1 /*****************************************************************************
2 
3 Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2015, 2021, MariaDB Corporation.
5 
6 This program is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free Software
8 Foundation; version 2 of the License.
9 
10 This program is distributed in the hope that it will be useful, but WITHOUT
11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13 
14 You should have received a copy of the GNU General Public License along with
15 this program; if not, write to the Free Software Foundation, Inc.,
16 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
17 
18 *****************************************************************************/
19 
20 /**************************************************//**
21 @file trx/trx0trx.cc
22 The transaction
23 
24 Created 3/26/1996 Heikki Tuuri
25 *******************************************************/
26 
27 #include "trx0trx.h"
28 
29 #ifdef WITH_WSREP
30 #include <mysql/service_wsrep.h>
31 #endif
32 
33 #include <mysql/service_thd_error_context.h>
34 
35 #include "btr0sea.h"
36 #include "lock0lock.h"
37 #include "log0log.h"
38 #include "que0que.h"
39 #include "srv0mon.h"
40 #include "srv0srv.h"
41 #include "srv0start.h"
42 #include "trx0purge.h"
43 #include "trx0rec.h"
44 #include "trx0roll.h"
45 #include "trx0rseg.h"
46 #include "trx0undo.h"
47 #include "trx0xa.h"
48 #include "ut0pool.h"
49 #include "ut0vec.h"
50 
51 #include <set>
52 #include <new>
53 
54 /** The bit pattern corresponding to TRX_ID_MAX */
55 const byte trx_id_max_bytes[8] = {
56 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
57 };
58 
59 /** The bit pattern corresponding to max timestamp */
60 const byte timestamp_max_bytes[7] = {
61 	0x7f, 0xff, 0xff, 0xff, 0x0f, 0x42, 0x3f
62 };
63 
64 
65 static const ulint MAX_DETAILED_ERROR_LEN = 256;
66 
67 /** Set of table_id */
68 typedef std::set<
69 	table_id_t,
70 	std::less<table_id_t>,
71 	ut_allocator<table_id_t> >	table_id_set;
72 
73 /*************************************************************//**
74 Set detailed error message for the transaction. */
75 void
trx_set_detailed_error(trx_t * trx,const char * msg)76 trx_set_detailed_error(
77 /*===================*/
78 	trx_t*		trx,	/*!< in: transaction struct */
79 	const char*	msg)	/*!< in: detailed error message */
80 {
81 	strncpy(trx->detailed_error, msg, MAX_DETAILED_ERROR_LEN - 1);
82 	trx->detailed_error[MAX_DETAILED_ERROR_LEN - 1] = '\0';
83 }
84 
85 /*************************************************************//**
86 Set detailed error message for the transaction from a file. Note that the
87 file is rewinded before reading from it. */
88 void
trx_set_detailed_error_from_file(trx_t * trx,FILE * file)89 trx_set_detailed_error_from_file(
90 /*=============================*/
91 	trx_t*	trx,	/*!< in: transaction struct */
92 	FILE*	file)	/*!< in: file to read message from */
93 {
94 	os_file_read_string(file, trx->detailed_error, MAX_DETAILED_ERROR_LEN);
95 }
96 
97 /********************************************************************//**
98 Initialize transaction object.
99 @param trx trx to initialize */
100 static
101 void
trx_init(trx_t * trx)102 trx_init(
103 /*=====*/
104 	trx_t*	trx)
105 {
106 	trx->state = TRX_STATE_NOT_STARTED;
107 
108 	trx->is_recovered = false;
109 
110 	trx->op_info = "";
111 
112 	trx->active_commit_ordered = false;
113 
114 	trx->isolation_level = TRX_ISO_REPEATABLE_READ;
115 
116 	trx->check_foreigns = true;
117 
118 	trx->check_unique_secondary = true;
119 
120 	trx->lock.n_rec_locks = 0;
121 
122 	trx->dict_operation = TRX_DICT_OP_NONE;
123 
124 	trx->table_id = 0;
125 
126 	trx->error_state = DB_SUCCESS;
127 
128 	trx->error_key_num = ULINT_UNDEFINED;
129 
130 	trx->undo_no = 0;
131 
132 	trx->rsegs.m_redo.rseg = NULL;
133 
134 	trx->rsegs.m_noredo.rseg = NULL;
135 
136 	trx->read_only = false;
137 
138 	trx->auto_commit = false;
139 
140 	trx->will_lock = false;
141 
142 	trx->ddl = false;
143 
144 	trx->internal = false;
145 
146 	ut_d(trx->start_file = 0);
147 
148 	ut_d(trx->start_line = 0);
149 
150 	trx->magic_n = TRX_MAGIC_N;
151 
152 	trx->lock.que_state = TRX_QUE_RUNNING;
153 
154 	trx->last_sql_stat_start.least_undo_no = 0;
155 
156 	ut_ad(!trx->read_view.is_open());
157 
158 	trx->lock.rec_cached = 0;
159 
160 	trx->lock.table_cached = 0;
161 #ifdef WITH_WSREP
162 	ut_ad(!trx->wsrep);
163 	ut_ad(!trx->wsrep_UK_scan);
164 #endif /* WITH_WSREP */
165 }
166 
167 /** For managing the life-cycle of the trx_t instance that we get
168 from the pool. */
169 struct TrxFactory {
170 
171 	/** Initializes a transaction object. It must be explicitly started
172 	with trx_start_if_not_started() before using it. The default isolation
173 	level is TRX_ISO_REPEATABLE_READ.
174 	@param trx Transaction instance to initialise */
initTrxFactory175 	static void init(trx_t* trx)
176 	{
177 		/* Explicitly call the constructor of the already
178 		allocated object. trx_t objects are allocated by
179 		ut_zalloc_nokey() in Pool::Pool() which would not call
180 		the constructors of the trx_t members. */
181 		new(&trx->mod_tables) trx_mod_tables_t();
182 
183 		new(&trx->lock.table_locks) lock_list();
184 
185 		new(&trx->read_view) ReadView();
186 
187 		trx->rw_trx_hash_pins = 0;
188 		trx_init(trx);
189 
190 		trx->dict_operation_lock_mode = 0;
191 
192 		trx->xid = UT_NEW_NOKEY(xid_t());
193 
194 		trx->detailed_error = reinterpret_cast<char*>(
195 			ut_zalloc_nokey(MAX_DETAILED_ERROR_LEN));
196 
197 		trx->lock.lock_heap = mem_heap_create_typed(
198 			1024, MEM_HEAP_FOR_LOCK_HEAP);
199 
200 		lock_trx_lock_list_init(&trx->lock.trx_locks);
201 
202 		UT_LIST_INIT(trx->lock.evicted_tables,
203 			     &dict_table_t::table_LRU);
204 
205 		UT_LIST_INIT(
206 			trx->trx_savepoints,
207 			&trx_named_savept_t::trx_savepoints);
208 
209 		mutex_create(LATCH_ID_TRX, &trx->mutex);
210 	}
211 
212 	/** Release resources held by the transaction object.
213 	@param trx the transaction for which to release resources */
destroyTrxFactory214 	static void destroy(trx_t* trx)
215 	{
216 #ifdef __SANITIZE_ADDRESS__
217 		/* Unpoison the memory for AddressSanitizer */
218 		MEM_MAKE_ADDRESSABLE(trx, sizeof *trx);
219 #elif !__has_feature(memory_sanitizer)
220 		/* In Valgrind, we cannot cancel MEM_NOACCESS() without
221 		changing the state of the V bits (which indicate
222 		which bits are initialized).
223 		We will declare the contents as initialized.
224 		We did invoke MEM_CHECK_DEFINED() in trx_t::free(). */
225 		MEM_MAKE_DEFINED(trx, sizeof *trx);
226 #endif
227 
228 		ut_a(trx->magic_n == TRX_MAGIC_N);
229 		ut_ad(!trx->mysql_thd);
230 
231 		ut_a(trx->lock.wait_lock == NULL);
232 		ut_a(trx->lock.wait_thr == NULL);
233 		ut_a(trx->dict_operation_lock_mode == 0);
234 
235 		if (trx->lock.lock_heap != NULL) {
236 			mem_heap_free(trx->lock.lock_heap);
237 			trx->lock.lock_heap = NULL;
238 		}
239 
240 		ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
241 		ut_ad(UT_LIST_GET_LEN(trx->lock.evicted_tables) == 0);
242 
243 		UT_DELETE(trx->xid);
244 		ut_free(trx->detailed_error);
245 
246 		mutex_free(&trx->mutex);
247 
248 		trx->mod_tables.~trx_mod_tables_t();
249 
250 		ut_ad(!trx->read_view.is_open());
251 
252 		trx->lock.table_locks.~lock_list();
253 
254 		trx->read_view.~ReadView();
255 	}
256 };
257 
258 /** The lock strategy for TrxPool */
259 struct TrxPoolLock {
TrxPoolLockTrxPoolLock260 	TrxPoolLock() { }
261 
262 	/** Create the mutex */
createTrxPoolLock263 	void create()
264 	{
265 		mutex_create(LATCH_ID_TRX_POOL, &m_mutex);
266 	}
267 
268 	/** Acquire the mutex */
enterTrxPoolLock269 	void enter() { mutex_enter(&m_mutex); }
270 
271 	/** Release the mutex */
exitTrxPoolLock272 	void exit() { mutex_exit(&m_mutex); }
273 
274 	/** Free the mutex */
destroyTrxPoolLock275 	void destroy() { mutex_free(&m_mutex); }
276 
277 	/** Mutex to use */
278 	ib_mutex_t	m_mutex;
279 };
280 
281 /** The lock strategy for the TrxPoolManager */
282 struct TrxPoolManagerLock {
TrxPoolManagerLockTrxPoolManagerLock283 	TrxPoolManagerLock() { }
284 
285 	/** Create the mutex */
createTrxPoolManagerLock286 	void create()
287 	{
288 		mutex_create(LATCH_ID_TRX_POOL_MANAGER, &m_mutex);
289 	}
290 
291 	/** Acquire the mutex */
enterTrxPoolManagerLock292 	void enter() { mutex_enter(&m_mutex); }
293 
294 	/** Release the mutex */
exitTrxPoolManagerLock295 	void exit() { mutex_exit(&m_mutex); }
296 
297 	/** Free the mutex */
destroyTrxPoolManagerLock298 	void destroy() { mutex_free(&m_mutex); }
299 
300 	/** Mutex to use */
301 	ib_mutex_t	m_mutex;
302 };
303 
304 /** Use explicit mutexes for the trx_t pool and its manager. */
305 typedef Pool<trx_t, TrxFactory, TrxPoolLock> trx_pool_t;
306 typedef PoolManager<trx_pool_t, TrxPoolManagerLock > trx_pools_t;
307 
308 /** The trx_t pool manager */
309 static trx_pools_t* trx_pools;
310 
311 /** Size of on trx_t pool in bytes. */
312 static const ulint MAX_TRX_BLOCK_SIZE = 1024 * 1024 * 4;
313 
314 /** Create the trx_t pool */
315 void
trx_pool_init()316 trx_pool_init()
317 {
318 	trx_pools = UT_NEW_NOKEY(trx_pools_t(MAX_TRX_BLOCK_SIZE));
319 
320 	ut_a(trx_pools != 0);
321 }
322 
323 /** Destroy the trx_t pool */
324 void
trx_pool_close()325 trx_pool_close()
326 {
327 	UT_DELETE(trx_pools);
328 
329 	trx_pools = 0;
330 }
331 
332 /** @return an allocated transaction */
trx_create()333 trx_t *trx_create()
334 {
335 	trx_t*	trx = trx_pools->get();
336 
337 #ifdef __SANITIZE_ADDRESS__
338 	/* Unpoison the memory for AddressSanitizer.
339 	It may have been poisoned in trx_t::free().*/
340 	MEM_MAKE_ADDRESSABLE(trx, sizeof *trx);
341 #elif !__has_feature(memory_sanitizer)
342 	/* In Valgrind, we cannot cancel MEM_NOACCESS() without
343 	changing the state of the V bits (which indicate
344 	which bits are initialized).
345 	We will declare the contents as initialized.
346 	We did invoke MEM_CHECK_DEFINED() in trx_t::free(). */
347 	MEM_MAKE_DEFINED(trx, sizeof *trx);
348 #endif
349 
350 	trx->assert_freed();
351 
352 	mem_heap_t*	heap;
353 	ib_alloc_t*	alloc;
354 
355 	/* We just got trx from pool, it should be non locking */
356 	ut_ad(!trx->will_lock);
357 	ut_ad(!trx->rw_trx_hash_pins);
358 
359 	DBUG_LOG("trx", "Create: " << trx);
360 
361 	heap = mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 8);
362 
363 	alloc = ib_heap_allocator_create(heap);
364 
365 	trx->autoinc_locks = ib_vector_create(alloc, sizeof(void**), 4);
366 
367 	ut_ad(trx->mod_tables.empty());
368 	ut_ad(trx->lock.n_rec_locks == 0);
369 	ut_ad(trx->lock.table_cached == 0);
370 	ut_ad(trx->lock.rec_cached == 0);
371 	ut_ad(UT_LIST_GET_LEN(trx->lock.evicted_tables) == 0);
372 
373 #ifdef WITH_WSREP
374 	ut_ad(!trx->wsrep_UK_scan);
375 #endif /* WITH_WSREP */
376 
377 	trx_sys.register_trx(trx);
378 
379 	return(trx);
380 }
381 
382 /** Free the memory to trx_pools */
free()383 void trx_t::free()
384 {
385   MEM_CHECK_DEFINED(this, sizeof *this);
386 
387   ut_ad(!n_mysql_tables_in_use);
388   ut_ad(!mysql_log_file_name);
389   ut_ad(!mysql_n_tables_locked);
390   ut_ad(!internal);
391   ut_ad(!will_lock);
392   ut_ad(error_state == DB_SUCCESS);
393   ut_ad(magic_n == TRX_MAGIC_N);
394   ut_ad(!read_only);
395   ut_ad(!lock.wait_lock);
396 
397   dict_operation= TRX_DICT_OP_NONE;
398   trx_sys.deregister_trx(this);
399   assert_freed();
400   trx_sys.rw_trx_hash.put_pins(this);
401 
402   mysql_thd= nullptr;
403 
404   // FIXME: We need to avoid this heap free/alloc for each commit.
405   if (autoinc_locks)
406   {
407     ut_ad(ib_vector_is_empty(autoinc_locks));
408     /* We allocated a dedicated heap for the vector. */
409     ib_vector_free(autoinc_locks);
410     autoinc_locks= NULL;
411   }
412 
413   mod_tables.clear();
414 
415   MEM_NOACCESS(&n_ref, sizeof n_ref);
416   /* do not poison mutex */
417   MEM_NOACCESS(&id, sizeof id);
418   MEM_NOACCESS(&state, sizeof state);
419   MEM_NOACCESS(&is_recovered, sizeof is_recovered);
420 #ifdef WITH_WSREP
421   MEM_NOACCESS(&wsrep, sizeof wsrep);
422 #endif
423   read_view.mem_noaccess();
424   MEM_NOACCESS(&lock, sizeof lock);
425   MEM_NOACCESS(&op_info, sizeof op_info);
426   MEM_NOACCESS(&isolation_level, sizeof isolation_level);
427   MEM_NOACCESS(&check_foreigns, sizeof check_foreigns);
428   MEM_NOACCESS(&is_registered, sizeof is_registered);
429   MEM_NOACCESS(&active_commit_ordered, sizeof active_commit_ordered);
430   MEM_NOACCESS(&check_unique_secondary, sizeof check_unique_secondary);
431   MEM_NOACCESS(&flush_log_later, sizeof flush_log_later);
432   MEM_NOACCESS(&must_flush_log_later, sizeof must_flush_log_later);
433   MEM_NOACCESS(&duplicates, sizeof duplicates);
434   MEM_NOACCESS(&dict_operation, sizeof dict_operation);
435   MEM_NOACCESS(&dict_operation_lock_mode, sizeof dict_operation_lock_mode);
436   MEM_NOACCESS(&start_time, sizeof start_time);
437   MEM_NOACCESS(&start_time_micro, sizeof start_time_micro);
438   MEM_NOACCESS(&commit_lsn, sizeof commit_lsn);
439   MEM_NOACCESS(&table_id, sizeof table_id);
440   MEM_NOACCESS(&mysql_thd, sizeof mysql_thd);
441   MEM_NOACCESS(&mysql_log_file_name, sizeof mysql_log_file_name);
442   MEM_NOACCESS(&mysql_log_offset, sizeof mysql_log_offset);
443   MEM_NOACCESS(&n_mysql_tables_in_use, sizeof n_mysql_tables_in_use);
444   MEM_NOACCESS(&mysql_n_tables_locked, sizeof mysql_n_tables_locked);
445   MEM_NOACCESS(&error_state, sizeof error_state);
446   MEM_NOACCESS(&error_info, sizeof error_info);
447   MEM_NOACCESS(&error_key_num, sizeof error_key_num);
448   MEM_NOACCESS(&graph, sizeof graph);
449   MEM_NOACCESS(&trx_savepoints, sizeof trx_savepoints);
450   MEM_NOACCESS(&undo_no, sizeof undo_no);
451   MEM_NOACCESS(&last_sql_stat_start, sizeof last_sql_stat_start);
452   MEM_NOACCESS(&rsegs, sizeof rsegs);
453   MEM_NOACCESS(&roll_limit, sizeof roll_limit);
454   MEM_NOACCESS(&in_rollback, sizeof in_rollback);
455   MEM_NOACCESS(&pages_undone, sizeof pages_undone);
456   MEM_NOACCESS(&n_autoinc_rows, sizeof n_autoinc_rows);
457   MEM_NOACCESS(&autoinc_locks, sizeof autoinc_locks);
458   MEM_NOACCESS(&read_only, sizeof read_only);
459   MEM_NOACCESS(&auto_commit, sizeof auto_commit);
460   MEM_NOACCESS(&will_lock, sizeof will_lock);
461   MEM_NOACCESS(&fts_trx, sizeof fts_trx);
462   MEM_NOACCESS(&fts_next_doc_id, sizeof fts_next_doc_id);
463   MEM_NOACCESS(&flush_tables, sizeof flush_tables);
464   MEM_NOACCESS(&ddl, sizeof ddl);
465   MEM_NOACCESS(&internal, sizeof internal);
466 #ifdef UNIV_DEBUG
467   MEM_NOACCESS(&start_line, sizeof start_line);
468   MEM_NOACCESS(&start_file, sizeof start_file);
469 #endif /* UNIV_DEBUG */
470   MEM_NOACCESS(&xid, sizeof xid);
471   MEM_NOACCESS(&mod_tables, sizeof mod_tables);
472   MEM_NOACCESS(&detailed_error, sizeof detailed_error);
473 #ifdef WITH_WSREP
474   ut_ad(!wsrep_UK_scan);
475   MEM_NOACCESS(&wsrep_UK_scan, sizeof wsrep_UK_scan);
476 #endif /* WITH_WSREP */
477   MEM_NOACCESS(&magic_n, sizeof magic_n);
478   trx_pools->mem_free(this);
479 }
480 
481 /** Transition to committed state, to release implicit locks. */
commit_state()482 inline void trx_t::commit_state()
483 {
484   ut_ad(state == TRX_STATE_PREPARED
485 	|| state == TRX_STATE_PREPARED_RECOVERED
486 	|| state == TRX_STATE_ACTIVE);
487   /* This makes the transaction committed in memory and makes its
488   changes to data visible to other transactions. NOTE that there is a
489   small discrepancy from the strict formal visibility rules here: a
490   user of the database can see modifications made by another
491   transaction T even before the necessary redo log segment has been
492   flushed to the disk. If the database happens to crash before the
493   flush, the user has seen modifications from T which will never be a
494   committed transaction. However, any transaction T2 which sees the
495   modifications of the committing transaction T, and which also itself
496   makes modifications to the database, will get an lsn larger than the
497   committing transaction T. In the case where the log flush fails, and
498   T never gets committed, also T2 will never get committed. */
499   trx_mutex_enter(this);
500   state= TRX_STATE_COMMITTED_IN_MEMORY;
501   trx_mutex_exit(this);
502   ut_ad(id || !is_referenced());
503 }
504 
505 /** Release any explicit locks of a committing transaction. */
release_locks()506 inline void trx_t::release_locks()
507 {
508   DBUG_ASSERT(state == TRX_STATE_COMMITTED_IN_MEMORY);
509   DBUG_ASSERT(!is_referenced());
510 
511   if (UT_LIST_GET_LEN(lock.trx_locks))
512   {
513     lock_release(this);
514     lock.n_rec_locks = 0;
515     ut_ad(UT_LIST_GET_LEN(lock.trx_locks) == 0);
516     ut_ad(ib_vector_is_empty(autoinc_locks));
517     mem_heap_empty(lock.lock_heap);
518   }
519 
520   lock.table_locks.clear();
521 }
522 
523 /** At shutdown, frees a transaction object. */
524 void
trx_free_at_shutdown(trx_t * trx)525 trx_free_at_shutdown(trx_t *trx)
526 {
527 	ut_ad(trx->is_recovered);
528 	ut_a(trx_state_eq(trx, TRX_STATE_PREPARED)
529 	     || trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)
530 	     || (trx_state_eq(trx, TRX_STATE_ACTIVE)
531 		 && (!srv_was_started
532 		     || srv_operation == SRV_OPERATION_RESTORE
533 		     || srv_operation == SRV_OPERATION_RESTORE_EXPORT
534 		     || srv_read_only_mode
535 		     || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO
536 		     || (!srv_is_being_started
537 		         && !srv_undo_sources && srv_fast_shutdown))));
538 	ut_a(trx->magic_n == TRX_MAGIC_N);
539 
540 	trx->commit_state();
541 	trx->release_locks();
542 	trx_undo_free_at_shutdown(trx);
543 
544 	ut_a(!trx->read_only);
545 
546 	DBUG_LOG("trx", "Free prepared: " << trx);
547 	trx->state = TRX_STATE_NOT_STARTED;
548 	ut_ad(!UT_LIST_GET_LEN(trx->lock.trx_locks));
549 	trx->id = 0;
550 	trx->free();
551 }
552 
553 
554 /**
555   Disconnect a prepared transaction from MySQL
556   @param[in,out] trx transaction
557 */
trx_disconnect_prepared(trx_t * trx)558 void trx_disconnect_prepared(trx_t *trx)
559 {
560   ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED));
561   ut_ad(trx->mysql_thd);
562   ut_ad(!trx->mysql_log_file_name);
563   trx->read_view.close();
564   trx->is_recovered= true;
565   trx->mysql_thd= NULL;
566   /* todo/fixme: suggest to do it at innodb prepare */
567   trx->will_lock= false;
568   trx_sys.rw_trx_hash.put_pins(trx);
569 }
570 
571 /****************************************************************//**
572 Resurrect the table locks for a resurrected transaction. */
573 static
574 void
trx_resurrect_table_locks(trx_t * trx,const trx_undo_t * undo)575 trx_resurrect_table_locks(
576 /*======================*/
577 	trx_t*			trx,	/*!< in/out: transaction */
578 	const trx_undo_t*	undo)	/*!< in: undo log */
579 {
580 	mtr_t			mtr;
581 	table_id_set		tables;
582 
583 	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) ||
584 	      trx_state_eq(trx, TRX_STATE_PREPARED));
585 	ut_ad(undo->rseg == trx->rsegs.m_redo.rseg);
586 
587 	if (undo->empty()) {
588 		return;
589 	}
590 
591 	mtr_start(&mtr);
592 
593 	/* trx_rseg_mem_create() may have acquired an X-latch on this
594 	page, so we cannot acquire an S-latch. */
595 	buf_block_t* block = trx_undo_page_get(
596 		page_id_t(trx->rsegs.m_redo.rseg->space->id,
597 			  undo->top_page_no), &mtr);
598 	buf_block_t* undo_block = block;
599 	trx_undo_rec_t* undo_rec = block->frame + undo->top_offset;
600 
601 	do {
602 		ulint		type;
603 		undo_no_t	undo_no;
604 		table_id_t	table_id;
605 		ulint		cmpl_info;
606 		bool		updated_extern;
607 
608 		if (undo_block != block) {
609 			mtr.memo_release(undo_block, MTR_MEMO_PAGE_X_FIX);
610 			undo_block = block;
611 		}
612 
613 		trx_undo_rec_get_pars(
614 			undo_rec, &type, &cmpl_info,
615 			&updated_extern, &undo_no, &table_id);
616 		tables.insert(table_id);
617 
618 		undo_rec = trx_undo_get_prev_rec(
619 			block, page_offset(undo_rec), undo->hdr_page_no,
620 			undo->hdr_offset, false, &mtr);
621 	} while (undo_rec);
622 
623 	mtr_commit(&mtr);
624 
625 	for (table_id_set::const_iterator i = tables.begin();
626 	     i != tables.end(); i++) {
627 		if (dict_table_t* table = dict_table_open_on_id(
628 			    *i, FALSE, DICT_TABLE_OP_LOAD_TABLESPACE)) {
629 			if (!table->is_readable()) {
630 				mutex_enter(&dict_sys.mutex);
631 				dict_table_close(table, TRUE, FALSE);
632 				dict_sys.remove(table);
633 				mutex_exit(&dict_sys.mutex);
634 				continue;
635 			}
636 
637 			if (trx->state == TRX_STATE_PREPARED) {
638 				trx->mod_tables.insert(
639 					trx_mod_tables_t::value_type(table,
640 								     0));
641 			}
642 			lock_table_ix_resurrect(table, trx);
643 
644 			DBUG_LOG("ib_trx",
645 				 "resurrect " << ib::hex(trx->id)
646 				 << " IX lock on " << table->name);
647 
648 			dict_table_close(table, FALSE, FALSE);
649 		}
650 	}
651 }
652 
653 
654 /**
655   Resurrect the transactions that were doing inserts/updates the time of the
656   crash, they need to be undone.
657 */
658 
trx_resurrect(trx_undo_t * undo,trx_rseg_t * rseg,time_t start_time,ulonglong start_time_micro,uint64_t * rows_to_undo)659 static void trx_resurrect(trx_undo_t *undo, trx_rseg_t *rseg,
660                           time_t start_time, ulonglong start_time_micro,
661                           uint64_t *rows_to_undo)
662 {
663   trx_state_t state;
664   /*
665     This is single-threaded startup code, we do not need the
666     protection of trx->mutex here.
667   */
668   switch (undo->state)
669   {
670   case TRX_UNDO_ACTIVE:
671     state= TRX_STATE_ACTIVE;
672     break;
673   case TRX_UNDO_PREPARED:
674     /*
675       Prepared transactions are left in the prepared state
676       waiting for a commit or abort decision from MySQL
677     */
678     ib::info() << "Transaction " << undo->trx_id
679                << " was in the XA prepared state.";
680 
681     state= TRX_STATE_PREPARED;
682     break;
683   default:
684     return;
685   }
686 
687   trx_t *trx= trx_create();
688   trx->state= state;
689   ut_d(trx->start_file= __FILE__);
690   ut_d(trx->start_line= __LINE__);
691 
692   trx->rsegs.m_redo.undo= undo;
693   trx->undo_no= undo->top_undo_no + 1;
694   trx->rsegs.m_redo.rseg= rseg;
695   /*
696     For transactions with active data will not have rseg size = 1
697     or will not qualify for purge limit criteria. So it is safe to increment
698     this trx_ref_count w/o mutex protection.
699   */
700   ++trx->rsegs.m_redo.rseg->trx_ref_count;
701   *trx->xid= undo->xid;
702   trx->id= undo->trx_id;
703   trx->is_recovered= true;
704   trx->start_time= start_time;
705   trx->start_time_micro= start_time_micro;
706 
707   if (undo->dict_operation)
708   {
709     trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
710     if (!trx->table_id)
711       trx->table_id= undo->table_id;
712   }
713 
714   trx_sys.rw_trx_hash.insert(trx);
715   trx_sys.rw_trx_hash.put_pins(trx);
716   trx_resurrect_table_locks(trx, undo);
717   if (trx_state_eq(trx, TRX_STATE_ACTIVE))
718     *rows_to_undo+= trx->undo_no;
719 }
720 
721 
722 /** Initialize (resurrect) transactions at startup. */
trx_lists_init_at_db_start()723 dberr_t trx_lists_init_at_db_start()
724 {
725 	ut_a(srv_is_being_started);
726 	ut_ad(!srv_was_started);
727 
728 	if (srv_operation == SRV_OPERATION_RESTORE) {
729 		/* mariabackup --prepare only deals with
730 		the redo log and the data files, not with
731 		transactions or the data dictionary. */
732 		return trx_rseg_array_init();
733 	}
734 
735 	if (srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN) {
736 		return DB_SUCCESS;
737 	}
738 
739 	purge_sys.create();
740 	if (dberr_t err = trx_rseg_array_init()) {
741 		ib::info() << "Retry with innodb_force_recovery=5";
742 		return err;
743 	}
744 
745 	/* Look from the rollback segments if there exist undo logs for
746 	transactions. */
747 	const time_t	start_time	= time(NULL);
748 	const ulonglong	start_time_micro= microsecond_interval_timer();
749 	uint64_t	rows_to_undo	= 0;
750 
751 	for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) {
752 		trx_undo_t*	undo;
753 		trx_rseg_t*	rseg = trx_sys.rseg_array[i];
754 
755 		/* Some rollback segment may be unavailable,
756 		especially if the server was previously run with a
757 		non-default value of innodb_undo_logs. */
758 		if (rseg == NULL) {
759 			continue;
760 		}
761 		/* Ressurrect other transactions. */
762 		for (undo = UT_LIST_GET_FIRST(rseg->undo_list);
763 		     undo != NULL;
764 		     undo = UT_LIST_GET_NEXT(undo_list, undo)) {
765 			trx_t *trx = trx_sys.find(0, undo->trx_id, false);
766 			if (!trx) {
767 				trx_resurrect(undo, rseg, start_time,
768 					      start_time_micro, &rows_to_undo);
769 			} else {
770 				ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) ||
771 				      trx_state_eq(trx, TRX_STATE_PREPARED));
772 				ut_ad(trx->start_time == start_time);
773 				ut_ad(trx->is_recovered);
774 				ut_ad(trx->rsegs.m_redo.rseg == rseg);
775 				ut_ad(trx->rsegs.m_redo.rseg->trx_ref_count);
776 
777 				trx->rsegs.m_redo.undo = undo;
778 				if (undo->top_undo_no >= trx->undo_no) {
779 					if (trx_state_eq(trx,
780 							 TRX_STATE_ACTIVE)) {
781 						rows_to_undo -= trx->undo_no;
782 						rows_to_undo +=
783 							undo->top_undo_no + 1;
784 					}
785 
786 					trx->undo_no = undo->top_undo_no + 1;
787 				}
788 				trx_resurrect_table_locks(trx, undo);
789 			}
790 		}
791 	}
792 
793 	if (const auto size = trx_sys.rw_trx_hash.size()) {
794 		ib::info() << size
795 			<< " transaction(s) which must be rolled back or"
796 			" cleaned up in total " << rows_to_undo
797 			<< " row operations to undo";
798 		ib::info() << "Trx id counter is " << trx_sys.get_max_trx_id();
799 	}
800 
801 	purge_sys.clone_oldest_view();
802 	return DB_SUCCESS;
803 }
804 
805 /** Assign a persistent rollback segment in a round-robin fashion,
806 evenly distributed between 0 and innodb_undo_logs-1
807 @return	persistent rollback segment
808 @retval	NULL	if innodb_read_only */
trx_assign_rseg_low()809 static trx_rseg_t* trx_assign_rseg_low()
810 {
811 	if (high_level_read_only) {
812 		ut_ad(!srv_available_undo_logs);
813 		return(NULL);
814 	}
815 
816 	ut_ad(srv_available_undo_logs == TRX_SYS_N_RSEGS);
817 
818 	/* The first slot is always assigned to the system tablespace. */
819 	ut_ad(trx_sys.rseg_array[0]->space == fil_system.sys_space);
820 
821 	/* Choose a rollback segment evenly distributed between 0 and
822 	innodb_undo_logs-1 in a round-robin fashion, skipping those
823 	undo tablespaces that are scheduled for truncation. */
824 	static Atomic_counter<unsigned>	rseg_slot;
825 	unsigned slot = rseg_slot++ % TRX_SYS_N_RSEGS;
826 	ut_d(if (trx_rseg_n_slots_debug) slot = 0);
827 	trx_rseg_t*	rseg;
828 
829 #ifdef UNIV_DEBUG
830 	ulint	start_scan_slot = slot;
831 	bool	look_for_rollover = false;
832 #endif /* UNIV_DEBUG */
833 
834 	bool	allocated = false;
835 
836 	do {
837 		for (;;) {
838 			rseg = trx_sys.rseg_array[slot];
839 
840 #ifdef UNIV_DEBUG
841 			/* Ensure that we are not revisiting the same
842 			slot that we have already inspected. */
843 			if (look_for_rollover) {
844 				ut_ad(start_scan_slot != slot);
845 			}
846 			look_for_rollover = true;
847 #endif /* UNIV_DEBUG */
848 
849 			ut_d(if (!trx_rseg_n_slots_debug))
850 			slot = (slot + 1) % TRX_SYS_N_RSEGS;
851 
852 			if (rseg == NULL) {
853 				continue;
854 			}
855 
856 			ut_ad(rseg->is_persistent());
857 
858 			if (rseg->space != fil_system.sys_space) {
859 				if (rseg->skip_allocation
860 				    || !srv_undo_tablespaces) {
861 					continue;
862 				}
863 			} else if (trx_rseg_t* next
864 				   = trx_sys.rseg_array[slot]) {
865 				if (next->space != fil_system.sys_space
866 				    && srv_undo_tablespaces > 0) {
867 					/** If dedicated
868 					innodb_undo_tablespaces have
869 					been configured, try to use them
870 					instead of the system tablespace. */
871 					continue;
872 				}
873 			}
874 
875 			break;
876 		}
877 
878 		/* By now we have only selected the rseg but not marked it
879 		allocated. By marking it allocated we are ensuring that it will
880 		never be selected for UNDO truncate purge. */
881 		mutex_enter(&rseg->mutex);
882 		if (!rseg->skip_allocation) {
883 			rseg->trx_ref_count++;
884 			allocated = true;
885 		}
886 		mutex_exit(&rseg->mutex);
887 	} while (!allocated);
888 
889 	ut_ad(rseg->trx_ref_count > 0);
890 	ut_ad(rseg->is_persistent());
891 	return(rseg);
892 }
893 
894 /** Assign a rollback segment for modifying temporary tables.
895 @return the assigned rollback segment */
assign_temp_rseg()896 trx_rseg_t *trx_t::assign_temp_rseg()
897 {
898 	ut_ad(!rsegs.m_noredo.rseg);
899 	ut_ad(!is_autocommit_non_locking());
900 	compile_time_assert(ut_is_2pow(TRX_SYS_N_RSEGS));
901 
902 	/* Choose a temporary rollback segment between 0 and 127
903 	in a round-robin fashion. */
904 	static Atomic_counter<unsigned> rseg_slot;
905 	trx_rseg_t*	rseg = trx_sys.temp_rsegs[
906 		rseg_slot++ & (TRX_SYS_N_RSEGS - 1)];
907 	ut_ad(!rseg->is_persistent());
908 	rsegs.m_noredo.rseg = rseg;
909 
910 	if (id == 0) {
911 		trx_sys.register_rw(this);
912 	}
913 
914 	ut_ad(!rseg->is_persistent());
915 	return(rseg);
916 }
917 
918 /****************************************************************//**
919 Starts a transaction. */
920 static
921 void
trx_start_low(trx_t * trx,bool read_write)922 trx_start_low(
923 /*==========*/
924 	trx_t*	trx,		/*!< in: transaction */
925 	bool	read_write)	/*!< in: true if read-write transaction */
926 {
927 	ut_ad(!trx->in_rollback);
928 	ut_ad(!trx->is_recovered);
929 	ut_ad(trx->start_line != 0);
930 	ut_ad(trx->start_file != 0);
931 	ut_ad(trx->roll_limit == 0);
932 	ut_ad(trx->error_state == DB_SUCCESS);
933 	ut_ad(trx->rsegs.m_redo.rseg == NULL);
934 	ut_ad(trx->rsegs.m_noredo.rseg == NULL);
935 	ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED));
936 	ut_ad(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
937 
938 	/* Check whether it is an AUTOCOMMIT SELECT */
939 	trx->auto_commit = thd_trx_is_auto_commit(trx->mysql_thd);
940 
941 	trx->read_only = srv_read_only_mode
942 		|| (!trx->ddl && !trx->internal
943 		    && thd_trx_is_read_only(trx->mysql_thd));
944 
945 	if (!trx->auto_commit) {
946 		trx->will_lock = true;
947 	} else if (!trx->will_lock) {
948 		trx->read_only = true;
949 	}
950 
951 #ifdef WITH_WSREP
952 	trx->xid->null();
953 #endif /* WITH_WSREP */
954 
955 	ut_a(ib_vector_is_empty(trx->autoinc_locks));
956 	ut_a(trx->lock.table_locks.empty());
957 
958 	/* No other thread can access this trx object through rw_trx_hash,
959 	still it can be found through trx_sys.trx_list. Sometimes it's
960 	possible to indirectly protect trx_t::state by freezing
961 	trx_sys.trx_list.
962 
963 	For now we update it without mutex protection, because original code
964 	did it this way. It has to be reviewed and fixed properly. */
965 	trx->state = TRX_STATE_ACTIVE;
966 
967 	/* By default all transactions are in the read-only list unless they
968 	are non-locking auto-commit read only transactions or background
969 	(internal) transactions. Note: Transactions marked explicitly as
970 	read only can write to temporary tables, we put those on the RO
971 	list too. */
972 
973 	if (!trx->read_only
974 	    && (trx->mysql_thd == 0 || read_write || trx->ddl)) {
975 
976 		/* Temporary rseg is assigned only if the transaction
977 		updates a temporary table */
978 		trx->rsegs.m_redo.rseg = trx_assign_rseg_low();
979 		ut_ad(trx->rsegs.m_redo.rseg != 0
980 		      || srv_read_only_mode
981 		      || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO);
982 
983 		trx_sys.register_rw(trx);
984 	} else {
985 		if (!trx->is_autocommit_non_locking()) {
986 
987 			/* If this is a read-only transaction that is writing
988 			to a temporary table then it needs a transaction id
989 			to write to the temporary table. */
990 
991 			if (read_write) {
992 				ut_ad(!srv_read_only_mode);
993 				trx_sys.register_rw(trx);
994 			}
995 		} else {
996 			ut_ad(!read_write);
997 		}
998 	}
999 
1000 	trx->start_time = time(NULL);
1001 	trx->start_time_micro = trx->mysql_thd
1002 		? thd_query_start_micro(trx->mysql_thd)
1003 		: microsecond_interval_timer();
1004 
1005 	ut_a(trx->error_state == DB_SUCCESS);
1006 
1007 	MONITOR_INC(MONITOR_TRX_ACTIVE);
1008 }
1009 
1010 /** Set the serialisation number for a persistent committed transaction.
1011 @param[in,out]	trx	committed transaction with persistent changes */
1012 static
1013 void
trx_serialise(trx_t * trx)1014 trx_serialise(trx_t* trx)
1015 {
1016 	trx_rseg_t *rseg = trx->rsegs.m_redo.rseg;
1017 	ut_ad(rseg);
1018 	ut_ad(mutex_own(&rseg->mutex));
1019 
1020 	if (rseg->last_page_no == FIL_NULL) {
1021 		mutex_enter(&purge_sys.pq_mutex);
1022 	}
1023 
1024 	trx_sys.assign_new_trx_no(trx);
1025 
1026 	/* If the rollback segment is not empty then the
1027 	new trx_t::no can't be less than any trx_t::no
1028 	already in the rollback segment. User threads only
1029 	produce events when a rollback segment is empty. */
1030 	if (rseg->last_page_no == FIL_NULL) {
1031 		purge_sys.purge_queue.push(TrxUndoRsegs(trx->rw_trx_hash_element->no,
1032 							*rseg));
1033 		mutex_exit(&purge_sys.pq_mutex);
1034 	}
1035 }
1036 
1037 /****************************************************************//**
1038 Assign the transaction its history serialisation number and write the
1039 update UNDO log record to the assigned rollback segment. */
1040 static
1041 void
trx_write_serialisation_history(trx_t * trx,mtr_t * mtr)1042 trx_write_serialisation_history(
1043 /*============================*/
1044 	trx_t*		trx,	/*!< in/out: transaction */
1045 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
1046 {
1047 	/* Change the undo log segment states from TRX_UNDO_ACTIVE to some
1048 	other state: these modifications to the file data structure define
1049 	the transaction as committed in the file based domain, at the
1050 	serialization point of the log sequence number lsn obtained below. */
1051 
1052 	/* We have to hold the rseg mutex because update log headers have
1053 	to be put to the history list in the (serialisation) order of the
1054 	UNDO trx number. This is required for the purge in-memory data
1055 	structures too. */
1056 
1057 	if (trx_undo_t* undo = trx->rsegs.m_noredo.undo) {
1058 		/* Undo log for temporary tables is discarded at transaction
1059 		commit. There is no purge for temporary tables, and also no
1060 		MVCC, because they are private to a session. */
1061 
1062 		mtr_t	temp_mtr;
1063 		temp_mtr.start();
1064 		temp_mtr.set_log_mode(MTR_LOG_NO_REDO);
1065 
1066 		mutex_enter(&trx->rsegs.m_noredo.rseg->mutex);
1067 		trx_undo_set_state_at_finish(undo, &temp_mtr);
1068 		mutex_exit(&trx->rsegs.m_noredo.rseg->mutex);
1069 		temp_mtr.commit();
1070 	}
1071 
1072 	trx_rseg_t*	rseg = trx->rsegs.m_redo.rseg;
1073 	if (!rseg) {
1074 		ut_ad(!trx->rsegs.m_redo.undo);
1075 		return;
1076 	}
1077 
1078 	trx_undo_t*& undo = trx->rsegs.m_redo.undo;
1079 
1080 	if (!undo) {
1081 		return;
1082 	}
1083 
1084 	ut_ad(!trx->read_only);
1085 	ut_ad(!undo || undo->rseg == rseg);
1086 	mutex_enter(&rseg->mutex);
1087 
1088 	/* Assign the transaction serialisation number and add any
1089 	undo log to the purge queue. */
1090 	trx_serialise(trx);
1091 	if (undo) {
1092 		UT_LIST_REMOVE(rseg->undo_list, undo);
1093 		trx_purge_add_undo_to_history(trx, undo, mtr);
1094 	}
1095 
1096 	mutex_exit(&rseg->mutex);
1097 
1098 	MONITOR_INC(MONITOR_TRX_COMMIT_UNDO);
1099 }
1100 
1101 /********************************************************************
1102 Finalize a transaction containing updates for a FTS table. */
1103 static
1104 void
trx_finalize_for_fts_table(fts_trx_table_t * ftt)1105 trx_finalize_for_fts_table(
1106 /*=======================*/
1107 	fts_trx_table_t*	ftt)	    /* in: FTS trx table */
1108 {
1109 	fts_t*		  fts = ftt->table->fts;
1110 	fts_doc_ids_t*	  doc_ids = ftt->added_doc_ids;
1111 
1112 	ut_a(fts->add_wq);
1113 
1114 	mem_heap_t* heap = static_cast<mem_heap_t*>(doc_ids->self_heap->arg);
1115 
1116 	ib_wqueue_add(fts->add_wq, doc_ids, heap);
1117 
1118 	/* fts_trx_table_t no longer owns the list. */
1119 	ftt->added_doc_ids = NULL;
1120 }
1121 
1122 /******************************************************************//**
1123 Finalize a transaction containing updates to FTS tables. */
1124 static
1125 void
trx_finalize_for_fts(trx_t * trx,bool is_commit)1126 trx_finalize_for_fts(
1127 /*=================*/
1128 	trx_t*	trx,		/*!< in/out: transaction */
1129 	bool	is_commit)	/*!< in: true if the transaction was
1130 				committed, false if it was rolled back. */
1131 {
1132 	if (is_commit) {
1133 		const ib_rbt_node_t*	node;
1134 		ib_rbt_t*		tables;
1135 		fts_savepoint_t*	savepoint;
1136 
1137 		savepoint = static_cast<fts_savepoint_t*>(
1138 			ib_vector_last(trx->fts_trx->savepoints));
1139 
1140 		tables = savepoint->tables;
1141 
1142 		for (node = rbt_first(tables);
1143 		     node;
1144 		     node = rbt_next(tables, node)) {
1145 			fts_trx_table_t**	ftt;
1146 
1147 			ftt = rbt_value(fts_trx_table_t*, node);
1148 
1149 			if ((*ftt)->added_doc_ids) {
1150 				trx_finalize_for_fts_table(*ftt);
1151 			}
1152 		}
1153 	}
1154 
1155 	fts_trx_free(trx->fts_trx);
1156 	trx->fts_trx = NULL;
1157 }
1158 
1159 /**********************************************************************//**
1160 If required, flushes the log to disk based on the value of
1161 innodb_flush_log_at_trx_commit. */
1162 static
1163 void
trx_flush_log_if_needed_low(lsn_t lsn)1164 trx_flush_log_if_needed_low(
1165 /*========================*/
1166 	lsn_t	lsn)	/*!< in: lsn up to which logs are to be
1167 			flushed. */
1168 {
1169 	bool	flush = srv_file_flush_method != SRV_NOSYNC;
1170 
1171 	switch (srv_flush_log_at_trx_commit) {
1172 	case 2:
1173 		/* Write the log but do not flush it to disk */
1174 		flush = false;
1175 		/* fall through */
1176 	case 1:
1177 	case 3:
1178 		/* Write the log and optionally flush it to disk */
1179 		log_write_up_to(lsn, flush);
1180 		srv_inc_activity_count();
1181 		return;
1182 	case 0:
1183 		/* Do nothing */
1184 		return;
1185 	}
1186 
1187 	ut_error;
1188 }
1189 
1190 /**********************************************************************//**
1191 If required, flushes the log to disk based on the value of
1192 innodb_flush_log_at_trx_commit. */
1193 static
1194 void
trx_flush_log_if_needed(lsn_t lsn,trx_t * trx)1195 trx_flush_log_if_needed(
1196 /*====================*/
1197 	lsn_t	lsn,	/*!< in: lsn up to which logs are to be
1198 			flushed. */
1199 	trx_t*	trx)	/*!< in/out: transaction */
1200 {
1201 	trx->op_info = "flushing log";
1202 	trx_flush_log_if_needed_low(lsn);
1203 	trx->op_info = "";
1204 }
1205 
1206 /**********************************************************************//**
1207 For each table that has been modified by the given transaction: update
1208 its dict_table_t::update_time with the current timestamp. Clear the list
1209 of the modified tables at the end. */
1210 static
1211 void
trx_update_mod_tables_timestamp(trx_t * trx)1212 trx_update_mod_tables_timestamp(
1213 /*============================*/
1214 	trx_t*	trx)	/*!< in: transaction */
1215 {
1216 	/* consider using trx->start_time if calling time() is too
1217 	expensive here */
1218 	const time_t now = time(NULL);
1219 
1220 	trx_mod_tables_t::const_iterator	end = trx->mod_tables.end();
1221 
1222 	for (trx_mod_tables_t::const_iterator it = trx->mod_tables.begin();
1223 	     it != end;
1224 	     ++it) {
1225 
1226 		/* This could be executed by multiple threads concurrently
1227 		on the same table object. This is fine because time_t is
1228 		word size or less. And _purely_ _theoretically_, even if
1229 		time_t write is not atomic, likely the value of 'now' is
1230 		the same in all threads and even if it is not, getting a
1231 		"garbage" in table->update_time is justified because
1232 		protecting it with a latch here would be too performance
1233 		intrusive. */
1234 		dict_table_t* table = it->first;
1235 		table->update_time = now;
1236 	}
1237 
1238 	trx->mod_tables.clear();
1239 }
1240 
1241 /** Evict a table definition due to the rollback of ALTER TABLE.
1242 @param[in]	table_id	table identifier */
evict_table(table_id_t table_id)1243 void trx_t::evict_table(table_id_t table_id)
1244 {
1245 	ut_ad(in_rollback);
1246 
1247 	dict_table_t* table = dict_table_open_on_id(
1248 		table_id, true, DICT_TABLE_OP_OPEN_ONLY_IF_CACHED);
1249 	if (!table) {
1250 		return;
1251 	}
1252 
1253 	if (!table->release()) {
1254 		/* This must be a DDL operation that is being rolled
1255 		back in an active connection. */
1256 		ut_a(table->get_ref_count() == 1);
1257 		ut_ad(!is_recovered);
1258 		ut_ad(mysql_thd);
1259 		return;
1260 	}
1261 
1262 	/* This table should only be locked by this transaction, if at all. */
1263 	ut_ad(UT_LIST_GET_LEN(table->locks) <= 1);
1264 	const bool locked = UT_LIST_GET_LEN(table->locks);
1265 	ut_ad(!locked || UT_LIST_GET_FIRST(table->locks)->trx == this);
1266 	dict_sys.remove(table, true, locked);
1267 	if (locked) {
1268 		UT_LIST_ADD_FIRST(lock.evicted_tables, table);
1269 	}
1270 }
1271 
1272 /** Mark a transaction committed in the main memory data structures. */
commit_in_memory(const mtr_t * mtr)1273 inline void trx_t::commit_in_memory(const mtr_t *mtr)
1274 {
1275   must_flush_log_later= false;
1276   read_view.close();
1277 
1278   if (is_autocommit_non_locking())
1279   {
1280     ut_ad(id == 0);
1281     ut_ad(read_only);
1282     ut_ad(!will_lock);
1283     ut_a(!is_recovered);
1284     ut_ad(!rsegs.m_redo.rseg);
1285     ut_ad(mysql_thd);
1286     ut_ad(state == TRX_STATE_ACTIVE);
1287 
1288     /* Note: We are asserting without holding the lock mutex. But
1289     that is OK because this transaction is not waiting and cannot
1290     be rolled back and no new locks can (or should) be added
1291     because it is flagged as a non-locking read-only transaction. */
1292     ut_a(UT_LIST_GET_LEN(lock.trx_locks) == 0);
1293 
1294     /* This state change is not protected by any mutex, therefore
1295     there is an inherent race here around state transition during
1296     printouts. We ignore this race for the sake of efficiency.
1297     However, the freezing of trx_sys.trx_list will protect the trx_t
1298     instance and it cannot be removed from the trx_list and freed
1299     without first unfreezing trx_list. */
1300     state= TRX_STATE_NOT_STARTED;
1301 
1302     MONITOR_INC(MONITOR_TRX_NL_RO_COMMIT);
1303 
1304     DBUG_LOG("trx", "Autocommit in memory: " << this);
1305   }
1306   else
1307   {
1308 #ifdef UNIV_DEBUG
1309     if (!UT_LIST_GET_LEN(lock.trx_locks))
1310       for (auto l : lock.table_locks)
1311         ut_ad(!l);
1312 #endif /* UNIV_DEBUG */
1313     commit_state();
1314 
1315     if (id)
1316     {
1317       trx_sys.deregister_rw(this);
1318 
1319       /* Wait for any implicit-to-explicit lock conversions to cease,
1320       so that there will be no race condition in lock_release(). */
1321       while (UNIV_UNLIKELY(is_referenced()))
1322         ut_delay(srv_spin_wait_delay);
1323     }
1324     else
1325       ut_ad(read_only || !rsegs.m_redo.rseg);
1326 
1327     if (read_only || !rsegs.m_redo.rseg)
1328     {
1329       MONITOR_INC(MONITOR_TRX_RO_COMMIT);
1330     }
1331     else
1332     {
1333       trx_update_mod_tables_timestamp(this);
1334       MONITOR_INC(MONITOR_TRX_RW_COMMIT);
1335       is_recovered= false;
1336     }
1337 
1338     release_locks();
1339     id= 0;
1340     DEBUG_SYNC_C("after_trx_committed_in_memory");
1341 
1342     while (dict_table_t *table= UT_LIST_GET_FIRST(lock.evicted_tables))
1343     {
1344       UT_LIST_REMOVE(lock.evicted_tables, table);
1345       dict_mem_table_free(table);
1346     }
1347   }
1348 
1349   ut_ad(!rsegs.m_redo.undo);
1350   ut_ad(UT_LIST_GET_LEN(lock.evicted_tables) == 0);
1351 
1352   if (mtr)
1353   {
1354     if (trx_undo_t *&undo= rsegs.m_noredo.undo)
1355     {
1356       ut_ad(undo->rseg == rsegs.m_noredo.rseg);
1357       trx_undo_commit_cleanup(undo);
1358       undo= nullptr;
1359     }
1360 
1361     /* NOTE that we could possibly make a group commit more efficient
1362     here: call os_thread_yield here to allow also other trxs to come
1363     to commit! */
1364 
1365     /*-------------------------------------*/
1366 
1367     /* Depending on the my.cnf options, we may now write the log
1368     buffer to the log files, making the transaction durable if the OS
1369     does not crash. We may also flush the log files to disk, making
1370     the transaction durable also at an OS crash or a power outage.
1371 
1372     The idea in InnoDB's group commit is that a group of transactions
1373     gather behind a trx doing a physical disk write to log files, and
1374     when that physical write has been completed, one of those
1375     transactions does a write which commits the whole group. Note that
1376     this group commit will only bring benefit if there are > 2 users
1377     in the database. Then at least 2 users can gather behind one doing
1378     the physical log write to disk.
1379 
1380     If we are calling trx_t::commit() under prepare_commit_mutex, we
1381     will delay possible log write and flush to a separate function
1382     trx_commit_complete_for_mysql(), which is only called when the
1383     thread has released the mutex. This is to make the group commit
1384     algorithm to work. Otherwise, the prepare_commit mutex would
1385     serialize all commits and prevent a group of transactions from
1386     gathering. */
1387 
1388     commit_lsn= mtr->commit_lsn();
1389     if (!commit_lsn)
1390       /* Nothing to be done. */;
1391     else if (flush_log_later)
1392       /* Do nothing yet */
1393       must_flush_log_later= true;
1394     else if (srv_flush_log_at_trx_commit)
1395       trx_flush_log_if_needed(commit_lsn, this);
1396   }
1397 
1398   ut_ad(!rsegs.m_noredo.undo);
1399 
1400   /* Only after trx_undo_commit_cleanup() it is safe to release
1401   our rseg reference. */
1402   if (trx_rseg_t *rseg= rsegs.m_redo.rseg)
1403   {
1404     mutex_enter(&rseg->mutex);
1405     ut_ad(rseg->trx_ref_count > 0);
1406     --rseg->trx_ref_count;
1407     mutex_exit(&rseg->mutex);
1408   }
1409 
1410   /* Free all savepoints, starting from the first. */
1411   trx_named_savept_t *savep= UT_LIST_GET_FIRST(trx_savepoints);
1412 
1413   trx_roll_savepoints_free(this, savep);
1414 
1415   if (fts_trx)
1416     trx_finalize_for_fts(this, undo_no != 0);
1417 
1418 #ifdef WITH_WSREP
1419   /* Serialization history has been written and the transaction is
1420   committed in memory, which makes this commit ordered. Release commit
1421   order critical section. */
1422   if (wsrep)
1423   {
1424     wsrep= false;
1425     wsrep_commit_ordered(mysql_thd);
1426   }
1427   lock.was_chosen_as_wsrep_victim= false;
1428 #endif /* WITH_WSREP */
1429   trx_mutex_enter(this);
1430   dict_operation= TRX_DICT_OP_NONE;
1431 
1432   DBUG_LOG("trx", "Commit in memory: " << this);
1433   state= TRX_STATE_NOT_STARTED;
1434 
1435   assert_freed();
1436   trx_init(this);
1437   trx_mutex_exit(this);
1438 
1439   ut_a(error_state == DB_SUCCESS);
1440   if (!srv_read_only_mode)
1441     srv_wake_purge_thread_if_not_active();
1442 }
1443 
1444 /** Commit the transaction in a mini-transaction.
1445 @param mtr  mini-transaction (if there are any persistent modifications) */
commit_low(mtr_t * mtr)1446 void trx_t::commit_low(mtr_t *mtr)
1447 {
1448   ut_ad(!mtr || mtr->is_active());
1449   ut_d(bool aborted = in_rollback && error_state == DB_DEADLOCK);
1450   ut_ad(!mtr == (aborted || !has_logged()));
1451   ut_ad(!mtr || !aborted);
1452 
1453   /* undo_no is non-zero if we're doing the final commit. */
1454   if (fts_trx && undo_no)
1455   {
1456     ut_a(!is_autocommit_non_locking());
1457     /* FTS-FIXME: Temporarily tolerate DB_DUPLICATE_KEY instead of
1458     dying. This is a possible scenario if there is a crash between
1459     insert to DELETED table committing and transaction committing. The
1460     fix would be able to return error from this function */
1461     if (dberr_t error= fts_commit(this))
1462       ut_a(error == DB_DUPLICATE_KEY);
1463   }
1464 
1465 #ifndef DBUG_OFF
1466   const bool debug_sync= mysql_thd && has_logged_persistent();
1467 #endif
1468 
1469   if (mtr)
1470   {
1471     trx_write_serialisation_history(this, mtr);
1472 
1473     /* The following call commits the mini-transaction, making the
1474     whole transaction committed in the file-based world, at this log
1475     sequence number. The transaction becomes 'durable' when we write
1476     the log to disk, but in the logical sense the commit in the
1477     file-based data structures (undo logs etc.) happens here.
1478 
1479     NOTE that transaction numbers, which are assigned only to
1480     transactions with an update undo log, do not necessarily come in
1481     exactly the same order as commit lsn's, if the transactions have
1482     different rollback segments. To get exactly the same order we
1483     should hold the kernel mutex up to this point, adding to the
1484     contention of the kernel mutex. However, if a transaction T2 is
1485     able to see modifications made by a transaction T1, T2 will always
1486     get a bigger transaction number and a bigger commit lsn than T1. */
1487 
1488     mtr->commit();
1489   }
1490 #ifndef DBUG_OFF
1491   if (debug_sync)
1492     DEBUG_SYNC_C("before_trx_state_committed_in_memory");
1493 #endif
1494 
1495   commit_in_memory(mtr);
1496 }
1497 
1498 
commit()1499 void trx_t::commit()
1500 {
1501   mtr_t *mtr= nullptr;
1502   mtr_t local_mtr;
1503 
1504   if (has_logged())
1505   {
1506     mtr= &local_mtr;
1507     local_mtr.start();
1508   }
1509   commit_low(mtr);
1510 }
1511 
1512 /****************************************************************//**
1513 Prepares a transaction for commit/rollback. */
1514 void
trx_commit_or_rollback_prepare(trx_t * trx)1515 trx_commit_or_rollback_prepare(
1516 /*===========================*/
1517 	trx_t*	trx)		/*!< in/out: transaction */
1518 {
1519 	/* We are reading trx->state without holding trx->mutex
1520 	here, because the commit or rollback should be invoked for a
1521 	running (or recovered prepared) transaction that is associated
1522 	with the current thread. */
1523 
1524 	switch (trx->state) {
1525 	case TRX_STATE_NOT_STARTED:
1526 		trx_start_low(trx, true);
1527 		/* fall through */
1528 
1529 	case TRX_STATE_ACTIVE:
1530 	case TRX_STATE_PREPARED:
1531 	case TRX_STATE_PREPARED_RECOVERED:
1532 		/* If the trx is in a lock wait state, moves the waiting
1533 		query thread to the suspended state */
1534 
1535 		if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
1536 
1537 			ut_a(trx->lock.wait_thr != NULL);
1538 			trx->lock.wait_thr->state = QUE_THR_SUSPENDED;
1539 			trx->lock.wait_thr = NULL;
1540 
1541 			trx->lock.que_state = TRX_QUE_RUNNING;
1542 		}
1543 
1544 		ut_ad(trx->lock.n_active_thrs == 1);
1545 		return;
1546 
1547 	case TRX_STATE_COMMITTED_IN_MEMORY:
1548 		break;
1549 	}
1550 
1551 	ut_error;
1552 }
1553 
1554 /*********************************************************************//**
1555 Creates a commit command node struct.
1556 @return own: commit node struct */
1557 commit_node_t*
trx_commit_node_create(mem_heap_t * heap)1558 trx_commit_node_create(
1559 /*===================*/
1560 	mem_heap_t*	heap)	/*!< in: mem heap where created */
1561 {
1562 	commit_node_t*	node;
1563 
1564 	node = static_cast<commit_node_t*>(mem_heap_alloc(heap, sizeof(*node)));
1565 	node->common.type  = QUE_NODE_COMMIT;
1566 	node->state = COMMIT_NODE_SEND;
1567 
1568 	return(node);
1569 }
1570 
1571 /***********************************************************//**
1572 Performs an execution step for a commit type node in a query graph.
1573 @return query thread to run next, or NULL */
1574 que_thr_t*
trx_commit_step(que_thr_t * thr)1575 trx_commit_step(
1576 /*============*/
1577 	que_thr_t*	thr)	/*!< in: query thread */
1578 {
1579 	commit_node_t*	node;
1580 
1581 	node = static_cast<commit_node_t*>(thr->run_node);
1582 
1583 	ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT);
1584 
1585 	if (thr->prev_node == que_node_get_parent(node)) {
1586 		node->state = COMMIT_NODE_SEND;
1587 	}
1588 
1589 	if (node->state == COMMIT_NODE_SEND) {
1590 		trx_t*	trx;
1591 
1592 		node->state = COMMIT_NODE_WAIT;
1593 
1594 		trx = thr_get_trx(thr);
1595 
1596 		ut_a(trx->lock.wait_thr == NULL);
1597 		ut_a(trx->lock.que_state != TRX_QUE_LOCK_WAIT);
1598 
1599 		trx_commit_or_rollback_prepare(trx);
1600 
1601 		trx->lock.que_state = TRX_QUE_COMMITTING;
1602 		trx->commit();
1603 		ut_ad(trx->lock.wait_thr == NULL);
1604 		trx->lock.que_state = TRX_QUE_RUNNING;
1605 
1606 		thr = NULL;
1607 	} else {
1608 		ut_ad(node->state == COMMIT_NODE_WAIT);
1609 
1610 		node->state = COMMIT_NODE_SEND;
1611 
1612 		thr->run_node = que_node_get_parent(node);
1613 	}
1614 
1615 	return(thr);
1616 }
1617 
1618 /**********************************************************************//**
1619 Does the transaction commit for MySQL.
1620 @return DB_SUCCESS or error number */
1621 dberr_t
trx_commit_for_mysql(trx_t * trx)1622 trx_commit_for_mysql(
1623 /*=================*/
1624 	trx_t*	trx)	/*!< in/out: transaction */
1625 {
1626 	/* Because we do not do the commit by sending an Innobase
1627 	sig to the transaction, we must here make sure that trx has been
1628 	started. */
1629 
1630 	switch (trx->state) {
1631 	case TRX_STATE_NOT_STARTED:
1632 		ut_d(trx->start_file = __FILE__);
1633 		ut_d(trx->start_line = __LINE__);
1634 
1635 		trx_start_low(trx, true);
1636 		/* fall through */
1637 	case TRX_STATE_ACTIVE:
1638 	case TRX_STATE_PREPARED:
1639 	case TRX_STATE_PREPARED_RECOVERED:
1640 		trx->op_info = "committing";
1641 		trx->commit();
1642 		MONITOR_DEC(MONITOR_TRX_ACTIVE);
1643 		trx->op_info = "";
1644 		return(DB_SUCCESS);
1645 	case TRX_STATE_COMMITTED_IN_MEMORY:
1646 		break;
1647 	}
1648 	ut_error;
1649 	return(DB_CORRUPTION);
1650 }
1651 
1652 /**********************************************************************//**
1653 If required, flushes the log to disk if we called trx_commit_for_mysql()
1654 with trx->flush_log_later == TRUE. */
1655 void
trx_commit_complete_for_mysql(trx_t * trx)1656 trx_commit_complete_for_mysql(
1657 /*==========================*/
1658 	trx_t*	trx)	/*!< in/out: transaction */
1659 {
1660 	if (trx->id != 0
1661 	    || !trx->must_flush_log_later
1662 	    || (srv_flush_log_at_trx_commit == 1 && trx->active_commit_ordered)) {
1663 
1664 		return;
1665 	}
1666 
1667 	trx_flush_log_if_needed(trx->commit_lsn, trx);
1668 
1669 	trx->must_flush_log_later = false;
1670 }
1671 
1672 /**********************************************************************//**
1673 Marks the latest SQL statement ended. */
1674 void
trx_mark_sql_stat_end(trx_t * trx)1675 trx_mark_sql_stat_end(
1676 /*==================*/
1677 	trx_t*	trx)	/*!< in: trx handle */
1678 {
1679 	ut_a(trx);
1680 
1681 	switch (trx->state) {
1682 	case TRX_STATE_PREPARED:
1683 	case TRX_STATE_PREPARED_RECOVERED:
1684 	case TRX_STATE_COMMITTED_IN_MEMORY:
1685 		break;
1686 	case TRX_STATE_NOT_STARTED:
1687 		trx->undo_no = 0;
1688 		/* fall through */
1689 	case TRX_STATE_ACTIVE:
1690 		trx->last_sql_stat_start.least_undo_no = trx->undo_no;
1691 
1692 		if (trx->fts_trx != NULL) {
1693 			fts_savepoint_laststmt_refresh(trx);
1694 		}
1695 
1696 		return;
1697 	}
1698 
1699 	ut_error;
1700 }
1701 
1702 /**********************************************************************//**
1703 Prints info about a transaction. */
1704 void
trx_print_low(FILE * f,const trx_t * trx,ulint max_query_len,ulint n_rec_locks,ulint n_trx_locks,ulint heap_size)1705 trx_print_low(
1706 /*==========*/
1707 	FILE*		f,
1708 			/*!< in: output stream */
1709 	const trx_t*	trx,
1710 			/*!< in: transaction */
1711 	ulint		max_query_len,
1712 			/*!< in: max query length to print,
1713 			or 0 to use the default max length */
1714 	ulint		n_rec_locks,
1715 			/*!< in: lock_number_of_rows_locked(&trx->lock) */
1716 	ulint		n_trx_locks,
1717 			/*!< in: length of trx->lock.trx_locks */
1718 	ulint		heap_size)
1719 			/*!< in: mem_heap_get_size(trx->lock.lock_heap) */
1720 {
1721 	ibool		newline;
1722 
1723 	fprintf(f, "TRANSACTION " TRX_ID_FMT, trx_get_id_for_print(trx));
1724 
1725 	switch (trx->state) {
1726 	case TRX_STATE_NOT_STARTED:
1727 		fputs(", not started", f);
1728 		goto state_ok;
1729 	case TRX_STATE_ACTIVE:
1730 		fprintf(f, ", ACTIVE %lu sec",
1731 			(ulong) difftime(time(NULL), trx->start_time));
1732 		goto state_ok;
1733 	case TRX_STATE_PREPARED:
1734 	case TRX_STATE_PREPARED_RECOVERED:
1735 		fprintf(f, ", ACTIVE (PREPARED) %lu sec",
1736 			(ulong) difftime(time(NULL), trx->start_time));
1737 		goto state_ok;
1738 	case TRX_STATE_COMMITTED_IN_MEMORY:
1739 		fputs(", COMMITTED IN MEMORY", f);
1740 		goto state_ok;
1741 	}
1742 	fprintf(f, ", state %lu", (ulong) trx->state);
1743 	ut_ad(0);
1744 state_ok:
1745 	const char* op_info = trx->op_info;
1746 
1747 	if (*op_info) {
1748 		putc(' ', f);
1749 		fputs(op_info, f);
1750 	}
1751 
1752 	if (trx->is_recovered) {
1753 		fputs(" recovered trx", f);
1754 	}
1755 
1756 	putc('\n', f);
1757 
1758 	if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) {
1759 		fprintf(f, "mysql tables in use %lu, locked %lu\n",
1760 			(ulong) trx->n_mysql_tables_in_use,
1761 			(ulong) trx->mysql_n_tables_locked);
1762 	}
1763 
1764 	newline = TRUE;
1765 
1766 	/* trx->lock.que_state of an ACTIVE transaction may change
1767 	while we are not holding trx->mutex. We perform a dirty read
1768 	for performance reasons. */
1769 
1770 	switch (trx->lock.que_state) {
1771 	case TRX_QUE_RUNNING:
1772 		newline = FALSE; break;
1773 	case TRX_QUE_LOCK_WAIT:
1774 		fputs("LOCK WAIT ", f); break;
1775 	case TRX_QUE_ROLLING_BACK:
1776 		fputs("ROLLING BACK ", f); break;
1777 	case TRX_QUE_COMMITTING:
1778 		fputs("COMMITTING ", f); break;
1779 	default:
1780 		fprintf(f, "que state %lu ", (ulong) trx->lock.que_state);
1781 	}
1782 
1783 	if (n_trx_locks > 0 || heap_size > 400) {
1784 		newline = TRUE;
1785 
1786 		fprintf(f, "%lu lock struct(s), heap size %lu,"
1787 			" %lu row lock(s)",
1788 			(ulong) n_trx_locks,
1789 			(ulong) heap_size,
1790 			(ulong) n_rec_locks);
1791 	}
1792 
1793 	if (trx->undo_no != 0) {
1794 		newline = TRUE;
1795 		fprintf(f, ", undo log entries " TRX_ID_FMT, trx->undo_no);
1796 	}
1797 
1798 	if (newline) {
1799 		putc('\n', f);
1800 	}
1801 
1802 	if (trx->state != TRX_STATE_NOT_STARTED && trx->mysql_thd != NULL) {
1803 		innobase_mysql_print_thd(
1804 			f, trx->mysql_thd, static_cast<uint>(max_query_len));
1805 	}
1806 }
1807 
1808 /**********************************************************************//**
1809 Prints info about a transaction.
1810 The caller must hold lock_sys.mutex.
1811 When possible, use trx_print() instead. */
1812 void
trx_print_latched(FILE * f,const trx_t * trx,ulint max_query_len)1813 trx_print_latched(
1814 /*==============*/
1815 	FILE*		f,		/*!< in: output stream */
1816 	const trx_t*	trx,		/*!< in: transaction */
1817 	ulint		max_query_len)	/*!< in: max query length to print,
1818 					or 0 to use the default max length */
1819 {
1820 	ut_ad(lock_mutex_own());
1821 
1822 	trx_print_low(f, trx, max_query_len,
1823 		      lock_number_of_rows_locked(&trx->lock),
1824 		      UT_LIST_GET_LEN(trx->lock.trx_locks),
1825 		      mem_heap_get_size(trx->lock.lock_heap));
1826 }
1827 
1828 /**********************************************************************//**
1829 Prints info about a transaction.
1830 Acquires and releases lock_sys.mutex. */
1831 void
trx_print(FILE * f,const trx_t * trx,ulint max_query_len)1832 trx_print(
1833 /*======*/
1834 	FILE*		f,		/*!< in: output stream */
1835 	const trx_t*	trx,		/*!< in: transaction */
1836 	ulint		max_query_len)	/*!< in: max query length to print,
1837 					or 0 to use the default max length */
1838 {
1839 	ulint	n_rec_locks;
1840 	ulint	n_trx_locks;
1841 	ulint	heap_size;
1842 
1843 	lock_mutex_enter();
1844 	n_rec_locks = lock_number_of_rows_locked(&trx->lock);
1845 	n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks);
1846 	heap_size = mem_heap_get_size(trx->lock.lock_heap);
1847 	lock_mutex_exit();
1848 
1849 	trx_print_low(f, trx, max_query_len,
1850 		      n_rec_locks, n_trx_locks, heap_size);
1851 }
1852 
1853 /*******************************************************************//**
1854 Compares the "weight" (or size) of two transactions. Transactions that
1855 have edited non-transactional tables are considered heavier than ones
1856 that have not.
1857 @return TRUE if weight(a) >= weight(b) */
1858 bool
trx_weight_ge(const trx_t * a,const trx_t * b)1859 trx_weight_ge(
1860 /*==========*/
1861 	const trx_t*	a,	/*!< in: transaction to be compared */
1862 	const trx_t*	b)	/*!< in: transaction to be compared */
1863 {
1864 	ibool	a_notrans_edit;
1865 	ibool	b_notrans_edit;
1866 
1867 	/* If mysql_thd is NULL for a transaction we assume that it has
1868 	not edited non-transactional tables. */
1869 
1870 	a_notrans_edit = a->mysql_thd != NULL
1871 		&& thd_has_edited_nontrans_tables(a->mysql_thd);
1872 
1873 	b_notrans_edit = b->mysql_thd != NULL
1874 		&& thd_has_edited_nontrans_tables(b->mysql_thd);
1875 
1876 	if (a_notrans_edit != b_notrans_edit) {
1877 
1878 		return(a_notrans_edit);
1879 	}
1880 
1881 	/* Either both had edited non-transactional tables or both had
1882 	not, we fall back to comparing the number of altered/locked
1883 	rows. */
1884 
1885 	return(TRX_WEIGHT(a) >= TRX_WEIGHT(b));
1886 }
1887 
1888 /** Prepare a transaction.
1889 @return	log sequence number that makes the XA PREPARE durable
1890 @retval	0	if no changes needed to be made durable */
trx_prepare_low(trx_t * trx)1891 static lsn_t trx_prepare_low(trx_t *trx)
1892 {
1893 	ut_ad(!trx->is_recovered);
1894 
1895 	mtr_t	mtr;
1896 
1897 	if (trx_undo_t* undo = trx->rsegs.m_noredo.undo) {
1898 		ut_ad(undo->rseg == trx->rsegs.m_noredo.rseg);
1899 
1900 		mtr.start();
1901 		mtr.set_log_mode(MTR_LOG_NO_REDO);
1902 
1903 		mutex_enter(&undo->rseg->mutex);
1904 		trx_undo_set_state_at_prepare(trx, undo, false, &mtr);
1905 		mutex_exit(&undo->rseg->mutex);
1906 
1907 		mtr.commit();
1908 	}
1909 
1910 	trx_undo_t* undo = trx->rsegs.m_redo.undo;
1911 
1912 	if (!undo) {
1913 		/* There were no changes to persistent tables. */
1914 		return(0);
1915 	}
1916 
1917 	trx_rseg_t*	rseg = trx->rsegs.m_redo.rseg;
1918 	ut_ad(undo->rseg == rseg);
1919 
1920 	mtr.start();
1921 
1922 	/* Change the undo log segment states from TRX_UNDO_ACTIVE to
1923 	TRX_UNDO_PREPARED: these modifications to the file data
1924 	structure define the transaction as prepared in the file-based
1925 	world, at the serialization point of lsn. */
1926 
1927 	mutex_enter(&rseg->mutex);
1928 	trx_undo_set_state_at_prepare(trx, undo, false, &mtr);
1929 	mutex_exit(&rseg->mutex);
1930 
1931 	/* Make the XA PREPARE durable. */
1932 	mtr.commit();
1933 	ut_ad(mtr.commit_lsn() > 0);
1934 	return(mtr.commit_lsn());
1935 }
1936 
1937 /****************************************************************//**
1938 Prepares a transaction. */
1939 static
1940 void
trx_prepare(trx_t * trx)1941 trx_prepare(
1942 /*========*/
1943 	trx_t*	trx)	/*!< in/out: transaction */
1944 {
1945 	/* Only fresh user transactions can be prepared.
1946 	Recovered transactions cannot. */
1947 	ut_a(!trx->is_recovered);
1948 
1949 	lsn_t	lsn = trx_prepare_low(trx);
1950 
1951 	DBUG_EXECUTE_IF("ib_trx_crash_during_xa_prepare_step", DBUG_SUICIDE(););
1952 
1953 	ut_a(trx->state == TRX_STATE_ACTIVE);
1954 	trx_mutex_enter(trx);
1955 	trx->state = TRX_STATE_PREPARED;
1956 	trx_mutex_exit(trx);
1957 
1958 	if (lsn) {
1959 		/* Depending on the my.cnf options, we may now write the log
1960 		buffer to the log files, making the prepared state of the
1961 		transaction durable if the OS does not crash. We may also
1962 		flush the log files to disk, making the prepared state of the
1963 		transaction durable also at an OS crash or a power outage.
1964 
1965 		The idea in InnoDB's group prepare is that a group of
1966 		transactions gather behind a trx doing a physical disk write
1967 		to log files, and when that physical write has been completed,
1968 		one of those transactions does a write which prepares the whole
1969 		group. Note that this group prepare will only bring benefit if
1970 		there are > 2 users in the database. Then at least 2 users can
1971 		gather behind one doing the physical log write to disk.
1972 
1973 		We must not be holding any mutexes or latches here. */
1974 
1975 		trx_flush_log_if_needed(lsn, trx);
1976 
1977 		if (!UT_LIST_GET_LEN(trx->lock.trx_locks)
1978 		    || trx->isolation_level == TRX_ISO_SERIALIZABLE) {
1979 			/* Do not release any locks at the
1980 			SERIALIZABLE isolation level. */
1981 		} else if (!trx->mysql_thd
1982 			   || thd_sql_command(trx->mysql_thd)
1983 			   != SQLCOM_XA_PREPARE) {
1984 			/* Do not release locks for XA COMMIT ONE PHASE
1985 			or for internal distributed transactions
1986 			(XID::get_my_xid() would be nonzero). */
1987 		} else {
1988 			lock_release_on_prepare(trx);
1989 		}
1990 	}
1991 }
1992 
1993 /** XA PREPARE a transaction.
1994 @param[in,out]	trx	transaction to prepare */
trx_prepare_for_mysql(trx_t * trx)1995 void trx_prepare_for_mysql(trx_t* trx)
1996 {
1997 	trx_start_if_not_started_xa(trx, false);
1998 
1999 	trx->op_info = "preparing";
2000 
2001 	trx_prepare(trx);
2002 
2003 	trx->op_info = "";
2004 }
2005 
2006 
2007 struct trx_recover_for_mysql_callback_arg
2008 {
2009   XID *xid_list;
2010   uint len;
2011   uint count;
2012 };
2013 
2014 
trx_recover_for_mysql_callback(rw_trx_hash_element_t * element,trx_recover_for_mysql_callback_arg * arg)2015 static my_bool trx_recover_for_mysql_callback(rw_trx_hash_element_t *element,
2016   trx_recover_for_mysql_callback_arg *arg)
2017 {
2018   DBUG_ASSERT(arg->len > 0);
2019   mutex_enter(&element->mutex);
2020   if (trx_t *trx= element->trx)
2021   {
2022     /*
2023       The state of a read-write transaction can only change from ACTIVE to
2024       PREPARED while we are holding the element->mutex. But since it is
2025       executed at startup no state change should occur.
2026     */
2027     if (trx_state_eq(trx, TRX_STATE_PREPARED))
2028     {
2029       ut_ad(trx->is_recovered);
2030       ut_ad(trx->id);
2031       if (arg->count == 0)
2032         ib::info() << "Starting recovery for XA transactions...";
2033       XID& xid= arg->xid_list[arg->count];
2034       if (arg->count++ < arg->len)
2035       {
2036         trx->state= TRX_STATE_PREPARED_RECOVERED;
2037         ib::info() << "Transaction " << trx->id
2038                    << " in prepared state after recovery";
2039         ib::info() << "Transaction contains changes to " << trx->undo_no
2040                    << " rows";
2041         xid= *trx->xid;
2042       }
2043     }
2044   }
2045   mutex_exit(&element->mutex);
2046   /* Do not terminate upon reaching arg->len; count all transactions */
2047   return false;
2048 }
2049 
2050 
trx_recover_reset_callback(rw_trx_hash_element_t * element,void *)2051 static my_bool trx_recover_reset_callback(rw_trx_hash_element_t *element,
2052   void*)
2053 {
2054   mutex_enter(&element->mutex);
2055   if (trx_t *trx= element->trx)
2056   {
2057     if (trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED))
2058       trx->state= TRX_STATE_PREPARED;
2059   }
2060   mutex_exit(&element->mutex);
2061   return false;
2062 }
2063 
2064 
2065 /**
2066   Find prepared transaction objects for recovery.
2067 
2068   @param[out]  xid_list  prepared transactions
2069   @param[in]   len       number of slots in xid_list
2070 
2071   @return number of prepared transactions stored in xid_list
2072 */
2073 
trx_recover_for_mysql(XID * xid_list,uint len)2074 int trx_recover_for_mysql(XID *xid_list, uint len)
2075 {
2076   trx_recover_for_mysql_callback_arg arg= { xid_list, len, 0 };
2077 
2078   ut_ad(xid_list);
2079   ut_ad(len);
2080 
2081   /* Fill xid_list with PREPARED transactions. */
2082   trx_sys.rw_trx_hash.iterate_no_dups(trx_recover_for_mysql_callback, &arg);
2083   if (arg.count)
2084   {
2085     ib::info() << arg.count
2086         << " transactions in prepared state after recovery";
2087     /* After returning the full list, reset the state, because
2088     init_server_components() wants to recover the collection of
2089     transactions twice, by first calling tc_log->open() and then
2090     ha_recover() directly. */
2091     if (arg.count <= len)
2092       trx_sys.rw_trx_hash.iterate(trx_recover_reset_callback);
2093   }
2094   return int(std::min(arg.count, len));
2095 }
2096 
2097 
2098 struct trx_get_trx_by_xid_callback_arg
2099 {
2100   const XID *xid;
2101   trx_t *trx;
2102 };
2103 
2104 
trx_get_trx_by_xid_callback(rw_trx_hash_element_t * element,trx_get_trx_by_xid_callback_arg * arg)2105 static my_bool trx_get_trx_by_xid_callback(rw_trx_hash_element_t *element,
2106   trx_get_trx_by_xid_callback_arg *arg)
2107 {
2108   my_bool found= 0;
2109   mutex_enter(&element->mutex);
2110   if (trx_t *trx= element->trx)
2111   {
2112     trx_mutex_enter(trx);
2113     if (trx->is_recovered &&
2114 	(trx_state_eq(trx, TRX_STATE_PREPARED) ||
2115 	 trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)) &&
2116         arg->xid->eq(reinterpret_cast<XID*>(trx->xid)))
2117     {
2118 #ifdef WITH_WSREP
2119       /* The commit of a prepared recovered Galera
2120       transaction needs a valid trx->xid for
2121       invoking trx_sys_update_wsrep_checkpoint(). */
2122       if (!wsrep_is_wsrep_xid(trx->xid))
2123 #endif /* WITH_WSREP */
2124       /* Invalidate the XID, so that subsequent calls will not find it. */
2125       trx->xid->null();
2126       arg->trx= trx;
2127       found= 1;
2128     }
2129     trx_mutex_exit(trx);
2130   }
2131   mutex_exit(&element->mutex);
2132   return found;
2133 }
2134 
2135 /** Look up an X/Open distributed transaction in XA PREPARE state.
2136 @param[in]	xid	X/Open XA transaction identifier
2137 @return	transaction on match (the trx_t::xid will be invalidated);
2138 note that the trx may have been committed before the caller acquires
2139 trx_t::mutex
2140 @retval	NULL if no match */
trx_get_trx_by_xid(const XID * xid)2141 trx_t* trx_get_trx_by_xid(const XID* xid)
2142 {
2143   trx_get_trx_by_xid_callback_arg arg= { xid, 0 };
2144 
2145   if (xid)
2146     trx_sys.rw_trx_hash.iterate(trx_get_trx_by_xid_callback, &arg);
2147   return arg.trx;
2148 }
2149 
2150 
2151 /*************************************************************//**
2152 Starts the transaction if it is not yet started. */
2153 void
trx_start_if_not_started_xa_low(trx_t * trx,bool read_write)2154 trx_start_if_not_started_xa_low(
2155 /*============================*/
2156 	trx_t*	trx,		/*!< in/out: transaction */
2157 	bool	read_write)	/*!< in: true if read write transaction */
2158 {
2159 	switch (trx->state) {
2160 	case TRX_STATE_NOT_STARTED:
2161 		trx_start_low(trx, read_write);
2162 		return;
2163 
2164 	case TRX_STATE_ACTIVE:
2165 		if (trx->id == 0 && read_write) {
2166 			/* If the transaction is tagged as read-only then
2167 			it can only write to temp tables and for such
2168 			transactions we don't want to move them to the
2169 			trx_sys_t::rw_trx_hash. */
2170 			if (!trx->read_only) {
2171 				trx_set_rw_mode(trx);
2172 			}
2173 		}
2174 		return;
2175 	case TRX_STATE_PREPARED:
2176 	case TRX_STATE_PREPARED_RECOVERED:
2177 	case TRX_STATE_COMMITTED_IN_MEMORY:
2178 		break;
2179 	}
2180 
2181 	ut_error;
2182 }
2183 
2184 /*************************************************************//**
2185 Starts the transaction if it is not yet started. */
2186 void
trx_start_if_not_started_low(trx_t * trx,bool read_write)2187 trx_start_if_not_started_low(
2188 /*==========================*/
2189 	trx_t*	trx,		/*!< in: transaction */
2190 	bool	read_write)	/*!< in: true if read write transaction */
2191 {
2192 	switch (trx->state) {
2193 	case TRX_STATE_NOT_STARTED:
2194 		trx_start_low(trx, read_write);
2195 		return;
2196 
2197 	case TRX_STATE_ACTIVE:
2198 		if (read_write && trx->id == 0 && !trx->read_only) {
2199 			trx_set_rw_mode(trx);
2200 		}
2201 		return;
2202 
2203 	case TRX_STATE_PREPARED:
2204 	case TRX_STATE_PREPARED_RECOVERED:
2205 	case TRX_STATE_COMMITTED_IN_MEMORY:
2206 		break;
2207 	}
2208 
2209 	ut_error;
2210 }
2211 
2212 /*************************************************************//**
2213 Starts a transaction for internal processing. */
2214 void
trx_start_internal_low(trx_t * trx)2215 trx_start_internal_low(
2216 /*===================*/
2217 	trx_t*	trx)		/*!< in/out: transaction */
2218 {
2219 	/* Ensure it is not flagged as an auto-commit-non-locking
2220 	transaction. */
2221 
2222 	trx->will_lock = true;
2223 
2224 	trx->internal = true;
2225 
2226 	trx_start_low(trx, true);
2227 }
2228 
2229 /** Starts a read-only transaction for internal processing.
2230 @param[in,out] trx	transaction to be started */
2231 void
trx_start_internal_read_only_low(trx_t * trx)2232 trx_start_internal_read_only_low(
2233 	trx_t*	trx)
2234 {
2235 	/* Ensure it is not flagged as an auto-commit-non-locking
2236 	transaction. */
2237 
2238 	trx->will_lock = true;
2239 
2240 	trx->internal = true;
2241 
2242 	trx_start_low(trx, false);
2243 }
2244 
2245 /*************************************************************//**
2246 Starts the transaction for a DDL operation. */
2247 void
trx_start_for_ddl_low(trx_t * trx,trx_dict_op_t op)2248 trx_start_for_ddl_low(
2249 /*==================*/
2250 	trx_t*		trx,	/*!< in/out: transaction */
2251 	trx_dict_op_t	op)	/*!< in: dictionary operation type */
2252 {
2253 	switch (trx->state) {
2254 	case TRX_STATE_NOT_STARTED:
2255 		/* Flag this transaction as a dictionary operation, so that
2256 		the data dictionary will be locked in crash recovery. */
2257 
2258 		trx_set_dict_operation(trx, op);
2259 		trx->ddl= true;
2260 		trx_start_internal_low(trx);
2261 		return;
2262 
2263 	case TRX_STATE_ACTIVE:
2264 	case TRX_STATE_PREPARED:
2265 	case TRX_STATE_PREPARED_RECOVERED:
2266 	case TRX_STATE_COMMITTED_IN_MEMORY:
2267 		break;
2268 	}
2269 
2270 	ut_error;
2271 }
2272 
2273 /*************************************************************//**
2274 Set the transaction as a read-write transaction if it is not already
2275 tagged as such. Read-only transactions that are writing to temporary
2276 tables are assigned an ID and a rollback segment but are not added
2277 to the trx read-write list because their updates should not be visible
2278 to other transactions and therefore their changes can be ignored by
2279 by MVCC. */
2280 void
trx_set_rw_mode(trx_t * trx)2281 trx_set_rw_mode(
2282 /*============*/
2283 	trx_t*		trx)		/*!< in/out: transaction that is RW */
2284 {
2285 	ut_ad(trx->rsegs.m_redo.rseg == 0);
2286 	ut_ad(!trx->is_autocommit_non_locking());
2287 	ut_ad(!trx->read_only);
2288 	ut_ad(trx->id == 0);
2289 
2290 	if (high_level_read_only) {
2291 		return;
2292 	}
2293 
2294 	trx->rsegs.m_redo.rseg = trx_assign_rseg_low();
2295 	ut_ad(trx->rsegs.m_redo.rseg != 0);
2296 
2297 	trx_sys.register_rw(trx);
2298 
2299 	/* So that we can see our own changes. */
2300 	if (trx->read_view.is_open()) {
2301 		trx->read_view.set_creator_trx_id(trx->id);
2302 	}
2303 }
2304 
has_stats_table_lock() const2305 bool trx_t::has_stats_table_lock() const
2306 {
2307   for (lock_list::const_iterator it= lock.table_locks.begin(),
2308        end= lock.table_locks.end(); it != end; ++it)
2309   {
2310      const lock_t *lock= *it;
2311      if (lock && lock->un_member.tab_lock.table->is_stats_table())
2312        return true;
2313   }
2314 
2315   return false;
2316 }
2317