1 /*****************************************************************************
2 
3 Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
4 
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License, version 2.0,
7 as published by the Free Software Foundation.
8 
9 This program is also distributed with certain software (including
10 but not limited to OpenSSL) that is licensed under separate terms,
11 as designated in a particular file or component or in included license
12 documentation.  The authors of MySQL hereby grant you an additional
13 permission to link the program and your derivative works with the
14 separately licensed software that they have included with MySQL.
15 
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 GNU General Public License, version 2.0, for more details.
20 
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
24 
25 *****************************************************************************/
26 
27 /**************************************************//**
28 @file trx/trx0trx.cc
29 The transaction
30 
31 Created 3/26/1996 Heikki Tuuri
32 *******************************************************/
33 
34 #include "btr0types.h"
35 #include "trx0trx.h"
36 
37 #ifdef UNIV_NONINL
38 #include "trx0trx.ic"
39 #endif
40 
41 #include "trx0undo.h"
42 #include "trx0rseg.h"
43 #include "log0log.h"
44 #include "que0que.h"
45 #include "lock0lock.h"
46 #include "trx0roll.h"
47 #include "usr0sess.h"
48 #include "read0read.h"
49 #include "srv0srv.h"
50 #include "srv0start.h"
51 #include "btr0sea.h"
52 #include "os0proc.h"
53 #include "trx0xa.h"
54 #include "trx0rec.h"
55 #include "trx0purge.h"
56 #include "ha_prototypes.h"
57 #include "srv0mon.h"
58 #include "ut0vec.h"
59 
60 #include<set>
61 
62 /** Set of table_id */
63 typedef std::set<table_id_t>	table_id_set;
64 
65 /** Dummy session used currently in MySQL interface */
66 UNIV_INTERN sess_t*		trx_dummy_sess = NULL;
67 
68 #ifdef UNIV_PFS_MUTEX
69 /* Key to register the mutex with performance schema */
70 UNIV_INTERN mysql_pfs_key_t	trx_mutex_key;
71 /* Key to register the mutex with performance schema */
72 UNIV_INTERN mysql_pfs_key_t	trx_undo_mutex_key;
73 #endif /* UNIV_PFS_MUTEX */
74 
75 /*************************************************************//**
76 Set detailed error message for the transaction. */
77 UNIV_INTERN
78 void
trx_set_detailed_error(trx_t * trx,const char * msg)79 trx_set_detailed_error(
80 /*===================*/
81 	trx_t*		trx,	/*!< in: transaction struct */
82 	const char*	msg)	/*!< in: detailed error message */
83 {
84 	ut_strlcpy(trx->detailed_error, msg, sizeof(trx->detailed_error));
85 }
86 
87 /*************************************************************//**
88 Set detailed error message for the transaction from a file. Note that the
89 file is rewinded before reading from it. */
90 UNIV_INTERN
91 void
trx_set_detailed_error_from_file(trx_t * trx,FILE * file)92 trx_set_detailed_error_from_file(
93 /*=============================*/
94 	trx_t*	trx,	/*!< in: transaction struct */
95 	FILE*	file)	/*!< in: file to read message from */
96 {
97 	os_file_read_string(file, trx->detailed_error,
98 			    sizeof(trx->detailed_error));
99 }
100 
101 /*************************************************************//**
102 Callback function for trx_find_descriptor() to compare trx IDs. */
103 UNIV_INTERN
104 int
trx_descr_cmp(const void * a,const void * b)105 trx_descr_cmp(
106 /*==========*/
107 	const void *a,	/*!< in: pointer to first comparison argument */
108 	const void *b)	/*!< in: pointer to second comparison argument */
109 {
110 	const trx_id_t*	da = (const trx_id_t*) a;
111 	const trx_id_t*	db = (const trx_id_t*) b;
112 
113 	if (*da < *db) {
114 		return -1;
115 	} else if (*da > *db) {
116 		return 1;
117 	}
118 
119 	return 0;
120 }
121 
122 /*************************************************************//**
123 Reserve a slot for a given trx in the global descriptors array. */
124 UNIV_INLINE
125 void
trx_reserve_descriptor(const trx_t * trx)126 trx_reserve_descriptor(
127 /*===================*/
128 	const trx_t* trx)	/*!< in: trx pointer */
129 {
130 	ulint		n_used;
131 	ulint		n_max;
132 	trx_id_t*	descr;
133 
134 	ut_ad(mutex_own(&trx_sys->mutex) || srv_is_being_started);
135 	ut_ad(srv_is_being_started ||
136 	      !trx_find_descriptor(trx_sys->descriptors,
137 				   trx_sys->descr_n_used,
138 				   trx->id));
139 
140 	n_used = trx_sys->descr_n_used + 1;
141 	n_max = trx_sys->descr_n_max;
142 
143 	if (UNIV_UNLIKELY(n_used > n_max)) {
144 
145 		n_max = n_max * 2;
146 
147 		trx_sys->descriptors = static_cast<trx_id_t*>(
148 			ut_realloc(trx_sys->descriptors,
149 				   n_max * sizeof(trx_id_t)));
150 
151 		trx_sys->descr_n_max = n_max;
152 		srv_descriptors_memory = n_max * sizeof(trx_id_t);
153 	}
154 
155 	descr = trx_sys->descriptors + n_used - 1;
156 
157 	if (UNIV_UNLIKELY(n_used > 1 && trx->id < descr[-1])) {
158 
159 		/* Find the slot where it should be inserted. We could use a
160 		binary search, but in reality linear search should be faster,
161 		because the slot we are looking for is near the array end. */
162 
163 		trx_id_t*	tdescr;
164 
165 		for (tdescr = descr - 1;
166 		     tdescr >= trx_sys->descriptors && *tdescr > trx->id;
167 		     tdescr--) {
168 		}
169 
170 		tdescr++;
171 
172 		ut_memmove(tdescr + 1, tdescr, (descr - tdescr) *
173 			   sizeof(trx_id_t));
174 
175 		descr = tdescr;
176 	}
177 
178 	*descr = trx->id;
179 
180 	trx_sys->descr_n_used = n_used;
181 }
182 
183 /*************************************************************//**
184 Release a slot for a given trx in the global descriptors array. */
185 UNIV_INTERN
186 void
trx_release_descriptor(trx_t * trx)187 trx_release_descriptor(
188 /*===================*/
189 	trx_t* trx)	/*!< in: trx pointer */
190 {
191 	ulint		size;
192 	trx_id_t*	descr;
193 
194 	ut_ad(mutex_own(&trx_sys->mutex));
195 
196 	if (UNIV_LIKELY(trx->in_trx_serial_list)) {
197 
198 		UT_LIST_REMOVE(trx_serial_list, trx_sys->trx_serial_list,
199 			       trx);
200 		trx->in_trx_serial_list = false;
201 	}
202 
203 	descr = trx_find_descriptor(trx_sys->descriptors,
204 				    trx_sys->descr_n_used,
205 				    trx->id);
206 
207 	if (UNIV_UNLIKELY(descr == NULL)) {
208 
209 		return;
210 	}
211 
212 	size = (trx_sys->descriptors + trx_sys->descr_n_used - 1 - descr) *
213 		sizeof(trx_id_t);
214 
215 	if (UNIV_LIKELY(size > 0)) {
216 
217 		ut_memmove(descr, descr + 1, size);
218 	}
219 
220 	trx_sys->descr_n_used--;
221 }
222 
223 /****************************************************************//**
224 Creates and initializes a transaction object. It must be explicitly
225 started with trx_start_if_not_started() before using it. The default
226 isolation level is TRX_ISO_REPEATABLE_READ.
227 @return transaction instance, should never be NULL */
228 static
229 trx_t*
trx_create(void)230 trx_create(void)
231 /*============*/
232 {
233 	trx_t*		trx;
234 	mem_heap_t*	heap;
235 	ib_alloc_t*	heap_alloc;
236 
237 	trx = static_cast<trx_t*>(mem_zalloc(sizeof(*trx)));
238 
239 	mutex_create(trx_mutex_key, &trx->mutex, SYNC_TRX);
240 
241 	trx->magic_n = TRX_MAGIC_N;
242 
243 	trx->state = TRX_STATE_NOT_STARTED;
244 
245 	trx->isolation_level = TRX_ISO_REPEATABLE_READ;
246 
247 	trx->no = TRX_ID_MAX;
248 	trx->in_trx_serial_list = false;
249 
250 	trx->support_xa = TRUE;
251 
252 	trx->fake_changes = FALSE;
253 
254 	trx->check_foreigns = TRUE;
255 	trx->check_unique_secondary = TRUE;
256 
257 	trx->dict_operation = TRX_DICT_OP_NONE;
258 
259 	trx->idle_start = 0;
260 	trx->last_stmt_start = 0;
261 
262 	mutex_create(trx_undo_mutex_key, &trx->undo_mutex, SYNC_TRX_UNDO);
263 
264 	trx->error_state = DB_SUCCESS;
265 
266 	trx->lock.que_state = TRX_QUE_RUNNING;
267 
268 	trx->lock.lock_heap = mem_heap_create_typed(
269 		256, MEM_HEAP_FOR_LOCK_HEAP);
270 
271 	trx->search_latch_timeout = BTR_SEA_TIMEOUT;
272 
273 	trx->io_reads = 0;
274 	trx->io_read = 0;
275 	trx->io_reads_wait_timer = 0;
276 	trx->lock_que_wait_timer = 0;
277 	trx->innodb_que_wait_timer = 0;
278 	trx->distinct_page_access = 0;
279 	trx->distinct_page_access_hash = NULL;
280 	trx->take_stats = FALSE;
281 
282 	trx->xid.formatID = -1;
283 
284 	trx->op_info = "";
285 
286 	trx->api_trx = false;
287 
288 	trx->api_auto_commit = false;
289 
290 	trx->read_write = true;
291 
292 	heap = mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 8);
293 	heap_alloc = ib_heap_allocator_create(heap);
294 
295 	/* Remember to free the vector explicitly in trx_free(). */
296 	trx->autoinc_locks = ib_vector_create(heap_alloc, sizeof(void**), 4);
297 
298 	/* Remember to free the vector explicitly in trx_free(). */
299 	heap = mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 128);
300 	heap_alloc = ib_heap_allocator_create(heap);
301 
302 	trx->lock.table_locks = ib_vector_create(
303 		heap_alloc, sizeof(void**), 32);
304 
305 	return(trx);
306 }
307 
308 /********************************************************************//**
309 Creates a transaction object for background operations by the master thread.
310 @return	own: transaction object */
311 UNIV_INTERN
312 trx_t*
trx_allocate_for_background(void)313 trx_allocate_for_background(void)
314 /*=============================*/
315 {
316 	trx_t*	trx;
317 
318 	trx = trx_create();
319 
320 	trx->sess = trx_dummy_sess;
321 
322 	return(trx);
323 }
324 
325 /********************************************************************//**
326 Creates a transaction object for MySQL.
327 @return	own: transaction object */
328 UNIV_INTERN
329 trx_t*
trx_allocate_for_mysql(void)330 trx_allocate_for_mysql(void)
331 /*========================*/
332 {
333 	trx_t*	trx;
334 
335 	trx = trx_allocate_for_background();
336 
337 	mutex_enter(&trx_sys->mutex);
338 
339 	ut_d(trx->in_mysql_trx_list = TRUE);
340 	UT_LIST_ADD_FIRST(mysql_trx_list, trx_sys->mysql_trx_list, trx);
341 
342 	mutex_exit(&trx_sys->mutex);
343 
344 	if (UNIV_UNLIKELY(trx->take_stats)) {
345 		trx->distinct_page_access_hash
346 			= static_cast<byte *>(mem_alloc(DPAH_SIZE));
347 		memset(trx->distinct_page_access_hash, 0, DPAH_SIZE);
348 	}
349 
350 	return(trx);
351 }
352 
353 /********************************************************************//**
354 Frees a transaction object without releasing the corresponding descriptor.
355 Should be used by callers that already own trx_sys->mutex. */
356 static
357 void
trx_free_low(trx_t * trx)358 trx_free_low(
359 /*=========*/
360 	trx_t*	trx)	/*!< in, own: trx object */
361 {
362 	ut_a(trx->magic_n == TRX_MAGIC_N);
363 	ut_ad(!trx->in_ro_trx_list);
364 	ut_ad(!trx->in_rw_trx_list);
365 	ut_ad(!trx->in_mysql_trx_list);
366 
367 	mutex_free(&trx->undo_mutex);
368 
369 	if (trx->undo_no_arr != NULL) {
370 		trx_undo_arr_free(trx->undo_no_arr);
371 	}
372 
373 	ut_a(trx->lock.wait_lock == NULL);
374 	ut_a(trx->lock.wait_thr == NULL);
375 
376 	ut_a(!trx->has_search_latch);
377 #ifdef UNIV_SYNC_DEBUG
378 	ut_ad(!btr_search_own_any());
379 #endif
380 
381 	ut_a(trx->dict_operation_lock_mode == 0);
382 
383 	if (trx->lock.lock_heap) {
384 		mem_heap_free(trx->lock.lock_heap);
385 	}
386 
387 	ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
388 
389 	ut_a(ib_vector_is_empty(trx->autoinc_locks));
390 	/* We allocated a dedicated heap for the vector. */
391 	ib_vector_free(trx->autoinc_locks);
392 
393 	if (trx->lock.table_locks != NULL) {
394 		/* We allocated a dedicated heap for the vector. */
395 		ib_vector_free(trx->lock.table_locks);
396 	}
397 
398 	mutex_free(&trx->mutex);
399 
400 	read_view_free(trx->prebuilt_view);
401 
402 	mem_free(trx);
403 }
404 
405 /********************************************************************//**
406 Frees a transaction object. */
407 static
408 void
trx_free(trx_t * trx)409 trx_free(
410 /*=========*/
411 	trx_t*	trx)	/*!< in, own: trx object */
412 {
413 	mutex_enter(&trx_sys->mutex);
414 	trx_release_descriptor(trx);
415 	mutex_exit(&trx_sys->mutex);
416 
417 	trx_free_low(trx);
418 }
419 
420 /********************************************************************//**
421 Frees a transaction object of a background operation of the master thread. */
422 UNIV_INTERN
423 void
trx_free_for_background(trx_t * trx)424 trx_free_for_background(
425 /*====================*/
426 	trx_t*	trx)	/*!< in, own: trx object */
427 {
428 
429 	if (trx->distinct_page_access_hash)
430 	{
431 		mem_free(trx->distinct_page_access_hash);
432 		trx->distinct_page_access_hash= NULL;
433 	}
434 
435 	if (trx->declared_to_be_inside_innodb) {
436 
437 		ib_logf(IB_LOG_LEVEL_ERROR,
438 			"Freeing a trx (%p, " TRX_ID_FMT ") which is declared "
439 			"to be processing inside InnoDB", trx, trx->id);
440 
441 		trx_print(stderr, trx, 600);
442 		putc('\n', stderr);
443 
444 		/* This is an error but not a fatal error. We must keep
445 		the counters like srv_conc_n_threads accurate. */
446 		srv_conc_force_exit_innodb(trx);
447 	}
448 
449 	if (trx->n_mysql_tables_in_use != 0
450 	    || trx->mysql_n_tables_locked != 0) {
451 
452 		ib_logf(IB_LOG_LEVEL_ERROR,
453 			"MySQL is freeing a thd though "
454 			"trx->n_mysql_tables_in_use is %lu and "
455 			"trx->mysql_n_tables_locked is %lu.",
456 			(ulong) trx->n_mysql_tables_in_use,
457 			(ulong) trx->mysql_n_tables_locked);
458 
459 		trx_print(stderr, trx, 600);
460 		ut_print_buf(stderr, trx, sizeof(trx_t));
461 		putc('\n', stderr);
462 	}
463 
464 	ut_a(trx->state == TRX_STATE_NOT_STARTED);
465 	ut_a(trx->insert_undo == NULL);
466 	ut_a(trx->update_undo == NULL);
467 	ut_a(trx->read_view == NULL);
468 
469 	trx_free(trx);
470 }
471 
472 /********************************************************************//**
473 At shutdown, frees a transaction object that is in the PREPARED state. */
474 UNIV_INTERN
475 void
trx_free_prepared(trx_t * trx)476 trx_free_prepared(
477 /*==============*/
478 	trx_t*	trx)	/*!< in, own: trx object */
479 {
480 	ut_a(trx_state_eq(trx, TRX_STATE_PREPARED));
481 	ut_a(trx->magic_n == TRX_MAGIC_N);
482 
483 	lock_trx_release_locks(trx);
484 	trx_undo_free_prepared(trx);
485 
486 	assert_trx_in_rw_list(trx);
487 
488 	ut_a(!trx->read_only);
489 
490 	UT_LIST_REMOVE(trx_list, trx_sys->rw_trx_list, trx);
491 	ut_d(trx->in_rw_trx_list = FALSE);
492 
493 	mutex_enter(&trx_sys->mutex);
494 	trx_release_descriptor(trx);
495 	mutex_exit(&trx_sys->mutex);
496 
497 	/* Undo trx_resurrect_table_locks(). */
498 	UT_LIST_INIT(trx->lock.trx_locks);
499 
500 	trx_free_low(trx);
501 
502 	ut_ad(trx_sys->descr_n_used <= UT_LIST_GET_LEN(trx_sys->rw_trx_list));
503 }
504 
505 /********************************************************************//**
506 Frees a transaction object for MySQL. */
507 UNIV_INTERN
508 void
trx_free_for_mysql(trx_t * trx)509 trx_free_for_mysql(
510 /*===============*/
511 	trx_t*	trx)	/*!< in, own: trx object */
512 {
513 	if (trx->distinct_page_access_hash)
514 	{
515 		mem_free(trx->distinct_page_access_hash);
516 		trx->distinct_page_access_hash= NULL;
517 	}
518 
519 	mutex_enter(&trx_sys->mutex);
520 
521 	ut_ad(trx->in_mysql_trx_list);
522 	ut_d(trx->in_mysql_trx_list = FALSE);
523 	UT_LIST_REMOVE(mysql_trx_list, trx_sys->mysql_trx_list, trx);
524 
525 	ut_ad(trx_sys_validate_trx_list());
526 
527 	mutex_exit(&trx_sys->mutex);
528 
529 	trx_free_for_background(trx);
530 }
531 
532 /****************************************************************//**
533 Inserts the trx handle in the trx system trx list in the right position.
534 The list is sorted on the trx id so that the biggest id is at the list
535 start. This function is used at the database startup to insert incomplete
536 transactions to the list. */
537 static
538 void
trx_list_rw_insert_ordered(trx_t * trx)539 trx_list_rw_insert_ordered(
540 /*=======================*/
541 	trx_t*	trx)	/*!< in: trx handle */
542 {
543 	trx_t*	trx2;
544 
545 	ut_ad(!trx->read_only);
546 
547 	ut_d(trx->start_file = __FILE__);
548 	ut_d(trx->start_line = __LINE__);
549 
550 	ut_a(srv_is_being_started);
551 	ut_ad(!trx->in_ro_trx_list);
552 	ut_ad(!trx->in_rw_trx_list);
553 	ut_ad(trx->state != TRX_STATE_NOT_STARTED);
554 	ut_ad(trx->is_recovered);
555 
556 	for (trx2 = UT_LIST_GET_FIRST(trx_sys->rw_trx_list);
557 	     trx2 != NULL;
558 	     trx2 = UT_LIST_GET_NEXT(trx_list, trx2)) {
559 
560 		assert_trx_in_rw_list(trx2);
561 
562 		if (trx->id >= trx2->id) {
563 
564 			ut_ad(trx->id > trx2->id);
565 			break;
566 		}
567 	}
568 
569 	if (trx2 != NULL) {
570 		trx2 = UT_LIST_GET_PREV(trx_list, trx2);
571 
572 		if (trx2 == NULL) {
573 			UT_LIST_ADD_FIRST(trx_list, trx_sys->rw_trx_list, trx);
574 		} else {
575 			UT_LIST_INSERT_AFTER(
576 				trx_list, trx_sys->rw_trx_list, trx2, trx);
577 		}
578 	} else {
579 		UT_LIST_ADD_LAST(trx_list, trx_sys->rw_trx_list, trx);
580 	}
581 
582 #ifdef UNIV_DEBUG
583 	if (trx->id > trx_sys->rw_max_trx_id) {
584 		trx_sys->rw_max_trx_id = trx->id;
585 	}
586 #endif /* UNIV_DEBUG */
587 
588 	ut_ad(!trx->in_rw_trx_list);
589 	ut_d(trx->in_rw_trx_list = TRUE);
590 }
591 
592 /****************************************************************//**
593 Resurrect the table locks for a resurrected transaction. */
594 static
595 void
trx_resurrect_table_locks(trx_t * trx,const trx_undo_t * undo)596 trx_resurrect_table_locks(
597 /*======================*/
598 	trx_t*			trx,	/*!< in/out: transaction */
599 	const trx_undo_t*	undo)	/*!< in: undo log */
600 {
601 	mtr_t			mtr;
602 	page_t*			undo_page;
603 	trx_undo_rec_t*		undo_rec;
604 	table_id_set		tables;
605 
606 	ut_ad(undo == trx->insert_undo || undo == trx->update_undo);
607 
608 	if (trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY)
609 	    || undo->empty) {
610 		return;
611 	}
612 
613 	mtr_start(&mtr);
614 	/* trx_rseg_mem_create() may have acquired an X-latch on this
615 	page, so we cannot acquire an S-latch. */
616 	undo_page = trx_undo_page_get(
617 		undo->space, undo->zip_size, undo->top_page_no, &mtr);
618 	undo_rec = undo_page + undo->top_offset;
619 
620 	do {
621 		ulint		type;
622 		ulint		cmpl_info;
623 		bool		updated_extern;
624 		undo_no_t	undo_no;
625 		table_id_t	table_id;
626 
627 		page_t*		undo_rec_page = page_align(undo_rec);
628 
629 		if (undo_rec_page != undo_page) {
630 			if (!mtr_memo_release(&mtr,
631 					      buf_block_align(undo_page),
632 					      MTR_MEMO_PAGE_X_FIX)) {
633 				/* The page of the previous undo_rec
634 				should have been latched by
635 				trx_undo_page_get() or
636 				trx_undo_get_prev_rec(). */
637 				ut_ad(0);
638 			}
639 
640 			undo_page = undo_rec_page;
641 		}
642 
643 		trx_undo_rec_get_pars(
644 			undo_rec, &type, &cmpl_info,
645 			&updated_extern, &undo_no, &table_id);
646 		tables.insert(table_id);
647 
648 		undo_rec = trx_undo_get_prev_rec(
649 			undo_rec, undo->hdr_page_no,
650 			undo->hdr_offset, false, &mtr);
651 	} while (undo_rec);
652 
653 	mtr_commit(&mtr);
654 
655 	for (table_id_set::const_iterator i = tables.begin();
656 	     i != tables.end(); i++) {
657 		if (dict_table_t* table = dict_table_open_on_id(
658 			    *i, FALSE, DICT_TABLE_OP_LOAD_TABLESPACE)) {
659 			if (table->ibd_file_missing
660 			    || dict_table_is_temporary(table)) {
661 				mutex_enter(&dict_sys->mutex);
662 				dict_table_close(table, TRUE, FALSE);
663 				dict_table_remove_from_cache(table);
664 				mutex_exit(&dict_sys->mutex);
665 				continue;
666 			}
667 
668 			lock_table_ix_resurrect(table, trx);
669 
670 			DBUG_PRINT("ib_trx",
671 				   ("resurrect" TRX_ID_FMT
672 				    "  table '%s' IX lock from %s undo",
673 				    trx->id, table->name,
674 				    undo == trx->insert_undo
675 				    ? "insert" : "update"));
676 
677 			dict_table_close(table, FALSE, FALSE);
678 		}
679 	}
680 }
681 
682 /****************************************************************//**
683 Resurrect the transactions that were doing inserts the time of the
684 crash, they need to be undone.
685 @return trx_t instance  */
686 static
687 trx_t*
trx_resurrect_insert(trx_undo_t * undo,trx_rseg_t * rseg)688 trx_resurrect_insert(
689 /*=================*/
690 	trx_undo_t*	undo,		/*!< in: entry to UNDO */
691 	trx_rseg_t*	rseg)		/*!< in: rollback segment */
692 {
693 	trx_t*		trx;
694 
695 	trx = trx_allocate_for_background();
696 
697 	trx->rseg = rseg;
698 	trx->xid = undo->xid;
699 	trx->id = undo->trx_id;
700 	trx->insert_undo = undo;
701 	trx->is_recovered = TRUE;
702 
703 	/* This is single-threaded startup code, we do not need the
704 	protection of trx->mutex or trx_sys->mutex here. */
705 
706 	if (undo->state != TRX_UNDO_ACTIVE) {
707 
708 		/* Prepared transactions are left in the prepared state
709 		waiting for a commit or abort decision from MySQL */
710 
711 		if (undo->state == TRX_UNDO_PREPARED) {
712 
713 			fprintf(stderr,
714 				"InnoDB: Transaction " TRX_ID_FMT " was in the"
715 				" XA prepared state.\n", trx->id);
716 
717 			if (srv_force_recovery == 0) {
718 
719 				trx->state = TRX_STATE_PREPARED;
720 				trx_sys->n_prepared_trx++;
721 				trx_sys->n_prepared_recovered_trx++;
722 			} else {
723 				fprintf(stderr,
724 					"InnoDB: Since innodb_force_recovery"
725 					" > 0, we will rollback it anyway.\n");
726 
727 				trx->state = TRX_STATE_ACTIVE;
728 			}
729 		} else {
730 			trx->state = TRX_STATE_COMMITTED_IN_MEMORY;
731 		}
732 
733 		/* We give a dummy value for the trx no; this should have no
734 		relevance since purge is not interested in committed
735 		transaction numbers, unless they are in the history
736 		list, in which case it looks the number from the disk based
737 		undo log structure */
738 
739 		trx->no = trx->id;
740 	} else {
741 		trx->state = TRX_STATE_ACTIVE;
742 
743 		/* A running transaction always has the number
744 		field inited to TRX_ID_MAX */
745 
746 		trx->no = TRX_ID_MAX;
747 	}
748 
749 	/* trx_start_low() is not called with resurrect, so need to initialize
750 	start time here.*/
751 	if (trx->state == TRX_STATE_ACTIVE
752 	    || trx->state == TRX_STATE_PREPARED) {
753 		trx->start_time = ut_time();
754 	}
755 
756 	if (undo->dict_operation) {
757 		trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
758 		trx->table_id = undo->table_id;
759 	}
760 
761 	if (!undo->empty) {
762 		trx->undo_no = undo->top_undo_no + 1;
763 	}
764 
765 	return(trx);
766 }
767 
768 /****************************************************************//**
769 Prepared transactions are left in the prepared state waiting for a
770 commit or abort decision from MySQL */
771 static
772 void
trx_resurrect_update_in_prepared_state(trx_t * trx,const trx_undo_t * undo)773 trx_resurrect_update_in_prepared_state(
774 /*===================================*/
775 	trx_t*			trx,	/*!< in,out: transaction */
776 	const trx_undo_t*	undo)	/*!< in: update UNDO record */
777 {
778 	/* This is single-threaded startup code, we do not need the
779 	protection of trx->mutex or trx_sys->mutex here. */
780 
781 	if (undo->state == TRX_UNDO_PREPARED) {
782 		fprintf(stderr,
783 			"InnoDB: Transaction " TRX_ID_FMT
784 			" was in the XA prepared state.\n", trx->id);
785 
786 		if (srv_force_recovery == 0) {
787 			if (trx_state_eq(trx, TRX_STATE_NOT_STARTED)) {
788 				trx_sys->n_prepared_trx++;
789 				trx_sys->n_prepared_recovered_trx++;
790 			} else {
791 				ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED));
792 			}
793 
794 			trx->state = TRX_STATE_PREPARED;
795 		} else {
796 			fprintf(stderr,
797 				"InnoDB: Since innodb_force_recovery"
798 				" > 0, we will rollback it anyway.\n");
799 
800 			trx->state = TRX_STATE_ACTIVE;
801 		}
802 	} else {
803 		trx->state = TRX_STATE_COMMITTED_IN_MEMORY;
804 	}
805 }
806 
807 /****************************************************************//**
808 Resurrect the transactions that were doing updates the time of the
809 crash, they need to be undone. */
810 static
811 void
trx_resurrect_update(trx_t * trx,trx_undo_t * undo,trx_rseg_t * rseg)812 trx_resurrect_update(
813 /*=================*/
814 	trx_t*		trx,	/*!< in/out: transaction */
815 	trx_undo_t*	undo,	/*!< in/out: update UNDO record */
816 	trx_rseg_t*	rseg)	/*!< in/out: rollback segment */
817 {
818 	trx->rseg = rseg;
819 	trx->xid = undo->xid;
820 	trx->id = undo->trx_id;
821 	trx->update_undo = undo;
822 	trx->is_recovered = TRUE;
823 
824 	/* This is single-threaded startup code, we do not need the
825 	protection of trx->mutex or trx_sys->mutex here. */
826 
827 	if (undo->state != TRX_UNDO_ACTIVE) {
828 		trx_resurrect_update_in_prepared_state(trx, undo);
829 
830 		/* We give a dummy value for the trx number */
831 
832 		trx->no = trx->id;
833 
834 	} else {
835 		trx->state = TRX_STATE_ACTIVE;
836 
837 		/* A running transaction always has the number field inited to
838 		TRX_ID_MAX */
839 
840 		trx->no = TRX_ID_MAX;
841 	}
842 
843 	/* trx_start_low() is not called with resurrect, so need to initialize
844 	start time here.*/
845 	if (trx->state == TRX_STATE_ACTIVE
846 	    || trx->state == TRX_STATE_PREPARED) {
847 		trx->start_time = ut_time();
848 	}
849 
850 	if (undo->dict_operation) {
851 		trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
852 		trx->table_id = undo->table_id;
853 	}
854 
855 	if (!undo->empty && undo->top_undo_no >= trx->undo_no) {
856 
857 		trx->undo_no = undo->top_undo_no + 1;
858 	}
859 }
860 
861 /****************************************************************//**
862 Creates trx objects for transactions and initializes the trx list of
863 trx_sys at database start. Rollback segment and undo log lists must
864 already exist when this function is called, because the lists of
865 transactions to be rolled back or cleaned up are built based on the
866 undo log lists. */
867 UNIV_INTERN
868 void
trx_lists_init_at_db_start(void)869 trx_lists_init_at_db_start(void)
870 /*============================*/
871 {
872 	ulint		i;
873 
874 	ut_a(srv_is_being_started);
875 
876 	UT_LIST_INIT(trx_sys->ro_trx_list);
877 	UT_LIST_INIT(trx_sys->rw_trx_list);
878 	UT_LIST_INIT(trx_sys->trx_serial_list);
879 
880 	/* Look from the rollback segments if there exist undo logs for
881 	transactions */
882 
883 	for (i = 0; i < TRX_SYS_N_RSEGS; ++i) {
884 		trx_undo_t*	undo;
885 		trx_rseg_t*	rseg;
886 
887 		rseg = trx_sys->rseg_array[i];
888 
889 		if (rseg == NULL) {
890 			continue;
891 		}
892 
893 		/* Resurrect transactions that were doing inserts. */
894 		for (undo = UT_LIST_GET_FIRST(rseg->insert_undo_list);
895 		     undo != NULL;
896 		     undo = UT_LIST_GET_NEXT(undo_list, undo)) {
897 			trx_t*	trx;
898 
899 			trx = trx_resurrect_insert(undo, rseg);
900 
901 			if (trx->state == TRX_STATE_ACTIVE ||
902 			    trx->state == TRX_STATE_PREPARED) {
903 
904 				trx_reserve_descriptor(trx);
905 			}
906 			trx_list_rw_insert_ordered(trx);
907 
908 			trx_resurrect_table_locks(trx, undo);
909 		}
910 
911 		/* Ressurrect transactions that were doing updates. */
912 		for (undo = UT_LIST_GET_FIRST(rseg->update_undo_list);
913 		     undo != NULL;
914 		     undo = UT_LIST_GET_NEXT(undo_list, undo)) {
915 			trx_t*	trx;
916 			ibool	trx_created;
917 
918 			/* Check the trx_sys->rw_trx_list first. */
919 			mutex_enter(&trx_sys->mutex);
920 			trx = trx_get_rw_trx_by_id(undo->trx_id);
921 			mutex_exit(&trx_sys->mutex);
922 
923 			if (trx == NULL) {
924 				trx = trx_allocate_for_background();
925 				trx_created = TRUE;
926 			} else {
927 				trx_created = FALSE;
928 			}
929 
930 			trx_resurrect_update(trx, undo, rseg);
931 
932 			if (trx_created) {
933 				if (trx->state == TRX_STATE_ACTIVE ||
934 				    trx->state == TRX_STATE_PREPARED) {
935 
936 					trx_reserve_descriptor(trx);
937 				}
938 				trx_list_rw_insert_ordered(trx);
939 			}
940 
941 			trx_resurrect_table_locks(trx, undo);
942 		}
943 	}
944 }
945 
946 /******************************************************************//**
947 Assigns a rollback segment to a transaction in a round-robin fashion.
948 @return	assigned rollback segment instance */
949 static
950 trx_rseg_t*
trx_assign_rseg_low(ulong max_undo_logs,ulint n_tablespaces)951 trx_assign_rseg_low(
952 /*================*/
953 	ulong	max_undo_logs,	/*!< in: maximum number of UNDO logs to use */
954 	ulint	n_tablespaces)	/*!< in: number of rollback tablespaces */
955 {
956 	ulint		i;
957 	trx_rseg_t*	rseg;
958 	static ulint	latest_rseg = 0;
959 
960 	if (srv_read_only_mode) {
961 		ut_a(max_undo_logs == ULONG_UNDEFINED);
962 		return(NULL);
963 	}
964 
965 	/* This breaks true round robin but that should be OK. */
966 
967 	ut_a(max_undo_logs > 0 && max_undo_logs <= TRX_SYS_N_RSEGS);
968 
969 	i = latest_rseg++;
970         i %= max_undo_logs;
971 
972 	/* Note: The assumption here is that there can't be any gaps in
973 	the array. Once we implement more flexible rollback segment
974 	management this may not hold. The assertion checks for that case. */
975 
976 	if (trx_sys->rseg_array[0] == NULL) {
977 		return(NULL);
978 	}
979 
980 	/* Skip the system tablespace if we have more than one tablespace
981 	defined for rollback segments. We want all UNDO records to be in
982 	the non-system tablespaces. */
983 
984 	do {
985 		rseg = trx_sys->rseg_array[i];
986 		ut_a(rseg == NULL || i == rseg->id);
987 
988 		i = (rseg == NULL) ? 0 : i + 1;
989 
990 	} while (rseg == NULL
991 		 || (rseg->space == 0
992 		     && n_tablespaces > 0
993 		     && trx_sys->rseg_array[1] != NULL));
994 
995 	return(rseg);
996 }
997 
998 /****************************************************************//**
999 Assign a read-only transaction a rollback-segment, if it is attempting
1000 to write to a TEMPORARY table. */
1001 UNIV_INTERN
1002 void
trx_assign_rseg(trx_t * trx)1003 trx_assign_rseg(
1004 /*============*/
1005 	trx_t*		trx)		/*!< A read-only transaction that
1006 					needs to be assigned a RBS. */
1007 {
1008 	ut_a(trx->rseg == 0);
1009 	ut_a(trx->read_only);
1010 	ut_a(!srv_read_only_mode);
1011 	ut_a(!trx_is_autocommit_non_locking(trx));
1012 
1013 	trx->rseg = trx_assign_rseg_low(srv_undo_logs, srv_undo_tablespaces);
1014 }
1015 
1016 /****************************************************************//**
1017 Starts a transaction. */
1018 static
1019 void
trx_start_low(trx_t * trx)1020 trx_start_low(
1021 /*==========*/
1022 	trx_t*	trx)		/*!< in: transaction */
1023 {
1024 	ut_ad(trx->rseg == NULL);
1025 
1026 	ut_ad(trx->start_file != 0);
1027 	ut_ad(trx->start_line != 0);
1028 	ut_ad(!trx->is_recovered);
1029 	ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED));
1030 	ut_ad(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
1031 
1032 	/* Check whether it is an AUTOCOMMIT SELECT */
1033 	trx->auto_commit = (trx->api_trx && trx->api_auto_commit)
1034 			   || thd_trx_is_auto_commit(trx->mysql_thd);
1035 
1036 	trx->read_only =
1037 		(trx->api_trx && !trx->read_write)
1038 		|| (!trx->ddl && thd_trx_is_read_only(trx->mysql_thd))
1039 		|| srv_read_only_mode;
1040 
1041 	if (!trx->auto_commit) {
1042 		++trx->will_lock;
1043 	} else if (trx->will_lock == 0) {
1044 		trx->read_only = TRUE;
1045 	}
1046 
1047 	if (!trx->read_only) {
1048 		trx->rseg = trx_assign_rseg_low(
1049 			srv_undo_logs, srv_undo_tablespaces);
1050 	}
1051 
1052 	/* The initial value for trx->no: TRX_ID_MAX is used in
1053 	read_view_open_now: */
1054 
1055 	trx->no = TRX_ID_MAX;
1056 
1057 	ut_a(ib_vector_is_empty(trx->autoinc_locks));
1058 	ut_a(ib_vector_is_empty(trx->lock.table_locks));
1059 
1060 	mutex_enter(&trx_sys->mutex);
1061 
1062 	/* If this transaction came from trx_allocate_for_mysql(),
1063 	trx->in_mysql_trx_list would hold. In that case, the trx->state
1064 	change must be protected by the trx_sys->mutex, so that
1065 	lock_print_info_all_transactions() will have a consistent view. */
1066 
1067 	trx->state = TRX_STATE_ACTIVE;
1068 
1069 	trx->id = trx_sys_get_new_trx_id();
1070 
1071 	/* Cache the state of fake_changes that transaction will use for
1072 	lifetime. Any change in session/global fake_changes configuration during
1073 	lifetime of transaction will not be honored by already started
1074 	transaction. */
1075 	trx->fake_changes = thd_fake_changes(trx->mysql_thd);
1076 
1077 	ut_ad(!trx->in_rw_trx_list);
1078 	ut_ad(!trx->in_ro_trx_list);
1079 
1080 	if (trx->read_only) {
1081 
1082 		/* Note: The trx_sys_t::ro_trx_list doesn't really need to
1083 		be ordered, we should exploit this using a list type that
1084 		doesn't need a list wide lock to increase concurrency. */
1085 
1086 		if (!trx_is_autocommit_non_locking(trx)) {
1087 			UT_LIST_ADD_FIRST(trx_list, trx_sys->ro_trx_list, trx);
1088 			ut_d(trx->in_ro_trx_list = TRUE);
1089 		}
1090 	} else {
1091 
1092 		ut_ad(trx->rseg != NULL
1093 		      || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO);
1094 
1095 		ut_ad(!trx_is_autocommit_non_locking(trx));
1096 		UT_LIST_ADD_FIRST(trx_list, trx_sys->rw_trx_list, trx);
1097 		ut_d(trx->in_rw_trx_list = TRUE);
1098 
1099 #ifdef UNIV_DEBUG
1100 		if (trx->id > trx_sys->rw_max_trx_id) {
1101 			trx_sys->rw_max_trx_id = trx->id;
1102 		}
1103 #endif /* UNIV_DEBUG */
1104 
1105 		trx_reserve_descriptor(trx);
1106 	}
1107 
1108 	ut_ad(trx_sys_validate_trx_list());
1109 
1110 	mutex_exit(&trx_sys->mutex);
1111 
1112 	trx->start_time = ut_time();
1113 
1114 	MONITOR_INC(MONITOR_TRX_ACTIVE);
1115 }
1116 
1117 /****************************************************************//**
1118 Set the transaction serialisation number. */
1119 static
1120 void
trx_serialisation_number_get(trx_t * trx)1121 trx_serialisation_number_get(
1122 /*=========================*/
1123 	trx_t*		trx)	/*!< in: transaction */
1124 {
1125 	trx_rseg_t*	rseg;
1126 
1127 	rseg = trx->rseg;
1128 
1129 	ut_ad(mutex_own(&rseg->mutex));
1130 
1131 	mutex_enter(&trx_sys->mutex);
1132 
1133 	trx->no = trx_sys_get_new_trx_id();
1134 
1135 	if (UNIV_LIKELY(!trx->in_trx_serial_list)) {
1136 
1137 		UT_LIST_ADD_LAST(trx_serial_list, trx_sys->trx_serial_list,
1138 				 trx);
1139 
1140 		trx->in_trx_serial_list = true;
1141 	}
1142 
1143 	/* If the rollack segment is not empty then the
1144 	new trx_t::no can't be less than any trx_t::no
1145 	already in the rollback segment. User threads only
1146 	produce events when a rollback segment is empty. */
1147 
1148 	if (rseg->last_page_no == FIL_NULL) {
1149 		void*		ptr;
1150 		rseg_queue_t	rseg_queue;
1151 
1152 		rseg_queue.rseg = rseg;
1153 		rseg_queue.trx_no = trx->no;
1154 
1155 		mutex_enter(&purge_sys->bh_mutex);
1156 
1157 		/* This is to reduce the pressure on the trx_sys_t::mutex
1158 		though in reality it should make very little (read no)
1159 		difference because this code path is only taken when the
1160 		rbs is empty. */
1161 
1162 		mutex_exit(&trx_sys->mutex);
1163 
1164 		ptr = ib_bh_push(purge_sys->ib_bh, &rseg_queue);
1165 		ut_a(ptr);
1166 
1167 		mutex_exit(&purge_sys->bh_mutex);
1168 	} else {
1169 		mutex_exit(&trx_sys->mutex);
1170 	}
1171 }
1172 
1173 /****************************************************************//**
1174 Assign the transaction its history serialisation number and write the
1175 update UNDO log record to the assigned rollback segment. */
1176 static MY_ATTRIBUTE((nonnull))
1177 void
trx_write_serialisation_history(trx_t * trx,mtr_t * mtr)1178 trx_write_serialisation_history(
1179 /*============================*/
1180 	trx_t*		trx,	/*!< in/out: transaction */
1181 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
1182 {
1183 	trx_rseg_t*	rseg;
1184 
1185 	rseg = trx->rseg;
1186 
1187 	/* Change the undo log segment states from TRX_UNDO_ACTIVE
1188 	to some other state: these modifications to the file data
1189 	structure define the transaction as committed in the file
1190 	based domain, at the serialization point of the log sequence
1191 	number lsn obtained below. */
1192 
1193 	if (trx->update_undo != NULL) {
1194 		page_t*		undo_hdr_page;
1195 		trx_undo_t*	undo = trx->update_undo;
1196 
1197 		/* We have to hold the rseg mutex because update
1198 		log headers have to be put to the history list in the
1199 		(serialisation) order of the UNDO trx number. This is
1200 		required for the purge in-memory data structures too. */
1201 
1202 		mutex_enter(&rseg->mutex);
1203 
1204 		/* Assign the transaction serialisation number and also
1205 		update the purge min binary heap if this is the first
1206 		UNDO log being written to the assigned rollback segment. */
1207 
1208 		trx_serialisation_number_get(trx);
1209 
1210 		/* It is not necessary to obtain trx->undo_mutex here
1211 		because only a single OS thread is allowed to do the
1212 		transaction commit for this transaction. */
1213 
1214 		undo_hdr_page = trx_undo_set_state_at_finish(undo, mtr);
1215 
1216 		trx_undo_update_cleanup(trx, undo_hdr_page, mtr);
1217 	} else {
1218 		mutex_enter(&rseg->mutex);
1219 	}
1220 
1221 	if (trx->insert_undo != NULL) {
1222 		trx_undo_set_state_at_finish(trx->insert_undo, mtr);
1223 	}
1224 
1225 	mutex_exit(&rseg->mutex);
1226 
1227 	MONITOR_INC(MONITOR_TRX_COMMIT_UNDO);
1228 
1229 	/* Update the latest MySQL binlog name and offset info
1230 	in trx sys header if MySQL binlogging is on or the database
1231 	server is a MySQL replication slave */
1232 
1233 	if (trx->mysql_log_file_name
1234 	    && trx->mysql_log_file_name[0] != '\0') {
1235 
1236 		trx_sys_update_mysql_binlog_offset(
1237 			trx->mysql_log_file_name,
1238 			trx->mysql_log_offset,
1239 			TRX_SYS_MYSQL_LOG_INFO, mtr);
1240 
1241 		trx->mysql_log_file_name = NULL;
1242 	}
1243 }
1244 
1245 /********************************************************************
1246 Finalize a transaction containing updates for a FTS table. */
1247 static MY_ATTRIBUTE((nonnull))
1248 void
trx_finalize_for_fts_table(fts_trx_table_t * ftt)1249 trx_finalize_for_fts_table(
1250 /*=======================*/
1251         fts_trx_table_t*        ftt)            /* in: FTS trx table */
1252 {
1253 	fts_t*                  fts = ftt->table->fts;
1254 	fts_doc_ids_t*          doc_ids = ftt->added_doc_ids;
1255 
1256 	mutex_enter(&fts->bg_threads_mutex);
1257 
1258 	if (fts->fts_status & BG_THREAD_STOP) {
1259 		/* The table is about to be dropped, no use
1260 		adding anything to its work queue. */
1261 
1262 		mutex_exit(&fts->bg_threads_mutex);
1263 	} else {
1264 		mem_heap_t*     heap;
1265 		mutex_exit(&fts->bg_threads_mutex);
1266 
1267 		ut_a(fts->add_wq);
1268 
1269 		heap = static_cast<mem_heap_t*>(doc_ids->self_heap->arg);
1270 
1271 		ib_wqueue_add(fts->add_wq, doc_ids, heap);
1272 
1273 		/* fts_trx_table_t no longer owns the list. */
1274 		ftt->added_doc_ids = NULL;
1275 	}
1276 }
1277 
1278 /******************************************************************//**
1279 Finalize a transaction containing updates to FTS tables. */
1280 static MY_ATTRIBUTE((nonnull))
1281 void
trx_finalize_for_fts(trx_t * trx,bool is_commit)1282 trx_finalize_for_fts(
1283 /*=================*/
1284 	trx_t*	trx,		/*!< in/out: transaction */
1285 	bool	is_commit)	/*!< in: true if the transaction was
1286 				committed, false if it was rolled back. */
1287 {
1288 	if (is_commit) {
1289 		const ib_rbt_node_t*	node;
1290 		ib_rbt_t*		tables;
1291 		fts_savepoint_t*	savepoint;
1292 
1293 		savepoint = static_cast<fts_savepoint_t*>(
1294 			ib_vector_last(trx->fts_trx->savepoints));
1295 
1296 		tables = savepoint->tables;
1297 
1298 		for (node = rbt_first(tables);
1299 		     node;
1300 		     node = rbt_next(tables, node)) {
1301 			fts_trx_table_t**	ftt;
1302 
1303 			ftt = rbt_value(fts_trx_table_t*, node);
1304 
1305 			if ((*ftt)->added_doc_ids) {
1306 				trx_finalize_for_fts_table(*ftt);
1307 			}
1308 		}
1309 	}
1310 
1311 	fts_trx_free(trx->fts_trx);
1312 	trx->fts_trx = NULL;
1313 }
1314 
1315 /**********************************************************************//**
1316 If required, flushes the log to disk based on the value of
1317 innodb_flush_log_at_trx_commit. */
1318 static
1319 void
trx_flush_log_if_needed_low(lsn_t lsn,trx_t * trx)1320 trx_flush_log_if_needed_low(
1321 /*========================*/
1322 	lsn_t	lsn,	/*!< in: lsn up to which logs are to be
1323 			flushed. */
1324 	trx_t*	trx)	/*!< in: transaction */
1325 {
1326 	ulint	flush_log_at_trx_commit;
1327 
1328 	flush_log_at_trx_commit = srv_use_global_flush_log_at_trx_commit
1329 		? thd_flush_log_at_trx_commit(NULL)
1330 		: thd_flush_log_at_trx_commit(trx->mysql_thd);
1331 
1332 	switch (flush_log_at_trx_commit) {
1333 	case 0:
1334 		/* Do nothing */
1335 		break;
1336 	case 1:
1337 		/* Write the log and optionally flush it to disk */
1338 		log_write_up_to(lsn, LOG_WAIT_ONE_GROUP,
1339 				srv_unix_file_flush_method != SRV_UNIX_NOSYNC);
1340 		break;
1341 	case 2:
1342 		/* Write the log but do not flush it to disk */
1343 		log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
1344 
1345 		break;
1346 	default:
1347 		ut_error;
1348 	}
1349 }
1350 
1351 /**********************************************************************//**
1352 If required, flushes the log to disk based on the value of
1353 innodb_flush_log_at_trx_commit. */
1354 static MY_ATTRIBUTE((nonnull))
1355 void
trx_flush_log_if_needed(lsn_t lsn,trx_t * trx)1356 trx_flush_log_if_needed(
1357 /*====================*/
1358 	lsn_t	lsn,	/*!< in: lsn up to which logs are to be
1359 			flushed. */
1360 	trx_t*	trx)	/*!< in/out: transaction */
1361 {
1362 	trx->op_info = "flushing log";
1363 	trx_flush_log_if_needed_low(lsn, trx);
1364 	trx->op_info = "";
1365 }
1366 
1367 /****************************************************************//**
1368 Commits a transaction in memory. */
1369 static MY_ATTRIBUTE((nonnull))
1370 void
trx_commit_in_memory(trx_t * trx,lsn_t lsn)1371 trx_commit_in_memory(
1372 /*=================*/
1373 	trx_t*	trx,	/*!< in/out: transaction */
1374 	lsn_t	lsn)	/*!< in: log sequence number of the mini-transaction
1375 			commit of trx_write_serialisation_history(), or 0
1376 			if the transaction did not modify anything */
1377 {
1378 	trx->must_flush_log_later = FALSE;
1379 
1380 	if (trx_is_autocommit_non_locking(trx)) {
1381 		ut_ad(trx->read_only);
1382 		ut_a(!trx->is_recovered);
1383 		ut_ad(trx->rseg == NULL);
1384 		ut_ad(!trx->in_ro_trx_list);
1385 		ut_ad(!trx->in_rw_trx_list);
1386 
1387 		/* Note: We are asserting without holding the lock mutex. But
1388 		that is OK because this transaction is not waiting and cannot
1389 		be rolled back and no new locks can (or should not) be added
1390 		becuase it is flagged as a non-locking read-only transaction. */
1391 
1392 		ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
1393 
1394 		/* This state change is not protected by any mutex, therefore
1395 		there is an inherent race here around state transition during
1396 		printouts. We ignore this race for the sake of efficiency.
1397 		However, the trx_sys_t::mutex will protect the trx_t instance
1398 		and it cannot be removed from the mysql_trx_list and freed
1399 		without first acquiring the trx_sys_t::mutex. */
1400 
1401 		ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
1402 
1403 		trx->state = TRX_STATE_NOT_STARTED;
1404 
1405 		read_view_remove(trx->global_read_view, false);
1406 
1407 		MONITOR_INC(MONITOR_TRX_NL_RO_COMMIT);
1408 	} else {
1409 		lock_trx_release_locks(trx);
1410 
1411 		/* Remove the transaction from the list of active
1412 		transactions now that it no longer holds any user locks. */
1413 
1414 		ut_ad(trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY));
1415 
1416 		mutex_enter(&trx_sys->mutex);
1417 
1418 		assert_trx_in_list(trx);
1419 
1420 		if (trx->read_only) {
1421 			UT_LIST_REMOVE(trx_list, trx_sys->ro_trx_list, trx);
1422 			ut_d(trx->in_ro_trx_list = FALSE);
1423 			MONITOR_INC(MONITOR_TRX_RO_COMMIT);
1424 		} else {
1425 			UT_LIST_REMOVE(trx_list, trx_sys->rw_trx_list, trx);
1426 			ut_d(trx->in_rw_trx_list = FALSE);
1427 			ut_ad(trx_sys->descr_n_used <=
1428 			      UT_LIST_GET_LEN(trx_sys->rw_trx_list));
1429 			MONITOR_INC(MONITOR_TRX_RW_COMMIT);
1430 		}
1431 
1432 		/* If this transaction came from trx_allocate_for_mysql(),
1433 		trx->in_mysql_trx_list would hold. In that case, the
1434 		trx->state change must be protected by trx_sys->mutex, so that
1435 		lock_print_info_all_transactions() will have a consistent
1436 		view. */
1437 
1438 		trx->state = TRX_STATE_NOT_STARTED;
1439 
1440 		/* We already own the trx_sys_t::mutex, by doing it here we
1441 		avoid a potential context switch later. */
1442 		read_view_remove(trx->global_read_view, true);
1443 
1444 		ut_ad(trx_sys_validate_trx_list());
1445 
1446 		mutex_exit(&trx_sys->mutex);
1447 	}
1448 
1449 	if (trx->global_read_view != NULL) {
1450 
1451 		trx->global_read_view = NULL;
1452 	}
1453 
1454 	trx->read_view = NULL;
1455 
1456 	if (lsn) {
1457 		ulint	flush_log_at_trx_commit;
1458 
1459 		if (trx->insert_undo != NULL) {
1460 
1461 			trx_undo_insert_cleanup(trx);
1462 		}
1463 
1464 		if (srv_use_global_flush_log_at_trx_commit) {
1465 			flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL);
1466 		} else {
1467 			flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd);
1468 		}
1469 
1470 		/* NOTE that we could possibly make a group commit more
1471 		efficient here: call os_thread_yield here to allow also other
1472 		trxs to come to commit! */
1473 
1474 		/*-------------------------------------*/
1475 
1476 		/* Depending on the my.cnf options, we may now write the log
1477 		buffer to the log files, making the transaction durable if
1478 		the OS does not crash. We may also flush the log files to
1479 		disk, making the transaction durable also at an OS crash or a
1480 		power outage.
1481 
1482 		The idea in InnoDB's group commit is that a group of
1483 		transactions gather behind a trx doing a physical disk write
1484 		to log files, and when that physical write has been completed,
1485 		one of those transactions does a write which commits the whole
1486 		group. Note that this group commit will only bring benefit if
1487 		there are > 2 users in the database. Then at least 2 users can
1488 		gather behind one doing the physical log write to disk.
1489 
1490 		If we are calling trx_commit() under prepare_commit_mutex, we
1491 		will delay possible log write and flush to a separate function
1492 		trx_commit_complete_for_mysql(), which is only called when the
1493 		thread has released the mutex. This is to make the
1494 		group commit algorithm to work. Otherwise, the prepare_commit
1495 		mutex would serialize all commits and prevent a group of
1496 		transactions from gathering. */
1497 
1498 		if (trx->flush_log_later) {
1499 			/* Do nothing yet */
1500 			trx->must_flush_log_later = TRUE;
1501 		} else if (flush_log_at_trx_commit == 0
1502 			   || thd_requested_durability(trx->mysql_thd)
1503 			   == HA_IGNORE_DURABILITY) {
1504 			/* Do nothing */
1505 		} else {
1506 			trx_flush_log_if_needed(lsn, trx);
1507 		}
1508 
1509 		trx->commit_lsn = lsn;
1510 
1511 		/* Tell server some activity has happened, since the trx
1512 		does changes something. Background utility threads like
1513 		master thread, purge thread or page_cleaner thread might
1514 		have some work to do. */
1515 		srv_active_wake_master_thread();
1516 	}
1517 
1518 	/* undo_no is non-zero if we're doing the final commit. */
1519 	bool			not_rollback = trx->undo_no != 0;
1520 	/* Free all savepoints, starting from the first. */
1521 	trx_named_savept_t*	savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
1522 	trx_roll_savepoints_free(trx, savep);
1523 
1524 	trx->rseg = NULL;
1525 	trx->undo_no = 0;
1526 	trx->last_sql_stat_start.least_undo_no = 0;
1527 
1528 	trx->ddl = false;
1529 #ifdef UNIV_DEBUG
1530 	ut_ad(trx->start_file != 0);
1531 	ut_ad(trx->start_line != 0);
1532 	trx->start_file = 0;
1533 	trx->start_line = 0;
1534 #endif /* UNIV_DEBUG */
1535 
1536 	trx->will_lock = 0;
1537 	trx->read_only = FALSE;
1538 	trx->auto_commit = FALSE;
1539 
1540         if (trx->fts_trx) {
1541                 trx_finalize_for_fts(trx, not_rollback);
1542         }
1543 
1544 	ut_ad(trx->lock.wait_thr == NULL);
1545 	ut_ad(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
1546 	ut_ad(!trx->in_ro_trx_list);
1547 	ut_ad(!trx->in_rw_trx_list);
1548 
1549 	trx->dict_operation = TRX_DICT_OP_NONE;
1550 
1551 	trx->error_state = DB_SUCCESS;
1552 
1553 	/* trx->in_mysql_trx_list would hold between
1554 	trx_allocate_for_mysql() and trx_free_for_mysql(). It does not
1555 	hold for recovered transactions or system transactions. */
1556 }
1557 
1558 /****************************************************************//**
1559 Commits a transaction and a mini-transaction. */
1560 UNIV_INTERN
1561 void
trx_commit_low(trx_t * trx,mtr_t * mtr)1562 trx_commit_low(
1563 /*===========*/
1564 	trx_t*	trx,	/*!< in/out: transaction */
1565 	mtr_t*	mtr)	/*!< in/out: mini-transaction (will be committed),
1566 			or NULL if trx made no modifications */
1567 {
1568 	lsn_t	lsn;
1569 
1570 	assert_trx_nonlocking_or_in_list(trx);
1571 	ut_ad(!trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY));
1572 	ut_ad(!mtr || mtr->state == MTR_ACTIVE);
1573 	ut_ad(!mtr == !(trx->insert_undo || trx->update_undo));
1574 
1575 	/* undo_no is non-zero if we're doing the final commit. */
1576 	if (trx->fts_trx && trx->undo_no != 0) {
1577 		dberr_t	error;
1578 
1579 		ut_a(!trx_is_autocommit_non_locking(trx));
1580 
1581 		error = fts_commit(trx);
1582 
1583 		/* FTS-FIXME: Temporarily tolerate DB_DUPLICATE_KEY
1584 		instead of dying. This is a possible scenario if there
1585 		is a crash between insert to DELETED table committing
1586 		and transaction committing. The fix would be able to
1587 		return error from this function */
1588 		if (error != DB_SUCCESS && error != DB_DUPLICATE_KEY) {
1589 			/* FTS-FIXME: once we can return values from this
1590 			function, we should do so and signal an error
1591 			instead of just dying. */
1592 
1593 			ut_error;
1594 		}
1595 	}
1596 
1597 	if (mtr) {
1598 		trx_write_serialisation_history(trx, mtr);
1599 		/* The following call commits the mini-transaction, making the
1600 		whole transaction committed in the file-based world, at this
1601 		log sequence number. The transaction becomes 'durable' when
1602 		we write the log to disk, but in the logical sense the commit
1603 		in the file-based data structures (undo logs etc.) happens
1604 		here.
1605 
1606 		NOTE that transaction numbers, which are assigned only to
1607 		transactions with an update undo log, do not necessarily come
1608 		in exactly the same order as commit lsn's, if the transactions
1609 		have different rollback segments. To get exactly the same
1610 		order we should hold the kernel mutex up to this point,
1611 		adding to the contention of the kernel mutex. However, if
1612 		a transaction T2 is able to see modifications made by
1613 		a transaction T1, T2 will always get a bigger transaction
1614 		number and a bigger commit lsn than T1. */
1615 
1616 		/*--------------*/
1617 		mtr_commit(mtr);
1618 		/*--------------*/
1619 		lsn = mtr->end_lsn;
1620 	} else {
1621 		lsn = 0;
1622 	}
1623 
1624 	trx_commit_in_memory(trx, lsn);
1625 }
1626 
1627 /****************************************************************//**
1628 Commits a transaction. */
1629 UNIV_INTERN
1630 void
trx_commit(trx_t * trx)1631 trx_commit(
1632 /*=======*/
1633 	trx_t*	trx)	/*!< in/out: transaction */
1634 {
1635 	mtr_t	local_mtr;
1636 	mtr_t*	mtr;
1637 
1638 	if (trx->insert_undo || trx->update_undo) {
1639 		mtr = &local_mtr;
1640 		mtr_start(mtr);
1641 	} else {
1642 		mtr = NULL;
1643 	}
1644 
1645 	trx_commit_low(trx, mtr);
1646 }
1647 
1648 /****************************************************************//**
1649 Cleans up a transaction at database startup. The cleanup is needed if
1650 the transaction already got to the middle of a commit when the database
1651 crashed, and we cannot roll it back. */
1652 UNIV_INTERN
1653 void
trx_cleanup_at_db_startup(trx_t * trx)1654 trx_cleanup_at_db_startup(
1655 /*======================*/
1656 	trx_t*	trx)	/*!< in: transaction */
1657 {
1658 	ut_ad(trx->is_recovered);
1659 
1660 	if (trx->insert_undo != NULL) {
1661 
1662 		trx_undo_insert_cleanup(trx);
1663 	}
1664 
1665 	trx->rseg = NULL;
1666 	trx->undo_no = 0;
1667 	trx->last_sql_stat_start.least_undo_no = 0;
1668 
1669 	mutex_enter(&trx_sys->mutex);
1670 
1671 	ut_a(!trx->read_only);
1672 
1673 	UT_LIST_REMOVE(trx_list, trx_sys->rw_trx_list, trx);
1674 	ut_ad(trx_sys->descr_n_used <= UT_LIST_GET_LEN(trx_sys->rw_trx_list));
1675 
1676 	assert_trx_in_rw_list(trx);
1677 	ut_d(trx->in_rw_trx_list = FALSE);
1678 
1679 	trx->state = TRX_STATE_NOT_STARTED;
1680 	trx_release_descriptor(trx);
1681 
1682 	mutex_exit(&trx_sys->mutex);
1683 
1684 	/* Change the transaction state without mutex protection, now
1685 	that it no longer is in the trx_list. Recovered transactions
1686 	are never placed in the mysql_trx_list. */
1687 	ut_ad(trx->is_recovered);
1688 	ut_ad(!trx->in_ro_trx_list);
1689 	ut_ad(!trx->in_rw_trx_list);
1690 	ut_ad(!trx->in_mysql_trx_list);
1691 }
1692 
1693 /********************************************************************//**
1694 Assigns a read view for a consistent read query. All the consistent reads
1695 within the same transaction will get the same read view, which is created
1696 when this function is first called for a new started transaction.
1697 @return	consistent read view */
1698 UNIV_INTERN
1699 read_view_t*
trx_assign_read_view(trx_t * trx)1700 trx_assign_read_view(
1701 /*=================*/
1702 	trx_t*	trx)	/*!< in: active transaction */
1703 {
1704 	ut_ad(trx->state == TRX_STATE_ACTIVE);
1705 
1706 	if (trx->read_view != NULL) {
1707 		return(trx->read_view);
1708 	}
1709 
1710 	trx->read_view = read_view_open_now(trx->id, trx->prebuilt_view);
1711 	trx->global_read_view = trx->read_view;
1712 
1713 	return(trx->read_view);
1714 }
1715 
1716 /********************************************************************//**
1717 Clones the read view from another transaction. All consistent reads within
1718 the receiver transaction will get the same read view as the donor transaction
1719 @return read view clone */
1720 UNIV_INTERN
1721 read_view_t*
trx_clone_read_view(trx_t * trx,trx_t * from_trx)1722 trx_clone_read_view(
1723 /*================*/
1724 	trx_t*	trx,		/*!< in: receiver transaction */
1725 	trx_t*	from_trx)	/*!< in: donor transaction */
1726 {
1727 	ut_ad(lock_mutex_own());
1728 	ut_ad(mutex_own(&trx_sys->mutex));
1729 	ut_ad(trx_mutex_own(from_trx));
1730 	ut_ad(trx->read_view == NULL);
1731 
1732 	if (from_trx->state != TRX_STATE_ACTIVE ||
1733 	    from_trx->read_view == NULL) {
1734 
1735 		return(NULL);
1736 	}
1737 
1738 	trx->read_view = read_view_clone(from_trx->read_view,
1739 					 trx->prebuilt_view);
1740 
1741 	read_view_add(trx->read_view);
1742 
1743 	trx->global_read_view = trx->read_view;
1744 
1745 	return(trx->read_view);
1746 }
1747 
1748 /****************************************************************//**
1749 Prepares a transaction for commit/rollback. */
1750 UNIV_INTERN
1751 void
trx_commit_or_rollback_prepare(trx_t * trx)1752 trx_commit_or_rollback_prepare(
1753 /*===========================*/
1754 	trx_t*	trx)		/*!< in/out: transaction */
1755 {
1756 	/* We are reading trx->state without holding trx_sys->mutex
1757 	here, because the commit or rollback should be invoked for a
1758 	running (or recovered prepared) transaction that is associated
1759 	with the current thread. */
1760 
1761 	switch (trx->state) {
1762 	case TRX_STATE_NOT_STARTED:
1763 		trx_start_low(trx);
1764 		/* fall through */
1765 	case TRX_STATE_ACTIVE:
1766 	case TRX_STATE_PREPARED:
1767 		/* If the trx is in a lock wait state, moves the waiting
1768 		query thread to the suspended state */
1769 
1770 		if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
1771 
1772 			ulint		sec;
1773 			ulint		ms;
1774 			ib_uint64_t	now;
1775 
1776 			ut_a(trx->lock.wait_thr != NULL);
1777 			trx->lock.wait_thr->state = QUE_THR_SUSPENDED;
1778 			trx->lock.wait_thr = NULL;
1779 
1780 			if (UNIV_UNLIKELY(trx->take_stats)) {
1781 				ut_usectime(&sec, &ms);
1782 				now = (ib_uint64_t)sec * 1000000 + ms;
1783 				trx->lock_que_wait_timer += now - trx->lock_que_wait_ustarted;
1784 			}
1785 
1786 			trx->lock.que_state = TRX_QUE_RUNNING;
1787 		}
1788 
1789 		ut_a(trx->lock.n_active_thrs == 1);
1790 		return;
1791 	case TRX_STATE_COMMITTED_IN_MEMORY:
1792 		break;
1793 	}
1794 
1795 	ut_error;
1796 }
1797 
1798 /*********************************************************************//**
1799 Creates a commit command node struct.
1800 @return	own: commit node struct */
1801 UNIV_INTERN
1802 commit_node_t*
trx_commit_node_create(mem_heap_t * heap)1803 trx_commit_node_create(
1804 /*===================*/
1805 	mem_heap_t*	heap)	/*!< in: mem heap where created */
1806 {
1807 	commit_node_t*	node;
1808 
1809 	node = static_cast<commit_node_t*>(mem_heap_alloc(heap, sizeof(*node)));
1810 	node->common.type  = QUE_NODE_COMMIT;
1811 	node->state = COMMIT_NODE_SEND;
1812 
1813 	return(node);
1814 }
1815 
1816 /***********************************************************//**
1817 Performs an execution step for a commit type node in a query graph.
1818 @return	query thread to run next, or NULL */
1819 UNIV_INTERN
1820 que_thr_t*
trx_commit_step(que_thr_t * thr)1821 trx_commit_step(
1822 /*============*/
1823 	que_thr_t*	thr)	/*!< in: query thread */
1824 {
1825 	commit_node_t*	node;
1826 
1827 	node = static_cast<commit_node_t*>(thr->run_node);
1828 
1829 	ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT);
1830 
1831 	if (thr->prev_node == que_node_get_parent(node)) {
1832 		node->state = COMMIT_NODE_SEND;
1833 	}
1834 
1835 	if (node->state == COMMIT_NODE_SEND) {
1836 		trx_t*	trx;
1837 
1838 		node->state = COMMIT_NODE_WAIT;
1839 
1840 		trx = thr_get_trx(thr);
1841 
1842 		ut_a(trx->lock.wait_thr == NULL);
1843 		ut_a(trx->lock.que_state != TRX_QUE_LOCK_WAIT);
1844 
1845 		trx_commit_or_rollback_prepare(trx);
1846 
1847 		trx->lock.que_state = TRX_QUE_COMMITTING;
1848 
1849 		trx_commit(trx);
1850 
1851 		ut_ad(trx->lock.wait_thr == NULL);
1852 
1853 		trx->lock.que_state = TRX_QUE_RUNNING;
1854 
1855 		thr = NULL;
1856 	} else {
1857 		ut_ad(node->state == COMMIT_NODE_WAIT);
1858 
1859 		node->state = COMMIT_NODE_SEND;
1860 
1861 		thr->run_node = que_node_get_parent(node);
1862 	}
1863 
1864 	return(thr);
1865 }
1866 
1867 /**********************************************************************//**
1868 Does the transaction commit for MySQL.
1869 @return	DB_SUCCESS or error number */
1870 UNIV_INTERN
1871 dberr_t
trx_commit_for_mysql(trx_t * trx)1872 trx_commit_for_mysql(
1873 /*=================*/
1874 	trx_t*	trx)	/*!< in/out: transaction */
1875 {
1876 	/* Because we do not do the commit by sending an Innobase
1877 	sig to the transaction, we must here make sure that trx has been
1878 	started. */
1879 
1880 	ut_a(trx);
1881 
1882 	switch (trx->state) {
1883 	case TRX_STATE_NOT_STARTED:
1884 		/* Update the info whether we should skip XA steps that eat
1885 		CPU time.
1886 
1887 		For the duration of the transaction trx->support_xa is
1888 		not reread from thd so any changes in the value take
1889 		effect in the next transaction. This is to avoid a
1890 		scenario where some undo log records generated by a
1891 		transaction contain XA information and other undo log
1892 		records, generated by the same transaction do not. */
1893 		trx->support_xa = thd_supports_xa(trx->mysql_thd);
1894 
1895 		ut_d(trx->start_file = __FILE__);
1896 		ut_d(trx->start_line = __LINE__);
1897 
1898 		trx_start_low(trx);
1899 		/* fall through */
1900 	case TRX_STATE_ACTIVE:
1901 	case TRX_STATE_PREPARED:
1902 		trx->op_info = "committing";
1903 		trx_commit(trx);
1904 		MONITOR_DEC(MONITOR_TRX_ACTIVE);
1905 		trx->op_info = "";
1906 		return(DB_SUCCESS);
1907 	case TRX_STATE_COMMITTED_IN_MEMORY:
1908 		break;
1909 	}
1910 	ut_error;
1911 	return(DB_CORRUPTION);
1912 }
1913 
1914 /**********************************************************************//**
1915 If required, flushes the log to disk if we called trx_commit_for_mysql()
1916 with trx->flush_log_later == TRUE. */
1917 UNIV_INTERN
1918 void
trx_commit_complete_for_mysql(trx_t * trx)1919 trx_commit_complete_for_mysql(
1920 /*==========================*/
1921 	trx_t*	trx)	/*!< in/out: transaction */
1922 {
1923 	ut_a(trx);
1924 
1925 	if (!trx->must_flush_log_later
1926 	    || thd_requested_durability(trx->mysql_thd)
1927 	       == HA_IGNORE_DURABILITY) {
1928 		return;
1929 	}
1930 
1931 	trx_flush_log_if_needed(trx->commit_lsn, trx);
1932 
1933 	trx->must_flush_log_later = FALSE;
1934 }
1935 
1936 /**********************************************************************//**
1937 Marks the latest SQL statement ended. */
1938 UNIV_INTERN
1939 void
trx_mark_sql_stat_end(trx_t * trx)1940 trx_mark_sql_stat_end(
1941 /*==================*/
1942 	trx_t*	trx)	/*!< in: trx handle */
1943 {
1944 	ut_a(trx);
1945 
1946 	switch (trx->state) {
1947 	case TRX_STATE_PREPARED:
1948 	case TRX_STATE_COMMITTED_IN_MEMORY:
1949 		break;
1950 	case TRX_STATE_NOT_STARTED:
1951 		trx->undo_no = 0;
1952 		/* fall through */
1953 	case TRX_STATE_ACTIVE:
1954 		trx->last_sql_stat_start.least_undo_no = trx->undo_no;
1955 
1956 		if (trx->fts_trx) {
1957 			fts_savepoint_laststmt_refresh(trx);
1958 		}
1959 
1960 		return;
1961 	}
1962 
1963 	ut_error;
1964 }
1965 
1966 /**********************************************************************//**
1967 Prints info about a transaction.
1968 Caller must hold trx_sys->mutex. */
1969 UNIV_INTERN
1970 void
trx_print_low(FILE * f,const trx_t * trx,ulint max_query_len,ulint n_rec_locks,ulint n_trx_locks,ulint heap_size)1971 trx_print_low(
1972 /*==========*/
1973 	FILE*		f,
1974 			/*!< in: output stream */
1975 	const trx_t*	trx,
1976 			/*!< in: transaction */
1977 	ulint		max_query_len,
1978 			/*!< in: max query length to print,
1979 			or 0 to use the default max length */
1980 	ulint		n_rec_locks,
1981 			/*!< in: lock_number_of_rows_locked(&trx->lock) */
1982 	ulint		n_trx_locks,
1983 			/*!< in: length of trx->lock.trx_locks */
1984 	ulint		heap_size)
1985 			/*!< in: mem_heap_get_size(trx->lock.lock_heap) */
1986 {
1987 	ibool		newline;
1988 	const char*	op_info;
1989 
1990 	ut_ad(mutex_own(&trx_sys->mutex));
1991 
1992 	fprintf(f, "TRANSACTION " TRX_ID_FMT, trx->id);
1993 
1994 	/* trx->state cannot change from or to NOT_STARTED while we
1995 	are holding the trx_sys->mutex. It may change from ACTIVE to
1996 	PREPARED or COMMITTED. */
1997 	switch (trx->state) {
1998 	case TRX_STATE_NOT_STARTED:
1999 		fputs(", not started", f);
2000 		goto state_ok;
2001 	case TRX_STATE_ACTIVE:
2002 		fprintf(f, ", ACTIVE %lu sec",
2003 			(ulong) difftime(time(NULL), trx->start_time));
2004 		goto state_ok;
2005 	case TRX_STATE_PREPARED:
2006 		fprintf(f, ", ACTIVE (PREPARED) %lu sec",
2007 			(ulong) difftime(time(NULL), trx->start_time));
2008 		goto state_ok;
2009 	case TRX_STATE_COMMITTED_IN_MEMORY:
2010 		fputs(", COMMITTED IN MEMORY", f);
2011 		goto state_ok;
2012 	}
2013 	fprintf(f, ", state %lu", (ulong) trx->state);
2014 	ut_ad(0);
2015 state_ok:
2016 
2017 	/* prevent a race condition */
2018 	op_info = trx->op_info;
2019 
2020 	if (*op_info) {
2021 		putc(' ', f);
2022 		fputs(op_info, f);
2023 	}
2024 
2025 	if (trx->is_recovered) {
2026 		fputs(" recovered trx", f);
2027 	}
2028 
2029 	if (trx->declared_to_be_inside_innodb) {
2030 		fprintf(f, ", thread declared inside InnoDB %lu",
2031 			(ulong) trx->n_tickets_to_enter_innodb);
2032 	}
2033 
2034 	putc('\n', f);
2035 
2036 	if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) {
2037 		fprintf(f, "mysql tables in use %lu, locked %lu\n",
2038 			(ulong) trx->n_mysql_tables_in_use,
2039 			(ulong) trx->mysql_n_tables_locked);
2040 	}
2041 
2042 	newline = TRUE;
2043 
2044 	/* trx->lock.que_state of an ACTIVE transaction may change
2045 	while we are not holding trx->mutex. We perform a dirty read
2046 	for performance reasons. */
2047 
2048 	switch (trx->lock.que_state) {
2049 	case TRX_QUE_RUNNING:
2050 		newline = FALSE; break;
2051 	case TRX_QUE_LOCK_WAIT:
2052 		fputs("LOCK WAIT ", f); break;
2053 	case TRX_QUE_ROLLING_BACK:
2054 		fputs("ROLLING BACK ", f); break;
2055 	case TRX_QUE_COMMITTING:
2056 		fputs("COMMITTING ", f); break;
2057 	default:
2058 		fprintf(f, "que state %lu ", (ulong) trx->lock.que_state);
2059 	}
2060 
2061 	if (n_trx_locks > 0 || heap_size > 400) {
2062 		newline = TRUE;
2063 
2064 		fprintf(f, "%lu lock struct(s), heap size %lu,"
2065 			" %lu row lock(s)",
2066 			(ulong) n_trx_locks,
2067 			(ulong) heap_size,
2068 			(ulong) n_rec_locks);
2069 	}
2070 
2071 	if (trx->has_search_latch) {
2072 		newline = TRUE;
2073 		fputs(", holds adaptive hash latch", f);
2074 	}
2075 
2076 	if (trx->undo_no != 0) {
2077 		newline = TRUE;
2078 		fprintf(f, ", undo log entries " TRX_ID_FMT, trx->undo_no);
2079 	}
2080 
2081 	if (newline) {
2082 		putc('\n', f);
2083 	}
2084 
2085 	if (trx->mysql_thd != NULL) {
2086 		innobase_mysql_print_thd(
2087 			f, trx->mysql_thd, static_cast<uint>(max_query_len));
2088 	}
2089 }
2090 
2091 /**********************************************************************//**
2092 Prints info about a transaction.
2093 The caller must hold lock_sys->mutex and trx_sys->mutex.
2094 When possible, use trx_print() instead. */
2095 UNIV_INTERN
2096 void
trx_print_latched(FILE * f,const trx_t * trx,ulint max_query_len)2097 trx_print_latched(
2098 /*==============*/
2099 	FILE*		f,		/*!< in: output stream */
2100 	const trx_t*	trx,		/*!< in: transaction */
2101 	ulint		max_query_len)	/*!< in: max query length to print,
2102 					or 0 to use the default max length */
2103 {
2104 	ut_ad(lock_mutex_own());
2105 	ut_ad(mutex_own(&trx_sys->mutex));
2106 
2107 	trx_print_low(f, trx, max_query_len,
2108 		      lock_number_of_rows_locked(&trx->lock),
2109 		      UT_LIST_GET_LEN(trx->lock.trx_locks),
2110 		      mem_heap_get_size(trx->lock.lock_heap));
2111 }
2112 
2113 /**********************************************************************//**
2114 Prints info about a transaction.
2115 Acquires and releases lock_sys->mutex and trx_sys->mutex. */
2116 UNIV_INTERN
2117 void
trx_print(FILE * f,const trx_t * trx,ulint max_query_len)2118 trx_print(
2119 /*======*/
2120 	FILE*		f,		/*!< in: output stream */
2121 	const trx_t*	trx,		/*!< in: transaction */
2122 	ulint		max_query_len)	/*!< in: max query length to print,
2123 					or 0 to use the default max length */
2124 {
2125 	ulint	n_rec_locks;
2126 	ulint	n_trx_locks;
2127 	ulint	heap_size;
2128 
2129 	lock_mutex_enter();
2130 	n_rec_locks = lock_number_of_rows_locked(&trx->lock);
2131 	n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks);
2132 	heap_size = mem_heap_get_size(trx->lock.lock_heap);
2133 	lock_mutex_exit();
2134 
2135 	mutex_enter(&trx_sys->mutex);
2136 	trx_print_low(f, trx, max_query_len,
2137 		      n_rec_locks, n_trx_locks, heap_size);
2138 	mutex_exit(&trx_sys->mutex);
2139 }
2140 
2141 #ifdef UNIV_DEBUG
2142 /**********************************************************************//**
2143 Asserts that a transaction has been started.
2144 The caller must hold trx_sys->mutex.
2145 @return TRUE if started */
2146 UNIV_INTERN
2147 ibool
trx_assert_started(const trx_t * trx)2148 trx_assert_started(
2149 /*===============*/
2150 	const trx_t*	trx)	/*!< in: transaction */
2151 {
2152 	ut_ad(mutex_own(&trx_sys->mutex));
2153 
2154 	/* Non-locking autocommits should not hold any locks and this
2155 	function is only called from the locking code. */
2156 	assert_trx_in_list(trx);
2157 
2158 	/* trx->state can change from or to NOT_STARTED while we are holding
2159 	trx_sys->mutex for non-locking autocommit selects but not for other
2160 	types of transactions. It may change from ACTIVE to PREPARED. Unless
2161 	we are holding lock_sys->mutex, it may also change to COMMITTED. */
2162 
2163 	switch (trx->state) {
2164 	case TRX_STATE_PREPARED:
2165 		return(TRUE);
2166 
2167 	case TRX_STATE_ACTIVE:
2168 	case TRX_STATE_COMMITTED_IN_MEMORY:
2169 		return(TRUE);
2170 
2171 	case TRX_STATE_NOT_STARTED:
2172 		break;
2173 	}
2174 
2175 	ut_error;
2176 	return(FALSE);
2177 }
2178 #endif /* UNIV_DEBUG */
2179 
2180 /*******************************************************************//**
2181 Compares the "weight" (or size) of two transactions. Transactions that
2182 have edited non-transactional tables are considered heavier than ones
2183 that have not.
2184 @return	TRUE if weight(a) >= weight(b) */
2185 UNIV_INTERN
2186 ibool
trx_weight_ge(const trx_t * a,const trx_t * b)2187 trx_weight_ge(
2188 /*==========*/
2189 	const trx_t*	a,	/*!< in: the first transaction to be compared */
2190 	const trx_t*	b)	/*!< in: the second transaction to be compared */
2191 {
2192 	ibool	a_notrans_edit;
2193 	ibool	b_notrans_edit;
2194 
2195 	/* If mysql_thd is NULL for a transaction we assume that it has
2196 	not edited non-transactional tables. */
2197 
2198 	a_notrans_edit = a->mysql_thd != NULL
2199 		&& thd_has_edited_nontrans_tables(a->mysql_thd);
2200 
2201 	b_notrans_edit = b->mysql_thd != NULL
2202 		&& thd_has_edited_nontrans_tables(b->mysql_thd);
2203 
2204 	if (a_notrans_edit != b_notrans_edit) {
2205 
2206 		return(a_notrans_edit);
2207 	}
2208 
2209 	/* Either both had edited non-transactional tables or both had
2210 	not, we fall back to comparing the number of altered/locked
2211 	rows. */
2212 
2213 #if 0
2214 	fprintf(stderr,
2215 		"%s TRX_WEIGHT(a): %lld+%lu, TRX_WEIGHT(b): %lld+%lu\n",
2216 		__func__,
2217 		a->undo_no, UT_LIST_GET_LEN(a->lock.trx_locks),
2218 		b->undo_no, UT_LIST_GET_LEN(b->lock.trx_locks));
2219 #endif
2220 
2221 	return(TRX_WEIGHT(a) >= TRX_WEIGHT(b));
2222 }
2223 
2224 /****************************************************************//**
2225 Prepares a transaction. */
2226 static
2227 void
trx_prepare(trx_t * trx)2228 trx_prepare(
2229 /*========*/
2230 	trx_t*	trx)	/*!< in/out: transaction */
2231 {
2232 	trx_rseg_t*	rseg;
2233 	lsn_t		lsn;
2234 	mtr_t		mtr;
2235 
2236 	rseg = trx->rseg;
2237 	/* Only fresh user transactions can be prepared.
2238 	Recovered transactions cannot. */
2239 	ut_a(!trx->is_recovered);
2240 
2241 	if (trx->insert_undo != NULL || trx->update_undo != NULL) {
2242 
2243 		mtr_start(&mtr);
2244 
2245 		/* Change the undo log segment states from TRX_UNDO_ACTIVE
2246 		to TRX_UNDO_PREPARED: these modifications to the file data
2247 		structure define the transaction as prepared in the
2248 		file-based world, at the serialization point of lsn. */
2249 
2250 		mutex_enter(&rseg->mutex);
2251 
2252 		if (trx->insert_undo != NULL) {
2253 
2254 			/* It is not necessary to obtain trx->undo_mutex here
2255 			because only a single OS thread is allowed to do the
2256 			transaction prepare for this transaction. */
2257 
2258 			trx_undo_set_state_at_prepare(trx, trx->insert_undo,
2259 						      &mtr);
2260 		}
2261 
2262 		if (trx->update_undo) {
2263 			trx_undo_set_state_at_prepare(
2264 				trx, trx->update_undo, &mtr);
2265 		}
2266 
2267 		mutex_exit(&rseg->mutex);
2268 
2269 		/*--------------*/
2270 		mtr_commit(&mtr);	/* This mtr commit makes the
2271 					transaction prepared in the file-based
2272 					world */
2273 		/*--------------*/
2274 		lsn = mtr.end_lsn;
2275 		ut_ad(lsn);
2276 	} else {
2277 		lsn = 0;
2278 	}
2279 
2280 	/*--------------------------------------*/
2281 	ut_a(trx->state == TRX_STATE_ACTIVE);
2282 	mutex_enter(&trx_sys->mutex);
2283 	trx->state = TRX_STATE_PREPARED;
2284 	trx_sys->n_prepared_trx++;
2285 	mutex_exit(&trx_sys->mutex);
2286 	/*--------------------------------------*/
2287 
2288 	if (lsn) {
2289 		/* Depending on the my.cnf options, we may now write the log
2290 		buffer to the log files, making the prepared state of the
2291 		transaction durable if the OS does not crash. We may also
2292 		flush the log files to disk, making the prepared state of the
2293 		transaction durable also at an OS crash or a power outage.
2294 
2295 		The idea in InnoDB's group prepare is that a group of
2296 		transactions gather behind a trx doing a physical disk write
2297 		to log files, and when that physical write has been completed,
2298 		one of those transactions does a write which prepares the whole
2299 		group. Note that this group prepare will only bring benefit if
2300 		there are > 2 users in the database. Then at least 2 users can
2301 		gather behind one doing the physical log write to disk.
2302 
2303 		TODO: find out if MySQL holds some mutex when calling this.
2304 		That would spoil our group prepare algorithm. */
2305 
2306 		trx_flush_log_if_needed(lsn, trx);
2307 	}
2308 }
2309 
2310 /**********************************************************************//**
2311 Does the transaction prepare for MySQL. */
2312 UNIV_INTERN
2313 void
trx_prepare_for_mysql(trx_t * trx)2314 trx_prepare_for_mysql(
2315 /*==================*/
2316 	trx_t*	trx)	/*!< in/out: trx handle */
2317 {
2318 	trx_start_if_not_started_xa(trx);
2319 
2320 	trx->op_info = "preparing";
2321 
2322 	trx_prepare(trx);
2323 
2324 	trx->op_info = "";
2325 }
2326 
2327 /**********************************************************************//**
2328 This function is used to find number of prepared transactions and
2329 their transaction objects for a recovery.
2330 @return	number of prepared transactions stored in xid_list */
2331 UNIV_INTERN
2332 int
trx_recover_for_mysql(XID * xid_list,ulint len)2333 trx_recover_for_mysql(
2334 /*==================*/
2335 	XID*	xid_list,	/*!< in/out: prepared transactions */
2336 	ulint	len)		/*!< in: number of slots in xid_list */
2337 {
2338 	const trx_t*	trx;
2339 	ulint		count = 0;
2340 
2341 	ut_ad(xid_list);
2342 	ut_ad(len);
2343 
2344 	/* We should set those transactions which are in the prepared state
2345 	to the xid_list */
2346 
2347 	mutex_enter(&trx_sys->mutex);
2348 
2349 	for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list);
2350 	     trx != NULL;
2351 	     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
2352 
2353 		assert_trx_in_rw_list(trx);
2354 
2355 		/* The state of a read-write transaction cannot change
2356 		from or to NOT_STARTED while we are holding the
2357 		trx_sys->mutex. It may change to PREPARED, but not if
2358 		trx->is_recovered. It may also change to COMMITTED. */
2359 		if (trx_state_eq(trx, TRX_STATE_PREPARED)) {
2360 			xid_list[count] = trx->xid;
2361 
2362 			if (count == 0) {
2363 				ut_print_timestamp(stderr);
2364 				fprintf(stderr,
2365 					"  InnoDB: Starting recovery for"
2366 					" XA transactions...\n");
2367 			}
2368 
2369 			ut_print_timestamp(stderr);
2370 			fprintf(stderr,
2371 				"  InnoDB: Transaction " TRX_ID_FMT " in"
2372 				" prepared state after recovery\n",
2373 				trx->id);
2374 
2375 			ut_print_timestamp(stderr);
2376 			fprintf(stderr,
2377 				"  InnoDB: Transaction contains changes"
2378 				" to " TRX_ID_FMT " rows\n",
2379 				trx->undo_no);
2380 
2381 			count++;
2382 
2383 			if (count == len) {
2384 				break;
2385 			}
2386 		}
2387 	}
2388 
2389 	mutex_exit(&trx_sys->mutex);
2390 
2391 	if (count > 0){
2392 		ut_print_timestamp(stderr);
2393 		fprintf(stderr,
2394 			"  InnoDB: %d transactions in prepared state"
2395 			" after recovery\n",
2396 			int (count));
2397 	}
2398 
2399 	return(int (count));
2400 }
2401 
2402 /*******************************************************************//**
2403 This function is used to find one X/Open XA distributed transaction
2404 which is in the prepared state
2405 @return	trx on match, the trx->xid will be invalidated;
2406 note that the trx may have been committed, unless the caller is
2407 holding lock_sys->mutex */
2408 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2409 trx_t*
trx_get_trx_by_xid_low(const XID * xid)2410 trx_get_trx_by_xid_low(
2411 /*===================*/
2412 	const XID*	xid)		/*!< in: X/Open XA transaction
2413 					identifier */
2414 {
2415 	trx_t*		trx;
2416 
2417 	ut_ad(mutex_own(&trx_sys->mutex));
2418 
2419 	for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list);
2420 	     trx != NULL;
2421 	     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
2422 
2423 		assert_trx_in_rw_list(trx);
2424 
2425 		/* Compare two X/Open XA transaction id's: their
2426 		length should be the same and binary comparison
2427 		of gtrid_length+bqual_length bytes should be
2428 		the same */
2429 
2430 		if (trx->is_recovered
2431 		    && trx_state_eq(trx, TRX_STATE_PREPARED)
2432 		    && xid->gtrid_length == trx->xid.gtrid_length
2433 		    && xid->bqual_length == trx->xid.bqual_length
2434 		    && memcmp(xid->data, trx->xid.data,
2435 			      xid->gtrid_length + xid->bqual_length) == 0) {
2436 
2437 			/* Invalidate the XID, so that subsequent calls
2438 			will not find it. */
2439 			memset(static_cast<void*>(&trx->xid), 0,
2440 			       sizeof(trx->xid));
2441 			trx->xid.formatID = -1;
2442 			break;
2443 		}
2444 	}
2445 
2446 	return(trx);
2447 }
2448 
2449 /*******************************************************************//**
2450 This function is used to find one X/Open XA distributed transaction
2451 which is in the prepared state
2452 @return	trx or NULL; on match, the trx->xid will be invalidated;
2453 note that the trx may have been committed, unless the caller is
2454 holding lock_sys->mutex */
2455 UNIV_INTERN
2456 trx_t*
trx_get_trx_by_xid(const XID * xid)2457 trx_get_trx_by_xid(
2458 /*===============*/
2459 	const XID*	xid)	/*!< in: X/Open XA transaction identifier */
2460 {
2461 	trx_t*	trx;
2462 
2463 	if (xid == NULL) {
2464 
2465 		return(NULL);
2466 	}
2467 
2468 	mutex_enter(&trx_sys->mutex);
2469 
2470 	/* Recovered/Resurrected transactions are always only on the
2471 	trx_sys_t::rw_trx_list. */
2472 	trx = trx_get_trx_by_xid_low(xid);
2473 
2474 	mutex_exit(&trx_sys->mutex);
2475 
2476 	return(trx);
2477 }
2478 
2479 /*************************************************************//**
2480 Starts the transaction if it is not yet started. */
2481 UNIV_INTERN
2482 void
trx_start_if_not_started_xa_low(trx_t * trx)2483 trx_start_if_not_started_xa_low(
2484 /*============================*/
2485 	trx_t*	trx)	/*!< in: transaction */
2486 {
2487 	switch (trx->state) {
2488 	case TRX_STATE_NOT_STARTED:
2489 
2490 		/* Update the info whether we should skip XA steps
2491 		that eat CPU time.
2492 
2493 		For the duration of the transaction trx->support_xa is
2494 		not reread from thd so any changes in the value take
2495 		effect in the next transaction. This is to avoid a
2496 		scenario where some undo generated by a transaction,
2497 		has XA stuff, and other undo, generated by the same
2498 		transaction, doesn't. */
2499 		trx->support_xa = thd_supports_xa(trx->mysql_thd);
2500 
2501 		trx_start_low(trx);
2502 		/* fall through */
2503 	case TRX_STATE_ACTIVE:
2504 		return;
2505 	case TRX_STATE_PREPARED:
2506 	case TRX_STATE_COMMITTED_IN_MEMORY:
2507 		break;
2508 	}
2509 
2510 	ut_error;
2511 }
2512 
2513 /*************************************************************//**
2514 Starts the transaction if it is not yet started. */
2515 UNIV_INTERN
2516 void
trx_start_if_not_started_low(trx_t * trx)2517 trx_start_if_not_started_low(
2518 /*=========================*/
2519 	trx_t*	trx)	/*!< in: transaction */
2520 {
2521 	switch (trx->state) {
2522 	case TRX_STATE_NOT_STARTED:
2523 		trx_start_low(trx);
2524 		/* fall through */
2525 	case TRX_STATE_ACTIVE:
2526 		return;
2527 	case TRX_STATE_PREPARED:
2528 	case TRX_STATE_COMMITTED_IN_MEMORY:
2529 		break;
2530 	}
2531 
2532 	ut_error;
2533 }
2534 
2535 /*************************************************************//**
2536 Starts the transaction for a DDL operation. */
2537 UNIV_INTERN
2538 void
trx_start_for_ddl_low(trx_t * trx,trx_dict_op_t op)2539 trx_start_for_ddl_low(
2540 /*==================*/
2541 	trx_t*		trx,	/*!< in/out: transaction */
2542 	trx_dict_op_t	op)	/*!< in: dictionary operation type */
2543 {
2544 	switch (trx->state) {
2545 	case TRX_STATE_NOT_STARTED:
2546 		/* Flag this transaction as a dictionary operation, so that
2547 		the data dictionary will be locked in crash recovery. */
2548 
2549 		trx_set_dict_operation(trx, op);
2550 
2551 		/* Ensure it is not flagged as an auto-commit-non-locking
2552 		transation. */
2553 		trx->will_lock = 1;
2554 
2555 		trx->ddl = true;
2556 
2557 		trx_start_low(trx);
2558 		return;
2559 
2560 	case TRX_STATE_ACTIVE:
2561 		/* We have this start if not started idiom, therefore we
2562 		can't add stronger checks here. */
2563 		trx->ddl = true;
2564 
2565 		ut_ad(trx->dict_operation != TRX_DICT_OP_NONE);
2566 		ut_ad(trx->will_lock > 0);
2567 		return;
2568 	case TRX_STATE_PREPARED:
2569 	case TRX_STATE_COMMITTED_IN_MEMORY:
2570 		break;
2571 	}
2572 
2573 	ut_error;
2574 }
2575 
2576