1 /*****************************************************************************
2
3 Copyright (c) 1996, 2011, Innobase Oy. All Rights Reserved.
4
5 This program is free software; you can redistribute it and/or modify it under
6 the terms of the GNU General Public License as published by the Free Software
7 Foundation; version 2 of the License.
8
9 This program is distributed in the hope that it will be useful, but WITHOUT
10 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License along with
14 this program; if not, write to the Free Software Foundation, Inc.,
15 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16
17 *****************************************************************************/
18
19 /**************************************************//**
20 @file trx/trx0trx.c
21 The transaction
22
23 Created 3/26/1996 Heikki Tuuri
24 *******************************************************/
25
26 #include "trx0trx.h"
27
28 #ifdef UNIV_NONINL
29 #include "trx0trx.ic"
30 #endif
31
32 #include "trx0undo.h"
33 #include "trx0rseg.h"
34 #include "log0log.h"
35 #include "que0que.h"
36 #include "lock0lock.h"
37 #include "trx0roll.h"
38 #include "usr0sess.h"
39 #include "read0read.h"
40 #include "srv0srv.h"
41 #include "btr0sea.h"
42 #include "os0proc.h"
43 #include "trx0xa.h"
44 #include "trx0purge.h"
45 #include "ha_prototypes.h"
46
47 /** Dummy session used currently in MySQL interface */
48 UNIV_INTERN sess_t* trx_dummy_sess = NULL;
49
50 /** Number of transactions currently allocated for MySQL: protected by
51 the kernel mutex */
52 UNIV_INTERN ulint trx_n_mysql_transactions = 0;
53 /** Number of transactions currently in the XA PREPARED state: protected by
54 the kernel mutex */
55 UNIV_INTERN ulint trx_n_prepared = 0;
56
57 #ifdef UNIV_PFS_MUTEX
58 /* Key to register the mutex with performance schema */
59 UNIV_INTERN mysql_pfs_key_t trx_undo_mutex_key;
60 #endif /* UNIV_PFS_MUTEX */
61
62 /*************************************************************//**
63 Set detailed error message for the transaction. */
64 UNIV_INTERN
65 void
trx_set_detailed_error(trx_t * trx,const char * msg)66 trx_set_detailed_error(
67 /*===================*/
68 trx_t* trx, /*!< in: transaction struct */
69 const char* msg) /*!< in: detailed error message */
70 {
71 ut_strlcpy(trx->detailed_error, msg, sizeof(trx->detailed_error));
72 }
73
74 /*************************************************************//**
75 Set detailed error message for the transaction from a file. Note that the
76 file is rewinded before reading from it. */
77 UNIV_INTERN
78 void
trx_set_detailed_error_from_file(trx_t * trx,FILE * file)79 trx_set_detailed_error_from_file(
80 /*=============================*/
81 trx_t* trx, /*!< in: transaction struct */
82 FILE* file) /*!< in: file to read message from */
83 {
84 os_file_read_string(file, trx->detailed_error,
85 sizeof(trx->detailed_error));
86 }
87
88 /****************************************************************//**
89 Creates and initializes a transaction object.
90 @return own: the transaction */
91 UNIV_INTERN
92 trx_t*
trx_create(sess_t * sess)93 trx_create(
94 /*=======*/
95 sess_t* sess) /*!< in: session */
96 {
97 trx_t* trx;
98
99 ut_ad(mutex_own(&kernel_mutex));
100 ut_ad(sess);
101
102 trx = mem_alloc(sizeof(trx_t));
103
104 trx->magic_n = TRX_MAGIC_N;
105
106 trx->op_info = "";
107
108 trx->is_purge = 0;
109 trx->is_recovered = 0;
110 trx->conc_state = TRX_NOT_STARTED;
111
112 trx->is_registered = 0;
113 trx->owns_prepare_mutex = 0;
114
115 trx->start_time = ut_time();
116
117 trx->isolation_level = TRX_ISO_REPEATABLE_READ;
118
119 trx->id = 0;
120 trx->no = IB_ULONGLONG_MAX;
121
122 trx->support_xa = TRUE;
123
124 trx->check_foreigns = TRUE;
125 trx->check_unique_secondary = TRUE;
126
127 trx->flush_log_later = FALSE;
128 trx->must_flush_log_later = FALSE;
129
130 trx->dict_operation = TRX_DICT_OP_NONE;
131 trx->table_id = 0;
132
133 trx->mysql_thd = NULL;
134 trx->duplicates = 0;
135
136 trx->n_mysql_tables_in_use = 0;
137 trx->mysql_n_tables_locked = 0;
138
139 trx->mysql_log_file_name = NULL;
140 trx->mysql_log_offset = 0;
141
142 mutex_create(trx_undo_mutex_key, &trx->undo_mutex, SYNC_TRX_UNDO);
143
144 trx->rseg = NULL;
145
146 trx->undo_no = 0;
147 trx->last_sql_stat_start.least_undo_no = 0;
148 trx->insert_undo = NULL;
149 trx->update_undo = NULL;
150 trx->undo_no_arr = NULL;
151
152 trx->error_state = DB_SUCCESS;
153 trx->error_key_num = 0;
154 trx->detailed_error[0] = '\0';
155
156 trx->sess = sess;
157 trx->que_state = TRX_QUE_RUNNING;
158 trx->n_active_thrs = 0;
159
160 trx->handling_signals = FALSE;
161
162 UT_LIST_INIT(trx->signals);
163 UT_LIST_INIT(trx->reply_signals);
164
165 trx->graph = NULL;
166
167 trx->wait_lock = NULL;
168 trx->was_chosen_as_deadlock_victim = FALSE;
169 UT_LIST_INIT(trx->wait_thrs);
170
171 trx->lock_heap = mem_heap_create_in_buffer(256);
172 UT_LIST_INIT(trx->trx_locks);
173
174 UT_LIST_INIT(trx->trx_savepoints);
175
176 trx->dict_operation_lock_mode = 0;
177 trx->has_search_latch = FALSE;
178 trx->search_latch_timeout = BTR_SEA_TIMEOUT;
179
180 trx->declared_to_be_inside_innodb = FALSE;
181 trx->n_tickets_to_enter_innodb = 0;
182
183 trx->global_read_view_heap = mem_heap_create(256);
184 trx->global_read_view = NULL;
185 trx->read_view = NULL;
186
187 /* Set X/Open XA transaction identification to NULL */
188 memset(&trx->xid, 0, sizeof(trx->xid));
189 trx->xid.formatID = -1;
190
191 trx->n_autoinc_rows = 0;
192
193 /* Remember to free the vector explicitly. */
194 trx->autoinc_locks = ib_vector_create(
195 mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 4), 4);
196
197 return(trx);
198 }
199
200 /********************************************************************//**
201 Creates a transaction object for MySQL.
202 @return own: transaction object */
203 UNIV_INTERN
204 trx_t*
trx_allocate_for_mysql(void)205 trx_allocate_for_mysql(void)
206 /*========================*/
207 {
208 trx_t* trx;
209
210 mutex_enter(&kernel_mutex);
211
212 trx = trx_create(trx_dummy_sess);
213
214 trx_n_mysql_transactions++;
215
216 UT_LIST_ADD_FIRST(mysql_trx_list, trx_sys->mysql_trx_list, trx);
217
218 mutex_exit(&kernel_mutex);
219
220 return(trx);
221 }
222
223 /********************************************************************//**
224 Creates a transaction object for background operations by the master thread.
225 @return own: transaction object */
226 UNIV_INTERN
227 trx_t*
trx_allocate_for_background(void)228 trx_allocate_for_background(void)
229 /*=============================*/
230 {
231 trx_t* trx;
232
233 mutex_enter(&kernel_mutex);
234
235 trx = trx_create(trx_dummy_sess);
236
237 mutex_exit(&kernel_mutex);
238
239 return(trx);
240 }
241
242 /********************************************************************//**
243 Releases the search latch if trx has reserved it. */
244 UNIV_INTERN
245 void
trx_search_latch_release_if_reserved(trx_t * trx)246 trx_search_latch_release_if_reserved(
247 /*=================================*/
248 trx_t* trx) /*!< in: transaction */
249 {
250 if (trx->has_search_latch) {
251 rw_lock_s_unlock(&btr_search_latch);
252
253 trx->has_search_latch = FALSE;
254 }
255 }
256
257 /********************************************************************//**
258 Frees a transaction object. */
259 UNIV_INTERN
260 void
trx_free(trx_t * trx)261 trx_free(
262 /*=====*/
263 trx_t* trx) /*!< in, own: trx object */
264 {
265 ut_ad(mutex_own(&kernel_mutex));
266
267 if (trx->declared_to_be_inside_innodb) {
268 ut_print_timestamp(stderr);
269 fputs(" InnoDB: Error: Freeing a trx which is declared"
270 " to be processing\n"
271 "InnoDB: inside InnoDB.\n", stderr);
272 trx_print(stderr, trx, 600);
273 putc('\n', stderr);
274
275 /* This is an error but not a fatal error. We must keep
276 the counters like srv_conc_n_threads accurate. */
277 srv_conc_force_exit_innodb(trx);
278 }
279
280 if (trx->n_mysql_tables_in_use != 0
281 || trx->mysql_n_tables_locked != 0) {
282
283 ut_print_timestamp(stderr);
284 fprintf(stderr,
285 " InnoDB: Error: MySQL is freeing a thd\n"
286 "InnoDB: though trx->n_mysql_tables_in_use is %lu\n"
287 "InnoDB: and trx->mysql_n_tables_locked is %lu.\n",
288 (ulong)trx->n_mysql_tables_in_use,
289 (ulong)trx->mysql_n_tables_locked);
290
291 trx_print(stderr, trx, 600);
292
293 ut_print_buf(stderr, trx, sizeof(trx_t));
294 putc('\n', stderr);
295 }
296
297 ut_a(trx->magic_n == TRX_MAGIC_N);
298
299 trx->magic_n = 11112222;
300
301 ut_a(trx->conc_state == TRX_NOT_STARTED);
302
303 mutex_free(&(trx->undo_mutex));
304
305 ut_a(trx->insert_undo == NULL);
306 ut_a(trx->update_undo == NULL);
307
308 if (trx->undo_no_arr) {
309 trx_undo_arr_free(trx->undo_no_arr);
310 }
311
312 ut_a(UT_LIST_GET_LEN(trx->signals) == 0);
313 ut_a(UT_LIST_GET_LEN(trx->reply_signals) == 0);
314
315 ut_a(trx->wait_lock == NULL);
316 ut_a(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
317
318 ut_a(!trx->has_search_latch);
319
320 ut_a(trx->dict_operation_lock_mode == 0);
321
322 if (trx->lock_heap) {
323 mem_heap_free(trx->lock_heap);
324 }
325
326 ut_a(UT_LIST_GET_LEN(trx->trx_locks) == 0);
327
328 if (trx->global_read_view_heap) {
329 mem_heap_free(trx->global_read_view_heap);
330 }
331
332 trx->global_read_view = NULL;
333
334 ut_a(trx->read_view == NULL);
335
336 ut_a(ib_vector_is_empty(trx->autoinc_locks));
337 /* We allocated a dedicated heap for the vector. */
338 ib_vector_free(trx->autoinc_locks);
339
340 mem_free(trx);
341 }
342
343 /********************************************************************//**
344 At shutdown, frees a transaction object that is in the PREPARED state. */
345 UNIV_INTERN
346 void
trx_free_prepared(trx_t * trx)347 trx_free_prepared(
348 /*==============*/
349 trx_t* trx) /*!< in, own: trx object */
350 {
351 ut_ad(mutex_own(&kernel_mutex));
352 ut_a(trx->conc_state == TRX_PREPARED);
353 ut_a(trx->magic_n == TRX_MAGIC_N);
354
355 /* Prepared transactions are sort of active; they allow
356 ROLLBACK and COMMIT operations. Because the system does not
357 contain any other transactions than prepared transactions at
358 the shutdown stage and because a transaction cannot become
359 PREPARED while holding locks, it is safe to release the locks
360 held by PREPARED transactions here at shutdown.*/
361 lock_release_off_kernel(trx);
362
363 trx_undo_free_prepared(trx);
364
365 mutex_free(&trx->undo_mutex);
366
367 if (trx->undo_no_arr) {
368 trx_undo_arr_free(trx->undo_no_arr);
369 }
370
371 ut_a(UT_LIST_GET_LEN(trx->signals) == 0);
372 ut_a(UT_LIST_GET_LEN(trx->reply_signals) == 0);
373
374 ut_a(trx->wait_lock == NULL);
375 ut_a(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
376
377 ut_a(!trx->has_search_latch);
378
379 ut_a(trx->dict_operation_lock_mode == 0);
380
381 if (trx->lock_heap) {
382 mem_heap_free(trx->lock_heap);
383 }
384
385 if (trx->global_read_view_heap) {
386 mem_heap_free(trx->global_read_view_heap);
387 }
388
389 ut_a(ib_vector_is_empty(trx->autoinc_locks));
390 ib_vector_free(trx->autoinc_locks);
391
392 UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx);
393
394 mem_free(trx);
395 }
396
397 /********************************************************************//**
398 Frees a transaction object for MySQL. */
399 UNIV_INTERN
400 void
trx_free_for_mysql(trx_t * trx)401 trx_free_for_mysql(
402 /*===============*/
403 trx_t* trx) /*!< in, own: trx object */
404 {
405 mutex_enter(&kernel_mutex);
406
407 UT_LIST_REMOVE(mysql_trx_list, trx_sys->mysql_trx_list, trx);
408
409 trx_free(trx);
410
411 ut_a(trx_n_mysql_transactions > 0);
412
413 trx_n_mysql_transactions--;
414
415 mutex_exit(&kernel_mutex);
416 }
417
418 /********************************************************************//**
419 Frees a transaction object of a background operation of the master thread. */
420 UNIV_INTERN
421 void
trx_free_for_background(trx_t * trx)422 trx_free_for_background(
423 /*====================*/
424 trx_t* trx) /*!< in, own: trx object */
425 {
426 mutex_enter(&kernel_mutex);
427
428 trx_free(trx);
429
430 mutex_exit(&kernel_mutex);
431 }
432
433 /****************************************************************//**
434 Inserts the trx handle in the trx system trx list in the right position.
435 The list is sorted on the trx id so that the biggest id is at the list
436 start. This function is used at the database startup to insert incomplete
437 transactions to the list. */
438 static
439 void
trx_list_insert_ordered(trx_t * trx)440 trx_list_insert_ordered(
441 /*====================*/
442 trx_t* trx) /*!< in: trx handle */
443 {
444 trx_t* trx2;
445
446 ut_ad(mutex_own(&kernel_mutex));
447
448 trx2 = UT_LIST_GET_FIRST(trx_sys->trx_list);
449
450 while (trx2 != NULL) {
451 if (trx->id >= trx2->id) {
452
453 ut_ad(trx->id > trx2->id);
454 break;
455 }
456 trx2 = UT_LIST_GET_NEXT(trx_list, trx2);
457 }
458
459 if (trx2 != NULL) {
460 trx2 = UT_LIST_GET_PREV(trx_list, trx2);
461
462 if (trx2 == NULL) {
463 UT_LIST_ADD_FIRST(trx_list, trx_sys->trx_list, trx);
464 } else {
465 UT_LIST_INSERT_AFTER(trx_list, trx_sys->trx_list,
466 trx2, trx);
467 }
468 } else {
469 UT_LIST_ADD_LAST(trx_list, trx_sys->trx_list, trx);
470 }
471 }
472
473 /****************************************************************//**
474 Creates trx objects for transactions and initializes the trx list of
475 trx_sys at database start. Rollback segment and undo log lists must
476 already exist when this function is called, because the lists of
477 transactions to be rolled back or cleaned up are built based on the
478 undo log lists. */
479 UNIV_INTERN
480 void
trx_lists_init_at_db_start(void)481 trx_lists_init_at_db_start(void)
482 /*============================*/
483 {
484 trx_rseg_t* rseg;
485 trx_undo_t* undo;
486 trx_t* trx;
487
488 ut_ad(mutex_own(&kernel_mutex));
489 UT_LIST_INIT(trx_sys->trx_list);
490
491 /* Look from the rollback segments if there exist undo logs for
492 transactions */
493
494 rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
495
496 while (rseg != NULL) {
497 undo = UT_LIST_GET_FIRST(rseg->insert_undo_list);
498
499 while (undo != NULL) {
500
501 trx = trx_create(trx_dummy_sess);
502
503 trx->is_recovered = TRUE;
504 trx->id = undo->trx_id;
505 trx->xid = undo->xid;
506 trx->insert_undo = undo;
507 trx->rseg = rseg;
508
509 if (undo->state != TRX_UNDO_ACTIVE) {
510
511 /* Prepared transactions are left in
512 the prepared state waiting for a
513 commit or abort decision from MySQL */
514
515 if (undo->state == TRX_UNDO_PREPARED) {
516
517 fprintf(stderr,
518 "InnoDB: Transaction "
519 TRX_ID_FMT
520 " was in the"
521 " XA prepared state.\n",
522 (ullint) trx->id);
523
524 if (srv_force_recovery == 0) {
525
526 trx->conc_state = TRX_PREPARED;
527 trx_n_prepared++;
528 } else {
529 fprintf(stderr,
530 "InnoDB: Since"
531 " innodb_force_recovery"
532 " > 0, we will"
533 " rollback it"
534 " anyway.\n");
535
536 trx->conc_state = TRX_ACTIVE;
537 }
538 } else {
539 trx->conc_state
540 = TRX_COMMITTED_IN_MEMORY;
541 }
542
543 /* We give a dummy value for the trx no;
544 this should have no relevance since purge
545 is not interested in committed transaction
546 numbers, unless they are in the history
547 list, in which case it looks the number
548 from the disk based undo log structure */
549
550 trx->no = trx->id;
551 } else {
552 trx->conc_state = TRX_ACTIVE;
553
554 /* A running transaction always has the number
555 field inited to IB_ULONGLONG_MAX */
556
557 trx->no = IB_ULONGLONG_MAX;
558 }
559
560 if (undo->dict_operation) {
561 trx_set_dict_operation(
562 trx, TRX_DICT_OP_TABLE);
563 trx->table_id = undo->table_id;
564 }
565
566 if (!undo->empty) {
567 trx->undo_no = undo->top_undo_no + 1;
568 }
569
570 trx_list_insert_ordered(trx);
571
572 undo = UT_LIST_GET_NEXT(undo_list, undo);
573 }
574
575 undo = UT_LIST_GET_FIRST(rseg->update_undo_list);
576
577 while (undo != NULL) {
578 trx = trx_get_on_id(undo->trx_id);
579
580 if (NULL == trx) {
581 trx = trx_create(trx_dummy_sess);
582
583 trx->is_recovered = TRUE;
584 trx->id = undo->trx_id;
585 trx->xid = undo->xid;
586
587 if (undo->state != TRX_UNDO_ACTIVE) {
588
589 /* Prepared transactions are left in
590 the prepared state waiting for a
591 commit or abort decision from MySQL */
592
593 if (undo->state == TRX_UNDO_PREPARED) {
594 fprintf(stderr,
595 "InnoDB: Transaction "
596 TRX_ID_FMT " was in the"
597 " XA prepared state.\n",
598 (ullint) trx->id);
599
600 if (srv_force_recovery == 0) {
601
602 trx->conc_state
603 = TRX_PREPARED;
604 trx_n_prepared++;
605 } else {
606 fprintf(stderr,
607 "InnoDB: Since"
608 " innodb_force_recovery"
609 " > 0, we will"
610 " rollback it"
611 " anyway.\n");
612
613 trx->conc_state
614 = TRX_ACTIVE;
615 }
616 } else {
617 trx->conc_state
618 = TRX_COMMITTED_IN_MEMORY;
619 }
620
621 /* We give a dummy value for the trx
622 number */
623
624 trx->no = trx->id;
625 } else {
626 trx->conc_state = TRX_ACTIVE;
627
628 /* A running transaction always has
629 the number field inited to
630 IB_ULONGLONG_MAX */
631
632 trx->no = IB_ULONGLONG_MAX;
633 }
634
635 trx->rseg = rseg;
636 trx_list_insert_ordered(trx);
637
638 if (undo->dict_operation) {
639 trx_set_dict_operation(
640 trx, TRX_DICT_OP_TABLE);
641 trx->table_id = undo->table_id;
642 }
643 }
644
645 trx->update_undo = undo;
646
647 if ((!undo->empty)
648 && undo->top_undo_no >= trx->undo_no) {
649
650 trx->undo_no = undo->top_undo_no + 1;
651 }
652
653 undo = UT_LIST_GET_NEXT(undo_list, undo);
654 }
655
656 rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
657 }
658 }
659
660 /******************************************************************//**
661 Assigns a rollback segment to a transaction in a round-robin fashion.
662 @return assigned rollback segment instance */
663 UNIV_INLINE
664 trx_rseg_t*
trx_assign_rseg(ulint max_undo_logs)665 trx_assign_rseg(
666 /*============*/
667 ulint max_undo_logs) /*!< in: maximum number of UNDO logs to use */
668 {
669 trx_rseg_t* rseg = trx_sys->latest_rseg;
670
671 ut_ad(mutex_own(&kernel_mutex));
672
673 rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
674
675 if (rseg == NULL || rseg->id == max_undo_logs - 1) {
676 rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
677 }
678
679 trx_sys->latest_rseg = rseg;
680
681 return(rseg);
682 }
683
684 /****************************************************************//**
685 Starts a new transaction.
686 @return TRUE */
687 UNIV_INTERN
688 ibool
trx_start_low(trx_t * trx,ulint rseg_id)689 trx_start_low(
690 /*==========*/
691 trx_t* trx, /*!< in: transaction */
692 ulint rseg_id)/*!< in: rollback segment id; if ULINT_UNDEFINED
693 is passed, the system chooses the rollback segment
694 automatically in a round-robin fashion */
695 {
696 trx_rseg_t* rseg;
697
698 ut_ad(mutex_own(&kernel_mutex));
699 ut_ad(trx->rseg == NULL);
700
701 if (trx->is_purge) {
702 trx->id = 0;
703 trx->conc_state = TRX_ACTIVE;
704 trx->start_time = time(NULL);
705
706 return(TRUE);
707 }
708
709 ut_ad(trx->conc_state != TRX_ACTIVE);
710
711 ut_a(rseg_id == ULINT_UNDEFINED);
712
713 rseg = trx_assign_rseg(srv_rollback_segments);
714
715 trx->id = trx_sys_get_new_trx_id();
716
717 /* The initial value for trx->no: IB_ULONGLONG_MAX is used in
718 read_view_open_now: */
719
720 trx->no = IB_ULONGLONG_MAX;
721
722 trx->rseg = rseg;
723
724 trx->conc_state = TRX_ACTIVE;
725 trx->start_time = time(NULL);
726
727 UT_LIST_ADD_FIRST(trx_list, trx_sys->trx_list, trx);
728
729 return(TRUE);
730 }
731
732 /****************************************************************//**
733 Starts a new transaction.
734 @return TRUE */
735 UNIV_INTERN
736 ibool
trx_start(trx_t * trx,ulint rseg_id)737 trx_start(
738 /*======*/
739 trx_t* trx, /*!< in: transaction */
740 ulint rseg_id)/*!< in: rollback segment id; if ULINT_UNDEFINED
741 is passed, the system chooses the rollback segment
742 automatically in a round-robin fashion */
743 {
744 ibool ret;
745
746 /* Update the info whether we should skip XA steps that eat CPU time
747 For the duration of the transaction trx->support_xa is not reread
748 from thd so any changes in the value take effect in the next
749 transaction. This is to avoid a scenario where some undo
750 generated by a transaction, has XA stuff, and other undo,
751 generated by the same transaction, doesn't. */
752 trx->support_xa = thd_supports_xa(trx->mysql_thd);
753
754 mutex_enter(&kernel_mutex);
755
756 ret = trx_start_low(trx, rseg_id);
757
758 mutex_exit(&kernel_mutex);
759
760 return(ret);
761 }
762
763 /****************************************************************//**
764 Set the transaction serialisation number. */
765 static
766 void
trx_serialisation_number_get(trx_t * trx)767 trx_serialisation_number_get(
768 /*=========================*/
769 trx_t* trx) /*!< in: transaction */
770 {
771 trx_rseg_t* rseg;
772
773 rseg = trx->rseg;
774
775 ut_ad(mutex_own(&rseg->mutex));
776
777 mutex_enter(&kernel_mutex);
778
779 trx->no = trx_sys_get_new_trx_id();
780
781 /* If the rollack segment is not empty then the
782 new trx_t::no can't be less than any trx_t::no
783 already in the rollback segment. User threads only
784 produce events when a rollback segment is empty. */
785
786 if (rseg->last_page_no == FIL_NULL) {
787 void* ptr;
788 rseg_queue_t rseg_queue;
789
790 rseg_queue.rseg = rseg;
791 rseg_queue.trx_no = trx->no;
792
793 mutex_enter(&purge_sys->bh_mutex);
794
795 /* This is to reduce the pressure on the kernel mutex,
796 though in reality it should make very little (read no)
797 difference because this code path is only taken when the
798 rbs is empty. */
799
800 mutex_exit(&kernel_mutex);
801
802 ptr = ib_bh_push(purge_sys->ib_bh, &rseg_queue);
803 ut_a(ptr);
804
805 mutex_exit(&purge_sys->bh_mutex);
806 } else {
807 mutex_exit(&kernel_mutex);
808 }
809 }
810
811 /****************************************************************//**
812 Assign the transaction its history serialisation number and write the
813 update UNDO log record to the assigned rollback segment.
814 @return the LSN of the UNDO log write. */
815 static
816 ib_uint64_t
trx_write_serialisation_history(trx_t * trx)817 trx_write_serialisation_history(
818 /*============================*/
819 trx_t* trx) /*!< in: transaction */
820 {
821 mtr_t mtr;
822 trx_rseg_t* rseg;
823
824 ut_ad(!mutex_own(&kernel_mutex));
825
826 rseg = trx->rseg;
827
828 mtr_start(&mtr);
829
830 /* Change the undo log segment states from TRX_UNDO_ACTIVE
831 to some other state: these modifications to the file data
832 structure define the transaction as committed in the file
833 based domain, at the serialization point of the log sequence
834 number lsn obtained below. */
835
836 if (trx->update_undo != NULL) {
837 page_t* undo_hdr_page;
838 trx_undo_t* undo = trx->update_undo;
839
840 /* We have to hold the rseg mutex because update
841 log headers have to be put to the history list in the
842 (serialisation) order of the UNDO trx number. This is
843 required for the purge in-memory data structures too. */
844
845 mutex_enter(&rseg->mutex);
846
847 /* Assign the transaction serialisation number and also
848 update the purge min binary heap if this is the first
849 UNDO log being written to the assigned rollback segment. */
850
851 trx_serialisation_number_get(trx);
852
853 /* It is not necessary to obtain trx->undo_mutex here
854 because only a single OS thread is allowed to do the
855 transaction commit for this transaction. */
856
857 undo_hdr_page = trx_undo_set_state_at_finish(undo, &mtr);
858
859 trx_undo_update_cleanup(trx, undo_hdr_page, &mtr);
860 } else {
861 mutex_enter(&rseg->mutex);
862 }
863
864 if (trx->insert_undo != NULL) {
865 trx_undo_set_state_at_finish(trx->insert_undo, &mtr);
866 }
867
868 mutex_exit(&rseg->mutex);
869
870 /* Update the latest MySQL binlog name and offset info
871 in trx sys header if MySQL binlogging is on or the database
872 server is a MySQL replication slave */
873
874 if (trx->mysql_log_file_name
875 && trx->mysql_log_file_name[0] != '\0') {
876
877 trx_sys_update_mysql_binlog_offset(
878 trx->mysql_log_file_name,
879 trx->mysql_log_offset,
880 TRX_SYS_MYSQL_LOG_INFO, &mtr);
881
882 trx->mysql_log_file_name = NULL;
883 }
884
885 /* The following call commits the mini-transaction, making the
886 whole transaction committed in the file-based world, at this
887 log sequence number. The transaction becomes 'durable' when
888 we write the log to disk, but in the logical sense the commit
889 in the file-based data structures (undo logs etc.) happens
890 here.
891
892 NOTE that transaction numbers, which are assigned only to
893 transactions with an update undo log, do not necessarily come
894 in exactly the same order as commit lsn's, if the transactions
895 have different rollback segments. To get exactly the same
896 order we should hold the kernel mutex up to this point,
897 adding to the contention of the kernel mutex. However, if
898 a transaction T2 is able to see modifications made by
899 a transaction T1, T2 will always get a bigger transaction
900 number and a bigger commit lsn than T1. */
901
902 /*--------------*/
903 mtr_commit(&mtr);
904 /*--------------*/
905
906 return(mtr.end_lsn);
907 }
908
909 /****************************************************************//**
910 Commits a transaction. */
911 UNIV_INTERN
912 void
trx_commit_off_kernel(trx_t * trx)913 trx_commit_off_kernel(
914 /*==================*/
915 trx_t* trx) /*!< in: transaction */
916 {
917 ib_uint64_t lsn;
918
919 ut_ad(mutex_own(&kernel_mutex));
920
921 trx->must_flush_log_later = FALSE;
922
923 /* If the transaction made any updates then we need to write the
924 UNDO logs for the updates to the assigned rollback segment. */
925
926 if (trx->insert_undo != NULL || trx->update_undo != NULL) {
927 mutex_exit(&kernel_mutex);
928
929 lsn = trx_write_serialisation_history(trx);
930
931 mutex_enter(&kernel_mutex);
932 } else {
933 lsn = 0;
934 }
935
936 ut_ad(trx->conc_state == TRX_ACTIVE || trx->conc_state == TRX_PREPARED);
937 ut_ad(mutex_own(&kernel_mutex));
938
939 if (UNIV_UNLIKELY(trx->conc_state == TRX_PREPARED)) {
940 ut_a(trx_n_prepared > 0);
941 trx_n_prepared--;
942 }
943
944 /* The following assignment makes the transaction committed in memory
945 and makes its changes to data visible to other transactions.
946 NOTE that there is a small discrepancy from the strict formal
947 visibility rules here: a human user of the database can see
948 modifications made by another transaction T even before the necessary
949 log segment has been flushed to the disk. If the database happens to
950 crash before the flush, the user has seen modifications from T which
951 will never be a committed transaction. However, any transaction T2
952 which sees the modifications of the committing transaction T, and
953 which also itself makes modifications to the database, will get an lsn
954 larger than the committing transaction T. In the case where the log
955 flush fails, and T never gets committed, also T2 will never get
956 committed. */
957
958 /*--------------------------------------*/
959 trx->conc_state = TRX_COMMITTED_IN_MEMORY;
960 /*--------------------------------------*/
961
962 /* If we release kernel_mutex below and we are still doing
963 recovery i.e.: back ground rollback thread is still active
964 then there is a chance that the rollback thread may see
965 this trx as COMMITTED_IN_MEMORY and goes adhead to clean it
966 up calling trx_cleanup_at_db_startup(). This can happen
967 in the case we are committing a trx here that is left in
968 PREPARED state during the crash. Note that commit of the
969 rollback of a PREPARED trx happens in the recovery thread
970 while the rollback of other transactions happen in the
971 background thread. To avoid this race we unconditionally
972 unset the is_recovered flag from the trx. */
973
974 trx->is_recovered = FALSE;
975
976 lock_release_off_kernel(trx);
977
978 if (trx->global_read_view) {
979 read_view_close(trx->global_read_view);
980 mem_heap_empty(trx->global_read_view_heap);
981 trx->global_read_view = NULL;
982 }
983
984 trx->read_view = NULL;
985
986 if (lsn) {
987
988 mutex_exit(&kernel_mutex);
989
990 if (trx->insert_undo != NULL) {
991
992 trx_undo_insert_cleanup(trx);
993 }
994
995 /* NOTE that we could possibly make a group commit more
996 efficient here: call os_thread_yield here to allow also other
997 trxs to come to commit! */
998
999 /*-------------------------------------*/
1000
1001 /* Depending on the my.cnf options, we may now write the log
1002 buffer to the log files, making the transaction durable if
1003 the OS does not crash. We may also flush the log files to
1004 disk, making the transaction durable also at an OS crash or a
1005 power outage.
1006
1007 The idea in InnoDB's group commit is that a group of
1008 transactions gather behind a trx doing a physical disk write
1009 to log files, and when that physical write has been completed,
1010 one of those transactions does a write which commits the whole
1011 group. Note that this group commit will only bring benefit if
1012 there are > 2 users in the database. Then at least 2 users can
1013 gather behind one doing the physical log write to disk.
1014
1015 If we are calling trx_commit() under prepare_commit_mutex, we
1016 will delay possible log write and flush to a separate function
1017 trx_commit_complete_for_mysql(), which is only called when the
1018 thread has released the mutex. This is to make the
1019 group commit algorithm to work. Otherwise, the prepare_commit
1020 mutex would serialize all commits and prevent a group of
1021 transactions from gathering. */
1022
1023 if (trx->flush_log_later) {
1024 /* Do nothing yet */
1025 trx->must_flush_log_later = TRUE;
1026 } else if (srv_flush_log_at_trx_commit == 0) {
1027 /* Do nothing */
1028 } else if (srv_flush_log_at_trx_commit == 1) {
1029 if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
1030 /* Write the log but do not flush it to disk */
1031
1032 log_write_up_to(lsn, LOG_WAIT_ONE_GROUP,
1033 FALSE);
1034 } else {
1035 /* Write the log to the log files AND flush
1036 them to disk */
1037
1038 log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
1039 }
1040 } else if (srv_flush_log_at_trx_commit == 2) {
1041
1042 /* Write the log but do not flush it to disk */
1043
1044 log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
1045 } else {
1046 ut_error;
1047 }
1048
1049 trx->commit_lsn = lsn;
1050
1051 /*-------------------------------------*/
1052
1053 mutex_enter(&kernel_mutex);
1054 }
1055
1056 /* Free all savepoints */
1057 trx_roll_free_all_savepoints(trx);
1058
1059 trx->conc_state = TRX_NOT_STARTED;
1060 trx->rseg = NULL;
1061 trx->undo_no = 0;
1062 trx->last_sql_stat_start.least_undo_no = 0;
1063
1064 ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
1065 ut_ad(UT_LIST_GET_LEN(trx->trx_locks) == 0);
1066
1067 UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx);
1068
1069 trx->error_state = DB_SUCCESS;
1070 }
1071
1072 /****************************************************************//**
1073 Cleans up a transaction at database startup. The cleanup is needed if
1074 the transaction already got to the middle of a commit when the database
1075 crashed, and we cannot roll it back. */
1076 UNIV_INTERN
1077 void
trx_cleanup_at_db_startup(trx_t * trx)1078 trx_cleanup_at_db_startup(
1079 /*======================*/
1080 trx_t* trx) /*!< in: transaction */
1081 {
1082 if (trx->insert_undo != NULL) {
1083
1084 trx_undo_insert_cleanup(trx);
1085 }
1086
1087 trx->conc_state = TRX_NOT_STARTED;
1088 trx->rseg = NULL;
1089 trx->undo_no = 0;
1090 trx->last_sql_stat_start.least_undo_no = 0;
1091
1092 UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx);
1093 }
1094
1095 /********************************************************************//**
1096 Assigns a read view for a consistent read query. All the consistent reads
1097 within the same transaction will get the same read view, which is created
1098 when this function is first called for a new started transaction.
1099 @return consistent read view */
1100 UNIV_INTERN
1101 read_view_t*
trx_assign_read_view(trx_t * trx)1102 trx_assign_read_view(
1103 /*=================*/
1104 trx_t* trx) /*!< in: active transaction */
1105 {
1106 ut_ad(trx->conc_state == TRX_ACTIVE);
1107
1108 if (trx->read_view) {
1109 return(trx->read_view);
1110 }
1111
1112 mutex_enter(&kernel_mutex);
1113
1114 if (!trx->read_view) {
1115 trx->read_view = read_view_open_now(
1116 trx->id, trx->global_read_view_heap);
1117 trx->global_read_view = trx->read_view;
1118 }
1119
1120 mutex_exit(&kernel_mutex);
1121
1122 return(trx->read_view);
1123 }
1124
1125 /****************************************************************//**
1126 Commits a transaction. NOTE that the kernel mutex is temporarily released. */
1127 static
1128 void
trx_handle_commit_sig_off_kernel(trx_t * trx,que_thr_t ** next_thr)1129 trx_handle_commit_sig_off_kernel(
1130 /*=============================*/
1131 trx_t* trx, /*!< in: transaction */
1132 que_thr_t** next_thr) /*!< in/out: next query thread to run;
1133 if the value which is passed in is
1134 a pointer to a NULL pointer, then the
1135 calling function can start running
1136 a new query thread */
1137 {
1138 trx_sig_t* sig;
1139 trx_sig_t* next_sig;
1140
1141 ut_ad(mutex_own(&kernel_mutex));
1142
1143 trx->que_state = TRX_QUE_COMMITTING;
1144
1145 trx_commit_off_kernel(trx);
1146
1147 ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
1148
1149 /* Remove all TRX_SIG_COMMIT signals from the signal queue and send
1150 reply messages to them */
1151
1152 sig = UT_LIST_GET_FIRST(trx->signals);
1153
1154 while (sig != NULL) {
1155 next_sig = UT_LIST_GET_NEXT(signals, sig);
1156
1157 if (sig->type == TRX_SIG_COMMIT) {
1158
1159 trx_sig_reply(sig, next_thr);
1160 trx_sig_remove(trx, sig);
1161 }
1162
1163 sig = next_sig;
1164 }
1165
1166 trx->que_state = TRX_QUE_RUNNING;
1167 }
1168
1169 /***********************************************************//**
1170 The transaction must be in the TRX_QUE_LOCK_WAIT state. Puts it to
1171 the TRX_QUE_RUNNING state and releases query threads which were
1172 waiting for a lock in the wait_thrs list. */
1173 UNIV_INTERN
1174 void
trx_end_lock_wait(trx_t * trx)1175 trx_end_lock_wait(
1176 /*==============*/
1177 trx_t* trx) /*!< in: transaction */
1178 {
1179 que_thr_t* thr;
1180
1181 ut_ad(mutex_own(&kernel_mutex));
1182 ut_ad(trx->que_state == TRX_QUE_LOCK_WAIT);
1183
1184 thr = UT_LIST_GET_FIRST(trx->wait_thrs);
1185
1186 while (thr != NULL) {
1187 que_thr_end_wait_no_next_thr(thr);
1188
1189 UT_LIST_REMOVE(trx_thrs, trx->wait_thrs, thr);
1190
1191 thr = UT_LIST_GET_FIRST(trx->wait_thrs);
1192 }
1193
1194 trx->que_state = TRX_QUE_RUNNING;
1195 }
1196
1197 /***********************************************************//**
1198 Moves the query threads in the lock wait list to the SUSPENDED state and puts
1199 the transaction to the TRX_QUE_RUNNING state. */
1200 static
1201 void
trx_lock_wait_to_suspended(trx_t * trx)1202 trx_lock_wait_to_suspended(
1203 /*=======================*/
1204 trx_t* trx) /*!< in: transaction in the TRX_QUE_LOCK_WAIT state */
1205 {
1206 que_thr_t* thr;
1207
1208 ut_ad(mutex_own(&kernel_mutex));
1209 ut_ad(trx->que_state == TRX_QUE_LOCK_WAIT);
1210
1211 thr = UT_LIST_GET_FIRST(trx->wait_thrs);
1212
1213 while (thr != NULL) {
1214 thr->state = QUE_THR_SUSPENDED;
1215
1216 UT_LIST_REMOVE(trx_thrs, trx->wait_thrs, thr);
1217
1218 thr = UT_LIST_GET_FIRST(trx->wait_thrs);
1219 }
1220
1221 trx->que_state = TRX_QUE_RUNNING;
1222 }
1223
1224 /***********************************************************//**
1225 Moves the query threads in the sig reply wait list of trx to the SUSPENDED
1226 state. */
1227 static
1228 void
trx_sig_reply_wait_to_suspended(trx_t * trx)1229 trx_sig_reply_wait_to_suspended(
1230 /*============================*/
1231 trx_t* trx) /*!< in: transaction */
1232 {
1233 trx_sig_t* sig;
1234 que_thr_t* thr;
1235
1236 ut_ad(mutex_own(&kernel_mutex));
1237
1238 sig = UT_LIST_GET_FIRST(trx->reply_signals);
1239
1240 while (sig != NULL) {
1241 thr = sig->receiver;
1242
1243 ut_ad(thr->state == QUE_THR_SIG_REPLY_WAIT);
1244
1245 thr->state = QUE_THR_SUSPENDED;
1246
1247 sig->receiver = NULL;
1248
1249 UT_LIST_REMOVE(reply_signals, trx->reply_signals, sig);
1250
1251 sig = UT_LIST_GET_FIRST(trx->reply_signals);
1252 }
1253 }
1254
1255 /*****************************************************************//**
1256 Checks the compatibility of a new signal with the other signals in the
1257 queue.
1258 @return TRUE if the signal can be queued */
1259 static
1260 ibool
trx_sig_is_compatible(trx_t * trx,ulint type,ulint sender)1261 trx_sig_is_compatible(
1262 /*==================*/
1263 trx_t* trx, /*!< in: trx handle */
1264 ulint type, /*!< in: signal type */
1265 ulint sender) /*!< in: TRX_SIG_SELF or TRX_SIG_OTHER_SESS */
1266 {
1267 trx_sig_t* sig;
1268
1269 ut_ad(mutex_own(&kernel_mutex));
1270
1271 if (UT_LIST_GET_LEN(trx->signals) == 0) {
1272
1273 return(TRUE);
1274 }
1275
1276 if (sender == TRX_SIG_SELF) {
1277 if (type == TRX_SIG_ERROR_OCCURRED) {
1278
1279 return(TRUE);
1280
1281 } else if (type == TRX_SIG_BREAK_EXECUTION) {
1282
1283 return(TRUE);
1284 } else {
1285 return(FALSE);
1286 }
1287 }
1288
1289 ut_ad(sender == TRX_SIG_OTHER_SESS);
1290
1291 sig = UT_LIST_GET_FIRST(trx->signals);
1292
1293 if (type == TRX_SIG_COMMIT) {
1294 while (sig != NULL) {
1295
1296 if (sig->type == TRX_SIG_TOTAL_ROLLBACK) {
1297
1298 return(FALSE);
1299 }
1300
1301 sig = UT_LIST_GET_NEXT(signals, sig);
1302 }
1303
1304 return(TRUE);
1305
1306 } else if (type == TRX_SIG_TOTAL_ROLLBACK) {
1307 while (sig != NULL) {
1308
1309 if (sig->type == TRX_SIG_COMMIT) {
1310
1311 return(FALSE);
1312 }
1313
1314 sig = UT_LIST_GET_NEXT(signals, sig);
1315 }
1316
1317 return(TRUE);
1318
1319 } else if (type == TRX_SIG_BREAK_EXECUTION) {
1320
1321 return(TRUE);
1322 } else {
1323 ut_error;
1324
1325 return(FALSE);
1326 }
1327 }
1328
1329 /****************************************************************//**
1330 Sends a signal to a trx object. */
1331 UNIV_INTERN
1332 void
trx_sig_send(trx_t * trx,ulint type,ulint sender,que_thr_t * receiver_thr,trx_savept_t * savept,que_thr_t ** next_thr)1333 trx_sig_send(
1334 /*=========*/
1335 trx_t* trx, /*!< in: trx handle */
1336 ulint type, /*!< in: signal type */
1337 ulint sender, /*!< in: TRX_SIG_SELF or
1338 TRX_SIG_OTHER_SESS */
1339 que_thr_t* receiver_thr, /*!< in: query thread which wants the
1340 reply, or NULL; if type is
1341 TRX_SIG_END_WAIT, this must be NULL */
1342 trx_savept_t* savept, /*!< in: possible rollback savepoint, or
1343 NULL */
1344 que_thr_t** next_thr) /*!< in/out: next query thread to run;
1345 if the value which is passed in is
1346 a pointer to a NULL pointer, then the
1347 calling function can start running
1348 a new query thread; if the parameter
1349 is NULL, it is ignored */
1350 {
1351 trx_sig_t* sig;
1352 trx_t* receiver_trx;
1353
1354 ut_ad(trx);
1355 ut_ad(mutex_own(&kernel_mutex));
1356
1357 if (!trx_sig_is_compatible(trx, type, sender)) {
1358 /* The signal is not compatible with the other signals in
1359 the queue: die */
1360
1361 ut_error;
1362 }
1363
1364 /* Queue the signal object */
1365
1366 if (UT_LIST_GET_LEN(trx->signals) == 0) {
1367
1368 /* The signal list is empty: the 'sig' slot must be unused
1369 (we improve performance a bit by avoiding mem_alloc) */
1370 sig = &(trx->sig);
1371 } else {
1372 /* It might be that the 'sig' slot is unused also in this
1373 case, but we choose the easy way of using mem_alloc */
1374
1375 sig = mem_alloc(sizeof(trx_sig_t));
1376 }
1377
1378 UT_LIST_ADD_LAST(signals, trx->signals, sig);
1379
1380 sig->type = type;
1381 sig->sender = sender;
1382 sig->receiver = receiver_thr;
1383
1384 if (savept) {
1385 sig->savept = *savept;
1386 }
1387
1388 if (receiver_thr) {
1389 receiver_trx = thr_get_trx(receiver_thr);
1390
1391 UT_LIST_ADD_LAST(reply_signals, receiver_trx->reply_signals,
1392 sig);
1393 }
1394
1395 if (trx->sess->state == SESS_ERROR) {
1396
1397 trx_sig_reply_wait_to_suspended(trx);
1398 }
1399
1400 if ((sender != TRX_SIG_SELF) || (type == TRX_SIG_BREAK_EXECUTION)) {
1401 ut_error;
1402 }
1403
1404 /* If there were no other signals ahead in the queue, try to start
1405 handling of the signal */
1406
1407 if (UT_LIST_GET_FIRST(trx->signals) == sig) {
1408
1409 trx_sig_start_handle(trx, next_thr);
1410 }
1411 }
1412
1413 /****************************************************************//**
1414 Ends signal handling. If the session is in the error state, and
1415 trx->graph_before_signal_handling != NULL, then returns control to the error
1416 handling routine of the graph (currently just returns the control to the
1417 graph root which then will send an error message to the client). */
1418 UNIV_INTERN
1419 void
trx_end_signal_handling(trx_t * trx)1420 trx_end_signal_handling(
1421 /*====================*/
1422 trx_t* trx) /*!< in: trx */
1423 {
1424 ut_ad(mutex_own(&kernel_mutex));
1425 ut_ad(trx->handling_signals == TRUE);
1426
1427 trx->handling_signals = FALSE;
1428
1429 trx->graph = trx->graph_before_signal_handling;
1430
1431 if (trx->graph && (trx->sess->state == SESS_ERROR)) {
1432
1433 que_fork_error_handle(trx, trx->graph);
1434 }
1435 }
1436
1437 /****************************************************************//**
1438 Starts handling of a trx signal. */
1439 UNIV_INTERN
1440 void
trx_sig_start_handle(trx_t * trx,que_thr_t ** next_thr)1441 trx_sig_start_handle(
1442 /*=================*/
1443 trx_t* trx, /*!< in: trx handle */
1444 que_thr_t** next_thr) /*!< in/out: next query thread to run;
1445 if the value which is passed in is
1446 a pointer to a NULL pointer, then the
1447 calling function can start running
1448 a new query thread; if the parameter
1449 is NULL, it is ignored */
1450 {
1451 trx_sig_t* sig;
1452 ulint type;
1453 loop:
1454 /* We loop in this function body as long as there are queued signals
1455 we can process immediately */
1456
1457 ut_ad(trx);
1458 ut_ad(mutex_own(&kernel_mutex));
1459
1460 if (trx->handling_signals && (UT_LIST_GET_LEN(trx->signals) == 0)) {
1461
1462 trx_end_signal_handling(trx);
1463
1464 return;
1465 }
1466
1467 if (trx->conc_state == TRX_NOT_STARTED) {
1468
1469 trx_start_low(trx, ULINT_UNDEFINED);
1470 }
1471
1472 /* If the trx is in a lock wait state, moves the waiting query threads
1473 to the suspended state */
1474
1475 if (trx->que_state == TRX_QUE_LOCK_WAIT) {
1476
1477 trx_lock_wait_to_suspended(trx);
1478 }
1479
1480 /* If the session is in the error state and this trx has threads
1481 waiting for reply from signals, moves these threads to the suspended
1482 state, canceling wait reservations; note that if the transaction has
1483 sent a commit or rollback signal to itself, and its session is not in
1484 the error state, then nothing is done here. */
1485
1486 if (trx->sess->state == SESS_ERROR) {
1487 trx_sig_reply_wait_to_suspended(trx);
1488 }
1489
1490 /* If there are no running query threads, we can start processing of a
1491 signal, otherwise we have to wait until all query threads of this
1492 transaction are aware of the arrival of the signal. */
1493
1494 if (trx->n_active_thrs > 0) {
1495
1496 return;
1497 }
1498
1499 if (trx->handling_signals == FALSE) {
1500 trx->graph_before_signal_handling = trx->graph;
1501
1502 trx->handling_signals = TRUE;
1503 }
1504
1505 sig = UT_LIST_GET_FIRST(trx->signals);
1506 type = sig->type;
1507
1508 if (type == TRX_SIG_COMMIT) {
1509
1510 trx_handle_commit_sig_off_kernel(trx, next_thr);
1511
1512 } else if ((type == TRX_SIG_TOTAL_ROLLBACK)
1513 || (type == TRX_SIG_ROLLBACK_TO_SAVEPT)) {
1514
1515 trx_rollback(trx, sig, next_thr);
1516
1517 /* No further signals can be handled until the rollback
1518 completes, therefore we return */
1519
1520 return;
1521
1522 } else if (type == TRX_SIG_ERROR_OCCURRED) {
1523
1524 trx_rollback(trx, sig, next_thr);
1525
1526 /* No further signals can be handled until the rollback
1527 completes, therefore we return */
1528
1529 return;
1530
1531 } else if (type == TRX_SIG_BREAK_EXECUTION) {
1532
1533 trx_sig_reply(sig, next_thr);
1534 trx_sig_remove(trx, sig);
1535 } else {
1536 ut_error;
1537 }
1538
1539 goto loop;
1540 }
1541
1542 /****************************************************************//**
1543 Send the reply message when a signal in the queue of the trx has been
1544 handled. */
1545 UNIV_INTERN
1546 void
trx_sig_reply(trx_sig_t * sig,que_thr_t ** next_thr)1547 trx_sig_reply(
1548 /*==========*/
1549 trx_sig_t* sig, /*!< in: signal */
1550 que_thr_t** next_thr) /*!< in/out: next query thread to run;
1551 if the value which is passed in is
1552 a pointer to a NULL pointer, then the
1553 calling function can start running
1554 a new query thread */
1555 {
1556 trx_t* receiver_trx;
1557
1558 ut_ad(sig);
1559 ut_ad(mutex_own(&kernel_mutex));
1560
1561 if (sig->receiver != NULL) {
1562 ut_ad((sig->receiver)->state == QUE_THR_SIG_REPLY_WAIT);
1563
1564 receiver_trx = thr_get_trx(sig->receiver);
1565
1566 UT_LIST_REMOVE(reply_signals, receiver_trx->reply_signals,
1567 sig);
1568 ut_ad(receiver_trx->sess->state != SESS_ERROR);
1569
1570 que_thr_end_wait(sig->receiver, next_thr);
1571
1572 sig->receiver = NULL;
1573
1574 }
1575 }
1576
1577 /****************************************************************//**
1578 Removes a signal object from the trx signal queue. */
1579 UNIV_INTERN
1580 void
trx_sig_remove(trx_t * trx,trx_sig_t * sig)1581 trx_sig_remove(
1582 /*===========*/
1583 trx_t* trx, /*!< in: trx handle */
1584 trx_sig_t* sig) /*!< in, own: signal */
1585 {
1586 ut_ad(trx && sig);
1587 ut_ad(mutex_own(&kernel_mutex));
1588
1589 ut_ad(sig->receiver == NULL);
1590
1591 UT_LIST_REMOVE(signals, trx->signals, sig);
1592 sig->type = 0; /* reset the field to catch possible bugs */
1593
1594 if (sig != &(trx->sig)) {
1595 mem_free(sig);
1596 }
1597 }
1598
1599 /*********************************************************************//**
1600 Creates a commit command node struct.
1601 @return own: commit node struct */
1602 UNIV_INTERN
1603 commit_node_t*
commit_node_create(mem_heap_t * heap)1604 commit_node_create(
1605 /*===============*/
1606 mem_heap_t* heap) /*!< in: mem heap where created */
1607 {
1608 commit_node_t* node;
1609
1610 node = mem_heap_alloc(heap, sizeof(commit_node_t));
1611 node->common.type = QUE_NODE_COMMIT;
1612 node->state = COMMIT_NODE_SEND;
1613
1614 return(node);
1615 }
1616
1617 /***********************************************************//**
1618 Performs an execution step for a commit type node in a query graph.
1619 @return query thread to run next, or NULL */
1620 UNIV_INTERN
1621 que_thr_t*
trx_commit_step(que_thr_t * thr)1622 trx_commit_step(
1623 /*============*/
1624 que_thr_t* thr) /*!< in: query thread */
1625 {
1626 commit_node_t* node;
1627 que_thr_t* next_thr;
1628
1629 node = thr->run_node;
1630
1631 ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT);
1632
1633 if (thr->prev_node == que_node_get_parent(node)) {
1634 node->state = COMMIT_NODE_SEND;
1635 }
1636
1637 if (node->state == COMMIT_NODE_SEND) {
1638 mutex_enter(&kernel_mutex);
1639
1640 node->state = COMMIT_NODE_WAIT;
1641
1642 next_thr = NULL;
1643
1644 thr->state = QUE_THR_SIG_REPLY_WAIT;
1645
1646 /* Send the commit signal to the transaction */
1647
1648 trx_sig_send(thr_get_trx(thr), TRX_SIG_COMMIT, TRX_SIG_SELF,
1649 thr, NULL, &next_thr);
1650
1651 mutex_exit(&kernel_mutex);
1652
1653 return(next_thr);
1654 }
1655
1656 ut_ad(node->state == COMMIT_NODE_WAIT);
1657
1658 node->state = COMMIT_NODE_SEND;
1659
1660 thr->run_node = que_node_get_parent(node);
1661
1662 return(thr);
1663 }
1664
1665 /**********************************************************************//**
1666 Does the transaction commit for MySQL.
1667 @return DB_SUCCESS or error number */
1668 UNIV_INTERN
1669 ulint
trx_commit_for_mysql(trx_t * trx)1670 trx_commit_for_mysql(
1671 /*=================*/
1672 trx_t* trx) /*!< in: trx handle */
1673 {
1674 /* Because we do not do the commit by sending an Innobase
1675 sig to the transaction, we must here make sure that trx has been
1676 started. */
1677
1678 ut_a(trx);
1679
1680 trx_start_if_not_started(trx);
1681
1682 trx->op_info = "committing";
1683
1684 mutex_enter(&kernel_mutex);
1685
1686 trx_commit_off_kernel(trx);
1687
1688 mutex_exit(&kernel_mutex);
1689
1690 trx->op_info = "";
1691
1692 return(DB_SUCCESS);
1693 }
1694
1695 /**********************************************************************//**
1696 If required, flushes the log to disk if we called trx_commit_for_mysql()
1697 with trx->flush_log_later == TRUE.
1698 @return 0 or error number */
1699 UNIV_INTERN
1700 ulint
trx_commit_complete_for_mysql(trx_t * trx)1701 trx_commit_complete_for_mysql(
1702 /*==========================*/
1703 trx_t* trx) /*!< in: trx handle */
1704 {
1705 ib_uint64_t lsn = trx->commit_lsn;
1706
1707 ut_a(trx);
1708
1709 trx->op_info = "flushing log";
1710
1711 if (!trx->must_flush_log_later) {
1712 /* Do nothing */
1713 } else if (srv_flush_log_at_trx_commit == 0) {
1714 /* Do nothing */
1715 } else if (srv_flush_log_at_trx_commit == 1) {
1716 if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
1717 /* Write the log but do not flush it to disk */
1718
1719 log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
1720 } else {
1721 /* Write the log to the log files AND flush them to
1722 disk */
1723
1724 log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
1725 }
1726 } else if (srv_flush_log_at_trx_commit == 2) {
1727
1728 /* Write the log but do not flush it to disk */
1729
1730 log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
1731 } else {
1732 ut_error;
1733 }
1734
1735 trx->must_flush_log_later = FALSE;
1736
1737 trx->op_info = "";
1738
1739 return(0);
1740 }
1741
1742 /**********************************************************************//**
1743 Marks the latest SQL statement ended. */
1744 UNIV_INTERN
1745 void
trx_mark_sql_stat_end(trx_t * trx)1746 trx_mark_sql_stat_end(
1747 /*==================*/
1748 trx_t* trx) /*!< in: trx handle */
1749 {
1750 ut_a(trx);
1751
1752 if (trx->conc_state == TRX_NOT_STARTED) {
1753 trx->undo_no = 0;
1754 }
1755
1756 trx->last_sql_stat_start.least_undo_no = trx->undo_no;
1757 }
1758
1759 /**********************************************************************//**
1760 Prints info about a transaction to the given file. The caller must own the
1761 kernel mutex. */
1762 UNIV_INTERN
1763 void
trx_print(FILE * f,trx_t * trx,ulint max_query_len)1764 trx_print(
1765 /*======*/
1766 FILE* f, /*!< in: output stream */
1767 trx_t* trx, /*!< in: transaction */
1768 ulint max_query_len) /*!< in: max query length to print, or 0 to
1769 use the default max length */
1770 {
1771 ibool newline;
1772
1773 fprintf(f, "TRANSACTION " TRX_ID_FMT, (ullint) trx->id);
1774
1775 switch (trx->conc_state) {
1776 case TRX_NOT_STARTED:
1777 fputs(", not started", f);
1778 break;
1779 case TRX_ACTIVE:
1780 fprintf(f, ", ACTIVE %lu sec",
1781 (ulong)difftime(time(NULL), trx->start_time));
1782 break;
1783 case TRX_PREPARED:
1784 fprintf(f, ", ACTIVE (PREPARED) %lu sec",
1785 (ulong)difftime(time(NULL), trx->start_time));
1786 break;
1787 case TRX_COMMITTED_IN_MEMORY:
1788 fputs(", COMMITTED IN MEMORY", f);
1789 break;
1790 default:
1791 fprintf(f, " state %lu", (ulong) trx->conc_state);
1792 }
1793
1794 if (*trx->op_info) {
1795 putc(' ', f);
1796 fputs(trx->op_info, f);
1797 }
1798
1799 if (trx->is_recovered) {
1800 fputs(" recovered trx", f);
1801 }
1802
1803 if (trx->is_purge) {
1804 fputs(" purge trx", f);
1805 }
1806
1807 if (trx->declared_to_be_inside_innodb) {
1808 fprintf(f, ", thread declared inside InnoDB %lu",
1809 (ulong) trx->n_tickets_to_enter_innodb);
1810 }
1811
1812 putc('\n', f);
1813
1814 if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) {
1815 fprintf(f, "mysql tables in use %lu, locked %lu\n",
1816 (ulong) trx->n_mysql_tables_in_use,
1817 (ulong) trx->mysql_n_tables_locked);
1818 }
1819
1820 newline = TRUE;
1821
1822 switch (trx->que_state) {
1823 case TRX_QUE_RUNNING:
1824 newline = FALSE; break;
1825 case TRX_QUE_LOCK_WAIT:
1826 fputs("LOCK WAIT ", f); break;
1827 case TRX_QUE_ROLLING_BACK:
1828 fputs("ROLLING BACK ", f); break;
1829 case TRX_QUE_COMMITTING:
1830 fputs("COMMITTING ", f); break;
1831 default:
1832 fprintf(f, "que state %lu ", (ulong) trx->que_state);
1833 }
1834
1835 if (0 < UT_LIST_GET_LEN(trx->trx_locks)
1836 || mem_heap_get_size(trx->lock_heap) > 400) {
1837 newline = TRUE;
1838
1839 fprintf(f, "%lu lock struct(s), heap size %lu,"
1840 " %lu row lock(s)",
1841 (ulong) UT_LIST_GET_LEN(trx->trx_locks),
1842 (ulong) mem_heap_get_size(trx->lock_heap),
1843 (ulong) lock_number_of_rows_locked(trx));
1844 }
1845
1846 if (trx->has_search_latch) {
1847 newline = TRUE;
1848 fputs(", holds adaptive hash latch", f);
1849 }
1850
1851 if (trx->undo_no != 0) {
1852 newline = TRUE;
1853 fprintf(f, ", undo log entries %llu",
1854 (ullint) trx->undo_no);
1855 }
1856
1857 if (newline) {
1858 putc('\n', f);
1859 }
1860
1861 if (trx->mysql_thd != NULL) {
1862 innobase_mysql_print_thd(f, trx->mysql_thd, max_query_len);
1863 }
1864 }
1865
1866 /*******************************************************************//**
1867 Compares the "weight" (or size) of two transactions. Transactions that
1868 have edited non-transactional tables are considered heavier than ones
1869 that have not.
1870 @return TRUE if weight(a) >= weight(b) */
1871 UNIV_INTERN
1872 ibool
trx_weight_ge(const trx_t * a,const trx_t * b)1873 trx_weight_ge(
1874 /*==========*/
1875 const trx_t* a, /*!< in: the first transaction to be compared */
1876 const trx_t* b) /*!< in: the second transaction to be compared */
1877 {
1878 ibool a_notrans_edit;
1879 ibool b_notrans_edit;
1880
1881 /* If mysql_thd is NULL for a transaction we assume that it has
1882 not edited non-transactional tables. */
1883
1884 a_notrans_edit = a->mysql_thd != NULL
1885 && thd_has_edited_nontrans_tables(a->mysql_thd);
1886
1887 b_notrans_edit = b->mysql_thd != NULL
1888 && thd_has_edited_nontrans_tables(b->mysql_thd);
1889
1890 if (a_notrans_edit != b_notrans_edit) {
1891
1892 return(a_notrans_edit);
1893 }
1894
1895 /* Either both had edited non-transactional tables or both had
1896 not, we fall back to comparing the number of altered/locked
1897 rows. */
1898
1899 #if 0
1900 fprintf(stderr,
1901 "%s TRX_WEIGHT(a): %lld+%lu, TRX_WEIGHT(b): %lld+%lu\n",
1902 __func__,
1903 a->undo_no, UT_LIST_GET_LEN(a->trx_locks),
1904 b->undo_no, UT_LIST_GET_LEN(b->trx_locks));
1905 #endif
1906
1907 return(TRX_WEIGHT(a) >= TRX_WEIGHT(b));
1908 }
1909
1910 /****************************************************************//**
1911 Prepares a transaction. */
1912 UNIV_INTERN
1913 void
trx_prepare_off_kernel(trx_t * trx)1914 trx_prepare_off_kernel(
1915 /*===================*/
1916 trx_t* trx) /*!< in: transaction */
1917 {
1918 trx_rseg_t* rseg;
1919 ib_uint64_t lsn = 0;
1920 mtr_t mtr;
1921
1922 ut_ad(mutex_own(&kernel_mutex));
1923
1924 rseg = trx->rseg;
1925
1926 if (trx->insert_undo != NULL || trx->update_undo != NULL) {
1927
1928 mutex_exit(&kernel_mutex);
1929
1930 mtr_start(&mtr);
1931
1932 /* Change the undo log segment states from TRX_UNDO_ACTIVE
1933 to TRX_UNDO_PREPARED: these modifications to the file data
1934 structure define the transaction as prepared in the
1935 file-based world, at the serialization point of lsn. */
1936
1937 mutex_enter(&(rseg->mutex));
1938
1939 if (trx->insert_undo != NULL) {
1940
1941 /* It is not necessary to obtain trx->undo_mutex here
1942 because only a single OS thread is allowed to do the
1943 transaction prepare for this transaction. */
1944
1945 trx_undo_set_state_at_prepare(trx, trx->insert_undo,
1946 &mtr);
1947 }
1948
1949 if (trx->update_undo) {
1950 trx_undo_set_state_at_prepare(
1951 trx, trx->update_undo, &mtr);
1952 }
1953
1954 mutex_exit(&(rseg->mutex));
1955
1956 /*--------------*/
1957 mtr_commit(&mtr); /* This mtr commit makes the
1958 transaction prepared in the file-based
1959 world */
1960 /*--------------*/
1961 lsn = mtr.end_lsn;
1962
1963 mutex_enter(&kernel_mutex);
1964 }
1965
1966 ut_ad(mutex_own(&kernel_mutex));
1967
1968 /*--------------------------------------*/
1969 trx->conc_state = TRX_PREPARED;
1970 trx_n_prepared++;
1971 /*--------------------------------------*/
1972
1973 if (lsn) {
1974 /* Depending on the my.cnf options, we may now write the log
1975 buffer to the log files, making the prepared state of the
1976 transaction durable if the OS does not crash. We may also
1977 flush the log files to disk, making the prepared state of the
1978 transaction durable also at an OS crash or a power outage.
1979
1980 The idea in InnoDB's group prepare is that a group of
1981 transactions gather behind a trx doing a physical disk write
1982 to log files, and when that physical write has been completed,
1983 one of those transactions does a write which prepares the whole
1984 group. Note that this group prepare will only bring benefit if
1985 there are > 2 users in the database. Then at least 2 users can
1986 gather behind one doing the physical log write to disk.
1987
1988 TODO: find out if MySQL holds some mutex when calling this.
1989 That would spoil our group prepare algorithm. */
1990
1991 mutex_exit(&kernel_mutex);
1992
1993 if (srv_flush_log_at_trx_commit == 0) {
1994 /* Do nothing */
1995 } else if (srv_flush_log_at_trx_commit == 1) {
1996 if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
1997 /* Write the log but do not flush it to disk */
1998
1999 log_write_up_to(lsn, LOG_WAIT_ONE_GROUP,
2000 FALSE);
2001 } else {
2002 /* Write the log to the log files AND flush
2003 them to disk */
2004
2005 log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
2006 }
2007 } else if (srv_flush_log_at_trx_commit == 2) {
2008
2009 /* Write the log but do not flush it to disk */
2010
2011 log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
2012 } else {
2013 ut_error;
2014 }
2015
2016 mutex_enter(&kernel_mutex);
2017 }
2018 }
2019
2020 /**********************************************************************//**
2021 Does the transaction prepare for MySQL.
2022 @return 0 or error number */
2023 UNIV_INTERN
2024 ulint
trx_prepare_for_mysql(trx_t * trx)2025 trx_prepare_for_mysql(
2026 /*==================*/
2027 trx_t* trx) /*!< in: trx handle */
2028 {
2029 /* Because we do not do the prepare by sending an Innobase
2030 sig to the transaction, we must here make sure that trx has been
2031 started. */
2032
2033 ut_a(trx);
2034
2035 trx->op_info = "preparing";
2036
2037 trx_start_if_not_started(trx);
2038
2039 mutex_enter(&kernel_mutex);
2040
2041 trx_prepare_off_kernel(trx);
2042
2043 mutex_exit(&kernel_mutex);
2044
2045 trx->op_info = "";
2046
2047 return(0);
2048 }
2049
2050 /**********************************************************************//**
2051 This function is used to find number of prepared transactions and
2052 their transaction objects for a recovery.
2053 @return number of prepared transactions stored in xid_list */
2054 UNIV_INTERN
2055 int
trx_recover_for_mysql(XID * xid_list,ulint len)2056 trx_recover_for_mysql(
2057 /*==================*/
2058 XID* xid_list, /*!< in/out: prepared transactions */
2059 ulint len) /*!< in: number of slots in xid_list */
2060 {
2061 trx_t* trx;
2062 ulint count = 0;
2063
2064 ut_ad(xid_list);
2065 ut_ad(len);
2066
2067 /* We should set those transactions which are in the prepared state
2068 to the xid_list */
2069
2070 mutex_enter(&kernel_mutex);
2071
2072 trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
2073
2074 while (trx) {
2075 if (trx->conc_state == TRX_PREPARED) {
2076 xid_list[count] = trx->xid;
2077
2078 if (count == 0) {
2079 ut_print_timestamp(stderr);
2080 fprintf(stderr,
2081 " InnoDB: Starting recovery for"
2082 " XA transactions...\n");
2083 }
2084
2085 ut_print_timestamp(stderr);
2086 fprintf(stderr,
2087 " InnoDB: Transaction " TRX_ID_FMT " in"
2088 " prepared state after recovery\n",
2089 (ullint) trx->id);
2090
2091 ut_print_timestamp(stderr);
2092 fprintf(stderr,
2093 " InnoDB: Transaction contains changes"
2094 " to %llu rows\n",
2095 (ullint) trx->undo_no);
2096
2097 count++;
2098
2099 if (count == len) {
2100 break;
2101 }
2102 }
2103
2104 trx = UT_LIST_GET_NEXT(trx_list, trx);
2105 }
2106
2107 mutex_exit(&kernel_mutex);
2108
2109 if (count > 0){
2110 ut_print_timestamp(stderr);
2111 fprintf(stderr,
2112 " InnoDB: %lu transactions in prepared state"
2113 " after recovery\n",
2114 (ulong) count);
2115 }
2116
2117 return ((int) count);
2118 }
2119
2120 /*******************************************************************//**
2121 This function is used to find one X/Open XA distributed transaction
2122 which is in the prepared state
2123 @return trx or NULL; on match, the trx->xid will be invalidated */
2124 UNIV_INTERN
2125 trx_t*
trx_get_trx_by_xid(const XID * xid)2126 trx_get_trx_by_xid(
2127 /*===============*/
2128 const XID* xid) /*!< in: X/Open XA transaction identifier */
2129 {
2130 trx_t* trx;
2131
2132 if (xid == NULL) {
2133
2134 return(NULL);
2135 }
2136
2137 mutex_enter(&kernel_mutex);
2138
2139 trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
2140
2141 while (trx) {
2142 /* Compare two X/Open XA transaction id's: their
2143 length should be the same and binary comparison
2144 of gtrid_length+bqual_length bytes should be
2145 the same */
2146
2147 if (trx->is_recovered
2148 && trx->conc_state == TRX_PREPARED
2149 && xid->gtrid_length == trx->xid.gtrid_length
2150 && xid->bqual_length == trx->xid.bqual_length
2151 && memcmp(xid->data, trx->xid.data,
2152 xid->gtrid_length + xid->bqual_length) == 0) {
2153
2154 /* Invalidate the XID, so that subsequent calls
2155 will not find it. */
2156 memset(&trx->xid, 0, sizeof(trx->xid));
2157 trx->xid.formatID = -1;
2158 break;
2159 }
2160
2161 trx = UT_LIST_GET_NEXT(trx_list, trx);
2162 }
2163
2164 mutex_exit(&kernel_mutex);
2165
2166 return(trx);
2167 }
2168