1 /*****************************************************************************
2
3 Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2017, 2021, MariaDB Corporation.
5
6 This program is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free Software
8 Foundation; version 2 of the License.
9
10 This program is distributed in the hope that it will be useful, but WITHOUT
11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License along with
15 this program; if not, write to the Free Software Foundation, Inc.,
16 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
17
18 *****************************************************************************/
19
20 /**************************************************//**
21 @file include/trx0sys.h
22 Transaction system
23
24 Created 3/26/1996 Heikki Tuuri
25 *******************************************************/
26
27 #ifndef trx0sys_h
28 #define trx0sys_h
29
30 #include "buf0buf.h"
31 #include "fil0fil.h"
32 #include "trx0types.h"
33 #include "mem0mem.h"
34 #include "mtr0mtr.h"
35 #include "ut0byte.h"
36 #include "ut0lst.h"
37 #include "read0types.h"
38 #include "page0types.h"
39 #include "ut0mutex.h"
40 #include "trx0trx.h"
41 #ifdef WITH_WSREP
42 #include "trx0xa.h"
43 #endif /* WITH_WSREP */
44
45 typedef UT_LIST_BASE_NODE_T(trx_t) trx_ut_list_t;
46
47 /** Checks if a page address is the trx sys header page.
48 @param[in] page_id page id
49 @return true if trx sys header page */
trx_sys_hdr_page(const page_id_t & page_id)50 inline bool trx_sys_hdr_page(const page_id_t& page_id)
51 {
52 return(page_id.space() == TRX_SYS_SPACE
53 && page_id.page_no() == TRX_SYS_PAGE_NO);
54 }
55
56 /*****************************************************************//**
57 Creates and initializes the transaction system at the database creation. */
58 void
59 trx_sys_create_sys_pages(void);
60 /*==========================*/
61 /** Find an available rollback segment.
62 @param[in] sys_header
63 @return an unallocated rollback segment slot in the TRX_SYS header
64 @retval ULINT_UNDEFINED if not found */
65 ulint
66 trx_sys_rseg_find_free(const buf_block_t* sys_header);
67 /** Request the TRX_SYS page.
68 @param[in] rw whether to lock the page for writing
69 @return the TRX_SYS page
70 @retval NULL if the page cannot be read */
71 inline buf_block_t *trx_sysf_get(mtr_t* mtr, bool rw= true)
72 {
73 buf_block_t* block = buf_page_get(page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO),
74 0, rw ? RW_X_LATCH : RW_S_LATCH, mtr);
75 ut_d(if (block) buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);)
76 return block;
77 }
78
79 #ifdef UNIV_DEBUG
80 /* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */
81 extern uint trx_rseg_n_slots_debug;
82 #endif
83
84 /** Write DB_TRX_ID.
85 @param[out] db_trx_id the DB_TRX_ID field to be written to
86 @param[in] id transaction ID */
87 UNIV_INLINE
88 void
trx_write_trx_id(byte * db_trx_id,trx_id_t id)89 trx_write_trx_id(byte* db_trx_id, trx_id_t id)
90 {
91 compile_time_assert(DATA_TRX_ID_LEN == 6);
92 mach_write_to_6(db_trx_id, id);
93 }
94
95 /** Read a transaction identifier.
96 @return id */
97 inline
98 trx_id_t
trx_read_trx_id(const byte * ptr)99 trx_read_trx_id(const byte* ptr)
100 {
101 compile_time_assert(DATA_TRX_ID_LEN == 6);
102 return(mach_read_from_6(ptr));
103 }
104
105 #ifdef UNIV_DEBUG
106 /** Check that the DB_TRX_ID in a record is valid.
107 @param[in] db_trx_id the DB_TRX_ID column to validate
108 @param[in] trx_id the id of the ALTER TABLE transaction */
trx_id_check(const void * db_trx_id,trx_id_t trx_id)109 inline bool trx_id_check(const void* db_trx_id, trx_id_t trx_id)
110 {
111 trx_id_t id = trx_read_trx_id(static_cast<const byte*>(db_trx_id));
112 ut_ad(id == 0 || id > trx_id);
113 return true;
114 }
115 #endif
116
117 /*****************************************************************//**
118 Updates the offset information about the end of the MySQL binlog entry
119 which corresponds to the transaction just being committed. In a MySQL
120 replication slave updates the latest master binlog position up to which
121 replication has proceeded. */
122 void
123 trx_sys_update_mysql_binlog_offset(
124 /*===============================*/
125 const char* file_name,/*!< in: MySQL log file name */
126 int64_t offset, /*!< in: position in that log file */
127 buf_block_t* sys_header, /*!< in,out: trx sys header */
128 mtr_t* mtr); /*!< in,out: mini-transaction */
129 /** Display the MySQL binlog offset info if it is present in the trx
130 system header. */
131 void
132 trx_sys_print_mysql_binlog_offset();
133
134 /** Create the rollback segments.
135 @return whether the creation succeeded */
136 bool
137 trx_sys_create_rsegs();
138
139 /** The automatically created system rollback segment has this id */
140 #define TRX_SYS_SYSTEM_RSEG_ID 0
141
142 /** The offset of the transaction system header on the page */
143 #define TRX_SYS FSEG_PAGE_DATA
144
145 /** Transaction system header */
146 /*------------------------------------------------------------- @{ */
147 /** In old versions of InnoDB, this persisted the value of
148 trx_sys.get_max_trx_id(). Starting with MariaDB 10.3.5,
149 the field TRX_RSEG_MAX_TRX_ID in rollback segment header pages
150 and the fields TRX_UNDO_TRX_ID, TRX_UNDO_TRX_NO in undo log pages
151 are used instead. The field only exists for the purpose of upgrading
152 from older MySQL or MariaDB versions. */
153 #define TRX_SYS_TRX_ID_STORE 0
154 #define TRX_SYS_FSEG_HEADER 8 /*!< segment header for the
155 tablespace segment the trx
156 system is created into */
157 #define TRX_SYS_RSEGS (8 + FSEG_HEADER_SIZE)
158 /*!< the start of the array of
159 rollback segment specification
160 slots */
161 /*------------------------------------------------------------- @} */
162
163 /** The number of rollback segments; rollback segment id must fit in
164 the 7 bits reserved for it in DB_ROLL_PTR. */
165 #define TRX_SYS_N_RSEGS 128
166 /** Maximum number of undo tablespaces (not counting the system tablespace) */
167 #define TRX_SYS_MAX_UNDO_SPACES (TRX_SYS_N_RSEGS - 1)
168
169 /* Rollback segment specification slot offsets */
170
171 /** the tablespace ID of an undo log header; starting with
172 MySQL/InnoDB 5.1.7, this is FIL_NULL if the slot is unused */
173 #define TRX_SYS_RSEG_SPACE 0
174 /** the page number of an undo log header, or FIL_NULL if unused */
175 #define TRX_SYS_RSEG_PAGE_NO 4
176 /** Size of a rollback segment specification slot */
177 #define TRX_SYS_RSEG_SLOT_SIZE 8
178
179 /** Read the tablespace ID of a rollback segment slot.
180 @param[in] sys_header TRX_SYS page
181 @param[in] rseg_id rollback segment identifier
182 @return undo tablespace id */
183 inline
184 uint32_t
trx_sysf_rseg_get_space(const buf_block_t * sys_header,ulint rseg_id)185 trx_sysf_rseg_get_space(const buf_block_t* sys_header, ulint rseg_id)
186 {
187 ut_ad(rseg_id < TRX_SYS_N_RSEGS);
188 return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_SPACE
189 + rseg_id * TRX_SYS_RSEG_SLOT_SIZE
190 + sys_header->frame);
191 }
192
193 /** Read the page number of a rollback segment slot.
194 @param[in] sys_header TRX_SYS page
195 @param[in] rseg_id rollback segment identifier
196 @return undo page number */
197 inline uint32_t
trx_sysf_rseg_get_page_no(const buf_block_t * sys_header,ulint rseg_id)198 trx_sysf_rseg_get_page_no(const buf_block_t *sys_header, ulint rseg_id)
199 {
200 ut_ad(rseg_id < TRX_SYS_N_RSEGS);
201 return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO +
202 rseg_id * TRX_SYS_RSEG_SLOT_SIZE +
203 sys_header->frame);
204 }
205
206 /** Maximum length of MySQL binlog file name, in bytes.
207 (Used before MariaDB 10.3.5.) */
208 #define TRX_SYS_MYSQL_LOG_NAME_LEN 512
209 /** Contents of TRX_SYS_MYSQL_LOG_MAGIC_N_FLD */
210 #define TRX_SYS_MYSQL_LOG_MAGIC_N 873422344
211
212 #if UNIV_PAGE_SIZE_MIN < 4096
213 # error "UNIV_PAGE_SIZE_MIN < 4096"
214 #endif
215 /** The offset of the MySQL binlog offset info in the trx system header */
216 #define TRX_SYS_MYSQL_LOG_INFO (srv_page_size - 1000)
217 #define TRX_SYS_MYSQL_LOG_MAGIC_N_FLD 0 /*!< magic number which is
218 TRX_SYS_MYSQL_LOG_MAGIC_N
219 if we have valid data in the
220 MySQL binlog info */
221 #define TRX_SYS_MYSQL_LOG_OFFSET 4 /*!< the 64-bit offset
222 within that file */
223 #define TRX_SYS_MYSQL_LOG_NAME 12 /*!< MySQL log file name */
224
225 /** Memory map TRX_SYS_PAGE_NO = 5 when srv_page_size = 4096
226
227 0...37 FIL_HEADER
228 38...45 TRX_SYS_TRX_ID_STORE
229 46...55 TRX_SYS_FSEG_HEADER (FSEG_HEADER_SIZE == 10)
230 56 TRX_SYS_RSEGS
231 56...59 TRX_SYS_RSEG_SPACE for slot 0
232 60...63 TRX_SYS_RSEG_PAGE_NO for slot 0
233 64...67 TRX_SYS_RSEG_SPACE for slot 1
234 68...71 TRX_SYS_RSEG_PAGE_NO for slot 1
235 ....
236 594..597 TRX_SYS_RSEG_SPACE for slot 72
237 598..601 TRX_SYS_RSEG_PAGE_NO for slot 72
238 ...
239 ...1063 TRX_SYS_RSEG_PAGE_NO for slot 126
240
241 (srv_page_size-3500 WSREP ::: FAIL would overwrite undo tablespace
242 space_id, page_no pairs :::)
243 596 TRX_SYS_WSREP_XID_INFO TRX_SYS_WSREP_XID_MAGIC_N_FLD
244 600 TRX_SYS_WSREP_XID_FORMAT
245 604 TRX_SYS_WSREP_XID_GTRID_LEN
246 608 TRX_SYS_WSREP_XID_BQUAL_LEN
247 612 TRX_SYS_WSREP_XID_DATA (len = 128)
248 739 TRX_SYS_WSREP_XID_DATA_END
249
250 FIXED WSREP XID info offsets for 4k page size 10.0.32-galera
251 (srv_page_size-2500)
252 1596 TRX_SYS_WSREP_XID_INFO TRX_SYS_WSREP_XID_MAGIC_N_FLD
253 1600 TRX_SYS_WSREP_XID_FORMAT
254 1604 TRX_SYS_WSREP_XID_GTRID_LEN
255 1608 TRX_SYS_WSREP_XID_BQUAL_LEN
256 1612 TRX_SYS_WSREP_XID_DATA (len = 128)
257 1739 TRX_SYS_WSREP_XID_DATA_END
258
259 (srv_page_size - 2000 MYSQL MASTER LOG)
260 2096 TRX_SYS_MYSQL_MASTER_LOG_INFO TRX_SYS_MYSQL_LOG_MAGIC_N_FLD
261 2100 TRX_SYS_MYSQL_LOG_OFFSET_HIGH
262 2104 TRX_SYS_MYSQL_LOG_OFFSET_LOW
263 2108 TRX_SYS_MYSQL_LOG_NAME
264
265 (srv_page_size - 1000 MYSQL LOG)
266 3096 TRX_SYS_MYSQL_LOG_INFO TRX_SYS_MYSQL_LOG_MAGIC_N_FLD
267 3100 TRX_SYS_MYSQL_LOG_OFFSET_HIGH
268 3104 TRX_SYS_MYSQL_LOG_OFFSET_LOW
269 3108 TRX_SYS_MYSQL_LOG_NAME
270
271 (srv_page_size - 200 DOUBLEWRITE)
272 3896 TRX_SYS_DOUBLEWRITE TRX_SYS_DOUBLEWRITE_FSEG
273 3906 TRX_SYS_DOUBLEWRITE_MAGIC
274 3910 TRX_SYS_DOUBLEWRITE_BLOCK1
275 3914 TRX_SYS_DOUBLEWRITE_BLOCK2
276 3918 TRX_SYS_DOUBLEWRITE_REPEAT
277 3930 TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N
278
279 (srv_page_size - 8, TAILER)
280 4088..4096 FIL_TAILER
281
282 */
283 #ifdef WITH_WSREP
284 /** The offset to WSREP XID headers (used before MariaDB 10.3.5) */
285 #define TRX_SYS_WSREP_XID_INFO std::max(srv_page_size - 3500, 1596UL)
286 #define TRX_SYS_WSREP_XID_MAGIC_N_FLD 0
287 #define TRX_SYS_WSREP_XID_MAGIC_N 0x77737265
288
289 /** XID field: formatID, gtrid_len, bqual_len, xid_data */
290 #define TRX_SYS_WSREP_XID_LEN (4 + 4 + 4 + XIDDATASIZE)
291 #define TRX_SYS_WSREP_XID_FORMAT 4
292 #define TRX_SYS_WSREP_XID_GTRID_LEN 8
293 #define TRX_SYS_WSREP_XID_BQUAL_LEN 12
294 #define TRX_SYS_WSREP_XID_DATA 16
295 #endif /* WITH_WSREP*/
296
297 /** Doublewrite buffer */
298 /* @{ */
299 /** The offset of the doublewrite buffer header on the trx system header page */
300 #define TRX_SYS_DOUBLEWRITE (srv_page_size - 200)
301 /*-------------------------------------------------------------*/
302 #define TRX_SYS_DOUBLEWRITE_FSEG 0 /*!< fseg header of the fseg
303 containing the doublewrite
304 buffer */
305 #define TRX_SYS_DOUBLEWRITE_MAGIC FSEG_HEADER_SIZE
306 /*!< 4-byte magic number which
307 shows if we already have
308 created the doublewrite
309 buffer */
310 #define TRX_SYS_DOUBLEWRITE_BLOCK1 (4 + FSEG_HEADER_SIZE)
311 /*!< page number of the
312 first page in the first
313 sequence of 64
314 (= FSP_EXTENT_SIZE) consecutive
315 pages in the doublewrite
316 buffer */
317 #define TRX_SYS_DOUBLEWRITE_BLOCK2 (8 + FSEG_HEADER_SIZE)
318 /*!< page number of the
319 first page in the second
320 sequence of 64 consecutive
321 pages in the doublewrite
322 buffer */
323 #define TRX_SYS_DOUBLEWRITE_REPEAT 12 /*!< we repeat
324 TRX_SYS_DOUBLEWRITE_MAGIC,
325 TRX_SYS_DOUBLEWRITE_BLOCK1,
326 TRX_SYS_DOUBLEWRITE_BLOCK2
327 so that if the trx sys
328 header is half-written
329 to disk, we still may
330 be able to recover the
331 information */
332 /** If this is not yet set to TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
333 we must reset the doublewrite buffer, because starting from 4.1.x the
334 space id of a data page is stored into
335 FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */
336 #define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED (24 + FSEG_HEADER_SIZE)
337
338 /*-------------------------------------------------------------*/
339 /** Contents of TRX_SYS_DOUBLEWRITE_MAGIC */
340 constexpr uint32_t TRX_SYS_DOUBLEWRITE_MAGIC_N= 536853855;
341 /** Contents of TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED */
342 constexpr uint32_t TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N= 1783657386;
343
344 /** Size of the doublewrite block in pages */
345 #define TRX_SYS_DOUBLEWRITE_BLOCK_SIZE FSP_EXTENT_SIZE
346 /* @} */
347
348 trx_t* current_trx();
349
350 struct rw_trx_hash_element_t
351 {
rw_trx_hash_element_trw_trx_hash_element_t352 rw_trx_hash_element_t(): trx(0)
353 {
354 mutex_create(LATCH_ID_RW_TRX_HASH_ELEMENT, &mutex);
355 }
356
357
~rw_trx_hash_element_trw_trx_hash_element_t358 ~rw_trx_hash_element_t()
359 {
360 mutex_free(&mutex);
361 }
362
363
364 trx_id_t id; /* lf_hash_init() relies on this to be first in the struct */
365 Atomic_counter<trx_id_t> no;
366 trx_t *trx;
367 ib_mutex_t mutex;
368 };
369
370
371 /**
372 Wrapper around LF_HASH to store set of in memory read-write transactions.
373 */
374
375 class rw_trx_hash_t
376 {
377 LF_HASH hash;
378
379
380 /**
381 Constructor callback for lock-free allocator.
382
383 Object is just allocated and is not yet accessible via rw_trx_hash by
384 concurrent threads. Object can be reused multiple times before it is freed.
385 Every time object is being reused initializer() callback is called.
386 */
387
rw_trx_hash_constructor(uchar * arg)388 static void rw_trx_hash_constructor(uchar *arg)
389 {
390 new(arg + LF_HASH_OVERHEAD) rw_trx_hash_element_t();
391 }
392
393
394 /**
395 Destructor callback for lock-free allocator.
396
397 Object is about to be freed and is not accessible via rw_trx_hash by
398 concurrent threads.
399 */
400
rw_trx_hash_destructor(uchar * arg)401 static void rw_trx_hash_destructor(uchar *arg)
402 {
403 reinterpret_cast<rw_trx_hash_element_t*>
404 (arg + LF_HASH_OVERHEAD)->~rw_trx_hash_element_t();
405 }
406
407
408 /**
409 Destructor callback for lock-free allocator.
410
411 This destructor is used at shutdown. It frees remaining transaction
412 objects.
413
414 XA PREPARED transactions may remain if they haven't been committed or
415 rolled back. ACTIVE transactions may remain if startup was interrupted or
416 server is running in read-only mode or for certain srv_force_recovery
417 levels.
418 */
419
rw_trx_hash_shutdown_destructor(uchar * arg)420 static void rw_trx_hash_shutdown_destructor(uchar *arg)
421 {
422 rw_trx_hash_element_t *element=
423 reinterpret_cast<rw_trx_hash_element_t*>(arg + LF_HASH_OVERHEAD);
424 if (trx_t *trx= element->trx)
425 {
426 ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED) ||
427 trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED) ||
428 (trx_state_eq(trx, TRX_STATE_ACTIVE) &&
429 (!srv_was_started ||
430 srv_read_only_mode ||
431 srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO)));
432 trx_free_at_shutdown(trx);
433 }
434 element->~rw_trx_hash_element_t();
435 }
436
437
438 /**
439 Initializer callback for lock-free hash.
440
441 Object is not yet accessible via rw_trx_hash by concurrent threads, but is
442 about to become such. Object id can be changed only by this callback and
443 remains the same until all pins to this object are released.
444
445 Object trx can be changed to 0 by erase() under object mutex protection,
446 which indicates it is about to be removed from lock-free hash and become
447 not accessible by concurrent threads.
448 */
449
rw_trx_hash_initializer(LF_HASH *,rw_trx_hash_element_t * element,trx_t * trx)450 static void rw_trx_hash_initializer(LF_HASH *,
451 rw_trx_hash_element_t *element,
452 trx_t *trx)
453 {
454 ut_ad(element->trx == 0);
455 element->trx= trx;
456 element->id= trx->id;
457 element->no= TRX_ID_MAX;
458 trx->rw_trx_hash_element= element;
459 }
460
461
462 /**
463 Gets LF_HASH pins.
464
465 Pins are used to protect object from being destroyed or reused. They are
466 normally stored in trx object for quick access. If caller doesn't have trx
467 available, we try to get it using currnet_trx(). If caller doesn't have trx
468 at all, temporary pins are allocated.
469 */
470
get_pins(trx_t * trx)471 LF_PINS *get_pins(trx_t *trx)
472 {
473 if (!trx->rw_trx_hash_pins)
474 {
475 trx->rw_trx_hash_pins= lf_hash_get_pins(&hash);
476 ut_a(trx->rw_trx_hash_pins);
477 }
478 return trx->rw_trx_hash_pins;
479 }
480
481
482 struct eliminate_duplicates_arg
483 {
484 trx_ids_t ids;
485 my_hash_walk_action action;
486 void *argument;
eliminate_duplicates_argeliminate_duplicates_arg487 eliminate_duplicates_arg(size_t size, my_hash_walk_action act, void* arg):
488 action(act), argument(arg) { ids.reserve(size); }
489 };
490
491
eliminate_duplicates(rw_trx_hash_element_t * element,eliminate_duplicates_arg * arg)492 static my_bool eliminate_duplicates(rw_trx_hash_element_t *element,
493 eliminate_duplicates_arg *arg)
494 {
495 for (trx_ids_t::iterator it= arg->ids.begin(); it != arg->ids.end(); it++)
496 {
497 if (*it == element->id)
498 return 0;
499 }
500 arg->ids.push_back(element->id);
501 return arg->action(element, arg->argument);
502 }
503
504
505 #ifdef UNIV_DEBUG
validate_element(trx_t * trx)506 static void validate_element(trx_t *trx)
507 {
508 ut_ad(!trx->read_only || !trx->rsegs.m_redo.rseg);
509 ut_ad(!trx->is_autocommit_non_locking());
510 /* trx->state can be anything except TRX_STATE_NOT_STARTED */
511 mutex_enter(&trx->mutex);
512 ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) ||
513 trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY) ||
514 trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED) ||
515 trx_state_eq(trx, TRX_STATE_PREPARED));
516 mutex_exit(&trx->mutex);
517 }
518
519
520 struct debug_iterator_arg
521 {
522 my_hash_walk_action action;
523 void *argument;
524 };
525
526
debug_iterator(rw_trx_hash_element_t * element,debug_iterator_arg * arg)527 static my_bool debug_iterator(rw_trx_hash_element_t *element,
528 debug_iterator_arg *arg)
529 {
530 mutex_enter(&element->mutex);
531 if (element->trx)
532 validate_element(element->trx);
533 mutex_exit(&element->mutex);
534 return arg->action(element, arg->argument);
535 }
536 #endif
537
538
539 public:
init()540 void init()
541 {
542 lf_hash_init(&hash, sizeof(rw_trx_hash_element_t), LF_HASH_UNIQUE, 0,
543 sizeof(trx_id_t), 0, &my_charset_bin);
544 hash.alloc.constructor= rw_trx_hash_constructor;
545 hash.alloc.destructor= rw_trx_hash_destructor;
546 hash.initializer=
547 reinterpret_cast<lf_hash_initializer>(rw_trx_hash_initializer);
548 }
549
550
destroy()551 void destroy()
552 {
553 hash.alloc.destructor= rw_trx_hash_shutdown_destructor;
554 lf_hash_destroy(&hash);
555 }
556
557
558 /**
559 Releases LF_HASH pins.
560
561 Must be called by thread that owns trx_t object when the latter is being
562 "detached" from thread (e.g. released to the pool by trx_t::free()). Can be
563 called earlier if thread is expected not to use rw_trx_hash.
564
565 Since pins are not allowed to be transferred to another thread,
566 initialisation thread calls this for recovered transactions.
567 */
568
put_pins(trx_t * trx)569 void put_pins(trx_t *trx)
570 {
571 if (trx->rw_trx_hash_pins)
572 {
573 lf_hash_put_pins(trx->rw_trx_hash_pins);
574 trx->rw_trx_hash_pins= 0;
575 }
576 }
577
578
579 /**
580 Finds trx object in lock-free hash with given id.
581
582 Only ACTIVE or PREPARED trx objects may participate in hash. Nevertheless
583 the transaction may get committed before this method returns.
584
585 With do_ref_count == false the caller may dereference returned trx pointer
586 only if lock_sys.mutex was acquired before calling find().
587
588 With do_ref_count == true caller may dereference trx even if it is not
589 holding lock_sys.mutex. Caller is responsible for calling
590 trx->release_reference() when it is done playing with trx.
591
592 Ideally this method should get caller rw_trx_hash_pins along with trx
593 object as a parameter, similar to insert() and erase(). However most
594 callers lose trx early in their call chains and it is not that easy to pass
595 them through.
596
597 So we take more expensive approach: get trx through current_thd()->ha_data.
598 Some threads don't have trx attached to THD, and at least server
599 initialisation thread, fts_optimize_thread, srv_master_thread,
600 dict_stats_thread, srv_monitor_thread, btr_defragment_thread don't even
601 have THD at all. For such cases we allocate pins only for duration of
602 search and free them immediately.
603
604 This has negative performance impact and should be fixed eventually (by
605 passing caller_trx as a parameter). Still stream of DML is more or less Ok.
606
607 @return
608 @retval 0 not found
609 @retval pointer to trx
610 */
611
find(trx_t * caller_trx,trx_id_t trx_id,bool do_ref_count)612 trx_t *find(trx_t *caller_trx, trx_id_t trx_id, bool do_ref_count)
613 {
614 /*
615 In MariaDB 10.3, purge will reset DB_TRX_ID to 0
616 when the history is lost. Read/write transactions will
617 always have a nonzero trx_t::id; there the value 0 is
618 reserved for transactions that did not write or lock
619 anything yet.
620
621 The caller should already have handled trx_id==0 specially.
622 */
623 ut_ad(trx_id);
624 ut_ad(!caller_trx || caller_trx->id != trx_id || !do_ref_count);
625
626 trx_t *trx= 0;
627 LF_PINS *pins= caller_trx ? get_pins(caller_trx) : lf_hash_get_pins(&hash);
628 ut_a(pins);
629
630 rw_trx_hash_element_t *element= reinterpret_cast<rw_trx_hash_element_t*>
631 (lf_hash_search(&hash, pins, reinterpret_cast<const void*>(&trx_id),
632 sizeof(trx_id_t)));
633 if (element)
634 {
635 mutex_enter(&element->mutex);
636 lf_hash_search_unpin(pins);
637 if ((trx= element->trx)) {
638 DBUG_ASSERT(trx_id == trx->id);
639 ut_d(validate_element(trx));
640 if (do_ref_count)
641 {
642 /*
643 We have an early state check here to avoid committer
644 starvation in a wait loop for transaction references,
645 when there's a stream of trx_sys.find() calls from other
646 threads. The trx->state may change to COMMITTED after
647 trx->mutex is released, and it will have to be rechecked
648 by the caller after reacquiring the mutex.
649 */
650 trx_mutex_enter(trx);
651 const trx_state_t state= trx->state;
652 trx_mutex_exit(trx);
653 if (state == TRX_STATE_COMMITTED_IN_MEMORY)
654 trx= NULL;
655 else
656 trx->reference();
657 }
658 }
659 mutex_exit(&element->mutex);
660 }
661 if (!caller_trx)
662 lf_hash_put_pins(pins);
663 return trx;
664 }
665
666
667 /**
668 Inserts trx to lock-free hash.
669
670 Object becomes accessible via rw_trx_hash.
671 */
672
insert(trx_t * trx)673 void insert(trx_t *trx)
674 {
675 ut_d(validate_element(trx));
676 int res= lf_hash_insert(&hash, get_pins(trx),
677 reinterpret_cast<void*>(trx));
678 ut_a(res == 0);
679 }
680
681
682 /**
683 Removes trx from lock-free hash.
684
685 Object becomes not accessible via rw_trx_hash. But it still can be pinned
686 by concurrent find(), which is supposed to release it immediately after
687 it sees object trx is 0.
688 */
689
erase(trx_t * trx)690 void erase(trx_t *trx)
691 {
692 ut_d(validate_element(trx));
693 mutex_enter(&trx->rw_trx_hash_element->mutex);
694 trx->rw_trx_hash_element->trx= 0;
695 mutex_exit(&trx->rw_trx_hash_element->mutex);
696 int res= lf_hash_delete(&hash, get_pins(trx),
697 reinterpret_cast<const void*>(&trx->id),
698 sizeof(trx_id_t));
699 ut_a(res == 0);
700 }
701
702
703 /**
704 Returns the number of elements in the hash.
705
706 The number is exact only if hash is protected against concurrent
707 modifications (e.g. single threaded startup or hash is protected
708 by some mutex). Otherwise the number may be used as a hint only,
709 because it may change even before this method returns.
710 */
711
size()712 uint32_t size() { return uint32_t(lf_hash_size(&hash)); }
713
714
715 /**
716 Iterates the hash.
717
718 @param caller_trx used to get/set pins
719 @param action called for every element in hash
720 @param argument opque argument passed to action
721
722 May return the same element multiple times if hash is under contention.
723 If caller doesn't like to see the same transaction multiple times, it has
724 to call iterate_no_dups() instead.
725
726 May return element with committed transaction. If caller doesn't like to
727 see committed transactions, it has to skip those under element mutex:
728
729 mutex_enter(&element->mutex);
730 if (trx_t trx= element->trx)
731 {
732 // trx is protected against commit in this branch
733 }
734 mutex_exit(&element->mutex);
735
736 May miss concurrently inserted transactions.
737
738 @return
739 @retval 0 iteration completed successfully
740 @retval 1 iteration was interrupted (action returned 1)
741 */
742
iterate(trx_t * caller_trx,my_hash_walk_action action,void * argument)743 int iterate(trx_t *caller_trx, my_hash_walk_action action, void *argument)
744 {
745 LF_PINS *pins= caller_trx ? get_pins(caller_trx) : lf_hash_get_pins(&hash);
746 ut_a(pins);
747 #ifdef UNIV_DEBUG
748 debug_iterator_arg debug_arg= { action, argument };
749 action= reinterpret_cast<my_hash_walk_action>(debug_iterator);
750 argument= &debug_arg;
751 #endif
752 int res= lf_hash_iterate(&hash, pins, action, argument);
753 if (!caller_trx)
754 lf_hash_put_pins(pins);
755 return res;
756 }
757
758
iterate(my_hash_walk_action action,void * argument)759 int iterate(my_hash_walk_action action, void *argument)
760 {
761 return iterate(current_trx(), action, argument);
762 }
763
764
765 /**
766 Iterates the hash and eliminates duplicate elements.
767
768 @sa iterate()
769 */
770
iterate_no_dups(trx_t * caller_trx,my_hash_walk_action action,void * argument)771 int iterate_no_dups(trx_t *caller_trx, my_hash_walk_action action,
772 void *argument)
773 {
774 eliminate_duplicates_arg arg(size() + 32, action, argument);
775 return iterate(caller_trx, reinterpret_cast<my_hash_walk_action>
776 (eliminate_duplicates), &arg);
777 }
778
779
iterate_no_dups(my_hash_walk_action action,void * argument)780 int iterate_no_dups(my_hash_walk_action action, void *argument)
781 {
782 return iterate_no_dups(current_trx(), action, argument);
783 }
784 };
785
786
787 /** The transaction system central memory data structure. */
788 class trx_sys_t
789 {
790 /**
791 The smallest number not yet assigned as a transaction id or transaction
792 number. Accessed and updated with atomic operations.
793 */
794 MY_ALIGNED(CACHE_LINE_SIZE) Atomic_counter<trx_id_t> m_max_trx_id;
795
796
797 /**
798 Solves race conditions between register_rw() and snapshot_ids() as well as
799 race condition between assign_new_trx_no() and snapshot_ids().
800
801 @sa register_rw()
802 @sa assign_new_trx_no()
803 @sa snapshot_ids()
804 */
805 MY_ALIGNED(CACHE_LINE_SIZE) std::atomic<trx_id_t> m_rw_trx_hash_version;
806
807
808 bool m_initialised;
809
810 public:
811 /**
812 TRX_RSEG_HISTORY list length (number of committed transactions to purge)
813 */
814 MY_ALIGNED(CACHE_LINE_SIZE) Atomic_counter<uint32_t> rseg_history_len;
815
816 /** Mutex protecting trx_list. */
817 MY_ALIGNED(CACHE_LINE_SIZE) mutable TrxSysMutex mutex;
818
819 /** List of all transactions. */
820 MY_ALIGNED(CACHE_LINE_SIZE) trx_ut_list_t trx_list;
821
822 MY_ALIGNED(CACHE_LINE_SIZE)
823 /** Temporary rollback segments */
824 trx_rseg_t* temp_rsegs[TRX_SYS_N_RSEGS];
825
826 MY_ALIGNED(CACHE_LINE_SIZE)
827 trx_rseg_t* rseg_array[TRX_SYS_N_RSEGS];
828 /*!< Pointer array to rollback
829 segments; NULL if slot not in use;
830 created and destroyed in
831 single-threaded mode; not protected
832 by any mutex, because it is read-only
833 during multi-threaded operation */
834
835 /**
836 Lock-free hash of in memory read-write transactions.
837 Works faster when it is on it's own cache line (tested).
838 */
839
840 MY_ALIGNED(CACHE_LINE_SIZE) rw_trx_hash_t rw_trx_hash;
841
842
843 #ifdef WITH_WSREP
844 /** Latest recovered XID during startup */
845 XID recovered_wsrep_xid;
846 #endif
847 /** Latest recovered binlog offset */
848 uint64_t recovered_binlog_offset;
849 /** Latest recovered binlog file name */
850 char recovered_binlog_filename[TRX_SYS_MYSQL_LOG_NAME_LEN];
851 /** FIL_PAGE_LSN of the page with the latest recovered binlog metadata */
852 lsn_t recovered_binlog_lsn;
853
854
855 /**
856 Constructor.
857
858 Some members may require late initialisation, thus we just mark object as
859 uninitialised. Real initialisation happens in create().
860 */
861
trx_sys_t()862 trx_sys_t(): m_initialised(false) {}
863
864
865 /**
866 Returns the minimum trx id in rw trx list.
867
868 This is the smallest id for which the trx can possibly be active. (But, you
869 must look at the trx->state to find out if the minimum trx id transaction
870 itself is active, or already committed.)
871
872 @return the minimum trx id, or m_max_trx_id if the trx list is empty
873 */
874
get_min_trx_id()875 trx_id_t get_min_trx_id()
876 {
877 trx_id_t id= get_max_trx_id();
878 rw_trx_hash.iterate(reinterpret_cast<my_hash_walk_action>
879 (get_min_trx_id_callback), &id);
880 return id;
881 }
882
883
884 /**
885 Determines the maximum transaction id.
886
887 @return maximum currently allocated trx id; will be stale after the
888 next call to trx_sys.get_new_trx_id()
889 */
890
get_max_trx_id()891 trx_id_t get_max_trx_id()
892 {
893 return m_max_trx_id;
894 }
895
896
897 /**
898 Allocates a new transaction id.
899 @return new, allocated trx id
900 */
901
get_new_trx_id()902 trx_id_t get_new_trx_id()
903 {
904 trx_id_t id= get_new_trx_id_no_refresh();
905 refresh_rw_trx_hash_version();
906 return id;
907 }
908
909
910 /**
911 Allocates and assigns new transaction serialisation number.
912
913 There's a gap between m_max_trx_id increment and transaction serialisation
914 number becoming visible through rw_trx_hash. While we're in this gap
915 concurrent thread may come and do MVCC snapshot without seeing allocated
916 but not yet assigned serialisation number. Then at some point purge thread
917 may clone this view. As a result it won't see newly allocated serialisation
918 number and may remove "unnecessary" history data of this transaction from
919 rollback segments.
920
921 m_rw_trx_hash_version is intended to solve this problem. MVCC snapshot has
922 to wait until m_max_trx_id == m_rw_trx_hash_version, which effectively
923 means that all transaction serialisation numbers up to m_max_trx_id are
924 available through rw_trx_hash.
925
926 We rely on refresh_rw_trx_hash_version() to issue RELEASE memory barrier so
927 that m_rw_trx_hash_version increment happens after
928 trx->rw_trx_hash_element->no becomes visible through rw_trx_hash.
929
930 @param trx transaction
931 */
assign_new_trx_no(trx_t * trx)932 void assign_new_trx_no(trx_t *trx)
933 {
934 trx->no= get_new_trx_id_no_refresh();
935 trx->rw_trx_hash_element->no= trx->no;
936 refresh_rw_trx_hash_version();
937 }
938
939
940 /**
941 Takes MVCC snapshot.
942
943 To reduce malloc probablility we reserve rw_trx_hash.size() + 32 elements
944 in ids.
945
946 For details about get_rw_trx_hash_version() != get_max_trx_id() spin
947 @sa register_rw() and @sa assign_new_trx_no().
948
949 We rely on get_rw_trx_hash_version() to issue ACQUIRE memory barrier so
950 that loading of m_rw_trx_hash_version happens before accessing rw_trx_hash.
951
952 To optimise snapshot creation rw_trx_hash.iterate() is being used instead
953 of rw_trx_hash.iterate_no_dups(). It means that some transaction
954 identifiers may appear multiple times in ids.
955
956 @param[in,out] caller_trx used to get access to rw_trx_hash_pins
957 @param[out] ids array to store registered transaction identifiers
958 @param[out] max_trx_id variable to store m_max_trx_id value
959 @param[out] mix_trx_no variable to store min(trx->no) value
960 */
961
snapshot_ids(trx_t * caller_trx,trx_ids_t * ids,trx_id_t * max_trx_id,trx_id_t * min_trx_no)962 void snapshot_ids(trx_t *caller_trx, trx_ids_t *ids, trx_id_t *max_trx_id,
963 trx_id_t *min_trx_no)
964 {
965 ut_ad(!mutex_own(&mutex));
966 snapshot_ids_arg arg(ids);
967
968 while ((arg.m_id= get_rw_trx_hash_version()) != get_max_trx_id())
969 ut_delay(1);
970 arg.m_no= arg.m_id;
971
972 ids->clear();
973 ids->reserve(rw_trx_hash.size() + 32);
974 rw_trx_hash.iterate(caller_trx,
975 reinterpret_cast<my_hash_walk_action>(copy_one_id),
976 &arg);
977
978 *max_trx_id= arg.m_id;
979 *min_trx_no= arg.m_no;
980 }
981
982
983 /** Initialiser for m_max_trx_id and m_rw_trx_hash_version. */
init_max_trx_id(trx_id_t value)984 void init_max_trx_id(trx_id_t value)
985 {
986 m_max_trx_id= value;
987 m_rw_trx_hash_version.store(value, std::memory_order_relaxed);
988 }
989
990
is_initialised()991 bool is_initialised() { return m_initialised; }
992
993
994 /** Initialise the transaction subsystem. */
995 void create();
996
997 /** Close the transaction subsystem on shutdown. */
998 void close();
999
1000 /** @return total number of active (non-prepared) transactions */
1001 ulint any_active_transactions();
1002
1003
1004 /**
1005 Registers read-write transaction.
1006
1007 Transaction becomes visible to MVCC.
1008
1009 There's a gap between m_max_trx_id increment and transaction becoming
1010 visible through rw_trx_hash. While we're in this gap concurrent thread may
1011 come and do MVCC snapshot. As a result concurrent read view will be able to
1012 observe records owned by this transaction even before it was committed.
1013
1014 m_rw_trx_hash_version is intended to solve this problem. MVCC snapshot has
1015 to wait until m_max_trx_id == m_rw_trx_hash_version, which effectively
1016 means that all transactions up to m_max_trx_id are available through
1017 rw_trx_hash.
1018
1019 We rely on refresh_rw_trx_hash_version() to issue RELEASE memory barrier so
1020 that m_rw_trx_hash_version increment happens after transaction becomes
1021 visible through rw_trx_hash.
1022 */
1023
register_rw(trx_t * trx)1024 void register_rw(trx_t *trx)
1025 {
1026 trx->id= get_new_trx_id_no_refresh();
1027 rw_trx_hash.insert(trx);
1028 refresh_rw_trx_hash_version();
1029 }
1030
1031
1032 /**
1033 Deregisters read-write transaction.
1034
1035 Transaction is removed from rw_trx_hash, which releases all implicit locks.
1036 MVCC snapshot won't see this transaction anymore.
1037 */
1038
deregister_rw(trx_t * trx)1039 void deregister_rw(trx_t *trx)
1040 {
1041 rw_trx_hash.erase(trx);
1042 }
1043
1044
is_registered(trx_t * caller_trx,trx_id_t id)1045 bool is_registered(trx_t *caller_trx, trx_id_t id)
1046 {
1047 return id && find(caller_trx, id, false);
1048 }
1049
1050
1051 trx_t *find(trx_t *caller_trx, trx_id_t id, bool do_ref_count= true)
1052 {
1053 return rw_trx_hash.find(caller_trx, id, do_ref_count);
1054 }
1055
1056
1057 /**
1058 Registers transaction in trx_sys.
1059
1060 @param trx transaction
1061 */
register_trx(trx_t * trx)1062 void register_trx(trx_t *trx)
1063 {
1064 mutex_enter(&mutex);
1065 UT_LIST_ADD_FIRST(trx_list, trx);
1066 mutex_exit(&mutex);
1067 }
1068
1069
1070 /**
1071 Deregisters transaction in trx_sys.
1072
1073 @param trx transaction
1074 */
deregister_trx(trx_t * trx)1075 void deregister_trx(trx_t *trx)
1076 {
1077 mutex_enter(&mutex);
1078 UT_LIST_REMOVE(trx_list, trx);
1079 mutex_exit(&mutex);
1080 }
1081
1082
1083 /**
1084 Clones the oldest view and stores it in view.
1085
1086 No need to call ReadView::close(). The caller owns the view that is passed
1087 in. This function is called by purge thread to determine whether it should
1088 purge the delete marked record or not.
1089 */
1090 void clone_oldest_view();
1091
1092
1093 /** @return the number of active views */
view_count()1094 size_t view_count() const
1095 {
1096 size_t count= 0;
1097
1098 mutex_enter(&mutex);
1099 for (const trx_t *trx= UT_LIST_GET_FIRST(trx_list); trx;
1100 trx= UT_LIST_GET_NEXT(trx_list, trx))
1101 {
1102 if (trx->read_view.get_state() == READ_VIEW_STATE_OPEN)
1103 ++count;
1104 }
1105 mutex_exit(&mutex);
1106 return count;
1107 }
1108
1109 private:
get_min_trx_id_callback(rw_trx_hash_element_t * element,trx_id_t * id)1110 static my_bool get_min_trx_id_callback(rw_trx_hash_element_t *element,
1111 trx_id_t *id)
1112 {
1113 if (element->id < *id)
1114 {
1115 mutex_enter(&element->mutex);
1116 /* We don't care about read-only transactions here. */
1117 if (element->trx && element->trx->rsegs.m_redo.rseg)
1118 *id= element->id;
1119 mutex_exit(&element->mutex);
1120 }
1121 return 0;
1122 }
1123
1124
1125 struct snapshot_ids_arg
1126 {
snapshot_ids_argsnapshot_ids_arg1127 snapshot_ids_arg(trx_ids_t *ids): m_ids(ids) {}
1128 trx_ids_t *m_ids;
1129 trx_id_t m_id;
1130 trx_id_t m_no;
1131 };
1132
1133
copy_one_id(rw_trx_hash_element_t * element,snapshot_ids_arg * arg)1134 static my_bool copy_one_id(rw_trx_hash_element_t *element,
1135 snapshot_ids_arg *arg)
1136 {
1137 if (element->id < arg->m_id)
1138 {
1139 trx_id_t no= element->no;
1140 arg->m_ids->push_back(element->id);
1141 if (no < arg->m_no)
1142 arg->m_no= no;
1143 }
1144 return 0;
1145 }
1146
1147
1148 /** Getter for m_rw_trx_hash_version, must issue ACQUIRE memory barrier. */
get_rw_trx_hash_version()1149 trx_id_t get_rw_trx_hash_version()
1150 {
1151 return m_rw_trx_hash_version.load(std::memory_order_acquire);
1152 }
1153
1154
1155 /** Increments m_rw_trx_hash_version, must issue RELEASE memory barrier. */
refresh_rw_trx_hash_version()1156 void refresh_rw_trx_hash_version()
1157 {
1158 m_rw_trx_hash_version.fetch_add(1, std::memory_order_release);
1159 }
1160
1161
1162 /**
1163 Allocates new transaction id without refreshing rw_trx_hash version.
1164
1165 This method is extracted for exclusive use by register_rw() and
1166 assign_new_trx_no() where new id must be allocated atomically with
1167 payload of these methods from MVCC snapshot point of view.
1168
1169 @sa get_new_trx_id()
1170 @sa assign_new_trx_no()
1171
1172 @return new transaction id
1173 */
1174
get_new_trx_id_no_refresh()1175 trx_id_t get_new_trx_id_no_refresh()
1176 {
1177 return m_max_trx_id++;
1178 }
1179 };
1180
1181
1182 /** The transaction system */
1183 extern trx_sys_t trx_sys;
1184
1185 #endif
1186