1 /*****************************************************************************
2 
3 Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2017, 2021, MariaDB Corporation.
5 
6 This program is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free Software
8 Foundation; version 2 of the License.
9 
10 This program is distributed in the hope that it will be useful, but WITHOUT
11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13 
14 You should have received a copy of the GNU General Public License along with
15 this program; if not, write to the Free Software Foundation, Inc.,
16 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
17 
18 *****************************************************************************/
19 
20 /**************************************************//**
21 @file include/trx0sys.h
22 Transaction system
23 
24 Created 3/26/1996 Heikki Tuuri
25 *******************************************************/
26 
27 #ifndef trx0sys_h
28 #define trx0sys_h
29 
30 #include "buf0buf.h"
31 #include "fil0fil.h"
32 #include "trx0types.h"
33 #include "mem0mem.h"
34 #include "mtr0mtr.h"
35 #include "ut0byte.h"
36 #include "ut0lst.h"
37 #include "read0types.h"
38 #include "page0types.h"
39 #include "ut0mutex.h"
40 #include "trx0trx.h"
41 #ifdef WITH_WSREP
42 #include "trx0xa.h"
43 #endif /* WITH_WSREP */
44 
45 typedef UT_LIST_BASE_NODE_T(trx_t) trx_ut_list_t;
46 
47 /** Checks if a page address is the trx sys header page.
48 @param[in]	page_id	page id
49 @return true if trx sys header page */
trx_sys_hdr_page(const page_id_t & page_id)50 inline bool trx_sys_hdr_page(const page_id_t& page_id)
51 {
52 	return(page_id.space() == TRX_SYS_SPACE
53 	       && page_id.page_no() == TRX_SYS_PAGE_NO);
54 }
55 
56 /*****************************************************************//**
57 Creates and initializes the transaction system at the database creation. */
58 void
59 trx_sys_create_sys_pages(void);
60 /*==========================*/
61 /** Find an available rollback segment.
62 @param[in]	sys_header
63 @return an unallocated rollback segment slot in the TRX_SYS header
64 @retval ULINT_UNDEFINED if not found */
65 ulint
66 trx_sys_rseg_find_free(const buf_block_t* sys_header);
67 /** Request the TRX_SYS page.
68 @param[in]	rw	whether to lock the page for writing
69 @return the TRX_SYS page
70 @retval	NULL	if the page cannot be read */
71 inline buf_block_t *trx_sysf_get(mtr_t* mtr, bool rw= true)
72 {
73   buf_block_t* block = buf_page_get(page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO),
74 				    0, rw ? RW_X_LATCH : RW_S_LATCH, mtr);
75   ut_d(if (block) buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);)
76   return block;
77 }
78 
79 #ifdef UNIV_DEBUG
80 /* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */
81 extern uint			trx_rseg_n_slots_debug;
82 #endif
83 
84 /** Write DB_TRX_ID.
85 @param[out]	db_trx_id	the DB_TRX_ID field to be written to
86 @param[in]	id		transaction ID */
87 UNIV_INLINE
88 void
trx_write_trx_id(byte * db_trx_id,trx_id_t id)89 trx_write_trx_id(byte* db_trx_id, trx_id_t id)
90 {
91 	compile_time_assert(DATA_TRX_ID_LEN == 6);
92 	mach_write_to_6(db_trx_id, id);
93 }
94 
95 /** Read a transaction identifier.
96 @return id */
97 inline
98 trx_id_t
trx_read_trx_id(const byte * ptr)99 trx_read_trx_id(const byte* ptr)
100 {
101 	compile_time_assert(DATA_TRX_ID_LEN == 6);
102 	return(mach_read_from_6(ptr));
103 }
104 
105 #ifdef UNIV_DEBUG
106 /** Check that the DB_TRX_ID in a record is valid.
107 @param[in]	db_trx_id	the DB_TRX_ID column to validate
108 @param[in]	trx_id		the id of the ALTER TABLE transaction */
trx_id_check(const void * db_trx_id,trx_id_t trx_id)109 inline bool trx_id_check(const void* db_trx_id, trx_id_t trx_id)
110 {
111 	trx_id_t id = trx_read_trx_id(static_cast<const byte*>(db_trx_id));
112 	ut_ad(id == 0 || id > trx_id);
113 	return true;
114 }
115 #endif
116 
117 /*****************************************************************//**
118 Updates the offset information about the end of the MySQL binlog entry
119 which corresponds to the transaction just being committed. In a MySQL
120 replication slave updates the latest master binlog position up to which
121 replication has proceeded. */
122 void
123 trx_sys_update_mysql_binlog_offset(
124 /*===============================*/
125 	const char*	file_name,/*!< in: MySQL log file name */
126 	int64_t		offset,	/*!< in: position in that log file */
127 	buf_block_t*	sys_header, /*!< in,out: trx sys header */
128 	mtr_t*		mtr);	/*!< in,out: mini-transaction */
129 /** Display the MySQL binlog offset info if it is present in the trx
130 system header. */
131 void
132 trx_sys_print_mysql_binlog_offset();
133 
134 /** Create the rollback segments.
135 @return	whether the creation succeeded */
136 bool
137 trx_sys_create_rsegs();
138 
139 /** The automatically created system rollback segment has this id */
140 #define TRX_SYS_SYSTEM_RSEG_ID	0
141 
142 /** The offset of the transaction system header on the page */
143 #define	TRX_SYS		FSEG_PAGE_DATA
144 
145 /** Transaction system header */
146 /*------------------------------------------------------------- @{ */
147 /** In old versions of InnoDB, this persisted the value of
148 trx_sys.get_max_trx_id(). Starting with MariaDB 10.3.5,
149 the field TRX_RSEG_MAX_TRX_ID in rollback segment header pages
150 and the fields TRX_UNDO_TRX_ID, TRX_UNDO_TRX_NO in undo log pages
151 are used instead. The field only exists for the purpose of upgrading
152 from older MySQL or MariaDB versions. */
153 #define	TRX_SYS_TRX_ID_STORE	0
154 #define TRX_SYS_FSEG_HEADER	8	/*!< segment header for the
155 					tablespace segment the trx
156 					system is created into */
157 #define	TRX_SYS_RSEGS		(8 + FSEG_HEADER_SIZE)
158 					/*!< the start of the array of
159 					rollback segment specification
160 					slots */
161 /*------------------------------------------------------------- @} */
162 
163 /** The number of rollback segments; rollback segment id must fit in
164 the 7 bits reserved for it in DB_ROLL_PTR. */
165 #define	TRX_SYS_N_RSEGS			128
166 /** Maximum number of undo tablespaces (not counting the system tablespace) */
167 #define TRX_SYS_MAX_UNDO_SPACES		(TRX_SYS_N_RSEGS - 1)
168 
169 /* Rollback segment specification slot offsets */
170 
171 /** the tablespace ID of an undo log header; starting with
172 MySQL/InnoDB 5.1.7, this is FIL_NULL if the slot is unused */
173 #define	TRX_SYS_RSEG_SPACE	0
174 /** the page number of an undo log header, or FIL_NULL if unused */
175 #define	TRX_SYS_RSEG_PAGE_NO	4
176 /** Size of a rollback segment specification slot */
177 #define TRX_SYS_RSEG_SLOT_SIZE	8
178 
179 /** Read the tablespace ID of a rollback segment slot.
180 @param[in]	sys_header	TRX_SYS page
181 @param[in]	rseg_id		rollback segment identifier
182 @return	undo tablespace id */
183 inline
184 uint32_t
trx_sysf_rseg_get_space(const buf_block_t * sys_header,ulint rseg_id)185 trx_sysf_rseg_get_space(const buf_block_t* sys_header, ulint rseg_id)
186 {
187 	ut_ad(rseg_id < TRX_SYS_N_RSEGS);
188 	return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_SPACE
189 				+ rseg_id * TRX_SYS_RSEG_SLOT_SIZE
190 				+ sys_header->frame);
191 }
192 
193 /** Read the page number of a rollback segment slot.
194 @param[in]	sys_header	TRX_SYS page
195 @param[in]	rseg_id		rollback segment identifier
196 @return	undo page number */
197 inline uint32_t
trx_sysf_rseg_get_page_no(const buf_block_t * sys_header,ulint rseg_id)198 trx_sysf_rseg_get_page_no(const buf_block_t *sys_header, ulint rseg_id)
199 {
200   ut_ad(rseg_id < TRX_SYS_N_RSEGS);
201   return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO +
202 			  rseg_id * TRX_SYS_RSEG_SLOT_SIZE +
203 			  sys_header->frame);
204 }
205 
206 /** Maximum length of MySQL binlog file name, in bytes.
207 (Used before MariaDB 10.3.5.) */
208 #define TRX_SYS_MYSQL_LOG_NAME_LEN	512
209 /** Contents of TRX_SYS_MYSQL_LOG_MAGIC_N_FLD */
210 #define TRX_SYS_MYSQL_LOG_MAGIC_N	873422344
211 
212 #if UNIV_PAGE_SIZE_MIN < 4096
213 # error "UNIV_PAGE_SIZE_MIN < 4096"
214 #endif
215 /** The offset of the MySQL binlog offset info in the trx system header */
216 #define TRX_SYS_MYSQL_LOG_INFO		(srv_page_size - 1000)
217 #define	TRX_SYS_MYSQL_LOG_MAGIC_N_FLD	0	/*!< magic number which is
218 						TRX_SYS_MYSQL_LOG_MAGIC_N
219 						if we have valid data in the
220 						MySQL binlog info */
221 #define TRX_SYS_MYSQL_LOG_OFFSET	4	/*!< the 64-bit offset
222 						within that file */
223 #define TRX_SYS_MYSQL_LOG_NAME		12	/*!< MySQL log file name */
224 
225 /** Memory map TRX_SYS_PAGE_NO = 5 when srv_page_size = 4096
226 
227 0...37 FIL_HEADER
228 38...45 TRX_SYS_TRX_ID_STORE
229 46...55 TRX_SYS_FSEG_HEADER (FSEG_HEADER_SIZE == 10)
230 56      TRX_SYS_RSEGS
231   56...59  TRX_SYS_RSEG_SPACE       for slot 0
232   60...63  TRX_SYS_RSEG_PAGE_NO     for slot 0
233   64...67  TRX_SYS_RSEG_SPACE       for slot 1
234   68...71  TRX_SYS_RSEG_PAGE_NO     for slot 1
235 ....
236  594..597  TRX_SYS_RSEG_SPACE       for slot 72
237  598..601  TRX_SYS_RSEG_PAGE_NO     for slot 72
238 ...
239   ...1063  TRX_SYS_RSEG_PAGE_NO     for slot 126
240 
241 (srv_page_size-3500 WSREP ::: FAIL would overwrite undo tablespace
242 space_id, page_no pairs :::)
243 596 TRX_SYS_WSREP_XID_INFO             TRX_SYS_WSREP_XID_MAGIC_N_FLD
244 600 TRX_SYS_WSREP_XID_FORMAT
245 604 TRX_SYS_WSREP_XID_GTRID_LEN
246 608 TRX_SYS_WSREP_XID_BQUAL_LEN
247 612 TRX_SYS_WSREP_XID_DATA   (len = 128)
248 739 TRX_SYS_WSREP_XID_DATA_END
249 
250 FIXED WSREP XID info offsets for 4k page size 10.0.32-galera
251 (srv_page_size-2500)
252 1596 TRX_SYS_WSREP_XID_INFO             TRX_SYS_WSREP_XID_MAGIC_N_FLD
253 1600 TRX_SYS_WSREP_XID_FORMAT
254 1604 TRX_SYS_WSREP_XID_GTRID_LEN
255 1608 TRX_SYS_WSREP_XID_BQUAL_LEN
256 1612 TRX_SYS_WSREP_XID_DATA   (len = 128)
257 1739 TRX_SYS_WSREP_XID_DATA_END
258 
259 (srv_page_size - 2000 MYSQL MASTER LOG)
260 2096   TRX_SYS_MYSQL_MASTER_LOG_INFO   TRX_SYS_MYSQL_LOG_MAGIC_N_FLD
261 2100   TRX_SYS_MYSQL_LOG_OFFSET_HIGH
262 2104   TRX_SYS_MYSQL_LOG_OFFSET_LOW
263 2108   TRX_SYS_MYSQL_LOG_NAME
264 
265 (srv_page_size - 1000 MYSQL LOG)
266 3096   TRX_SYS_MYSQL_LOG_INFO          TRX_SYS_MYSQL_LOG_MAGIC_N_FLD
267 3100   TRX_SYS_MYSQL_LOG_OFFSET_HIGH
268 3104   TRX_SYS_MYSQL_LOG_OFFSET_LOW
269 3108   TRX_SYS_MYSQL_LOG_NAME
270 
271 (srv_page_size - 200 DOUBLEWRITE)
272 3896   TRX_SYS_DOUBLEWRITE		TRX_SYS_DOUBLEWRITE_FSEG
273 3906         TRX_SYS_DOUBLEWRITE_MAGIC
274 3910         TRX_SYS_DOUBLEWRITE_BLOCK1
275 3914         TRX_SYS_DOUBLEWRITE_BLOCK2
276 3918         TRX_SYS_DOUBLEWRITE_REPEAT
277 3930         TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N
278 
279 (srv_page_size - 8, TAILER)
280 4088..4096	FIL_TAILER
281 
282 */
283 #ifdef WITH_WSREP
284 /** The offset to WSREP XID headers (used before MariaDB 10.3.5) */
285 #define TRX_SYS_WSREP_XID_INFO std::max(srv_page_size - 3500, 1596UL)
286 #define TRX_SYS_WSREP_XID_MAGIC_N_FLD 0
287 #define TRX_SYS_WSREP_XID_MAGIC_N 0x77737265
288 
289 /** XID field: formatID, gtrid_len, bqual_len, xid_data */
290 #define TRX_SYS_WSREP_XID_LEN        (4 + 4 + 4 + XIDDATASIZE)
291 #define TRX_SYS_WSREP_XID_FORMAT     4
292 #define TRX_SYS_WSREP_XID_GTRID_LEN  8
293 #define TRX_SYS_WSREP_XID_BQUAL_LEN 12
294 #define TRX_SYS_WSREP_XID_DATA      16
295 #endif /* WITH_WSREP*/
296 
297 /** Doublewrite buffer */
298 /* @{ */
299 /** The offset of the doublewrite buffer header on the trx system header page */
300 #define TRX_SYS_DOUBLEWRITE		(srv_page_size - 200)
301 /*-------------------------------------------------------------*/
302 #define TRX_SYS_DOUBLEWRITE_FSEG	0	/*!< fseg header of the fseg
303 						containing the doublewrite
304 						buffer */
305 #define TRX_SYS_DOUBLEWRITE_MAGIC	FSEG_HEADER_SIZE
306 						/*!< 4-byte magic number which
307 						shows if we already have
308 						created the doublewrite
309 						buffer */
310 #define TRX_SYS_DOUBLEWRITE_BLOCK1	(4 + FSEG_HEADER_SIZE)
311 						/*!< page number of the
312 						first page in the first
313 						sequence of 64
314 						(= FSP_EXTENT_SIZE) consecutive
315 						pages in the doublewrite
316 						buffer */
317 #define TRX_SYS_DOUBLEWRITE_BLOCK2	(8 + FSEG_HEADER_SIZE)
318 						/*!< page number of the
319 						first page in the second
320 						sequence of 64 consecutive
321 						pages in the doublewrite
322 						buffer */
323 #define TRX_SYS_DOUBLEWRITE_REPEAT	12	/*!< we repeat
324 						TRX_SYS_DOUBLEWRITE_MAGIC,
325 						TRX_SYS_DOUBLEWRITE_BLOCK1,
326 						TRX_SYS_DOUBLEWRITE_BLOCK2
327 						so that if the trx sys
328 						header is half-written
329 						to disk, we still may
330 						be able to recover the
331 						information */
332 /** If this is not yet set to TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
333 we must reset the doublewrite buffer, because starting from 4.1.x the
334 space id of a data page is stored into
335 FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */
336 #define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED (24 + FSEG_HEADER_SIZE)
337 
338 /*-------------------------------------------------------------*/
339 /** Contents of TRX_SYS_DOUBLEWRITE_MAGIC */
340 constexpr uint32_t TRX_SYS_DOUBLEWRITE_MAGIC_N= 536853855;
341 /** Contents of TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED */
342 constexpr uint32_t TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N= 1783657386;
343 
344 /** Size of the doublewrite block in pages */
345 #define TRX_SYS_DOUBLEWRITE_BLOCK_SIZE	FSP_EXTENT_SIZE
346 /* @} */
347 
348 trx_t* current_trx();
349 
350 struct rw_trx_hash_element_t
351 {
rw_trx_hash_element_trw_trx_hash_element_t352   rw_trx_hash_element_t(): trx(0)
353   {
354     mutex_create(LATCH_ID_RW_TRX_HASH_ELEMENT, &mutex);
355   }
356 
357 
~rw_trx_hash_element_trw_trx_hash_element_t358   ~rw_trx_hash_element_t()
359   {
360     mutex_free(&mutex);
361   }
362 
363 
364   trx_id_t id; /* lf_hash_init() relies on this to be first in the struct */
365   Atomic_counter<trx_id_t> no;
366   trx_t *trx;
367   ib_mutex_t mutex;
368 };
369 
370 
371 /**
372   Wrapper around LF_HASH to store set of in memory read-write transactions.
373 */
374 
375 class rw_trx_hash_t
376 {
377   LF_HASH hash;
378 
379 
380   /**
381     Constructor callback for lock-free allocator.
382 
383     Object is just allocated and is not yet accessible via rw_trx_hash by
384     concurrent threads. Object can be reused multiple times before it is freed.
385     Every time object is being reused initializer() callback is called.
386   */
387 
rw_trx_hash_constructor(uchar * arg)388   static void rw_trx_hash_constructor(uchar *arg)
389   {
390     new(arg + LF_HASH_OVERHEAD) rw_trx_hash_element_t();
391   }
392 
393 
394   /**
395     Destructor callback for lock-free allocator.
396 
397     Object is about to be freed and is not accessible via rw_trx_hash by
398     concurrent threads.
399   */
400 
rw_trx_hash_destructor(uchar * arg)401   static void rw_trx_hash_destructor(uchar *arg)
402   {
403     reinterpret_cast<rw_trx_hash_element_t*>
404       (arg + LF_HASH_OVERHEAD)->~rw_trx_hash_element_t();
405   }
406 
407 
408   /**
409     Destructor callback for lock-free allocator.
410 
411     This destructor is used at shutdown. It frees remaining transaction
412     objects.
413 
414     XA PREPARED transactions may remain if they haven't been committed or
415     rolled back. ACTIVE transactions may remain if startup was interrupted or
416     server is running in read-only mode or for certain srv_force_recovery
417     levels.
418   */
419 
rw_trx_hash_shutdown_destructor(uchar * arg)420   static void rw_trx_hash_shutdown_destructor(uchar *arg)
421   {
422     rw_trx_hash_element_t *element=
423       reinterpret_cast<rw_trx_hash_element_t*>(arg + LF_HASH_OVERHEAD);
424     if (trx_t *trx= element->trx)
425     {
426       ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED) ||
427             trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED) ||
428             (trx_state_eq(trx, TRX_STATE_ACTIVE) &&
429              (!srv_was_started ||
430               srv_read_only_mode ||
431               srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO)));
432       trx_free_at_shutdown(trx);
433     }
434     element->~rw_trx_hash_element_t();
435   }
436 
437 
438   /**
439     Initializer callback for lock-free hash.
440 
441     Object is not yet accessible via rw_trx_hash by concurrent threads, but is
442     about to become such. Object id can be changed only by this callback and
443     remains the same until all pins to this object are released.
444 
445     Object trx can be changed to 0 by erase() under object mutex protection,
446     which indicates it is about to be removed from lock-free hash and become
447     not accessible by concurrent threads.
448   */
449 
rw_trx_hash_initializer(LF_HASH *,rw_trx_hash_element_t * element,trx_t * trx)450   static void rw_trx_hash_initializer(LF_HASH *,
451                                       rw_trx_hash_element_t *element,
452                                       trx_t *trx)
453   {
454     ut_ad(element->trx == 0);
455     element->trx= trx;
456     element->id= trx->id;
457     element->no= TRX_ID_MAX;
458     trx->rw_trx_hash_element= element;
459   }
460 
461 
462   /**
463     Gets LF_HASH pins.
464 
465     Pins are used to protect object from being destroyed or reused. They are
466     normally stored in trx object for quick access. If caller doesn't have trx
467     available, we try to get it using currnet_trx(). If caller doesn't have trx
468     at all, temporary pins are allocated.
469   */
470 
get_pins(trx_t * trx)471   LF_PINS *get_pins(trx_t *trx)
472   {
473     if (!trx->rw_trx_hash_pins)
474     {
475       trx->rw_trx_hash_pins= lf_hash_get_pins(&hash);
476       ut_a(trx->rw_trx_hash_pins);
477     }
478     return trx->rw_trx_hash_pins;
479   }
480 
481 
482   struct eliminate_duplicates_arg
483   {
484     trx_ids_t ids;
485     my_hash_walk_action action;
486     void *argument;
eliminate_duplicates_argeliminate_duplicates_arg487     eliminate_duplicates_arg(size_t size, my_hash_walk_action act, void* arg):
488       action(act), argument(arg) { ids.reserve(size); }
489   };
490 
491 
eliminate_duplicates(rw_trx_hash_element_t * element,eliminate_duplicates_arg * arg)492   static my_bool eliminate_duplicates(rw_trx_hash_element_t *element,
493                                       eliminate_duplicates_arg *arg)
494   {
495     for (trx_ids_t::iterator it= arg->ids.begin(); it != arg->ids.end(); it++)
496     {
497       if (*it == element->id)
498         return 0;
499     }
500     arg->ids.push_back(element->id);
501     return arg->action(element, arg->argument);
502   }
503 
504 
505 #ifdef UNIV_DEBUG
validate_element(trx_t * trx)506   static void validate_element(trx_t *trx)
507   {
508     ut_ad(!trx->read_only || !trx->rsegs.m_redo.rseg);
509     ut_ad(!trx->is_autocommit_non_locking());
510     /* trx->state can be anything except TRX_STATE_NOT_STARTED */
511     mutex_enter(&trx->mutex);
512     ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) ||
513           trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY) ||
514           trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED) ||
515           trx_state_eq(trx, TRX_STATE_PREPARED));
516     mutex_exit(&trx->mutex);
517   }
518 
519 
520   struct debug_iterator_arg
521   {
522     my_hash_walk_action action;
523     void *argument;
524   };
525 
526 
debug_iterator(rw_trx_hash_element_t * element,debug_iterator_arg * arg)527   static my_bool debug_iterator(rw_trx_hash_element_t *element,
528                                 debug_iterator_arg *arg)
529   {
530     mutex_enter(&element->mutex);
531     if (element->trx)
532       validate_element(element->trx);
533     mutex_exit(&element->mutex);
534     return arg->action(element, arg->argument);
535   }
536 #endif
537 
538 
539 public:
init()540   void init()
541   {
542     lf_hash_init(&hash, sizeof(rw_trx_hash_element_t), LF_HASH_UNIQUE, 0,
543                  sizeof(trx_id_t), 0, &my_charset_bin);
544     hash.alloc.constructor= rw_trx_hash_constructor;
545     hash.alloc.destructor= rw_trx_hash_destructor;
546     hash.initializer=
547       reinterpret_cast<lf_hash_initializer>(rw_trx_hash_initializer);
548   }
549 
550 
destroy()551   void destroy()
552   {
553     hash.alloc.destructor= rw_trx_hash_shutdown_destructor;
554     lf_hash_destroy(&hash);
555   }
556 
557 
558   /**
559     Releases LF_HASH pins.
560 
561     Must be called by thread that owns trx_t object when the latter is being
562     "detached" from thread (e.g. released to the pool by trx_t::free()). Can be
563     called earlier if thread is expected not to use rw_trx_hash.
564 
565     Since pins are not allowed to be transferred to another thread,
566     initialisation thread calls this for recovered transactions.
567   */
568 
put_pins(trx_t * trx)569   void put_pins(trx_t *trx)
570   {
571     if (trx->rw_trx_hash_pins)
572     {
573       lf_hash_put_pins(trx->rw_trx_hash_pins);
574       trx->rw_trx_hash_pins= 0;
575     }
576   }
577 
578 
579   /**
580     Finds trx object in lock-free hash with given id.
581 
582     Only ACTIVE or PREPARED trx objects may participate in hash. Nevertheless
583     the transaction may get committed before this method returns.
584 
585     With do_ref_count == false the caller may dereference returned trx pointer
586     only if lock_sys.mutex was acquired before calling find().
587 
588     With do_ref_count == true caller may dereference trx even if it is not
589     holding lock_sys.mutex. Caller is responsible for calling
590     trx->release_reference() when it is done playing with trx.
591 
592     Ideally this method should get caller rw_trx_hash_pins along with trx
593     object as a parameter, similar to insert() and erase(). However most
594     callers lose trx early in their call chains and it is not that easy to pass
595     them through.
596 
597     So we take more expensive approach: get trx through current_thd()->ha_data.
598     Some threads don't have trx attached to THD, and at least server
599     initialisation thread, fts_optimize_thread, srv_master_thread,
600     dict_stats_thread, srv_monitor_thread, btr_defragment_thread don't even
601     have THD at all. For such cases we allocate pins only for duration of
602     search and free them immediately.
603 
604     This has negative performance impact and should be fixed eventually (by
605     passing caller_trx as a parameter). Still stream of DML is more or less Ok.
606 
607     @return
608       @retval 0 not found
609       @retval pointer to trx
610   */
611 
find(trx_t * caller_trx,trx_id_t trx_id,bool do_ref_count)612   trx_t *find(trx_t *caller_trx, trx_id_t trx_id, bool do_ref_count)
613   {
614     /*
615       In MariaDB 10.3, purge will reset DB_TRX_ID to 0
616       when the history is lost. Read/write transactions will
617       always have a nonzero trx_t::id; there the value 0 is
618       reserved for transactions that did not write or lock
619       anything yet.
620 
621       The caller should already have handled trx_id==0 specially.
622     */
623     ut_ad(trx_id);
624     ut_ad(!caller_trx || caller_trx->id != trx_id || !do_ref_count);
625 
626     trx_t *trx= 0;
627     LF_PINS *pins= caller_trx ? get_pins(caller_trx) : lf_hash_get_pins(&hash);
628     ut_a(pins);
629 
630     rw_trx_hash_element_t *element= reinterpret_cast<rw_trx_hash_element_t*>
631       (lf_hash_search(&hash, pins, reinterpret_cast<const void*>(&trx_id),
632                       sizeof(trx_id_t)));
633     if (element)
634     {
635       mutex_enter(&element->mutex);
636       lf_hash_search_unpin(pins);
637       if ((trx= element->trx)) {
638         DBUG_ASSERT(trx_id == trx->id);
639         ut_d(validate_element(trx));
640         if (do_ref_count)
641         {
642           /*
643             We have an early state check here to avoid committer
644             starvation in a wait loop for transaction references,
645             when there's a stream of trx_sys.find() calls from other
646             threads. The trx->state may change to COMMITTED after
647             trx->mutex is released, and it will have to be rechecked
648             by the caller after reacquiring the mutex.
649           */
650           trx_mutex_enter(trx);
651           const trx_state_t state= trx->state;
652           trx_mutex_exit(trx);
653           if (state == TRX_STATE_COMMITTED_IN_MEMORY)
654             trx= NULL;
655           else
656             trx->reference();
657         }
658       }
659       mutex_exit(&element->mutex);
660     }
661     if (!caller_trx)
662       lf_hash_put_pins(pins);
663     return trx;
664   }
665 
666 
667   /**
668     Inserts trx to lock-free hash.
669 
670     Object becomes accessible via rw_trx_hash.
671   */
672 
insert(trx_t * trx)673   void insert(trx_t *trx)
674   {
675     ut_d(validate_element(trx));
676     int res= lf_hash_insert(&hash, get_pins(trx),
677                             reinterpret_cast<void*>(trx));
678     ut_a(res == 0);
679   }
680 
681 
682   /**
683     Removes trx from lock-free hash.
684 
685     Object becomes not accessible via rw_trx_hash. But it still can be pinned
686     by concurrent find(), which is supposed to release it immediately after
687     it sees object trx is 0.
688   */
689 
erase(trx_t * trx)690   void erase(trx_t *trx)
691   {
692     ut_d(validate_element(trx));
693     mutex_enter(&trx->rw_trx_hash_element->mutex);
694     trx->rw_trx_hash_element->trx= 0;
695     mutex_exit(&trx->rw_trx_hash_element->mutex);
696     int res= lf_hash_delete(&hash, get_pins(trx),
697                             reinterpret_cast<const void*>(&trx->id),
698                             sizeof(trx_id_t));
699     ut_a(res == 0);
700   }
701 
702 
703   /**
704     Returns the number of elements in the hash.
705 
706     The number is exact only if hash is protected against concurrent
707     modifications (e.g. single threaded startup or hash is protected
708     by some mutex). Otherwise the number may be used as a hint only,
709     because it may change even before this method returns.
710   */
711 
size()712   uint32_t size() { return uint32_t(lf_hash_size(&hash)); }
713 
714 
715   /**
716     Iterates the hash.
717 
718     @param caller_trx  used to get/set pins
719     @param action      called for every element in hash
720     @param argument    opque argument passed to action
721 
722     May return the same element multiple times if hash is under contention.
723     If caller doesn't like to see the same transaction multiple times, it has
724     to call iterate_no_dups() instead.
725 
726     May return element with committed transaction. If caller doesn't like to
727     see committed transactions, it has to skip those under element mutex:
728 
729       mutex_enter(&element->mutex);
730       if (trx_t trx= element->trx)
731       {
732         // trx is protected against commit in this branch
733       }
734       mutex_exit(&element->mutex);
735 
736     May miss concurrently inserted transactions.
737 
738     @return
739       @retval 0 iteration completed successfully
740       @retval 1 iteration was interrupted (action returned 1)
741   */
742 
iterate(trx_t * caller_trx,my_hash_walk_action action,void * argument)743   int iterate(trx_t *caller_trx, my_hash_walk_action action, void *argument)
744   {
745     LF_PINS *pins= caller_trx ? get_pins(caller_trx) : lf_hash_get_pins(&hash);
746     ut_a(pins);
747 #ifdef UNIV_DEBUG
748     debug_iterator_arg debug_arg= { action, argument };
749     action= reinterpret_cast<my_hash_walk_action>(debug_iterator);
750     argument= &debug_arg;
751 #endif
752     int res= lf_hash_iterate(&hash, pins, action, argument);
753     if (!caller_trx)
754       lf_hash_put_pins(pins);
755     return res;
756   }
757 
758 
iterate(my_hash_walk_action action,void * argument)759   int iterate(my_hash_walk_action action, void *argument)
760   {
761     return iterate(current_trx(), action, argument);
762   }
763 
764 
765   /**
766     Iterates the hash and eliminates duplicate elements.
767 
768     @sa iterate()
769   */
770 
iterate_no_dups(trx_t * caller_trx,my_hash_walk_action action,void * argument)771   int iterate_no_dups(trx_t *caller_trx, my_hash_walk_action action,
772                       void *argument)
773   {
774     eliminate_duplicates_arg arg(size() + 32, action, argument);
775     return iterate(caller_trx, reinterpret_cast<my_hash_walk_action>
776                    (eliminate_duplicates), &arg);
777   }
778 
779 
iterate_no_dups(my_hash_walk_action action,void * argument)780   int iterate_no_dups(my_hash_walk_action action, void *argument)
781   {
782     return iterate_no_dups(current_trx(), action, argument);
783   }
784 };
785 
786 
787 /** The transaction system central memory data structure. */
788 class trx_sys_t
789 {
790   /**
791     The smallest number not yet assigned as a transaction id or transaction
792     number. Accessed and updated with atomic operations.
793   */
794   MY_ALIGNED(CACHE_LINE_SIZE) Atomic_counter<trx_id_t> m_max_trx_id;
795 
796 
797   /**
798     Solves race conditions between register_rw() and snapshot_ids() as well as
799     race condition between assign_new_trx_no() and snapshot_ids().
800 
801     @sa register_rw()
802     @sa assign_new_trx_no()
803     @sa snapshot_ids()
804   */
805   MY_ALIGNED(CACHE_LINE_SIZE) std::atomic<trx_id_t> m_rw_trx_hash_version;
806 
807 
808   bool m_initialised;
809 
810 public:
811   /**
812     TRX_RSEG_HISTORY list length (number of committed transactions to purge)
813   */
814   MY_ALIGNED(CACHE_LINE_SIZE) Atomic_counter<uint32_t> rseg_history_len;
815 
816   /** Mutex protecting trx_list. */
817   MY_ALIGNED(CACHE_LINE_SIZE) mutable TrxSysMutex mutex;
818 
819   /** List of all transactions. */
820   MY_ALIGNED(CACHE_LINE_SIZE) trx_ut_list_t trx_list;
821 
822 	MY_ALIGNED(CACHE_LINE_SIZE)
823 	/** Temporary rollback segments */
824 	trx_rseg_t*	temp_rsegs[TRX_SYS_N_RSEGS];
825 
826 	MY_ALIGNED(CACHE_LINE_SIZE)
827 	trx_rseg_t*	rseg_array[TRX_SYS_N_RSEGS];
828 					/*!< Pointer array to rollback
829 					segments; NULL if slot not in use;
830 					created and destroyed in
831 					single-threaded mode; not protected
832 					by any mutex, because it is read-only
833 					during multi-threaded operation */
834 
835   /**
836     Lock-free hash of in memory read-write transactions.
837     Works faster when it is on it's own cache line (tested).
838   */
839 
840   MY_ALIGNED(CACHE_LINE_SIZE) rw_trx_hash_t rw_trx_hash;
841 
842 
843 #ifdef WITH_WSREP
844   /** Latest recovered XID during startup */
845   XID recovered_wsrep_xid;
846 #endif
847   /** Latest recovered binlog offset */
848   uint64_t recovered_binlog_offset;
849   /** Latest recovered binlog file name */
850   char recovered_binlog_filename[TRX_SYS_MYSQL_LOG_NAME_LEN];
851   /** FIL_PAGE_LSN of the page with the latest recovered binlog metadata */
852   lsn_t recovered_binlog_lsn;
853 
854 
855   /**
856     Constructor.
857 
858     Some members may require late initialisation, thus we just mark object as
859     uninitialised. Real initialisation happens in create().
860   */
861 
trx_sys_t()862   trx_sys_t(): m_initialised(false) {}
863 
864 
865   /**
866     Returns the minimum trx id in rw trx list.
867 
868     This is the smallest id for which the trx can possibly be active. (But, you
869     must look at the trx->state to find out if the minimum trx id transaction
870     itself is active, or already committed.)
871 
872     @return the minimum trx id, or m_max_trx_id if the trx list is empty
873   */
874 
get_min_trx_id()875   trx_id_t get_min_trx_id()
876   {
877     trx_id_t id= get_max_trx_id();
878     rw_trx_hash.iterate(reinterpret_cast<my_hash_walk_action>
879                         (get_min_trx_id_callback), &id);
880     return id;
881   }
882 
883 
884   /**
885     Determines the maximum transaction id.
886 
887     @return maximum currently allocated trx id; will be stale after the
888             next call to trx_sys.get_new_trx_id()
889   */
890 
get_max_trx_id()891   trx_id_t get_max_trx_id()
892   {
893     return m_max_trx_id;
894   }
895 
896 
897   /**
898     Allocates a new transaction id.
899     @return new, allocated trx id
900   */
901 
get_new_trx_id()902   trx_id_t get_new_trx_id()
903   {
904     trx_id_t id= get_new_trx_id_no_refresh();
905     refresh_rw_trx_hash_version();
906     return id;
907   }
908 
909 
910   /**
911     Allocates and assigns new transaction serialisation number.
912 
913     There's a gap between m_max_trx_id increment and transaction serialisation
914     number becoming visible through rw_trx_hash. While we're in this gap
915     concurrent thread may come and do MVCC snapshot without seeing allocated
916     but not yet assigned serialisation number. Then at some point purge thread
917     may clone this view. As a result it won't see newly allocated serialisation
918     number and may remove "unnecessary" history data of this transaction from
919     rollback segments.
920 
921     m_rw_trx_hash_version is intended to solve this problem. MVCC snapshot has
922     to wait until m_max_trx_id == m_rw_trx_hash_version, which effectively
923     means that all transaction serialisation numbers up to m_max_trx_id are
924     available through rw_trx_hash.
925 
926     We rely on refresh_rw_trx_hash_version() to issue RELEASE memory barrier so
927     that m_rw_trx_hash_version increment happens after
928     trx->rw_trx_hash_element->no becomes visible through rw_trx_hash.
929 
930     @param trx transaction
931   */
assign_new_trx_no(trx_t * trx)932   void assign_new_trx_no(trx_t *trx)
933   {
934     trx->no= get_new_trx_id_no_refresh();
935     trx->rw_trx_hash_element->no= trx->no;
936     refresh_rw_trx_hash_version();
937   }
938 
939 
940   /**
941     Takes MVCC snapshot.
942 
943     To reduce malloc probablility we reserve rw_trx_hash.size() + 32 elements
944     in ids.
945 
946     For details about get_rw_trx_hash_version() != get_max_trx_id() spin
947     @sa register_rw() and @sa assign_new_trx_no().
948 
949     We rely on get_rw_trx_hash_version() to issue ACQUIRE memory barrier so
950     that loading of m_rw_trx_hash_version happens before accessing rw_trx_hash.
951 
952     To optimise snapshot creation rw_trx_hash.iterate() is being used instead
953     of rw_trx_hash.iterate_no_dups(). It means that some transaction
954     identifiers may appear multiple times in ids.
955 
956     @param[in,out] caller_trx used to get access to rw_trx_hash_pins
957     @param[out]    ids        array to store registered transaction identifiers
958     @param[out]    max_trx_id variable to store m_max_trx_id value
959     @param[out]    mix_trx_no variable to store min(trx->no) value
960   */
961 
snapshot_ids(trx_t * caller_trx,trx_ids_t * ids,trx_id_t * max_trx_id,trx_id_t * min_trx_no)962   void snapshot_ids(trx_t *caller_trx, trx_ids_t *ids, trx_id_t *max_trx_id,
963                     trx_id_t *min_trx_no)
964   {
965     ut_ad(!mutex_own(&mutex));
966     snapshot_ids_arg arg(ids);
967 
968     while ((arg.m_id= get_rw_trx_hash_version()) != get_max_trx_id())
969       ut_delay(1);
970     arg.m_no= arg.m_id;
971 
972     ids->clear();
973     ids->reserve(rw_trx_hash.size() + 32);
974     rw_trx_hash.iterate(caller_trx,
975                         reinterpret_cast<my_hash_walk_action>(copy_one_id),
976                         &arg);
977 
978     *max_trx_id= arg.m_id;
979     *min_trx_no= arg.m_no;
980   }
981 
982 
983   /** Initialiser for m_max_trx_id and m_rw_trx_hash_version. */
init_max_trx_id(trx_id_t value)984   void init_max_trx_id(trx_id_t value)
985   {
986     m_max_trx_id= value;
987     m_rw_trx_hash_version.store(value, std::memory_order_relaxed);
988   }
989 
990 
is_initialised()991   bool is_initialised() { return m_initialised; }
992 
993 
994   /** Initialise the transaction subsystem. */
995   void create();
996 
997   /** Close the transaction subsystem on shutdown. */
998   void close();
999 
1000   /** @return total number of active (non-prepared) transactions */
1001   ulint any_active_transactions();
1002 
1003 
1004   /**
1005     Registers read-write transaction.
1006 
1007     Transaction becomes visible to MVCC.
1008 
1009     There's a gap between m_max_trx_id increment and transaction becoming
1010     visible through rw_trx_hash. While we're in this gap concurrent thread may
1011     come and do MVCC snapshot. As a result concurrent read view will be able to
1012     observe records owned by this transaction even before it was committed.
1013 
1014     m_rw_trx_hash_version is intended to solve this problem. MVCC snapshot has
1015     to wait until m_max_trx_id == m_rw_trx_hash_version, which effectively
1016     means that all transactions up to m_max_trx_id are available through
1017     rw_trx_hash.
1018 
1019     We rely on refresh_rw_trx_hash_version() to issue RELEASE memory barrier so
1020     that m_rw_trx_hash_version increment happens after transaction becomes
1021     visible through rw_trx_hash.
1022   */
1023 
register_rw(trx_t * trx)1024   void register_rw(trx_t *trx)
1025   {
1026     trx->id= get_new_trx_id_no_refresh();
1027     rw_trx_hash.insert(trx);
1028     refresh_rw_trx_hash_version();
1029   }
1030 
1031 
1032   /**
1033     Deregisters read-write transaction.
1034 
1035     Transaction is removed from rw_trx_hash, which releases all implicit locks.
1036     MVCC snapshot won't see this transaction anymore.
1037   */
1038 
deregister_rw(trx_t * trx)1039   void deregister_rw(trx_t *trx)
1040   {
1041     rw_trx_hash.erase(trx);
1042   }
1043 
1044 
is_registered(trx_t * caller_trx,trx_id_t id)1045   bool is_registered(trx_t *caller_trx, trx_id_t id)
1046   {
1047     return id && find(caller_trx, id, false);
1048   }
1049 
1050 
1051   trx_t *find(trx_t *caller_trx, trx_id_t id, bool do_ref_count= true)
1052   {
1053     return rw_trx_hash.find(caller_trx, id, do_ref_count);
1054   }
1055 
1056 
1057   /**
1058     Registers transaction in trx_sys.
1059 
1060     @param trx transaction
1061   */
register_trx(trx_t * trx)1062   void register_trx(trx_t *trx)
1063   {
1064     mutex_enter(&mutex);
1065     UT_LIST_ADD_FIRST(trx_list, trx);
1066     mutex_exit(&mutex);
1067   }
1068 
1069 
1070   /**
1071     Deregisters transaction in trx_sys.
1072 
1073     @param trx transaction
1074   */
deregister_trx(trx_t * trx)1075   void deregister_trx(trx_t *trx)
1076   {
1077     mutex_enter(&mutex);
1078     UT_LIST_REMOVE(trx_list, trx);
1079     mutex_exit(&mutex);
1080   }
1081 
1082 
1083   /**
1084     Clones the oldest view and stores it in view.
1085 
1086     No need to call ReadView::close(). The caller owns the view that is passed
1087     in. This function is called by purge thread to determine whether it should
1088     purge the delete marked record or not.
1089   */
1090   void clone_oldest_view();
1091 
1092 
1093   /** @return the number of active views */
view_count()1094   size_t view_count() const
1095   {
1096     size_t count= 0;
1097 
1098     mutex_enter(&mutex);
1099     for (const trx_t *trx= UT_LIST_GET_FIRST(trx_list); trx;
1100          trx= UT_LIST_GET_NEXT(trx_list, trx))
1101     {
1102       if (trx->read_view.get_state() == READ_VIEW_STATE_OPEN)
1103         ++count;
1104     }
1105     mutex_exit(&mutex);
1106     return count;
1107   }
1108 
1109 private:
get_min_trx_id_callback(rw_trx_hash_element_t * element,trx_id_t * id)1110   static my_bool get_min_trx_id_callback(rw_trx_hash_element_t *element,
1111                                          trx_id_t *id)
1112   {
1113     if (element->id < *id)
1114     {
1115       mutex_enter(&element->mutex);
1116       /* We don't care about read-only transactions here. */
1117       if (element->trx && element->trx->rsegs.m_redo.rseg)
1118         *id= element->id;
1119       mutex_exit(&element->mutex);
1120     }
1121     return 0;
1122   }
1123 
1124 
1125   struct snapshot_ids_arg
1126   {
snapshot_ids_argsnapshot_ids_arg1127     snapshot_ids_arg(trx_ids_t *ids): m_ids(ids) {}
1128     trx_ids_t *m_ids;
1129     trx_id_t m_id;
1130     trx_id_t m_no;
1131   };
1132 
1133 
copy_one_id(rw_trx_hash_element_t * element,snapshot_ids_arg * arg)1134   static my_bool copy_one_id(rw_trx_hash_element_t *element,
1135                              snapshot_ids_arg *arg)
1136   {
1137     if (element->id < arg->m_id)
1138     {
1139       trx_id_t no= element->no;
1140       arg->m_ids->push_back(element->id);
1141       if (no < arg->m_no)
1142         arg->m_no= no;
1143     }
1144     return 0;
1145   }
1146 
1147 
1148   /** Getter for m_rw_trx_hash_version, must issue ACQUIRE memory barrier. */
get_rw_trx_hash_version()1149   trx_id_t get_rw_trx_hash_version()
1150   {
1151     return m_rw_trx_hash_version.load(std::memory_order_acquire);
1152   }
1153 
1154 
1155   /** Increments m_rw_trx_hash_version, must issue RELEASE memory barrier. */
refresh_rw_trx_hash_version()1156   void refresh_rw_trx_hash_version()
1157   {
1158     m_rw_trx_hash_version.fetch_add(1, std::memory_order_release);
1159   }
1160 
1161 
1162   /**
1163     Allocates new transaction id without refreshing rw_trx_hash version.
1164 
1165     This method is extracted for exclusive use by register_rw() and
1166     assign_new_trx_no() where new id must be allocated atomically with
1167     payload of these methods from MVCC snapshot point of view.
1168 
1169     @sa get_new_trx_id()
1170     @sa assign_new_trx_no()
1171 
1172     @return new transaction id
1173   */
1174 
get_new_trx_id_no_refresh()1175   trx_id_t get_new_trx_id_no_refresh()
1176   {
1177     return m_max_trx_id++;
1178   }
1179 };
1180 
1181 
1182 /** The transaction system */
1183 extern trx_sys_t trx_sys;
1184 
1185 #endif
1186