1 /* Copyright (c) 2015, 2021, Oracle and/or its affiliates.
2 Copyright (c) 2009, 2013, Monty Program Ab
3 Copyright (C) 2012 Percona Inc.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License, version 2.0,
7 as published by the Free Software Foundation.
8
9 This program is also distributed with certain software (including
10 but not limited to OpenSSL) that is licensed under separate terms,
11 as designated in a particular file or component or in included license
12 documentation. The authors of MySQL hereby grant you an additional
13 permission to link the program and your derivative works with the
14 separately licensed software that they have included with MySQL.
15
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License, version 2.0, for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with this program; if not, write to the Free Software
23 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
24
25
26 #include "tc_log.h"
27
28 #include "log.h" // sql_print_error
29 #include "sql_class.h" // THD
30
31 #include "pfs_file_provider.h"
32 #include "mysql/psi/mysql_file.h"
33
34
commit(THD * thd,bool all)35 TC_LOG::enum_result TC_LOG_DUMMY::commit(THD *thd, bool all)
36 {
37 return ha_commit_low(thd, all) ? RESULT_ABORTED : RESULT_SUCCESS;
38 }
39
40
rollback(THD * thd,bool all)41 int TC_LOG_DUMMY::rollback(THD *thd, bool all)
42 {
43 return ha_rollback_low(thd, all);
44 }
45
46
prepare(THD * thd,bool all)47 int TC_LOG_DUMMY::prepare(THD *thd, bool all)
48 {
49 return ha_prepare_low(thd, all);
50 }
51
52
53 /********* transaction coordinator log for 2pc - mmap() based solution *******/
54
55 /*
56 the log consists of a file, mmapped to a memory.
57 file is divided on pages of tc_log_page_size size.
58 (usable size of the first page is smaller because of log header)
59 there's PAGE control structure for each page
60 each page (or rather PAGE control structure) can be in one of three
61 states - active, syncing, pool.
62 there could be only one page in active or syncing states,
63 but many in pool - pool is fifo queue.
64 usual lifecycle of a page is pool->active->syncing->pool
65 "active" page - is a page where new xid's are logged.
66 the page stays active as long as syncing slot is taken.
67 "syncing" page is being synced to disk. no new xid can be added to it.
68 when the sync is done the page is moved to a pool and an active page
69 becomes "syncing".
70
71 the result of such an architecture is a natural "commit grouping" -
72 If commits are coming faster than the system can sync, they do not
73 stall. Instead, all commit that came since the last sync are
74 logged to the same page, and they all are synced with the next -
75 one - sync. Thus, thought individual commits are delayed, throughput
76 is not decreasing.
77
78 when a xid is added to an active page, the thread of this xid waits
79 for a page's condition until the page is synced. when syncing slot
80 becomes vacant one of these waiters is awaken to take care of syncing.
81 it syncs the page and signals all waiters that the page is synced.
82 PAGE::waiters is used to count these waiters, and a page may never
83 become active again until waiters==0 (that is all waiters from the
84 previous sync have noticed the sync was completed)
85
86 note, that the page becomes "dirty" and has to be synced only when a
87 new xid is added into it. Removing a xid from a page does not make it
88 dirty - we don't sync removals to disk.
89 */
90
91 ulong tc_log_page_waits= 0;
92
93 #define TC_LOG_HEADER_SIZE (sizeof(tc_log_magic)+1)
94
95 static const char tc_log_magic[]={(char) 254, 0x23, 0x05, 0x74};
96
97 ulong tc_log_max_pages_used=0, tc_log_page_size=0, tc_log_cur_pages_used=0;
98
open(const char * opt_name)99 int TC_LOG_MMAP::open(const char *opt_name)
100 {
101 uint i;
102 bool crashed=FALSE;
103 PAGE *pg;
104
105 assert(total_ha_2pc > 1);
106 assert(opt_name && opt_name[0]);
107
108 tc_log_page_size= my_getpagesize();
109
110 fn_format(logname,opt_name,mysql_data_home,"",MY_UNPACK_FILENAME);
111 if ((fd= mysql_file_open(key_file_tclog, logname, O_RDWR, MYF(0))) < 0)
112 {
113 if (my_errno() != ENOENT)
114 goto err;
115 if (using_heuristic_recover())
116 return 1;
117 if ((fd= mysql_file_create(key_file_tclog, logname, CREATE_MODE,
118 O_RDWR, MYF(MY_WME))) < 0)
119 goto err;
120 inited=1;
121 file_length= opt_tc_log_size;
122 if (mysql_file_chsize(fd, file_length, 0, MYF(MY_WME)))
123 goto err;
124 }
125 else
126 {
127 inited= 1;
128 crashed= TRUE;
129 sql_print_information("Recovering after a crash using %s", opt_name);
130 if (tc_heuristic_recover != TC_HEURISTIC_NOT_USED)
131 {
132 sql_print_error("Cannot perform automatic crash recovery when "
133 "--tc-heuristic-recover is used");
134 goto err;
135 }
136 file_length= mysql_file_seek(fd, 0L, MY_SEEK_END, MYF(MY_WME+MY_FAE));
137 if (file_length == MY_FILEPOS_ERROR || file_length % tc_log_page_size)
138 goto err;
139 }
140
141 data= (uchar *)my_mmap(0, (size_t)file_length, PROT_READ|PROT_WRITE,
142 MAP_NOSYNC|MAP_SHARED, fd, 0);
143 if (data == MAP_FAILED)
144 {
145 set_my_errno(errno);
146 goto err;
147 }
148 inited=2;
149
150 npages=(uint)file_length/tc_log_page_size;
151 assert(npages >= 3); // to guarantee non-empty pool
152 if (!(pages=(PAGE *)my_malloc(key_memory_TC_LOG_MMAP_pages,
153 npages*sizeof(PAGE), MYF(MY_WME|MY_ZEROFILL))))
154 goto err;
155 inited=3;
156 for (pg=pages, i=0; i < npages; i++, pg++)
157 {
158 pg->next=pg+1;
159 pg->waiters=0;
160 pg->state=PS_POOL;
161 mysql_cond_init(key_PAGE_cond, &pg->cond);
162 pg->size=pg->free=tc_log_page_size/sizeof(my_xid);
163 pg->start= (my_xid *)(data + i*tc_log_page_size);
164 pg->end= pg->start + pg->size;
165 pg->ptr= pg->start;
166 }
167 pages[0].size=pages[0].free=
168 (tc_log_page_size-TC_LOG_HEADER_SIZE)/sizeof(my_xid);
169 pages[0].start=pages[0].end-pages[0].size;
170 pages[npages-1].next=0;
171 inited=4;
172
173 if (crashed && recover())
174 goto err;
175
176 memcpy(data, tc_log_magic, sizeof(tc_log_magic));
177 data[sizeof(tc_log_magic)]= (uchar)total_ha_2pc;
178 my_msync(fd, data, tc_log_page_size, MS_SYNC);
179 inited=5;
180
181 mysql_mutex_init(key_LOCK_tc, &LOCK_tc, MY_MUTEX_INIT_FAST);
182 mysql_cond_init(key_COND_active, &COND_active);
183 mysql_cond_init(key_COND_pool, &COND_pool);
184
185 inited=6;
186
187 syncing= 0;
188 active=pages;
189 pool=pages+1;
190 pool_last_ptr= &pages[npages-1].next;
191
192 return 0;
193
194 err:
195 close();
196 return 1;
197 }
198
199
200 /**
201 Get the total amount of potentially usable slots for XIDs in TC log.
202 */
203
size() const204 uint TC_LOG_MMAP::size() const
205 {
206 return (tc_log_page_size-TC_LOG_HEADER_SIZE)/sizeof(my_xid) +
207 (npages - 1) * (tc_log_page_size/sizeof(my_xid));
208 }
209
210
211 /**
212 there is no active page, let's got one from the pool.
213
214 Two strategies here:
215 -# take the first from the pool
216 -# if there're waiters - take the one with the most free space.
217
218 @todo
219 TODO page merging. try to allocate adjacent page first,
220 so that they can be flushed both in one sync
221
222 @returns Pointer to qualifying page or NULL if no page in the
223 pool can be made active.
224 */
225
get_active_from_pool()226 TC_LOG_MMAP::PAGE* TC_LOG_MMAP::get_active_from_pool()
227 {
228 PAGE **best_p= &pool;
229
230 if ((*best_p)->waiters != 0 || (*best_p)->free == 0)
231 {
232 /* if the first page can't be used try second strategy */
233 int best_free=0;
234 PAGE **p= &pool;
235 for (p=&(*p)->next; *p; p=&(*p)->next)
236 {
237 if ((*p)->waiters == 0 && (*p)->free > best_free)
238 {
239 best_free=(*p)->free;
240 best_p=p;
241 }
242 }
243 if (*best_p == NULL || best_free == 0)
244 return NULL;
245 }
246
247 PAGE *new_active= *best_p;
248 if (new_active->free == new_active->size) // we've chosen an empty page
249 {
250 tc_log_cur_pages_used++;
251 set_if_bigger(tc_log_max_pages_used, tc_log_cur_pages_used);
252 }
253
254 *best_p= (*best_p)->next;
255 if (! *best_p)
256 pool_last_ptr= best_p;
257
258 return new_active;
259 }
260
261 /**
262 @todo
263 perhaps, increase log size ?
264 */
overflow()265 void TC_LOG_MMAP::overflow()
266 {
267 /*
268 simple overflow handling - just wait
269 TODO perhaps, increase log size ?
270 let's check the behaviour of tc_log_page_waits first
271 */
272 ulong old_log_page_waits= tc_log_page_waits;
273
274 mysql_cond_wait(&COND_pool, &LOCK_tc);
275
276 if (old_log_page_waits == tc_log_page_waits)
277 {
278 /*
279 When several threads are waiting in overflow() simultaneously
280 we want to increase counter only once and not for each thread.
281 */
282 tc_log_page_waits++;
283 }
284 }
285
286 /**
287 Commit the transaction.
288
289 @note When the TC_LOG inteface was changed, this function was added
290 and uses the functions that were there with the old interface to
291 implement the logic.
292 */
commit(THD * thd,bool all)293 TC_LOG::enum_result TC_LOG_MMAP::commit(THD *thd, bool all)
294 {
295 DBUG_ENTER("TC_LOG_MMAP::commit");
296 ulong cookie= 0;
297 my_xid xid= thd->get_transaction()->xid_state()->get_xid()->get_my_xid();
298
299 if (all && xid)
300 if (!(cookie= log_xid(xid)))
301 DBUG_RETURN(RESULT_ABORTED); // Failed to log the transaction
302
303 /*
304 Acquire a shared lock to block commits until START TRANSACTION WITH
305 CONSISTENT SNAPSHOT completes snapshot creation for all storage engines.
306 */
307 slock();
308 int rc= ha_commit_low(thd, all);
309 sunlock();
310
311 if (rc)
312 DBUG_RETURN(RESULT_INCONSISTENT); // Transaction logged, but not committed
313
314 /* If cookie is non-zero, something was logged */
315 if (cookie)
316 unlog(cookie, xid);
317
318 DBUG_RETURN(RESULT_SUCCESS);
319 }
320
321
rollback(THD * thd,bool all)322 int TC_LOG_MMAP::rollback(THD *thd, bool all)
323 {
324 return ha_rollback_low(thd, all);
325 }
326
327
prepare(THD * thd,bool all)328 int TC_LOG_MMAP::prepare(THD *thd, bool all)
329 {
330 return ha_prepare_low(thd, all);
331 }
332
333
334 /**
335 Record that transaction XID is committed on the persistent storage.
336
337 This function is called in the middle of two-phase commit:
338 First all resources prepare the transaction, then tc_log->log() is called,
339 then all resources commit the transaction, then tc_log->unlog() is called.
340
341 All access to active page is serialized but it's not a problem, as
342 we're assuming that fsync() will be a main bottleneck.
343 That is, parallelizing writes to log pages we'll decrease number of
344 threads waiting for a page, but then all these threads will be waiting
345 for a fsync() anyway
346
347 If tc_log == MYSQL_BIN_LOG then tc_log writes transaction to binlog and
348 records XID in a special Xid_log_event.
349 If tc_log = TC_LOG_MMAP then xid is written in a special memory-mapped
350 log.
351
352 @retval
353 0 - error
354 @retval
355 \# - otherwise, "cookie", a number that will be passed as an argument
356 to unlog() call. tc_log can define it any way it wants,
357 and use for whatever purposes. TC_LOG_MMAP sets it
358 to the position in memory where xid was logged to.
359 */
360
log_xid(my_xid xid)361 ulong TC_LOG_MMAP::log_xid(my_xid xid)
362 {
363 mysql_mutex_lock(&LOCK_tc);
364
365 while (true)
366 {
367 /* If active page is full - just wait... */
368 while (unlikely(active && active->free == 0))
369 mysql_cond_wait(&COND_active, &LOCK_tc);
370
371 /* no active page ? take one from the pool. */
372 if (active == NULL)
373 {
374 active= get_active_from_pool();
375
376 /* There are no pages with free slots? Wait and retry. */
377 if (active == NULL)
378 {
379 overflow();
380 continue;
381 }
382 }
383
384 break;
385 }
386
387 PAGE *p= active;
388 ulong cookie= store_xid_in_empty_slot(xid, p, data);
389 bool err;
390
391 if (syncing)
392 { // somebody's syncing. let's wait
393 err= wait_sync_completion(p);
394 if (p->state != PS_DIRTY) // page was synced
395 {
396 if (p->waiters == 0)
397 mysql_cond_broadcast(&COND_pool); // in case somebody's waiting
398 mysql_mutex_unlock(&LOCK_tc);
399 goto done; // we're done
400 }
401 } // page was not synced! do it now
402 assert(active == p && syncing == NULL);
403 syncing= p; // place is vacant - take it
404 active= NULL; // page is not active anymore
405 mysql_cond_broadcast(&COND_active); // in case somebody's waiting
406 mysql_mutex_unlock(&LOCK_tc);
407 err= sync();
408
409 done:
410 return err ? 0 : cookie;
411 }
412
413
414 /**
415 Write the page data being synchronized to the disk.
416
417 @return
418 @retval false Success
419 @retval true Failure
420 */
sync()421 bool TC_LOG_MMAP::sync()
422 {
423 assert(syncing != active);
424
425 /*
426 sit down and relax - this can take a while...
427 note - no locks are held at this point
428 */
429
430 int err= do_msync_and_fsync(fd, syncing->start,
431 syncing->size*sizeof(my_xid), MS_SYNC);
432
433 mysql_mutex_lock(&LOCK_tc);
434 /* Page is synced. Let's move it to the pool. */
435 *pool_last_ptr= syncing;
436 pool_last_ptr= &(syncing->next);
437 syncing->next= NULL;
438 syncing->state= err ? PS_ERROR : PS_POOL;
439 mysql_cond_broadcast(&COND_pool); // in case somebody's waiting
440
441 /* Wake-up all threads which are waiting for syncing of the same page. */
442 mysql_cond_broadcast(&syncing->cond);
443
444 /* Mark syncing slot as free and wake-up new syncer. */
445 syncing= NULL;
446 if (active)
447 mysql_cond_signal(&active->cond);
448
449 mysql_mutex_unlock(&LOCK_tc);
450 return err != 0;
451 }
452
453 /**
454 erase xid from the page, update page free space counters/pointers.
455 cookie points directly to the memory where xid was logged.
456 */
457
unlog(ulong cookie,my_xid xid)458 void TC_LOG_MMAP::unlog(ulong cookie, my_xid xid)
459 {
460 PAGE *p= pages + (cookie / tc_log_page_size);
461 my_xid *x= (my_xid *)(data + cookie);
462
463 assert(*x == xid);
464 assert(x >= p->start && x < p->end);
465 *x= 0;
466
467 mysql_mutex_lock(&LOCK_tc);
468 p->free++;
469 assert(p->free <= p->size);
470 set_if_smaller(p->ptr, x);
471 if (p->free == p->size) // the page is completely empty
472 tc_log_cur_pages_used--;
473 if (p->waiters == 0) // the page is in pool and ready to rock
474 mysql_cond_broadcast(&COND_pool); // ping ... for overflow()
475 mysql_mutex_unlock(&LOCK_tc);
476 }
477
close()478 void TC_LOG_MMAP::close()
479 {
480 uint i;
481 switch (inited) {
482 case 6:
483 mysql_mutex_destroy(&LOCK_tc);
484 mysql_cond_destroy(&COND_pool);
485 // Fall through.
486 case 5:
487 data[0]='A'; // garble the first (signature) byte, in case mysql_file_delete fails
488 // Fall through.
489 case 4:
490 for (i=0; i < npages; i++)
491 {
492 if (pages[i].ptr == 0)
493 break;
494 mysql_cond_destroy(&pages[i].cond);
495 }
496 // Fall through.
497 case 3:
498 my_free(pages);
499 // Fall through.
500 case 2:
501 my_munmap((char*)data, (size_t)file_length);
502 // Fall through.
503 case 1:
504 mysql_file_close(fd, MYF(0));
505 }
506 if (inited>=5) // cannot do in the switch because of Windows
507 mysql_file_delete(key_file_tclog, logname, MYF(MY_WME));
508 inited=0;
509 }
510
recover()511 int TC_LOG_MMAP::recover()
512 {
513 HASH xids;
514 PAGE *p=pages, *end_p=pages+npages;
515
516 if (memcmp(data, tc_log_magic, sizeof(tc_log_magic)))
517 {
518 sql_print_error("Bad magic header in tc log");
519 goto err1;
520 }
521
522 /*
523 the first byte after magic signature is set to current
524 number of storage engines on startup
525 */
526 if (data[sizeof(tc_log_magic)] != total_ha_2pc)
527 {
528 sql_print_error("Recovery failed! You must enable "
529 "exactly %d storage engines that support "
530 "two-phase commit protocol",
531 data[sizeof(tc_log_magic)]);
532 goto err1;
533 }
534
535 if (my_hash_init(&xids, &my_charset_bin, tc_log_page_size/3, 0,
536 sizeof(my_xid), 0, 0, MYF(0),
537 PSI_INSTRUMENT_ME))
538 goto err1;
539
540 for ( ; p < end_p ; p++)
541 {
542 for (my_xid *x=p->start; x < p->end; x++)
543 if (*x && my_hash_insert(&xids, (uchar *)x))
544 goto err2; // OOM
545 }
546
547 if (ha_recover(&xids))
548 goto err2;
549
550 my_hash_free(&xids);
551 memset(data, 0, (size_t)file_length);
552 return 0;
553
554 err2:
555 my_hash_free(&xids);
556 err1:
557 sql_print_error("Crash recovery failed. Either correct the problem "
558 "(if it's, for example, out of memory error) and restart, "
559 "or delete tc log and start mysqld with "
560 "--tc-heuristic-recover={commit|rollback}");
561 return 1;
562 }
563
564 TC_LOG *tc_log;
565 TC_LOG_DUMMY tc_log_dummy;
566 TC_LOG_MMAP tc_log_mmap;
567
using_heuristic_recover()568 bool TC_LOG::using_heuristic_recover()
569 {
570 if (tc_heuristic_recover == TC_HEURISTIC_NOT_USED)
571 return false;
572
573 sql_print_information("Heuristic crash recovery mode");
574 if (ha_recover(0))
575 sql_print_error("Heuristic crash recovery failed");
576 sql_print_information("Please restart mysqld without --tc-heuristic-recover");
577 return true;
578 }
579
580