1 /* Copyright (c) 2015, 2021, Oracle and/or its affiliates.
2    Copyright (c) 2009, 2013, Monty Program Ab
3    Copyright (C) 2012 Percona Inc.
4 
5    This program is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License, version 2.0,
7    as published by the Free Software Foundation.
8 
9    This program is also distributed with certain software (including
10    but not limited to OpenSSL) that is licensed under separate terms,
11    as designated in a particular file or component or in included license
12    documentation.  The authors of MySQL hereby grant you an additional
13    permission to link the program and your derivative works with the
14    separately licensed software that they have included with MySQL.
15 
16    This program is distributed in the hope that it will be useful,
17    but WITHOUT ANY WARRANTY; without even the implied warranty of
18    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19    GNU General Public License, version 2.0, for more details.
20 
21    You should have received a copy of the GNU General Public License
22    along with this program; if not, write to the Free Software
23    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
24 
25 
26 #include "tc_log.h"
27 
28 #include "log.h"           // sql_print_error
29 #include "sql_class.h"     // THD
30 
31 #include "pfs_file_provider.h"
32 #include "mysql/psi/mysql_file.h"
33 
34 
commit(THD * thd,bool all)35 TC_LOG::enum_result TC_LOG_DUMMY::commit(THD *thd, bool all)
36 {
37   return ha_commit_low(thd, all) ? RESULT_ABORTED : RESULT_SUCCESS;
38 }
39 
40 
rollback(THD * thd,bool all)41 int TC_LOG_DUMMY::rollback(THD *thd, bool all)
42 {
43   return ha_rollback_low(thd, all);
44 }
45 
46 
prepare(THD * thd,bool all)47 int TC_LOG_DUMMY::prepare(THD *thd, bool all)
48 {
49   return ha_prepare_low(thd, all);
50 }
51 
52 
53 /********* transaction coordinator log for 2pc - mmap() based solution *******/
54 
55 /*
56   the log consists of a file, mmapped to a memory.
57   file is divided on pages of tc_log_page_size size.
58   (usable size of the first page is smaller because of log header)
59   there's PAGE control structure for each page
60   each page (or rather PAGE control structure) can be in one of three
61   states - active, syncing, pool.
62   there could be only one page in active or syncing states,
63   but many in pool - pool is fifo queue.
64   usual lifecycle of a page is pool->active->syncing->pool
65   "active" page - is a page where new xid's are logged.
66   the page stays active as long as syncing slot is taken.
67   "syncing" page is being synced to disk. no new xid can be added to it.
68   when the sync is done the page is moved to a pool and an active page
69   becomes "syncing".
70 
71   the result of such an architecture is a natural "commit grouping" -
72   If commits are coming faster than the system can sync, they do not
73   stall. Instead, all commit that came since the last sync are
74   logged to the same page, and they all are synced with the next -
75   one - sync. Thus, thought individual commits are delayed, throughput
76   is not decreasing.
77 
78   when a xid is added to an active page, the thread of this xid waits
79   for a page's condition until the page is synced. when syncing slot
80   becomes vacant one of these waiters is awaken to take care of syncing.
81   it syncs the page and signals all waiters that the page is synced.
82   PAGE::waiters is used to count these waiters, and a page may never
83   become active again until waiters==0 (that is all waiters from the
84   previous sync have noticed the sync was completed)
85 
86   note, that the page becomes "dirty" and has to be synced only when a
87   new xid is added into it. Removing a xid from a page does not make it
88   dirty - we don't sync removals to disk.
89 */
90 
91 ulong tc_log_page_waits= 0;
92 
93 #define TC_LOG_HEADER_SIZE (sizeof(tc_log_magic)+1)
94 
95 static const char tc_log_magic[]={(char) 254, 0x23, 0x05, 0x74};
96 
97 ulong tc_log_max_pages_used=0, tc_log_page_size=0, tc_log_cur_pages_used=0;
98 
open(const char * opt_name)99 int TC_LOG_MMAP::open(const char *opt_name)
100 {
101   uint i;
102   bool crashed=FALSE;
103   PAGE *pg;
104 
105   assert(total_ha_2pc > 1);
106   assert(opt_name && opt_name[0]);
107 
108   tc_log_page_size= my_getpagesize();
109 
110   fn_format(logname,opt_name,mysql_data_home,"",MY_UNPACK_FILENAME);
111   if ((fd= mysql_file_open(key_file_tclog, logname, O_RDWR, MYF(0))) < 0)
112   {
113     if (my_errno() != ENOENT)
114       goto err;
115     if (using_heuristic_recover())
116       return 1;
117     if ((fd= mysql_file_create(key_file_tclog, logname, CREATE_MODE,
118                                O_RDWR, MYF(MY_WME))) < 0)
119       goto err;
120     inited=1;
121     file_length= opt_tc_log_size;
122     if (mysql_file_chsize(fd, file_length, 0, MYF(MY_WME)))
123       goto err;
124   }
125   else
126   {
127     inited= 1;
128     crashed= TRUE;
129     sql_print_information("Recovering after a crash using %s", opt_name);
130     if (tc_heuristic_recover != TC_HEURISTIC_NOT_USED)
131     {
132       sql_print_error("Cannot perform automatic crash recovery when "
133                       "--tc-heuristic-recover is used");
134       goto err;
135     }
136     file_length= mysql_file_seek(fd, 0L, MY_SEEK_END, MYF(MY_WME+MY_FAE));
137     if (file_length == MY_FILEPOS_ERROR || file_length % tc_log_page_size)
138       goto err;
139   }
140 
141   data= (uchar *)my_mmap(0, (size_t)file_length, PROT_READ|PROT_WRITE,
142                         MAP_NOSYNC|MAP_SHARED, fd, 0);
143   if (data == MAP_FAILED)
144   {
145     set_my_errno(errno);
146     goto err;
147   }
148   inited=2;
149 
150   npages=(uint)file_length/tc_log_page_size;
151   assert(npages >= 3);             // to guarantee non-empty pool
152   if (!(pages=(PAGE *)my_malloc(key_memory_TC_LOG_MMAP_pages,
153                                 npages*sizeof(PAGE), MYF(MY_WME|MY_ZEROFILL))))
154     goto err;
155   inited=3;
156   for (pg=pages, i=0; i < npages; i++, pg++)
157   {
158     pg->next=pg+1;
159     pg->waiters=0;
160     pg->state=PS_POOL;
161     mysql_cond_init(key_PAGE_cond, &pg->cond);
162     pg->size=pg->free=tc_log_page_size/sizeof(my_xid);
163     pg->start= (my_xid *)(data + i*tc_log_page_size);
164     pg->end= pg->start + pg->size;
165     pg->ptr= pg->start;
166   }
167   pages[0].size=pages[0].free=
168                 (tc_log_page_size-TC_LOG_HEADER_SIZE)/sizeof(my_xid);
169   pages[0].start=pages[0].end-pages[0].size;
170   pages[npages-1].next=0;
171   inited=4;
172 
173   if (crashed && recover())
174       goto err;
175 
176   memcpy(data, tc_log_magic, sizeof(tc_log_magic));
177   data[sizeof(tc_log_magic)]= (uchar)total_ha_2pc;
178   my_msync(fd, data, tc_log_page_size, MS_SYNC);
179   inited=5;
180 
181   mysql_mutex_init(key_LOCK_tc, &LOCK_tc, MY_MUTEX_INIT_FAST);
182   mysql_cond_init(key_COND_active, &COND_active);
183   mysql_cond_init(key_COND_pool, &COND_pool);
184 
185   inited=6;
186 
187   syncing= 0;
188   active=pages;
189   pool=pages+1;
190   pool_last_ptr= &pages[npages-1].next;
191 
192   return 0;
193 
194 err:
195   close();
196   return 1;
197 }
198 
199 
200 /**
201   Get the total amount of potentially usable slots for XIDs in TC log.
202 */
203 
size() const204 uint TC_LOG_MMAP::size() const
205 {
206   return (tc_log_page_size-TC_LOG_HEADER_SIZE)/sizeof(my_xid) +
207          (npages - 1) * (tc_log_page_size/sizeof(my_xid));
208 }
209 
210 
211 /**
212   there is no active page, let's got one from the pool.
213 
214   Two strategies here:
215     -# take the first from the pool
216     -# if there're waiters - take the one with the most free space.
217 
218   @todo
219     TODO page merging. try to allocate adjacent page first,
220     so that they can be flushed both in one sync
221 
222   @returns Pointer to qualifying page or NULL if no page in the
223            pool can be made active.
224 */
225 
get_active_from_pool()226 TC_LOG_MMAP::PAGE* TC_LOG_MMAP::get_active_from_pool()
227 {
228   PAGE **best_p= &pool;
229 
230   if ((*best_p)->waiters != 0 || (*best_p)->free == 0)
231   {
232     /* if the first page can't be used try second strategy */
233     int best_free=0;
234     PAGE **p= &pool;
235     for (p=&(*p)->next; *p; p=&(*p)->next)
236     {
237       if ((*p)->waiters == 0 && (*p)->free > best_free)
238       {
239         best_free=(*p)->free;
240         best_p=p;
241       }
242     }
243     if (*best_p == NULL || best_free == 0)
244       return NULL;
245   }
246 
247   PAGE *new_active= *best_p;
248   if (new_active->free == new_active->size) // we've chosen an empty page
249   {
250     tc_log_cur_pages_used++;
251     set_if_bigger(tc_log_max_pages_used, tc_log_cur_pages_used);
252   }
253 
254   *best_p= (*best_p)->next;
255   if (! *best_p)
256     pool_last_ptr= best_p;
257 
258   return new_active;
259 }
260 
261 /**
262   @todo
263   perhaps, increase log size ?
264 */
overflow()265 void TC_LOG_MMAP::overflow()
266 {
267   /*
268     simple overflow handling - just wait
269     TODO perhaps, increase log size ?
270     let's check the behaviour of tc_log_page_waits first
271   */
272   ulong old_log_page_waits= tc_log_page_waits;
273 
274   mysql_cond_wait(&COND_pool, &LOCK_tc);
275 
276   if (old_log_page_waits == tc_log_page_waits)
277   {
278     /*
279       When several threads are waiting in overflow() simultaneously
280       we want to increase counter only once and not for each thread.
281     */
282     tc_log_page_waits++;
283   }
284 }
285 
286 /**
287   Commit the transaction.
288 
289   @note When the TC_LOG inteface was changed, this function was added
290   and uses the functions that were there with the old interface to
291   implement the logic.
292  */
commit(THD * thd,bool all)293 TC_LOG::enum_result TC_LOG_MMAP::commit(THD *thd, bool all)
294 {
295   DBUG_ENTER("TC_LOG_MMAP::commit");
296   ulong cookie= 0;
297   my_xid xid= thd->get_transaction()->xid_state()->get_xid()->get_my_xid();
298 
299   if (all && xid)
300     if (!(cookie= log_xid(xid)))
301       DBUG_RETURN(RESULT_ABORTED);    // Failed to log the transaction
302 
303   /*
304     Acquire a shared lock to block commits until START TRANSACTION WITH
305     CONSISTENT SNAPSHOT completes snapshot creation for all storage engines.
306   */
307   slock();
308   int rc= ha_commit_low(thd, all);
309   sunlock();
310 
311   if (rc)
312     DBUG_RETURN(RESULT_INCONSISTENT); // Transaction logged, but not committed
313 
314   /* If cookie is non-zero, something was logged */
315   if (cookie)
316     unlog(cookie, xid);
317 
318   DBUG_RETURN(RESULT_SUCCESS);
319 }
320 
321 
rollback(THD * thd,bool all)322 int TC_LOG_MMAP::rollback(THD *thd, bool all)
323 {
324   return ha_rollback_low(thd, all);
325 }
326 
327 
prepare(THD * thd,bool all)328 int TC_LOG_MMAP::prepare(THD *thd, bool all)
329 {
330   return ha_prepare_low(thd, all);
331 }
332 
333 
334 /**
335   Record that transaction XID is committed on the persistent storage.
336 
337     This function is called in the middle of two-phase commit:
338     First all resources prepare the transaction, then tc_log->log() is called,
339     then all resources commit the transaction, then tc_log->unlog() is called.
340 
341     All access to active page is serialized but it's not a problem, as
342     we're assuming that fsync() will be a main bottleneck.
343     That is, parallelizing writes to log pages we'll decrease number of
344     threads waiting for a page, but then all these threads will be waiting
345     for a fsync() anyway
346 
347    If tc_log == MYSQL_BIN_LOG then tc_log writes transaction to binlog and
348    records XID in a special Xid_log_event.
349    If tc_log = TC_LOG_MMAP then xid is written in a special memory-mapped
350    log.
351 
352   @retval
353     0  - error
354   @retval
355     \# - otherwise, "cookie", a number that will be passed as an argument
356     to unlog() call. tc_log can define it any way it wants,
357     and use for whatever purposes. TC_LOG_MMAP sets it
358     to the position in memory where xid was logged to.
359 */
360 
log_xid(my_xid xid)361 ulong TC_LOG_MMAP::log_xid(my_xid xid)
362 {
363   mysql_mutex_lock(&LOCK_tc);
364 
365   while (true)
366   {
367     /* If active page is full - just wait... */
368     while (unlikely(active && active->free == 0))
369       mysql_cond_wait(&COND_active, &LOCK_tc);
370 
371     /* no active page ? take one from the pool. */
372     if (active == NULL)
373     {
374       active= get_active_from_pool();
375 
376       /* There are no pages with free slots? Wait and retry. */
377       if (active == NULL)
378       {
379         overflow();
380         continue;
381       }
382     }
383 
384     break;
385   }
386 
387   PAGE *p= active;
388   ulong cookie= store_xid_in_empty_slot(xid, p, data);
389   bool err;
390 
391   if (syncing)
392   {                                          // somebody's syncing. let's wait
393     err= wait_sync_completion(p);
394     if (p->state != PS_DIRTY)                   // page was synced
395     {
396       if (p->waiters == 0)
397         mysql_cond_broadcast(&COND_pool);    // in case somebody's waiting
398       mysql_mutex_unlock(&LOCK_tc);
399       goto done;                             // we're done
400     }
401   }                                          // page was not synced! do it now
402   assert(active == p && syncing == NULL);
403   syncing= p;                                 // place is vacant - take it
404   active= NULL;                                  // page is not active anymore
405   mysql_cond_broadcast(&COND_active);        // in case somebody's waiting
406   mysql_mutex_unlock(&LOCK_tc);
407   err= sync();
408 
409 done:
410   return err ? 0 : cookie;
411 }
412 
413 
414 /**
415   Write the page data being synchronized to the disk.
416 
417   @return
418     @retval false   Success
419     @retval true    Failure
420 */
sync()421 bool TC_LOG_MMAP::sync()
422 {
423   assert(syncing != active);
424 
425   /*
426     sit down and relax - this can take a while...
427     note - no locks are held at this point
428   */
429 
430   int err= do_msync_and_fsync(fd, syncing->start,
431                               syncing->size*sizeof(my_xid), MS_SYNC);
432 
433   mysql_mutex_lock(&LOCK_tc);
434   /* Page is synced. Let's move it to the pool. */
435   *pool_last_ptr= syncing;
436   pool_last_ptr= &(syncing->next);
437   syncing->next= NULL;
438   syncing->state= err ? PS_ERROR : PS_POOL;
439   mysql_cond_broadcast(&COND_pool);          // in case somebody's waiting
440 
441   /* Wake-up all threads which are waiting for syncing of the same page. */
442   mysql_cond_broadcast(&syncing->cond);
443 
444   /* Mark syncing slot as free and wake-up new syncer. */
445   syncing= NULL;
446   if (active)
447     mysql_cond_signal(&active->cond);
448 
449   mysql_mutex_unlock(&LOCK_tc);
450   return err != 0;
451 }
452 
453 /**
454   erase xid from the page, update page free space counters/pointers.
455   cookie points directly to the memory where xid was logged.
456 */
457 
unlog(ulong cookie,my_xid xid)458 void TC_LOG_MMAP::unlog(ulong cookie, my_xid xid)
459 {
460   PAGE *p= pages + (cookie / tc_log_page_size);
461   my_xid *x= (my_xid *)(data + cookie);
462 
463   assert(*x == xid);
464   assert(x >= p->start && x < p->end);
465   *x= 0;
466 
467   mysql_mutex_lock(&LOCK_tc);
468   p->free++;
469   assert(p->free <= p->size);
470   set_if_smaller(p->ptr, x);
471   if (p->free == p->size)               // the page is completely empty
472     tc_log_cur_pages_used--;
473   if (p->waiters == 0)                 // the page is in pool and ready to rock
474     mysql_cond_broadcast(&COND_pool);  // ping ... for overflow()
475   mysql_mutex_unlock(&LOCK_tc);
476 }
477 
close()478 void TC_LOG_MMAP::close()
479 {
480   uint i;
481   switch (inited) {
482   case 6:
483     mysql_mutex_destroy(&LOCK_tc);
484     mysql_cond_destroy(&COND_pool);
485     // Fall through.
486   case 5:
487     data[0]='A'; // garble the first (signature) byte, in case mysql_file_delete fails
488     // Fall through.
489   case 4:
490     for (i=0; i < npages; i++)
491     {
492       if (pages[i].ptr == 0)
493         break;
494       mysql_cond_destroy(&pages[i].cond);
495     }
496     // Fall through.
497   case 3:
498     my_free(pages);
499     // Fall through.
500   case 2:
501     my_munmap((char*)data, (size_t)file_length);
502     // Fall through.
503   case 1:
504     mysql_file_close(fd, MYF(0));
505   }
506   if (inited>=5) // cannot do in the switch because of Windows
507     mysql_file_delete(key_file_tclog, logname, MYF(MY_WME));
508   inited=0;
509 }
510 
recover()511 int TC_LOG_MMAP::recover()
512 {
513   HASH xids;
514   PAGE *p=pages, *end_p=pages+npages;
515 
516   if (memcmp(data, tc_log_magic, sizeof(tc_log_magic)))
517   {
518     sql_print_error("Bad magic header in tc log");
519     goto err1;
520   }
521 
522   /*
523     the first byte after magic signature is set to current
524     number of storage engines on startup
525   */
526   if (data[sizeof(tc_log_magic)] != total_ha_2pc)
527   {
528     sql_print_error("Recovery failed! You must enable "
529                     "exactly %d storage engines that support "
530                     "two-phase commit protocol",
531                     data[sizeof(tc_log_magic)]);
532     goto err1;
533   }
534 
535   if (my_hash_init(&xids, &my_charset_bin, tc_log_page_size/3, 0,
536                    sizeof(my_xid), 0, 0, MYF(0),
537                    PSI_INSTRUMENT_ME))
538     goto err1;
539 
540   for ( ; p < end_p ; p++)
541   {
542     for (my_xid *x=p->start; x < p->end; x++)
543       if (*x && my_hash_insert(&xids, (uchar *)x))
544         goto err2; // OOM
545   }
546 
547   if (ha_recover(&xids))
548     goto err2;
549 
550   my_hash_free(&xids);
551   memset(data, 0, (size_t)file_length);
552   return 0;
553 
554 err2:
555   my_hash_free(&xids);
556 err1:
557   sql_print_error("Crash recovery failed. Either correct the problem "
558                   "(if it's, for example, out of memory error) and restart, "
559                   "or delete tc log and start mysqld with "
560                   "--tc-heuristic-recover={commit|rollback}");
561   return 1;
562 }
563 
564 TC_LOG *tc_log;
565 TC_LOG_DUMMY tc_log_dummy;
566 TC_LOG_MMAP  tc_log_mmap;
567 
using_heuristic_recover()568 bool TC_LOG::using_heuristic_recover()
569 {
570   if (tc_heuristic_recover == TC_HEURISTIC_NOT_USED)
571     return false;
572 
573   sql_print_information("Heuristic crash recovery mode");
574   if (ha_recover(0))
575     sql_print_error("Heuristic crash recovery failed");
576   sql_print_information("Please restart mysqld without --tc-heuristic-recover");
577   return true;
578 }
579 
580