1 /* Copyright (c) 2009, 2020, Oracle and/or its affiliates. All rights reserved.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License, version 2.0,
5    as published by the Free Software Foundation.
6 
7    This program is also distributed with certain software (including
8    but not limited to OpenSSL) that is licensed under separate terms,
9    as designated in a particular file or component or in included license
10    documentation.  The authors of MySQL hereby grant you an additional
11    permission to link the program and your derivative works with the
12    separately licensed software that they have included with MySQL.
13 
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License, version 2.0, for more details.
18 
19    You should have received a copy of the GNU General Public License
20    along with this program; if not, write to the Free Software Foundation,
21    51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
22 
23 
24 #include "my_global.h"
25 #include "log.h"
26 #include "binlog.h"
27 #include "log_event.h"
28 #include "rpl_filter.h"
29 #include "rpl_rli.h"
30 #include "sql_plugin.h"
31 #include "rpl_handler.h"
32 #include "rpl_info_factory.h"
33 #include "rpl_utility.h"
34 #include "debug_sync.h"
35 #include "global_threads.h"
36 #include "sql_show.h"
37 #include "sql_parse.h"
38 #include "rpl_mi.h"
39 #include <list>
40 #include <string>
41 #include <sstream>
42 #include <my_stacktrace.h>
43 
44 using std::max;
45 using std::min;
46 using std::string;
47 using std::list;
48 
49 #define FLAGSTR(V,F) ((V)&(F)?#F" ":"")
50 
51 /**
52   @defgroup Binary_Log Binary Log
53   @{
54  */
55 
56 #define MY_OFF_T_UNDEF (~(my_off_t)0UL)
57 
58 /*
59   Constants required for the limit unsafe warnings suppression
60  */
61 //seconds after which the limit unsafe warnings suppression will be activated
62 #define LIMIT_UNSAFE_WARNING_ACTIVATION_TIMEOUT 50
63 //number of limit unsafe warnings after which the suppression will be activated
64 #define LIMIT_UNSAFE_WARNING_ACTIVATION_THRESHOLD_COUNT 50
65 #define MAX_SESSION_ATTACH_TRIES 10
66 
67 static ulonglong limit_unsafe_suppression_start_time= 0;
68 static bool unsafe_warning_suppression_is_activated= false;
69 static int limit_unsafe_warning_count= 0;
70 
71 static handlerton *binlog_hton;
72 bool opt_binlog_order_commits= true;
73 
74 const char *log_bin_index= 0;
75 const char *log_bin_basename= 0;
76 
77 MYSQL_BIN_LOG mysql_bin_log(&sync_binlog_period);
78 
79 static int binlog_init(void *p);
80 static int binlog_start_trans_and_stmt(THD *thd, Log_event *start_event);
81 static int binlog_close_connection(handlerton *hton, THD *thd);
82 static int binlog_savepoint_set(handlerton *hton, THD *thd, void *sv);
83 static int binlog_savepoint_rollback(handlerton *hton, THD *thd, void *sv);
84 static bool binlog_savepoint_rollback_can_release_mdl(handlerton *hton,
85                                                       THD *thd);
86 static int binlog_commit(handlerton *hton, THD *thd, bool all);
87 static int binlog_rollback(handlerton *hton, THD *thd, bool all);
88 static int binlog_prepare(handlerton *hton, THD *thd, bool all);
89 
90 
91 /**
92   Print system time.
93  */
94 
print_system_time()95 static void print_system_time()
96 {
97 #ifdef __WIN__
98   SYSTEMTIME utc_time;
99   GetSystemTime(&utc_time);
100   const long hrs=  utc_time.wHour;
101   const long mins= utc_time.wMinute;
102   const long secs= utc_time.wSecond;
103 #else
104   /* Using time() instead of my_time() to avoid looping */
105   const time_t curr_time= time(NULL);
106   /* Calculate time of day */
107   const long tmins = curr_time / 60;
108   const long thrs  = tmins / 60;
109   const long hrs   = thrs  % 24;
110   const long mins  = tmins % 60;
111   const long secs  = curr_time % 60;
112 #endif
113   char hrs_buf[3]= "00";
114   char mins_buf[3]= "00";
115   char secs_buf[3]= "00";
116   int base= 10;
117   my_safe_itoa(base, hrs, &hrs_buf[2]);
118   my_safe_itoa(base, mins, &mins_buf[2]);
119   my_safe_itoa(base, secs, &secs_buf[2]);
120 
121   my_safe_printf_stderr("---------- %s:%s:%s UTC - ",
122                         hrs_buf, mins_buf, secs_buf);
123 }
124 
125 
126 /**
127   Helper class to perform a thread excursion.
128 
129   This class is used to temporarily switch to another session (THD
130   structure). It will set up thread specific "globals" correctly
131   so that the POSIX thread looks exactly like the session attached to.
132   However, PSI_thread info is not touched as it is required to show
133   the actual physial view in PFS instrumentation i.e., it should
134   depict as the real thread doing the work instead of thread it switched
135   to.
136 
137   On destruction, the original session (which is supplied to the
138   constructor) will be re-attached automatically. For example, with
139   this code, the value of @c current_thd will be the same before and
140   after execution of the code.
141 
142   @code
143   {
144     Thread_excursion excursion(current_thd);
145     for (int i = 0 ; i < count ; ++i)
146       excursion.attach_to(other_thd[i]);
147   }
148   @endcode
149 
150   @warning The class is not designed to be inherited from.
151  */
152 
153 class Thread_excursion
154 {
155 public:
Thread_excursion(THD * thd)156   Thread_excursion(THD *thd)
157     : m_original_thd(thd)
158   {
159   }
160 
~Thread_excursion()161   ~Thread_excursion() {
162 #ifndef EMBEDDED_LIBRARY
163     if (unlikely(setup_thread_globals(m_original_thd)))
164       DBUG_ASSERT(0);                           // Out of memory?!
165 #endif
166   }
167 
168   /**
169     Try to attach the POSIX thread to a session.
170     - This function attaches the POSIX thread to a session
171     in MAX_SESSION_ATTACH_TRIES tries when encountering
172     'out of memory' error, and terminates the server after
173     failed in MAX_SESSION_ATTACH_TRIES tries.
174 
175     @param[in] thd       The thd of a session
176    */
try_to_attach_to(THD * thd)177   void try_to_attach_to(THD *thd)
178   {
179     int i= 0;
180     /*
181       Attach the POSIX thread to a session in MAX_SESSION_ATTACH_TRIES
182       tries when encountering 'out of memory' error.
183     */
184     while (i < MAX_SESSION_ATTACH_TRIES)
185     {
186       /*
187         Currently attach_to(...) returns ER_OUTOFMEMORY or 0. So
188         we continue to attach the POSIX thread when encountering
189         the ER_OUTOFMEMORY error. Please take care other error
190         returned from attach_to(...) in future.
191       */
192       if (!attach_to(thd))
193       {
194         if (i > 0)
195           sql_print_warning("Server overcomes the temporary 'out of memory' "
196                             "in '%d' tries while attaching to session thread "
197                             "during the group commit phase.\n", i + 1);
198         break;
199       }
200       i++;
201     }
202     /*
203       Terminate the server after failed to attach the POSIX thread
204       to a session in MAX_SESSION_ATTACH_TRIES tries.
205     */
206     if (MAX_SESSION_ATTACH_TRIES == i)
207     {
208       print_system_time();
209       my_safe_printf_stderr("%s", "[Fatal] Out of memory while attaching to "
210                             "session thread during the group commit phase. "
211                             "Data consistency between master and slave can "
212                             "be guaranteed after server restarts.\n");
213       _exit(EXIT_FAILURE);
214     }
215   }
216 
217 private:
218 
219   /**
220     Attach the POSIX thread to a session.
221    */
attach_to(THD * thd)222   int attach_to(THD *thd)
223   {
224 #ifndef EMBEDDED_LIBRARY
225     if (DBUG_EVALUATE_IF("simulate_session_attach_error", 1, 0)
226         || unlikely(setup_thread_globals(thd)))
227     {
228       /*
229         Indirectly uses pthread_setspecific, which can only return
230         ENOMEM or EINVAL. Since store_globals are using correct keys,
231         the only alternative is out of memory.
232       */
233       return ER_OUTOFMEMORY;
234     }
235 #endif /* EMBEDDED_LIBRARY */
236     return 0;
237   }
238 
setup_thread_globals(THD * thd) const239   int setup_thread_globals(THD *thd) const {
240     int error= 0;
241     THD *original_thd= my_pthread_getspecific(THD*, THR_THD);
242     MEM_ROOT* original_mem_root= my_pthread_getspecific(MEM_ROOT*, THR_MALLOC);
243     if ((error= my_pthread_setspecific_ptr(THR_THD, thd)))
244       goto exit0;
245     if ((error= my_pthread_setspecific_ptr(THR_MALLOC, &thd->mem_root)))
246       goto exit1;
247     if ((error= set_mysys_var(thd->mysys_var)))
248       goto exit2;
249     goto exit0;
250 exit2:
251     error= my_pthread_setspecific_ptr(THR_MALLOC,  original_mem_root);
252 exit1:
253     error= my_pthread_setspecific_ptr(THR_THD,  original_thd);
254 exit0:
255     return error;
256   }
257 
258   THD *m_original_thd;
259 };
260 
261 
262 /**
263   Caches for non-transactional and transactional data before writing
264   it to the binary log.
265 
266   @todo All the access functions for the flags suggest that the
267   encapsuling is not done correctly, so try to move any logic that
268   requires access to the flags into the cache.
269 */
270 class binlog_cache_data
271 {
272 public:
273 
binlog_cache_data(bool trx_cache_arg,my_off_t max_binlog_cache_size_arg,ulong * ptr_binlog_cache_use_arg,ulong * ptr_binlog_cache_disk_use_arg)274   binlog_cache_data(bool trx_cache_arg,
275                     my_off_t max_binlog_cache_size_arg,
276                     ulong *ptr_binlog_cache_use_arg,
277                     ulong *ptr_binlog_cache_disk_use_arg)
278   : m_pending(0), saved_max_binlog_cache_size(max_binlog_cache_size_arg),
279     ptr_binlog_cache_use(ptr_binlog_cache_use_arg),
280     ptr_binlog_cache_disk_use(ptr_binlog_cache_disk_use_arg)
281   {
282     reset();
283     flags.transactional= trx_cache_arg;
284     cache_log.end_of_file= saved_max_binlog_cache_size;
285   }
286 
287   int finalize(THD *thd, Log_event *end_event);
288   int flush(THD *thd, my_off_t *bytes, bool *wrote_xid);
289   int write_event(THD *thd, Log_event *event);
290 
~binlog_cache_data()291   virtual ~binlog_cache_data()
292   {
293     DBUG_ASSERT(is_binlog_empty());
294     close_cached_file(&cache_log);
295   }
296 
is_binlog_empty() const297   bool is_binlog_empty() const
298   {
299     my_off_t pos= my_b_tell(&cache_log);
300     DBUG_PRINT("debug", ("%s_cache - pending: 0x%llx, bytes: %llu",
301                          (flags.transactional ? "trx" : "stmt"),
302                          (ulonglong) pending(), (ulonglong) pos));
303     return pending() == NULL && pos == 0;
304   }
305 
is_group_cache_empty() const306   bool is_group_cache_empty() const
307   {
308     return group_cache.is_empty();
309   }
310 
311 #ifndef DBUG_OFF
dbug_is_finalized() const312   bool dbug_is_finalized() const {
313     return flags.finalized;
314   }
315 #endif
316 
pending() const317   Rows_log_event *pending() const
318   {
319     return m_pending;
320   }
321 
set_pending(Rows_log_event * const pending)322   void set_pending(Rows_log_event *const pending)
323   {
324     m_pending= pending;
325   }
326 
set_incident(void)327   void set_incident(void)
328   {
329     flags.incident= true;
330   }
331 
has_incident(void) const332   bool has_incident(void) const
333   {
334     return flags.incident;
335   }
336 
337   /**
338     Sets the binlog_cache_data::Flags::flush_error flag if there
339     is an error while flushing cache to the file.
340 
341     @param thd  The client thread that is executing the transaction.
342   */
set_flush_error(THD * thd)343   void set_flush_error(THD *thd)
344   {
345     flags.flush_error= true;
346     if(is_trx_cache())
347     {
348       /*
349          If the cache is a transactional cache and if the write
350          has failed due to ENOSPC, then my_write() would have
351          set EE_WRITE error, so clear the error and create an
352          equivalent server error.
353       */
354       if (thd->is_error())
355         thd->clear_error();
356       char errbuf[MYSYS_STRERROR_SIZE];
357       my_error(ER_ERROR_ON_WRITE, MYF(MY_WME), my_filename(cache_log.file),
358           errno, my_strerror(errbuf, sizeof(errbuf), errno));
359     }
360   }
361 
get_flush_error(void) const362   bool get_flush_error(void) const
363   {
364     return flags.flush_error;
365   }
366 
has_xid() const367   bool has_xid() const {
368     // There should only be an XID event if we are transactional
369     DBUG_ASSERT((flags.transactional && flags.with_xid) || !flags.with_xid);
370     return flags.with_xid;
371   }
372 
is_trx_cache() const373   bool is_trx_cache() const
374   {
375     return flags.transactional;
376   }
377 
get_byte_position() const378   my_off_t get_byte_position() const
379   {
380     return my_b_tell(&cache_log);
381   }
382 
reset()383   virtual void reset()
384   {
385     compute_statistics();
386     truncate(0);
387 
388     /*
389       If IOCACHE has a file associated, change its size to 0.
390       It is safer to do it here, since we are certain that one
391       asked the cache to go to position 0 with truncate.
392     */
393     if(cache_log.file != -1)
394     {
395       int error= 0;
396       if((error= my_chsize(cache_log.file, 0, 0, MYF(MY_WME))))
397         sql_print_warning("Unable to resize binlog IOCACHE auxilary file");
398 
399       DBUG_EXECUTE_IF("show_io_cache_size",
400                       {
401                         ulong file_size= my_seek(cache_log.file,
402                                                0L,MY_SEEK_END,MYF(MY_WME+MY_FAE));
403                         sql_print_error("New size:%ld", file_size);
404                       });
405     }
406 
407     flags.incident= false;
408     flags.with_xid= false;
409     flags.immediate= false;
410     flags.finalized= false;
411     flags.flush_error= false;
412     /*
413       The truncate function calls reinit_io_cache that calls my_b_flush_io_cache
414       which may increase disk_writes. This breaks the disk_writes use by the
415       binary log which aims to compute the ratio between in-memory cache usage
416       and disk cache usage. To avoid this undesirable behavior, we reset the
417       variable after truncating the cache.
418     */
419     cache_log.disk_writes= 0;
420     group_cache.clear();
421     DBUG_ASSERT(is_binlog_empty());
422   }
423 
424   /*
425     Sets the write position to point at the position given. If the
426     cache has swapped to a file, it reinitializes it, so that the
427     proper data is added to the IO_CACHE buffer. Otherwise, it just
428     does a my_b_seek.
429 
430     my_b_seek will not work if the cache has swapped, that's why
431     we do this workaround.
432 
433     @param[IN]  pos the new write position.
434     @param[IN]  use_reinit if the position should be reset resorting
435                 to reset_io_cache (which may issue a flush_io_cache
436                 inside)
437 
438     @return The previous write position.
439    */
reset_write_pos(my_off_t pos,bool use_reinit)440   my_off_t reset_write_pos(my_off_t pos, bool use_reinit)
441   {
442     DBUG_ENTER("reset_write_pos");
443     DBUG_ASSERT(cache_log.type == WRITE_CACHE);
444 
445     my_off_t oldpos= get_byte_position();
446 
447     if (use_reinit)
448       reinit_io_cache(&cache_log, WRITE_CACHE, pos, 0, 0);
449     else
450       my_b_seek(&cache_log, pos);
451 
452     DBUG_RETURN(oldpos);
453   }
454 
455   /*
456     Cache to store data before copying it to the binary log.
457   */
458   IO_CACHE cache_log;
459 
460   /**
461     The group cache for this cache.
462   */
463   Group_cache group_cache;
464 
465 protected:
466   /*
467     It truncates the cache to a certain position. This includes deleting the
468     pending event.
469    */
truncate(my_off_t pos)470   void truncate(my_off_t pos)
471   {
472     DBUG_PRINT("info", ("truncating to position %lu", (ulong) pos));
473     remove_pending_event();
474     /*
475       Whenever there is an error while flushing cache to file,
476       the local cache will not be in a normal state and the same
477       cache cannot be used without facing an assert.
478       So, clear the cache if there is a flush error.
479     */
480     reinit_io_cache(&cache_log, WRITE_CACHE, pos, 0, get_flush_error());
481     cache_log.end_of_file= saved_max_binlog_cache_size;
482   }
483 
484   /**
485      Flush pending event to the cache buffer.
486    */
flush_pending_event(THD * thd)487   int flush_pending_event(THD *thd) {
488     if (m_pending)
489     {
490       m_pending->set_flags(Rows_log_event::STMT_END_F);
491       if (int error= write_event(thd, m_pending))
492         return error;
493       thd->clear_binlog_table_maps();
494     }
495     return 0;
496   }
497 
498   /**
499     Remove the pending event.
500    */
remove_pending_event()501   int remove_pending_event() {
502     delete m_pending;
503     m_pending= NULL;
504     return 0;
505   }
506   struct Flags {
507     /*
508       Defines if this is either a trx-cache or stmt-cache, respectively, a
509       transactional or non-transactional cache.
510     */
511     bool transactional:1;
512 
513     /*
514       This indicates that some events did not get into the cache and most likely
515       it is corrupted.
516     */
517     bool incident:1;
518 
519     /*
520       This indicates that the cache should be written without BEGIN/END.
521     */
522     bool immediate:1;
523 
524     /*
525       This flag indicates that the buffer was finalized and has to be
526       flushed to disk.
527      */
528     bool finalized:1;
529 
530     /*
531       This indicates that the cache contain an XID event.
532      */
533     bool with_xid:1;
534 
535     /*
536       This flag is set to 'true' when there is an error while flushing the
537       I/O cache to file.
538      */
539     bool flush_error:1;
540   } flags;
541 
542 private:
543   /*
544     Pending binrows event. This event is the event where the rows are currently
545     written.
546    */
547   Rows_log_event *m_pending;
548 
549   /**
550     This function computes binlog cache and disk usage.
551   */
compute_statistics()552   void compute_statistics()
553   {
554     if (!is_binlog_empty())
555     {
556       statistic_increment(*ptr_binlog_cache_use, &LOCK_status);
557       if (cache_log.disk_writes != 0)
558         statistic_increment(*ptr_binlog_cache_disk_use, &LOCK_status);
559     }
560   }
561 
562   /*
563     Stores the values of maximum size of the cache allowed when this cache
564     is configured. This corresponds to either
565       . max_binlog_cache_size or max_binlog_stmt_cache_size.
566   */
567   my_off_t saved_max_binlog_cache_size;
568 
569   /*
570     Stores a pointer to the status variable that keeps track of the in-memory
571     cache usage. This corresponds to either
572       . binlog_cache_use or binlog_stmt_cache_use.
573   */
574   ulong *ptr_binlog_cache_use;
575 
576   /*
577     Stores a pointer to the status variable that keeps track of the disk
578     cache usage. This corresponds to either
579       . binlog_cache_disk_use or binlog_stmt_cache_disk_use.
580   */
581   ulong *ptr_binlog_cache_disk_use;
582 
583   binlog_cache_data& operator=(const binlog_cache_data& info);
584   binlog_cache_data(const binlog_cache_data& info);
585 };
586 
587 
588 class binlog_stmt_cache_data
589   : public binlog_cache_data
590 {
591 public:
binlog_stmt_cache_data(bool trx_cache_arg,my_off_t max_binlog_cache_size_arg,ulong * ptr_binlog_cache_use_arg,ulong * ptr_binlog_cache_disk_use_arg)592   binlog_stmt_cache_data(bool trx_cache_arg,
593                         my_off_t max_binlog_cache_size_arg,
594                         ulong *ptr_binlog_cache_use_arg,
595                         ulong *ptr_binlog_cache_disk_use_arg)
596     : binlog_cache_data(trx_cache_arg,
597                         max_binlog_cache_size_arg,
598                         ptr_binlog_cache_use_arg,
599                         ptr_binlog_cache_disk_use_arg)
600   {
601   }
602 
603   using binlog_cache_data::finalize;
604 
605   int finalize(THD *thd);
606 };
607 
608 
609 int
finalize(THD * thd)610 binlog_stmt_cache_data::finalize(THD *thd)
611 {
612   if (flags.immediate)
613   {
614     if (int error= finalize(thd, NULL))
615       return error;
616   }
617   else
618   {
619     Query_log_event
620       end_evt(thd, STRING_WITH_LEN("COMMIT"), false, false, true, 0, true);
621     if (int error= finalize(thd, &end_evt))
622       return error;
623   }
624   return 0;
625 }
626 
627 
628 class binlog_trx_cache_data : public binlog_cache_data
629 {
630 public:
binlog_trx_cache_data(bool trx_cache_arg,my_off_t max_binlog_cache_size_arg,ulong * ptr_binlog_cache_use_arg,ulong * ptr_binlog_cache_disk_use_arg)631   binlog_trx_cache_data(bool trx_cache_arg,
632                         my_off_t max_binlog_cache_size_arg,
633                         ulong *ptr_binlog_cache_use_arg,
634                         ulong *ptr_binlog_cache_disk_use_arg)
635   : binlog_cache_data(trx_cache_arg,
636                       max_binlog_cache_size_arg,
637                       ptr_binlog_cache_use_arg,
638                       ptr_binlog_cache_disk_use_arg),
639     m_cannot_rollback(FALSE), before_stmt_pos(MY_OFF_T_UNDEF)
640   {   }
641 
reset()642   void reset()
643   {
644     DBUG_ENTER("reset");
645     DBUG_PRINT("enter", ("before_stmt_pos: %llu", (ulonglong) before_stmt_pos));
646     m_cannot_rollback= FALSE;
647     before_stmt_pos= MY_OFF_T_UNDEF;
648     binlog_cache_data::reset();
649     DBUG_PRINT("return", ("before_stmt_pos: %llu", (ulonglong) before_stmt_pos));
650     DBUG_VOID_RETURN;
651   }
652 
cannot_rollback() const653   bool cannot_rollback() const
654   {
655     return m_cannot_rollback;
656   }
657 
set_cannot_rollback()658   void set_cannot_rollback()
659   {
660     m_cannot_rollback= TRUE;
661   }
662 
get_prev_position() const663   my_off_t get_prev_position() const
664   {
665      return before_stmt_pos;
666   }
667 
set_prev_position(my_off_t pos)668   void set_prev_position(my_off_t pos)
669   {
670     DBUG_ENTER("set_prev_position");
671     DBUG_PRINT("enter", ("before_stmt_pos: %llu", (ulonglong) before_stmt_pos));
672     before_stmt_pos= pos;
673     DBUG_PRINT("return", ("before_stmt_pos: %llu", (ulonglong) before_stmt_pos));
674     DBUG_VOID_RETURN;
675   }
676 
restore_prev_position()677   void restore_prev_position()
678   {
679     DBUG_ENTER("restore_prev_position");
680     DBUG_PRINT("enter", ("before_stmt_pos: %llu", (ulonglong) before_stmt_pos));
681     binlog_cache_data::truncate(before_stmt_pos);
682     before_stmt_pos= MY_OFF_T_UNDEF;
683     DBUG_PRINT("return", ("before_stmt_pos: %llu", (ulonglong) before_stmt_pos));
684     DBUG_VOID_RETURN;
685   }
686 
restore_savepoint(my_off_t pos)687   void restore_savepoint(my_off_t pos)
688   {
689     DBUG_ENTER("restore_savepoint");
690     DBUG_PRINT("enter", ("before_stmt_pos: %llu", (ulonglong) before_stmt_pos));
691     binlog_cache_data::truncate(pos);
692     if (pos <= before_stmt_pos)
693       before_stmt_pos= MY_OFF_T_UNDEF;
694     DBUG_PRINT("return", ("before_stmt_pos: %llu", (ulonglong) before_stmt_pos));
695     DBUG_VOID_RETURN;
696   }
697 
698   using binlog_cache_data::truncate;
699 
700   int truncate(THD *thd, bool all);
701 
702 private:
703   /*
704     It will be set TRUE if any statement which cannot be rolled back safely
705     is put in trx_cache.
706   */
707   bool m_cannot_rollback;
708 
709   /*
710     Binlog position before the start of the current statement.
711   */
712   my_off_t before_stmt_pos;
713 
714   binlog_trx_cache_data& operator=(const binlog_trx_cache_data& info);
715   binlog_trx_cache_data(const binlog_trx_cache_data& info);
716 };
717 
718 class binlog_cache_mngr {
719 public:
binlog_cache_mngr(my_off_t max_binlog_stmt_cache_size_arg,ulong * ptr_binlog_stmt_cache_use_arg,ulong * ptr_binlog_stmt_cache_disk_use_arg,my_off_t max_binlog_cache_size_arg,ulong * ptr_binlog_cache_use_arg,ulong * ptr_binlog_cache_disk_use_arg)720   binlog_cache_mngr(my_off_t max_binlog_stmt_cache_size_arg,
721                     ulong *ptr_binlog_stmt_cache_use_arg,
722                     ulong *ptr_binlog_stmt_cache_disk_use_arg,
723                     my_off_t max_binlog_cache_size_arg,
724                     ulong *ptr_binlog_cache_use_arg,
725                     ulong *ptr_binlog_cache_disk_use_arg)
726   : stmt_cache(FALSE, max_binlog_stmt_cache_size_arg,
727                ptr_binlog_stmt_cache_use_arg,
728                ptr_binlog_stmt_cache_disk_use_arg),
729     trx_cache(TRUE, max_binlog_cache_size_arg,
730               ptr_binlog_cache_use_arg,
731               ptr_binlog_cache_disk_use_arg)
732   {  }
733 
get_binlog_cache_data(bool is_transactional)734   binlog_cache_data* get_binlog_cache_data(bool is_transactional)
735   {
736     if (is_transactional)
737       return &trx_cache;
738     else
739       return &stmt_cache;
740   }
741 
get_binlog_cache_log(bool is_transactional)742   IO_CACHE* get_binlog_cache_log(bool is_transactional)
743   {
744     return (is_transactional ? &trx_cache.cache_log : &stmt_cache.cache_log);
745   }
746 
747   /**
748     Convenience method to check if both caches are empty.
749    */
is_binlog_empty() const750   bool is_binlog_empty() const {
751     return stmt_cache.is_binlog_empty() && trx_cache.is_binlog_empty();
752   }
753 
754   /*
755     clear stmt_cache and trx_cache if they are not empty
756   */
reset()757   void reset()
758   {
759     if (!stmt_cache.is_binlog_empty())
760       stmt_cache.reset();
761     if (!trx_cache.is_binlog_empty())
762       trx_cache.reset();
763   }
764 
765 #ifndef DBUG_OFF
dbug_any_finalized() const766   bool dbug_any_finalized() const {
767     return stmt_cache.dbug_is_finalized() || trx_cache.dbug_is_finalized();
768   }
769 #endif
770 
771   /*
772     Convenience method to flush both caches to the binary log.
773 
774     @param bytes_written Pointer to variable that will be set to the
775                          number of bytes written for the flush.
776     @param wrote_xid     Pointer to variable that will be set to @c
777                          true if any XID event was written to the
778                          binary log. Otherwise, the variable will not
779                          be touched.
780     @return Error code on error, zero if no error.
781    */
flush(THD * thd,my_off_t * bytes_written,bool * wrote_xid)782   int flush(THD *thd, my_off_t *bytes_written, bool *wrote_xid)
783   {
784     my_off_t stmt_bytes= 0;
785     my_off_t trx_bytes= 0;
786     DBUG_ASSERT(stmt_cache.has_xid() == 0);
787     if (int error= stmt_cache.flush(thd, &stmt_bytes, wrote_xid))
788       return error;
789     if (int error= trx_cache.flush(thd, &trx_bytes, wrote_xid))
790       return error;
791     *bytes_written= stmt_bytes + trx_bytes;
792     return 0;
793   }
794 
795   binlog_stmt_cache_data stmt_cache;
796   binlog_trx_cache_data trx_cache;
797 
798 private:
799 
800   binlog_cache_mngr& operator=(const binlog_cache_mngr& info);
801   binlog_cache_mngr(const binlog_cache_mngr& info);
802 };
803 
804 
thd_get_cache_mngr(const THD * thd)805 static binlog_cache_mngr *thd_get_cache_mngr(const THD *thd)
806 {
807   /*
808     If opt_bin_log is not set, binlog_hton->slot == -1 and hence
809     thd_get_ha_data(thd, hton) segfaults.
810   */
811   DBUG_ASSERT(opt_bin_log);
812   return (binlog_cache_mngr *)thd_get_ha_data(thd, binlog_hton);
813 }
814 
815 
816 /**
817   Checks if the BINLOG_CACHE_SIZE's value is greater than MAX_BINLOG_CACHE_SIZE.
818   If this happens, the BINLOG_CACHE_SIZE is set to MAX_BINLOG_CACHE_SIZE.
819 */
check_binlog_cache_size(THD * thd)820 void check_binlog_cache_size(THD *thd)
821 {
822   if (binlog_cache_size > max_binlog_cache_size)
823   {
824     if (thd)
825     {
826       push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
827                           ER_BINLOG_CACHE_SIZE_GREATER_THAN_MAX,
828                           ER(ER_BINLOG_CACHE_SIZE_GREATER_THAN_MAX),
829                           (ulong) binlog_cache_size,
830                           (ulong) max_binlog_cache_size);
831     }
832     else
833     {
834       sql_print_warning(ER_DEFAULT(ER_BINLOG_CACHE_SIZE_GREATER_THAN_MAX),
835                         (ulong) binlog_cache_size,
836                         (ulong) max_binlog_cache_size);
837     }
838     binlog_cache_size= max_binlog_cache_size;
839   }
840 }
841 
842 /**
843   Checks if the BINLOG_STMT_CACHE_SIZE's value is greater than MAX_BINLOG_STMT_CACHE_SIZE.
844   If this happens, the BINLOG_STMT_CACHE_SIZE is set to MAX_BINLOG_STMT_CACHE_SIZE.
845 */
check_binlog_stmt_cache_size(THD * thd)846 void check_binlog_stmt_cache_size(THD *thd)
847 {
848   if (binlog_stmt_cache_size > max_binlog_stmt_cache_size)
849   {
850     if (thd)
851     {
852       push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
853                           ER_BINLOG_STMT_CACHE_SIZE_GREATER_THAN_MAX,
854                           ER(ER_BINLOG_STMT_CACHE_SIZE_GREATER_THAN_MAX),
855                           (ulong) binlog_stmt_cache_size,
856                           (ulong) max_binlog_stmt_cache_size);
857     }
858     else
859     {
860       sql_print_warning(ER_DEFAULT(ER_BINLOG_STMT_CACHE_SIZE_GREATER_THAN_MAX),
861                         (ulong) binlog_stmt_cache_size,
862                         (ulong) max_binlog_stmt_cache_size);
863     }
864     binlog_stmt_cache_size= max_binlog_stmt_cache_size;
865   }
866 }
867 
868 /**
869  Check whether binlog_hton has valid slot and enabled
870 */
binlog_enabled()871 bool binlog_enabled()
872 {
873 	return(binlog_hton && binlog_hton->slot != HA_SLOT_UNDEF);
874 }
875 
876  /*
877   Save position of binary log transaction cache.
878 
879   SYNPOSIS
880     binlog_trans_log_savepos()
881 
882     thd      The thread to take the binlog data from
883     pos      Pointer to variable where the position will be stored
884 
885   DESCRIPTION
886 
887     Save the current position in the binary log transaction cache into
888     the variable pointed to by 'pos'
889  */
890 
891 static void
binlog_trans_log_savepos(THD * thd,my_off_t * pos)892 binlog_trans_log_savepos(THD *thd, my_off_t *pos)
893 {
894   DBUG_ENTER("binlog_trans_log_savepos");
895   DBUG_ASSERT(pos != NULL);
896   binlog_cache_mngr *const cache_mngr= thd_get_cache_mngr(thd);
897   DBUG_ASSERT(mysql_bin_log.is_open());
898   *pos= cache_mngr->trx_cache.get_byte_position();
899   DBUG_PRINT("return", ("position: %lu", (ulong) *pos));
900   DBUG_VOID_RETURN;
901 }
902 
903 
904 /*
905   this function is mostly a placeholder.
906   conceptually, binlog initialization (now mostly done in MYSQL_BIN_LOG::open)
907   should be moved here.
908 */
909 
binlog_init(void * p)910 static int binlog_init(void *p)
911 {
912   binlog_hton= (handlerton *)p;
913   binlog_hton->state=opt_bin_log ? SHOW_OPTION_YES : SHOW_OPTION_NO;
914   binlog_hton->db_type=DB_TYPE_BINLOG;
915   binlog_hton->savepoint_offset= sizeof(my_off_t);
916   binlog_hton->close_connection= binlog_close_connection;
917   binlog_hton->savepoint_set= binlog_savepoint_set;
918   binlog_hton->savepoint_rollback= binlog_savepoint_rollback;
919   binlog_hton->savepoint_rollback_can_release_mdl=
920                                      binlog_savepoint_rollback_can_release_mdl;
921   binlog_hton->commit= binlog_commit;
922   binlog_hton->rollback= binlog_rollback;
923   binlog_hton->prepare= binlog_prepare;
924   binlog_hton->flags= HTON_NOT_USER_SELECTABLE | HTON_HIDDEN;
925   return 0;
926 }
927 
binlog_close_connection(handlerton * hton,THD * thd)928 static int binlog_close_connection(handlerton *hton, THD *thd)
929 {
930   DBUG_ENTER("binlog_close_connection");
931   binlog_cache_mngr *const cache_mngr= thd_get_cache_mngr(thd);
932   DBUG_ASSERT(cache_mngr->is_binlog_empty());
933   DBUG_ASSERT(cache_mngr->trx_cache.is_group_cache_empty() &&
934               cache_mngr->stmt_cache.is_group_cache_empty());
935   DBUG_PRINT("debug", ("Set ha_data slot %d to 0x%llx", binlog_hton->slot, (ulonglong) NULL));
936   thd_set_ha_data(thd, binlog_hton, NULL);
937   cache_mngr->~binlog_cache_mngr();
938   my_free(cache_mngr);
939   DBUG_RETURN(0);
940 }
941 
write_event(THD * thd,Log_event * ev)942 int binlog_cache_data::write_event(THD *thd, Log_event *ev)
943 {
944   DBUG_ENTER("binlog_cache_data::write_event");
945 
946   if (gtid_mode > 0)
947   {
948     Group_cache::enum_add_group_status status=
949       group_cache.add_logged_group(thd, get_byte_position());
950     if (status == Group_cache::ERROR)
951       DBUG_RETURN(1);
952     else if (status == Group_cache::APPEND_NEW_GROUP)
953     {
954       Gtid_log_event gtid_ev(thd, is_trx_cache());
955       if (gtid_ev.write(&cache_log) != 0)
956         DBUG_RETURN(1);
957     }
958   }
959 
960   if (ev != NULL)
961   {
962     DBUG_EXECUTE_IF("simulate_disk_full_at_flush_pending",
963                   {DBUG_SET("+d,simulate_file_write_error");});
964 
965     DBUG_EXECUTE_IF("simulate_tmpdir_partition_full",
966                   {
967                   static int count= -1;
968                   count++;
969                   if(count % 4 == 3 && ev->get_type_code() == WRITE_ROWS_EVENT)
970                     DBUG_SET("+d,simulate_temp_file_write_error");
971                   });
972     if (ev->write(&cache_log) != 0)
973     {
974       DBUG_EXECUTE_IF("simulate_disk_full_at_flush_pending",
975                       {
976                         DBUG_SET("-d,simulate_file_write_error");
977                         DBUG_SET("-d,simulate_disk_full_at_flush_pending");
978                         /*
979                            after +d,simulate_file_write_error the local cache
980                            is in unsane state. Since -d,simulate_file_write_error
981                            revokes the first simulation do_write_cache()
982                            can't be run without facing an assert.
983                            So it's blocked with the following 2nd simulation:
984                         */
985                         DBUG_SET("+d,simulate_do_write_cache_failure");
986                       });
987 
988       DBUG_EXECUTE_IF("simulate_temp_file_write_error",
989                       {
990                         DBUG_SET("-d,simulate_temp_file_write_error");
991                       });
992       /*
993         If the flush has failed due to ENOSPC error, set the
994         flush_error flag.
995       */
996       if (thd->is_error() && my_errno == ENOSPC)
997       {
998         set_flush_error(thd);
999       }
1000       DBUG_RETURN(1);
1001     }
1002     if (ev->get_type_code() == XID_EVENT)
1003       flags.with_xid= true;
1004     if (ev->is_using_immediate_logging())
1005       flags.immediate= true;
1006   }
1007   DBUG_RETURN(0);
1008 }
1009 
1010 
1011 /**
1012   Checks if the given GTID exists in the Group_cache. If not, add it
1013   as an empty group.
1014 
1015   @todo Move this function into the cache class?
1016 
1017   @param thd THD object that owns the Group_cache
1018   @param cache_data binlog_cache_data object for the cache
1019   @param gtid GTID to check
1020 */
write_one_empty_group_to_cache(THD * thd,binlog_cache_data * cache_data,Gtid gtid)1021 static int write_one_empty_group_to_cache(THD *thd,
1022                                           binlog_cache_data *cache_data,
1023                                           Gtid gtid)
1024 {
1025   DBUG_ENTER("write_one_empty_group_to_cache");
1026   Group_cache *group_cache= &cache_data->group_cache;
1027   if (group_cache->contains_gtid(gtid))
1028     DBUG_RETURN(0);
1029   /*
1030     Apparently this code is not being called. We need to
1031     investigate if this is a bug or this code is not
1032     necessary. /Alfranio
1033 
1034     Empty groups are currently being handled in the function
1035     gtid_empty_group_log_and_cleanup().
1036   */
1037   DBUG_ASSERT(0); /*NOTREACHED*/
1038 #ifdef NON_ERROR_GTID
1039   IO_CACHE *cache= &cache_data->cache_log;
1040   Group_cache::enum_add_group_status status= group_cache->add_empty_group(gtid);
1041   if (status == Group_cache::ERROR)
1042     DBUG_RETURN(1);
1043   DBUG_ASSERT(status == Group_cache::APPEND_NEW_GROUP);
1044   Gtid_specification spec= { GTID_GROUP, gtid };
1045   Gtid_log_event gtid_ev(thd, cache_data->is_trx_cache(), &spec);
1046   if (gtid_ev.write(cache) != 0)
1047     DBUG_RETURN(1);
1048 #endif
1049   DBUG_RETURN(0);
1050 }
1051 
1052 /**
1053   Writes all GTIDs that the thread owns to the stmt/trx cache, if the
1054   GTID is not already in the cache.
1055 
1056   @todo Move this function into the cache class?
1057 
1058   @param thd THD object for the thread that owns the cache.
1059   @param cache_data The cache.
1060 */
write_empty_groups_to_cache(THD * thd,binlog_cache_data * cache_data)1061 static int write_empty_groups_to_cache(THD *thd, binlog_cache_data *cache_data)
1062 {
1063   DBUG_ENTER("write_empty_groups_to_cache");
1064   if (thd->owned_gtid.sidno == -1)
1065   {
1066 #ifdef HAVE_GTID_NEXT_LIST
1067     Gtid_set::Gtid_iterator git(&thd->owned_gtid_set);
1068     Gtid gtid= git.get();
1069     while (gtid.sidno != 0)
1070     {
1071       if (write_one_empty_group_to_cache(thd, cache_data, gtid) != 0)
1072         DBUG_RETURN(1);
1073       git.next();
1074       gtid= git.get();
1075     }
1076 #else
1077     DBUG_ASSERT(0);
1078 #endif
1079   }
1080   else if (thd->owned_gtid.sidno > 0)
1081     if (write_one_empty_group_to_cache(thd, cache_data, thd->owned_gtid) != 0)
1082       DBUG_RETURN(1);
1083   DBUG_RETURN(0);
1084 }
1085 
1086 
1087 /**
1088 
1089   @todo Move this function into the cache class?
1090  */
1091 static int
gtid_before_write_cache(THD * thd,binlog_cache_data * cache_data)1092 gtid_before_write_cache(THD* thd, binlog_cache_data* cache_data)
1093 {
1094   DBUG_ENTER("gtid_before_write_cache");
1095   int error= 0;
1096 
1097   DBUG_ASSERT(thd->variables.gtid_next.type != UNDEFINED_GROUP);
1098 
1099   if (gtid_mode == 0)
1100     DBUG_RETURN(0);
1101 
1102   Group_cache* group_cache= &cache_data->group_cache;
1103 
1104   global_sid_lock->rdlock();
1105 
1106   if (thd->variables.gtid_next.type == AUTOMATIC_GROUP)
1107   {
1108     if (group_cache->generate_automatic_gno(thd) !=
1109         RETURN_STATUS_OK)
1110     {
1111       global_sid_lock->unlock();
1112       DBUG_RETURN(1);
1113     }
1114   }
1115   if (write_empty_groups_to_cache(thd, cache_data) != 0)
1116   {
1117     global_sid_lock->unlock();
1118     DBUG_RETURN(1);
1119   }
1120 
1121   global_sid_lock->unlock();
1122 
1123   /*
1124     If an automatic group number was generated, change the first event
1125     into a "real" one.
1126   */
1127   if (thd->variables.gtid_next.type == AUTOMATIC_GROUP)
1128   {
1129     DBUG_ASSERT(group_cache->get_n_groups() == 1);
1130     Cached_group *cached_group= group_cache->get_unsafe_pointer(0);
1131     DBUG_ASSERT(cached_group->spec.type != AUTOMATIC_GROUP);
1132     Gtid_log_event gtid_ev(thd, cache_data->is_trx_cache(),
1133                            &cached_group->spec);
1134     bool using_file= cache_data->cache_log.pos_in_file > 0;
1135 
1136     DBUG_EXECUTE_IF("simulate_tmpdir_partition_full",
1137                   {
1138                   DBUG_SET("+d,simulate_temp_file_write_error");
1139                   });
1140 
1141     my_off_t saved_position= cache_data->reset_write_pos(0, using_file);
1142 
1143     if (!cache_data->cache_log.error)
1144     {
1145       if (gtid_ev.write(&cache_data->cache_log))
1146         goto err;
1147       cache_data->reset_write_pos(saved_position, using_file);
1148     }
1149 
1150     if (cache_data->cache_log.error)
1151       goto err;
1152   }
1153 
1154   DBUG_RETURN(error);
1155 
1156 err:
1157   DBUG_EXECUTE_IF("simulate_tmpdir_partition_full",
1158                 {
1159                 DBUG_SET("-d,simulate_temp_file_write_error");
1160                 });
1161   /*
1162     If the reinit_io_cache has failed, set the flush_error flag.
1163   */
1164   if (cache_data->cache_log.error)
1165   {
1166     cache_data->set_flush_error(thd);
1167   }
1168   DBUG_RETURN(1);
1169 
1170 }
1171 
1172 /**
1173    The function logs an empty group with GTID and performs cleanup.
1174    Its logic wrt GTID is equivalent to one of binlog_commit().
1175    It's called at the end of statement execution in case binlog_commit()
1176    was skipped.
1177    Such cases are due ineffective binlogging incl an empty group
1178    re-execution.
1179 
1180    @param thd   The thread handle
1181 
1182    @return
1183     nonzero if an error pops up.
1184 */
gtid_empty_group_log_and_cleanup(THD * thd)1185 int gtid_empty_group_log_and_cleanup(THD *thd)
1186 {
1187   int ret= 1;
1188   binlog_cache_data* cache_data= NULL;
1189 
1190   DBUG_ENTER("gtid_empty_group_log_and_cleanup");
1191 
1192   Query_log_event qinfo(thd, STRING_WITH_LEN("BEGIN"), TRUE,
1193                           FALSE, TRUE, 0, TRUE);
1194   DBUG_ASSERT(!qinfo.is_using_immediate_logging());
1195 
1196   /*
1197     thd->cache_mngr is uninitialized on the first empty transaction.
1198   */
1199   if (thd->binlog_setup_trx_data())
1200     DBUG_RETURN(1);
1201   cache_data= &thd_get_cache_mngr(thd)->trx_cache;
1202   DBUG_PRINT("debug", ("Writing to trx_cache"));
1203   if (cache_data->write_event(thd, &qinfo) ||
1204       gtid_before_write_cache(thd, cache_data))
1205     goto err;
1206 
1207   ret= mysql_bin_log.commit(thd, true);
1208 
1209 err:
1210   DBUG_RETURN(ret);
1211 }
1212 
1213 /**
1214   This function finalizes the cache preparing for commit or rollback.
1215 
1216   The function just writes all the necessary events to the cache but
1217   does not flush the data to the binary log file. That is the role of
1218   the binlog_cache_data::flush function.
1219 
1220   @see binlog_cache_data::flush
1221 
1222   @param thd                The thread whose transaction should be flushed
1223   @param cache_data         Pointer to the cache
1224   @param end_ev             The end event either commit/rollback
1225 
1226   @return
1227     nonzero if an error pops up when flushing the cache.
1228 */
1229 int
finalize(THD * thd,Log_event * end_event)1230 binlog_cache_data::finalize(THD *thd, Log_event *end_event)
1231 {
1232   DBUG_ENTER("binlog_cache_data::finalize");
1233   if (!is_binlog_empty())
1234   {
1235     DBUG_ASSERT(!flags.finalized);
1236     if (int error= flush_pending_event(thd))
1237       DBUG_RETURN(error);
1238     if (int error= write_event(thd, end_event))
1239       DBUG_RETURN(error);
1240     flags.finalized= true;
1241     DBUG_PRINT("debug", ("flags.finalized: %s", YESNO(flags.finalized)));
1242   }
1243   DBUG_RETURN(0);
1244 }
1245 
1246 /**
1247   Flush caches to the binary log.
1248 
1249   If the cache is finalized, the cache will be flushed to the binary
1250   log file. If the cache is not finalized, nothing will be done.
1251 
1252   If flushing fails for any reason, an error will be reported and the
1253   cache will be reset. Flushing can fail in two circumstances:
1254 
1255   - It was not possible to write the cache to the file. In this case,
1256     it does not make sense to keep the cache.
1257 
1258   - The cache was successfully written to disk but post-flush actions
1259     (such as binary log rotation) failed. In this case, the cache is
1260     already written to disk and there is no reason to keep it.
1261 
1262   @see binlog_cache_data::finalize
1263  */
1264 int
flush(THD * thd,my_off_t * bytes_written,bool * wrote_xid)1265 binlog_cache_data::flush(THD *thd, my_off_t *bytes_written, bool *wrote_xid)
1266 {
1267   /*
1268     Doing a commit or a rollback including non-transactional tables,
1269     i.e., ending a transaction where we might write the transaction
1270     cache to the binary log.
1271 
1272     We can always end the statement when ending a transaction since
1273     transactions are not allowed inside stored functions. If they
1274     were, we would have to ensure that we're not ending a statement
1275     inside a stored function.
1276   */
1277   DBUG_ENTER("binlog_cache_data::flush");
1278   DBUG_PRINT("debug", ("flags.finalized: %s", YESNO(flags.finalized)));
1279   int error= 0;
1280   if (flags.finalized)
1281   {
1282     my_off_t bytes_in_cache= my_b_tell(&cache_log);
1283     DBUG_PRINT("debug", ("bytes_in_cache: %llu", bytes_in_cache));
1284     /*
1285       The cache is always reset since subsequent rollbacks of the
1286       transactions might trigger attempts to write to the binary log
1287       if the cache is not reset.
1288      */
1289     if (!(error= gtid_before_write_cache(thd, this)))
1290       error= mysql_bin_log.write_cache(thd, this);
1291     else
1292       thd->commit_error= THD::CE_FLUSH_ERROR;
1293 
1294     if (flags.with_xid && error == 0)
1295       *wrote_xid= true;
1296 
1297     /*
1298       Reset have to be after the if above, since it clears the
1299       with_xid flag
1300     */
1301     reset();
1302     if (bytes_written)
1303       *bytes_written= bytes_in_cache;
1304   }
1305   DBUG_ASSERT(!flags.finalized);
1306   DBUG_RETURN(error);
1307 }
1308 
1309 /**
1310   This function truncates the transactional cache upon committing or rolling
1311   back either a transaction or a statement.
1312 
1313   @param thd        The thread whose transaction should be flushed
1314   @param cache_mngr Pointer to the cache data to be flushed
1315   @param all        @c true means truncate the transaction, otherwise the
1316                     statement must be truncated.
1317 
1318   @return
1319     nonzero if an error pops up when truncating the transactional cache.
1320 */
1321 int
truncate(THD * thd,bool all)1322 binlog_trx_cache_data::truncate(THD *thd, bool all)
1323 {
1324   DBUG_ENTER("binlog_trx_cache_data::truncate");
1325   int error=0;
1326 
1327   DBUG_PRINT("info", ("thd->options={ %s %s}, transaction: %s",
1328                       FLAGSTR(thd->variables.option_bits, OPTION_NOT_AUTOCOMMIT),
1329                       FLAGSTR(thd->variables.option_bits, OPTION_BEGIN),
1330                       all ? "all" : "stmt"));
1331 
1332   remove_pending_event();
1333 
1334   /*
1335     If rolling back an entire transaction or a single statement not
1336     inside a transaction, we reset the transaction cache.
1337   */
1338   if (ending_trans(thd, all))
1339   {
1340     if (has_incident())
1341       error= mysql_bin_log.write_incident(thd, true/*need_lock_log=true*/);
1342     reset();
1343   }
1344   /*
1345     If rolling back a statement in a transaction, we truncate the
1346     transaction cache to remove the statement.
1347   */
1348   else if (get_prev_position() != MY_OFF_T_UNDEF)
1349   {
1350     restore_prev_position();
1351     if (is_binlog_empty())
1352     {
1353       /*
1354         After restoring the previous position, we need to check if
1355         the cache is empty. In such case, the group cache needs to
1356         be cleaned up too because the GTID is removed too from the
1357         cache.
1358 
1359         So if any change happens again, the GTID must be rewritten
1360         and this will not happen if the group cache is not cleaned
1361         up.
1362 
1363         After integrating this with NDB, we need to check if the
1364         current approach is enough or the group cache needs to
1365         explicitly support rollback to savepoints.
1366       */
1367       group_cache.clear();
1368     }
1369   }
1370 
1371   thd->clear_binlog_table_maps();
1372 
1373   DBUG_RETURN(error);
1374 }
1375 
binlog_prepare(handlerton * hton,THD * thd,bool all)1376 static int binlog_prepare(handlerton *hton, THD *thd, bool all)
1377 {
1378   /*
1379     do nothing.
1380     just pretend we can do 2pc, so that MySQL won't
1381     switch to 1pc.
1382     real work will be done in MYSQL_BIN_LOG::commit()
1383   */
1384   return 0;
1385 }
1386 
1387 /**
1388   This function is called once after each statement.
1389 
1390   @todo This function is currently not used any more and will
1391   eventually be eliminated. The real commit job is done in the
1392   MYSQL_BIN_LOG::commit function.
1393 
1394   @see MYSQL_BIN_LOG::commit
1395 
1396   @param hton  The binlog handlerton.
1397   @param thd   The client thread that executes the transaction.
1398   @param all   This is @c true if this is a real transaction commit, and
1399                @false otherwise.
1400 
1401   @see handlerton::commit
1402 */
binlog_commit(handlerton * hton,THD * thd,bool all)1403 static int binlog_commit(handlerton *hton, THD *thd, bool all)
1404 {
1405   DBUG_ENTER("binlog_commit");
1406   /*
1407     Nothing to do (any more) on commit.
1408    */
1409   DBUG_RETURN(0);
1410 }
1411 
1412 /**
1413   This function is called when a transaction or a statement is rolled back.
1414 
1415   @internal It is necessary to execute a rollback here if the
1416   transaction was rolled back because of executing a ROLLBACK TO
1417   SAVEPOINT command, but it is not used for normal rollback since
1418   MYSQL_BIN_LOG::rollback is called in that case.
1419 
1420   @todo Refactor code to introduce a <code>MYSQL_BIN_LOG::rollback(THD
1421   *thd, SAVEPOINT *sv)</code> function in @c TC_LOG and have that
1422   function execute the necessary work to rollback to a savepoint.
1423 
1424   @param hton  The binlog handlerton.
1425   @param thd   The client thread that executes the transaction.
1426   @param all   This is @c true if this is a real transaction rollback, and
1427                @false otherwise.
1428 
1429   @see handlerton::rollback
1430 */
binlog_rollback(handlerton * hton,THD * thd,bool all)1431 static int binlog_rollback(handlerton *hton, THD *thd, bool all)
1432 {
1433   DBUG_ENTER("binlog_rollback");
1434   int error= 0;
1435   if (thd->lex->sql_command == SQLCOM_ROLLBACK_TO_SAVEPOINT)
1436     error= mysql_bin_log.rollback(thd, all);
1437   DBUG_RETURN(error);
1438 }
1439 
1440 
1441 bool
append(THD * first)1442 Stage_manager::Mutex_queue::append(THD *first)
1443 {
1444   DBUG_ENTER("Stage_manager::Mutex_queue::append");
1445   lock();
1446   DBUG_PRINT("enter", ("first: 0x%llx", (ulonglong) first));
1447   DBUG_PRINT("info", ("m_first: 0x%llx, &m_first: 0x%llx, m_last: 0x%llx",
1448                        (ulonglong) m_first, (ulonglong) &m_first,
1449                        (ulonglong) m_last));
1450   bool empty= (m_first == NULL);
1451   *m_last= first;
1452   DBUG_PRINT("info", ("m_first: 0x%llx, &m_first: 0x%llx, m_last: 0x%llx",
1453                        (ulonglong) m_first, (ulonglong) &m_first,
1454                        (ulonglong) m_last));
1455   /*
1456     Go to the last THD instance of the list. We expect lists to be
1457     moderately short. If they are not, we need to track the end of
1458     the queue as well.
1459   */
1460   while (first->next_to_commit)
1461     first= first->next_to_commit;
1462   m_last= &first->next_to_commit;
1463   DBUG_PRINT("info", ("m_first: 0x%llx, &m_first: 0x%llx, m_last: 0x%llx",
1464                         (ulonglong) m_first, (ulonglong) &m_first,
1465                         (ulonglong) m_last));
1466   DBUG_ASSERT(m_first || m_last == &m_first);
1467   DBUG_PRINT("return", ("empty: %s", YESNO(empty)));
1468   unlock();
1469   DBUG_RETURN(empty);
1470 }
1471 
1472 
1473 std::pair<bool, THD*>
pop_front()1474 Stage_manager::Mutex_queue::pop_front()
1475 {
1476   DBUG_ENTER("Stage_manager::Mutex_queue::pop_front");
1477   lock();
1478   THD *result= m_first;
1479   bool more= true;
1480   /*
1481     We do not set next_to_commit to NULL here since this is only used
1482     in the flush stage. We will have to call fetch_queue last here,
1483     and will then "cut" the linked list by setting the end of that
1484     queue to NULL.
1485   */
1486   if (result)
1487     m_first= result->next_to_commit;
1488   if (m_first == NULL)
1489   {
1490     more= false;
1491     m_last = &m_first;
1492   }
1493   DBUG_ASSERT(m_first || m_last == &m_first);
1494   unlock();
1495   DBUG_PRINT("return", ("result: 0x%llx, more: %s",
1496                         (ulonglong) result, YESNO(more)));
1497   DBUG_RETURN(std::make_pair(more, result));
1498 }
1499 
1500 
1501 bool
enroll_for(StageID stage,THD * thd,mysql_mutex_t * stage_mutex)1502 Stage_manager::enroll_for(StageID stage, THD *thd, mysql_mutex_t *stage_mutex)
1503 {
1504   // If the queue was empty: we're the leader for this batch
1505   DBUG_PRINT("debug", ("Enqueue 0x%llx to queue for stage %d",
1506                        (ulonglong) thd, stage));
1507   bool leader= m_queue[stage].append(thd);
1508 
1509   /*
1510     The stage mutex can be NULL if we are enrolling for the first
1511     stage.
1512   */
1513   if (stage_mutex)
1514     mysql_mutex_unlock(stage_mutex);
1515 
1516   /*
1517     If the queue was not empty, we're a follower and wait for the
1518     leader to process the queue. If we were holding a mutex, we have
1519     to release it before going to sleep.
1520   */
1521   if (!leader)
1522   {
1523     mysql_mutex_lock(&m_lock_done);
1524 #ifndef DBUG_OFF
1525     /*
1526       Leader can be awaiting all-clear to preempt follower's execution.
1527       With setting the status the follower ensures it won't execute anything
1528       including thread-specific code.
1529     */
1530     thd->transaction.flags.ready_preempt= 1;
1531     if (leader_await_preempt_status)
1532       mysql_cond_signal(&m_cond_preempt);
1533 #endif
1534     while (thd->transaction.flags.pending)
1535       mysql_cond_wait(&m_cond_done, &m_lock_done);
1536     mysql_mutex_unlock(&m_lock_done);
1537   }
1538   return leader;
1539 }
1540 
1541 
fetch_and_empty()1542 THD *Stage_manager::Mutex_queue::fetch_and_empty()
1543 {
1544   DBUG_ENTER("Stage_manager::Mutex_queue::fetch_and_empty");
1545   lock();
1546   DBUG_PRINT("enter", ("m_first: 0x%llx, &m_first: 0x%llx, m_last: 0x%llx",
1547                        (ulonglong) m_first, (ulonglong) &m_first,
1548                        (ulonglong) m_last));
1549   THD *result= m_first;
1550   m_first= NULL;
1551   m_last= &m_first;
1552   DBUG_PRINT("info", ("m_first: 0x%llx, &m_first: 0x%llx, m_last: 0x%llx",
1553                        (ulonglong) m_first, (ulonglong) &m_first,
1554                        (ulonglong) m_last));
1555   DBUG_ASSERT(m_first || m_last == &m_first);
1556   DBUG_PRINT("return", ("result: 0x%llx", (ulonglong) result));
1557   unlock();
1558   DBUG_RETURN(result);
1559 }
1560 
1561 #ifndef DBUG_OFF
clear_preempt_status(THD * head)1562 void Stage_manager::clear_preempt_status(THD *head)
1563 {
1564   DBUG_ASSERT(head);
1565 
1566   mysql_mutex_lock(&m_lock_done);
1567   while(!head->transaction.flags.ready_preempt)
1568   {
1569     leader_await_preempt_status= true;
1570     mysql_cond_wait(&m_cond_preempt, &m_lock_done);
1571   }
1572   leader_await_preempt_status= false;
1573   mysql_mutex_unlock(&m_lock_done);
1574 }
1575 #endif
1576 
1577 /**
1578   Write a rollback record of the transaction to the binary log.
1579 
1580   For binary log group commit, the rollback is separated into three
1581   parts:
1582 
1583   1. First part consists of filling the necessary caches and
1584      finalizing them (if they need to be finalized). After a cache is
1585      finalized, nothing can be added to the cache.
1586 
1587   2. Second part execute an ordered flush and commit. This will be
1588      done using the group commit functionality in @c ordered_commit.
1589 
1590      Since we roll back the transaction early, we call @c
1591      ordered_commit with the @c skip_commit flag set. The @c
1592      ha_commit_low call inside @c ordered_commit will then not be
1593      called.
1594 
1595   3. Third part checks any errors resulting from the flush and handles
1596      them appropriately.
1597 
1598   @see MYSQL_BIN_LOG::ordered_commit
1599   @see ha_commit_low
1600   @see ha_rollback_low
1601 
1602   @param thd Session to commit
1603   @param all This is @c true if this is a real transaction rollback, and
1604              @false otherwise.
1605 
1606   @return Error code, or zero if there were no error.
1607  */
1608 
rollback(THD * thd,bool all)1609 int MYSQL_BIN_LOG::rollback(THD *thd, bool all)
1610 {
1611   int error= 0;
1612   bool stuff_logged= false;
1613 
1614   binlog_cache_mngr *const cache_mngr= thd_get_cache_mngr(thd);
1615   DBUG_ENTER("MYSQL_BIN_LOG::rollback(THD *thd, bool all)");
1616   DBUG_PRINT("enter", ("all: %s, cache_mngr: 0x%llx, thd->is_error: %s",
1617                        YESNO(all), (ulonglong) cache_mngr, YESNO(thd->is_error())));
1618 
1619   /*
1620     We roll back the transaction in the engines early since this will
1621     release locks and allow other transactions to start executing.
1622 
1623     If we are executing a ROLLBACK TO SAVEPOINT, we should only clear
1624     the caches since this function is called as part of the engine
1625     rollback.
1626    */
1627   if (thd->lex->sql_command != SQLCOM_ROLLBACK_TO_SAVEPOINT)
1628     if ((error= ha_rollback_low(thd, all)))
1629       goto end;
1630 
1631   /*
1632     If there is no cache manager, or if there is nothing in the
1633     caches, there are no caches to roll back, so we're trivially done.
1634    */
1635   if (cache_mngr == NULL || cache_mngr->is_binlog_empty())
1636     goto end;
1637 
1638   DBUG_PRINT("debug",
1639              ("all.cannot_safely_rollback(): %s, trx_cache_empty: %s",
1640               YESNO(thd->transaction.all.cannot_safely_rollback()),
1641               YESNO(cache_mngr->trx_cache.is_binlog_empty())));
1642   DBUG_PRINT("debug",
1643              ("stmt.cannot_safely_rollback(): %s, stmt_cache_empty: %s",
1644               YESNO(thd->transaction.stmt.cannot_safely_rollback()),
1645               YESNO(cache_mngr->stmt_cache.is_binlog_empty())));
1646 
1647   /*
1648     If an incident event is set we do not flush the content of the statement
1649     cache because it may be corrupted.
1650   */
1651   if (cache_mngr->stmt_cache.has_incident())
1652   {
1653     error= write_incident(thd, true/*need_lock_log=true*/);
1654     cache_mngr->stmt_cache.reset();
1655   }
1656   else if (!cache_mngr->stmt_cache.is_binlog_empty())
1657   {
1658     if ((error= cache_mngr->stmt_cache.finalize(thd)))
1659       goto end;
1660     stuff_logged= true;
1661   }
1662 
1663   if (ending_trans(thd, all))
1664   {
1665     if (trans_cannot_safely_rollback(thd))
1666     {
1667       /*
1668         If the transaction is being rolled back and contains changes that
1669         cannot be rolled back, the trx-cache's content is flushed.
1670       */
1671       Query_log_event
1672         end_evt(thd, STRING_WITH_LEN("ROLLBACK"), true, false, true, 0, true);
1673       error= cache_mngr->trx_cache.finalize(thd, &end_evt);
1674       stuff_logged= true;
1675     }
1676     else
1677     {
1678       /*
1679         If the transaction is being rolled back and its changes can be
1680         rolled back, the trx-cache's content is truncated.
1681       */
1682       error= cache_mngr->trx_cache.truncate(thd, all);
1683     }
1684   }
1685   else
1686   {
1687     /*
1688       If a statement is being rolled back, it is necessary to know
1689       exactly why a statement may not be safely rolled back as in
1690       some specific situations the trx-cache can be truncated.
1691 
1692       If a temporary table is created or dropped, the trx-cache is not
1693       truncated. Note that if the stmt-cache is used, there is nothing
1694       to truncate in the trx-cache.
1695 
1696       If a non-transactional table is updated and the binlog format is
1697       statement, the trx-cache is not truncated. The trx-cache is used
1698       when the direct option is off and a transactional table has been
1699       updated before the current statement in the context of the
1700       current transaction. Note that if the stmt-cache is used there is
1701       nothing to truncate in the trx-cache.
1702 
1703       If other binlog formats are used, updates to non-transactional
1704       tables are written to the stmt-cache and trx-cache can be safely
1705       truncated, if necessary.
1706     */
1707     if (thd->transaction.stmt.has_dropped_temp_table() ||
1708         thd->transaction.stmt.has_created_temp_table() ||
1709         (thd->transaction.stmt.has_modified_non_trans_table() &&
1710         thd->variables.binlog_format == BINLOG_FORMAT_STMT))
1711     {
1712       /*
1713         If the statement is being rolled back and dropped or created a
1714         temporary table or modified a non-transactional table and the
1715         statement-based replication is in use, the statement's changes
1716         in the trx-cache are preserved.
1717       */
1718       cache_mngr->trx_cache.set_prev_position(MY_OFF_T_UNDEF);
1719     }
1720     else
1721     {
1722       /*
1723         Otherwise, the statement's changes in the trx-cache are
1724         truncated.
1725       */
1726       error= cache_mngr->trx_cache.truncate(thd, all);
1727     }
1728   }
1729 
1730   DBUG_PRINT("debug", ("error: %d", error));
1731   if (error == 0 && stuff_logged)
1732     error= ordered_commit(thd, all, /* skip_commit */ true);
1733 
1734   if (check_write_error(thd))
1735   {
1736     /*
1737       "all == true" means that a "rollback statement" triggered the error and
1738       this function was called. However, this must not happen as a rollback
1739       is written directly to the binary log. And in auto-commit mode, a single
1740       statement that is rolled back has the flag all == false.
1741     */
1742     DBUG_ASSERT(!all);
1743     /*
1744       We reach this point if the effect of a statement did not properly get into
1745       a cache and need to be rolled back.
1746     */
1747     error |= cache_mngr->trx_cache.truncate(thd, all);
1748   }
1749 
1750 end:
1751   /*
1752     When a statement errors out on auto-commit mode it is rollback
1753     implicitly, so the same should happen to its GTID.
1754   */
1755   if (!thd->in_active_multi_stmt_transaction())
1756     gtid_rollback(thd);
1757 
1758   DBUG_PRINT("return", ("error: %d", error));
1759   DBUG_RETURN(error);
1760 }
1761 
1762 /**
1763   @note
1764   How do we handle this (unlikely but legal) case:
1765   @verbatim
1766     [transaction] + [update to non-trans table] + [rollback to savepoint] ?
1767   @endverbatim
1768   The problem occurs when a savepoint is before the update to the
1769   non-transactional table. Then when there's a rollback to the savepoint, if we
1770   simply truncate the binlog cache, we lose the part of the binlog cache where
1771   the update is. If we want to not lose it, we need to write the SAVEPOINT
1772   command and the ROLLBACK TO SAVEPOINT command to the binlog cache. The latter
1773   is easy: it's just write at the end of the binlog cache, but the former
1774   should be *inserted* to the place where the user called SAVEPOINT. The
1775   solution is that when the user calls SAVEPOINT, we write it to the binlog
1776   cache (so no need to later insert it). As transactions are never intermixed
1777   in the binary log (i.e. they are serialized), we won't have conflicts with
1778   savepoint names when using mysqlbinlog or in the slave SQL thread.
1779   Then when ROLLBACK TO SAVEPOINT is called, if we updated some
1780   non-transactional table, we don't truncate the binlog cache but instead write
1781   ROLLBACK TO SAVEPOINT to it; otherwise we truncate the binlog cache (which
1782   will chop the SAVEPOINT command from the binlog cache, which is good as in
1783   that case there is no need to have it in the binlog).
1784 */
1785 
binlog_savepoint_set(handlerton * hton,THD * thd,void * sv)1786 static int binlog_savepoint_set(handlerton *hton, THD *thd, void *sv)
1787 {
1788   DBUG_ENTER("binlog_savepoint_set");
1789   int error= 1;
1790 
1791   String log_query;
1792   if (log_query.append(STRING_WITH_LEN("SAVEPOINT ")))
1793     DBUG_RETURN(error);
1794   else
1795     append_identifier(thd, &log_query, thd->lex->ident.str,
1796                       thd->lex->ident.length);
1797 
1798   int errcode= query_error_code(thd, thd->killed == THD::NOT_KILLED);
1799   Query_log_event qinfo(thd, log_query.c_ptr_safe(), log_query.length(),
1800                         TRUE, FALSE, TRUE, errcode);
1801   /*
1802     We cannot record the position before writing the statement
1803     because a rollback to a savepoint (.e.g. consider it "S") would
1804     prevent the savepoint statement (i.e. "SAVEPOINT S") from being
1805     written to the binary log despite the fact that the server could
1806     still issue other rollback statements to the same savepoint (i.e.
1807     "S").
1808     Given that the savepoint is valid until the server releases it,
1809     ie, until the transaction commits or it is released explicitly,
1810     we need to log it anyway so that we don't have "ROLLBACK TO S"
1811     or "RELEASE S" without the preceding "SAVEPOINT S" in the binary
1812     log.
1813   */
1814   if (!(error= mysql_bin_log.write_event(&qinfo)))
1815     binlog_trans_log_savepos(thd, (my_off_t*) sv);
1816 
1817   DBUG_RETURN(error);
1818 }
1819 
binlog_savepoint_rollback(handlerton * hton,THD * thd,void * sv)1820 static int binlog_savepoint_rollback(handlerton *hton, THD *thd, void *sv)
1821 {
1822   DBUG_ENTER("binlog_savepoint_rollback");
1823   binlog_cache_mngr *const cache_mngr= thd_get_cache_mngr(thd);
1824   my_off_t pos= *(my_off_t*) sv;
1825   DBUG_ASSERT(pos != ~(my_off_t) 0);
1826 
1827   /*
1828     Write ROLLBACK TO SAVEPOINT to the binlog cache if we have updated some
1829     non-transactional table. Otherwise, truncate the binlog cache starting
1830     from the SAVEPOINT command.
1831   */
1832   if (trans_cannot_safely_rollback(thd))
1833   {
1834     String log_query;
1835     if (log_query.append(STRING_WITH_LEN("ROLLBACK TO ")))
1836       DBUG_RETURN(1);
1837     else
1838     {
1839       /*
1840         Before writing identifier to the binlog, make sure to
1841         quote the identifier properly so as to prevent any SQL
1842         injection on the slave.
1843       */
1844       append_identifier(thd, &log_query, thd->lex->ident.str,
1845                         thd->lex->ident.length);
1846     }
1847 
1848     int errcode= query_error_code(thd, thd->killed == THD::NOT_KILLED);
1849     Query_log_event qinfo(thd, log_query.c_ptr_safe(), log_query.length(),
1850                           TRUE, FALSE, TRUE, errcode);
1851     DBUG_RETURN(mysql_bin_log.write_event(&qinfo));
1852   }
1853   // Otherwise, we truncate the cache
1854   cache_mngr->trx_cache.restore_savepoint(pos);
1855   /*
1856     When a SAVEPOINT is executed inside a stored function/trigger we force the
1857     pending event to be flushed with a STMT_END_F flag and clear the table maps
1858     as well to ensure that following DMLs will have a clean state to start
1859     with. ROLLBACK inside a stored routine has to finalize possibly existing
1860     current row-based pending event with cleaning up table maps. That ensures
1861     that following DMLs will have a clean state to start with.
1862    */
1863   if (thd->in_sub_stmt)
1864     thd->clear_binlog_table_maps();
1865   if (cache_mngr->trx_cache.is_binlog_empty())
1866     cache_mngr->trx_cache.group_cache.clear();
1867   DBUG_RETURN(0);
1868 }
1869 
1870 /**
1871   Check whether binlog state allows to safely release MDL locks after
1872   rollback to savepoint.
1873 
1874   @param hton  The binlog handlerton.
1875   @param thd   The client thread that executes the transaction.
1876 
1877   @return true  - It is safe to release MDL locks.
1878           false - If it is not.
1879 */
binlog_savepoint_rollback_can_release_mdl(handlerton * hton,THD * thd)1880 static bool binlog_savepoint_rollback_can_release_mdl(handlerton *hton,
1881                                                       THD *thd)
1882 {
1883   DBUG_ENTER("binlog_savepoint_rollback_can_release_mdl");
1884   /*
1885     If we have not updated any non-transactional tables rollback
1886     to savepoint will simply truncate binlog cache starting from
1887     SAVEPOINT command. So it should be safe to release MDL acquired
1888     after SAVEPOINT command in this case.
1889   */
1890   DBUG_RETURN(!trans_cannot_safely_rollback(thd));
1891 }
1892 
1893 #ifdef HAVE_REPLICATION
1894 
1895 /*
1896   Adjust the position pointer in the binary log file for all running slaves
1897 
1898   SYNOPSIS
1899     adjust_linfo_offsets()
1900     purge_offset	Number of bytes removed from start of log index file
1901 
1902   NOTES
1903     - This is called when doing a PURGE when we delete lines from the
1904       index log file
1905 
1906   REQUIREMENTS
1907     - Before calling this function, we have to ensure that no threads are
1908       using any binary log file before purge_offset.a
1909 
1910   TODO
1911     - Inform the slave threads that they should sync the position
1912       in the binary log file with flush_relay_log_info.
1913       Now they sync is done for next read.
1914 */
1915 
adjust_linfo_offsets(my_off_t purge_offset)1916 static void adjust_linfo_offsets(my_off_t purge_offset)
1917 {
1918   mysql_mutex_lock(&LOCK_thread_count);
1919 
1920   Thread_iterator it= global_thread_list_begin();
1921   Thread_iterator end= global_thread_list_end();
1922   for (; it != end; ++it)
1923   {
1924     LOG_INFO* linfo;
1925     if ((linfo = (*it)->current_linfo))
1926     {
1927       mysql_mutex_lock(&linfo->lock);
1928       /*
1929 	Index file offset can be less that purge offset only if
1930 	we just started reading the index file. In that case
1931 	we have nothing to adjust
1932       */
1933       if (linfo->index_file_offset < purge_offset)
1934 	linfo->fatal = (linfo->index_file_offset != 0);
1935       else
1936 	linfo->index_file_offset -= purge_offset;
1937       mysql_mutex_unlock(&linfo->lock);
1938     }
1939   }
1940   mysql_mutex_unlock(&LOCK_thread_count);
1941 }
1942 
1943 
log_in_use(const char * log_name)1944 static int log_in_use(const char* log_name)
1945 {
1946   size_t log_name_len = strlen(log_name) + 1;
1947   int thread_count=0;
1948 #ifndef DBUG_OFF
1949   if (current_thd)
1950     DEBUG_SYNC(current_thd,"purge_logs_after_lock_index_before_thread_count");
1951 #endif
1952   mysql_mutex_lock(&LOCK_thread_count);
1953 
1954   Thread_iterator it= global_thread_list_begin();
1955   Thread_iterator end= global_thread_list_end();
1956   for (; it != end; ++it)
1957   {
1958     LOG_INFO* linfo;
1959     if ((linfo = (*it)->current_linfo))
1960     {
1961       mysql_mutex_lock(&linfo->lock);
1962       if(!strncmp(log_name, linfo->log_file_name, log_name_len))
1963       {
1964         thread_count++;
1965         sql_print_warning("file %s was not purged because it was being read"
1966                           "by thread number %llu", log_name,
1967                           (ulonglong)(*it)->thread_id);
1968       }
1969       mysql_mutex_unlock(&linfo->lock);
1970     }
1971   }
1972 
1973   mysql_mutex_unlock(&LOCK_thread_count);
1974   return thread_count;
1975 }
1976 
purge_error_message(THD * thd,int res)1977 static bool purge_error_message(THD* thd, int res)
1978 {
1979   uint errcode;
1980 
1981   if ((errcode= purge_log_get_error_code(res)) != 0)
1982   {
1983     my_message(errcode, ER(errcode), MYF(0));
1984     return TRUE;
1985   }
1986   my_ok(thd);
1987   return FALSE;
1988 }
1989 
1990 #endif /* HAVE_REPLICATION */
1991 
check_binlog_magic(IO_CACHE * log,const char ** errmsg)1992 int check_binlog_magic(IO_CACHE* log, const char** errmsg)
1993 {
1994   char magic[4];
1995   DBUG_ASSERT(my_b_tell(log) == 0);
1996 
1997   if (my_b_read(log, (uchar*) magic, sizeof(magic)))
1998   {
1999     *errmsg = "I/O error reading the header from the binary log";
2000     sql_print_error("%s, errno=%d, io cache code=%d", *errmsg, my_errno,
2001 		    log->error);
2002     return 1;
2003   }
2004   if (memcmp(magic, BINLOG_MAGIC, sizeof(magic)))
2005   {
2006     *errmsg = "Binlog has bad magic number;  It's not a binary log file that can be used by this version of MySQL";
2007     return 1;
2008   }
2009   return 0;
2010 }
2011 
2012 
open_binlog_file(IO_CACHE * log,const char * log_file_name,const char ** errmsg)2013 File open_binlog_file(IO_CACHE *log, const char *log_file_name, const char **errmsg)
2014 {
2015   File file;
2016   DBUG_ENTER("open_binlog_file");
2017 
2018   if ((file= mysql_file_open(key_file_binlog,
2019                              log_file_name, O_RDONLY | O_BINARY | O_SHARE,
2020                              MYF(MY_WME))) < 0)
2021   {
2022     sql_print_error("Failed to open log (file '%s', errno %d)",
2023                     log_file_name, my_errno);
2024     *errmsg = "Could not open log file";
2025     goto err;
2026   }
2027   if (init_io_cache(log, file, IO_SIZE*2, READ_CACHE, 0, 0,
2028                     MYF(MY_WME|MY_DONT_CHECK_FILESIZE)))
2029   {
2030     sql_print_error("Failed to create a cache on log (file '%s')",
2031                     log_file_name);
2032     *errmsg = "Could not open log file";
2033     goto err;
2034   }
2035   if (check_binlog_magic(log,errmsg))
2036     goto err;
2037   DBUG_RETURN(file);
2038 
2039 err:
2040   if (file >= 0)
2041   {
2042     mysql_file_close(file, MYF(0));
2043     end_io_cache(log);
2044   }
2045   DBUG_RETURN(-1);
2046 }
2047 
2048 /**
2049   This function checks if a transactional table was updated by the
2050   current transaction.
2051 
2052   @param thd The client thread that executed the current statement.
2053   @return
2054     @c true if a transactional table was updated, @c false otherwise.
2055 */
2056 bool
trans_has_updated_trans_table(const THD * thd)2057 trans_has_updated_trans_table(const THD* thd)
2058 {
2059   binlog_cache_mngr *const cache_mngr= thd_get_cache_mngr(thd);
2060 
2061   return (cache_mngr ? !cache_mngr->trx_cache.is_binlog_empty() : 0);
2062 }
2063 
2064 /**
2065   This function checks if a transactional table was updated by the
2066   current statement.
2067 
2068   @param ha_list Registered storage engine handler list.
2069   @return
2070     @c true if a transactional table was updated, @c false otherwise.
2071 */
2072 bool
stmt_has_updated_trans_table(Ha_trx_info * ha_list)2073 stmt_has_updated_trans_table(Ha_trx_info* ha_list)
2074 {
2075   Ha_trx_info *ha_info;
2076 
2077   for (ha_info= ha_list; ha_info; ha_info= ha_info->next())
2078   {
2079     if (ha_info->is_trx_read_write() && ha_info->ht() != binlog_hton)
2080       return (TRUE);
2081   }
2082   return (FALSE);
2083 }
2084 
2085 /**
2086   This function checks if a transaction, either a multi-statement
2087   or a single statement transaction is about to commit or not.
2088 
2089   @param thd The client thread that executed the current statement.
2090   @param all Committing a transaction (i.e. TRUE) or a statement
2091              (i.e. FALSE).
2092   @return
2093     @c true if committing a transaction, otherwise @c false.
2094 */
ending_trans(THD * thd,const bool all)2095 bool ending_trans(THD* thd, const bool all)
2096 {
2097   return (all || ending_single_stmt_trans(thd, all));
2098 }
2099 
2100 /**
2101   This function checks if a single statement transaction is about
2102   to commit or not.
2103 
2104   @param thd The client thread that executed the current statement.
2105   @param all Committing a transaction (i.e. TRUE) or a statement
2106              (i.e. FALSE).
2107   @return
2108     @c true if committing a single statement transaction, otherwise
2109     @c false.
2110 */
ending_single_stmt_trans(THD * thd,const bool all)2111 bool ending_single_stmt_trans(THD* thd, const bool all)
2112 {
2113   return (!all && !thd->in_multi_stmt_transaction_mode());
2114 }
2115 
2116 /**
2117   This function checks if a transaction cannot be rolled back safely.
2118 
2119   @param thd The client thread that executed the current statement.
2120   @return
2121     @c true if cannot be safely rolled back, @c false otherwise.
2122 */
trans_cannot_safely_rollback(const THD * thd)2123 bool trans_cannot_safely_rollback(const THD* thd)
2124 {
2125   binlog_cache_mngr *const cache_mngr= thd_get_cache_mngr(thd);
2126 
2127   return cache_mngr->trx_cache.cannot_rollback();
2128 }
2129 
2130 /**
2131   This function checks if current statement cannot be rollded back safely.
2132 
2133   @param thd The client thread that executed the current statement.
2134   @return
2135     @c true if cannot be safely rolled back, @c false otherwise.
2136 */
stmt_cannot_safely_rollback(const THD * thd)2137 bool stmt_cannot_safely_rollback(const THD* thd)
2138 {
2139   return thd->transaction.stmt.cannot_safely_rollback();
2140 }
2141 
2142 #ifndef EMBEDDED_LIBRARY
2143 /**
2144   Execute a PURGE BINARY LOGS TO <log> command.
2145 
2146   @param thd Pointer to THD object for the client thread executing the
2147   statement.
2148 
2149   @param to_log Name of the last log to purge.
2150 
2151   @retval FALSE success
2152   @retval TRUE failure
2153 */
purge_master_logs(THD * thd,const char * to_log)2154 bool purge_master_logs(THD* thd, const char* to_log)
2155 {
2156   char search_file_name[FN_REFLEN];
2157   if (!mysql_bin_log.is_open())
2158   {
2159     my_ok(thd);
2160     return FALSE;
2161   }
2162 
2163   mysql_bin_log.make_log_name(search_file_name, to_log);
2164   return purge_error_message(thd,
2165                              mysql_bin_log.purge_logs(search_file_name, false,
2166                                                       true/*need_lock_index=true*/,
2167                                                       true/*need_update_threads=true*/,
2168                                                       NULL, false));
2169 }
2170 
2171 
2172 /**
2173   Execute a PURGE BINARY LOGS BEFORE <date> command.
2174 
2175   @param thd Pointer to THD object for the client thread executing the
2176   statement.
2177 
2178   @param purge_time Date before which logs should be purged.
2179 
2180   @retval FALSE success
2181   @retval TRUE failure
2182 */
purge_master_logs_before_date(THD * thd,time_t purge_time)2183 bool purge_master_logs_before_date(THD* thd, time_t purge_time)
2184 {
2185   if (!mysql_bin_log.is_open())
2186   {
2187     my_ok(thd);
2188     return 0;
2189   }
2190   return purge_error_message(thd,
2191                              mysql_bin_log.purge_logs_before_date(purge_time,
2192                                                                   false));
2193 }
2194 #endif /* EMBEDDED_LIBRARY */
2195 
2196 /*
2197   Helper function to get the error code of the query to be binlogged.
2198  */
query_error_code(THD * thd,bool not_killed)2199 int query_error_code(THD *thd, bool not_killed)
2200 {
2201   int error;
2202 
2203   if (not_killed || (thd->killed == THD::KILL_BAD_DATA))
2204   {
2205     error= thd->is_error() ? thd->get_stmt_da()->sql_errno() : 0;
2206 
2207     /* thd->get_stmt_da()->sql_errno() might be ER_SERVER_SHUTDOWN or
2208        ER_QUERY_INTERRUPTED, So here we need to make sure that error
2209        is not set to these errors when specified not_killed by the
2210        caller.
2211     */
2212     if (error == ER_SERVER_SHUTDOWN || error == ER_QUERY_INTERRUPTED)
2213       error= 0;
2214   }
2215   else
2216   {
2217     /* killed status for DELAYED INSERT thread should never be used */
2218     DBUG_ASSERT(!(thd->system_thread & SYSTEM_THREAD_DELAYED_INSERT));
2219     error= thd->killed_errno();
2220   }
2221 
2222   return error;
2223 }
2224 
2225 
2226 /**
2227   Copy content of 'from' file from offset to 'to' file.
2228 
2229   - We do the copy outside of the IO_CACHE as the cache
2230   buffers would just make things slower and more complicated.
2231   In most cases the copy loop should only do one read.
2232 
2233   @param from          File to copy.
2234   @param to            File to copy to.
2235   @param offset        Offset in 'from' file.
2236 
2237 
2238   @retval
2239     0    ok
2240   @retval
2241     -1    error
2242 */
copy_file(IO_CACHE * from,IO_CACHE * to,my_off_t offset)2243 static bool copy_file(IO_CACHE *from, IO_CACHE *to, my_off_t offset)
2244 {
2245   int bytes_read;
2246   uchar io_buf[IO_SIZE*2];
2247   DBUG_ENTER("copy_file");
2248 
2249   mysql_file_seek(from->file, offset, MY_SEEK_SET, MYF(0));
2250   while(TRUE)
2251   {
2252     if ((bytes_read= (int) mysql_file_read(from->file, io_buf, sizeof(io_buf),
2253                                            MYF(MY_WME)))
2254         < 0)
2255       goto err;
2256     if (DBUG_EVALUATE_IF("fault_injection_copy_part_file", 1, 0))
2257       bytes_read= bytes_read/2;
2258     if (!bytes_read)
2259       break;                                    // end of file
2260     if (mysql_file_write(to->file, io_buf, bytes_read, MYF(MY_WME | MY_NABP)))
2261       goto err;
2262   }
2263 
2264   DBUG_RETURN(0);
2265 
2266 err:
2267   DBUG_RETURN(1);
2268 }
2269 
2270 
2271 #ifdef HAVE_REPLICATION
2272 /**
2273    Load data's io cache specific hook to be executed
2274    before a chunk of data is being read into the cache's buffer
2275    The fuction instantianates and writes into the binlog
2276    replication events along LOAD DATA processing.
2277 
2278    @param file  pointer to io-cache
2279    @retval 0 success
2280    @retval 1 failure
2281 */
log_loaded_block(IO_CACHE * file)2282 int log_loaded_block(IO_CACHE* file)
2283 {
2284   DBUG_ENTER("log_loaded_block");
2285   LOAD_FILE_INFO *lf_info;
2286   uint block_len;
2287   /* buffer contains position where we started last read */
2288   uchar* buffer= (uchar*) my_b_get_buffer_start(file);
2289   uint max_event_size= current_thd->variables.max_allowed_packet;
2290   lf_info= (LOAD_FILE_INFO*) file->arg;
2291   if (lf_info->thd->is_current_stmt_binlog_format_row())
2292     DBUG_RETURN(0);
2293   if (lf_info->last_pos_in_file != HA_POS_ERROR &&
2294       lf_info->last_pos_in_file >= my_b_get_pos_in_file(file))
2295     DBUG_RETURN(0);
2296 
2297   for (block_len= (uint) (my_b_get_bytes_in_buffer(file)); block_len > 0;
2298        buffer += min(block_len, max_event_size),
2299        block_len -= min(block_len, max_event_size))
2300   {
2301     lf_info->last_pos_in_file= my_b_get_pos_in_file(file);
2302     if (lf_info->wrote_create_file)
2303     {
2304       Append_block_log_event a(lf_info->thd, lf_info->thd->db, buffer,
2305                                min(block_len, max_event_size),
2306                                lf_info->log_delayed);
2307       if (mysql_bin_log.write_event(&a))
2308         DBUG_RETURN(1);
2309     }
2310     else
2311     {
2312       Begin_load_query_log_event b(lf_info->thd, lf_info->thd->db,
2313                                    buffer,
2314                                    min(block_len, max_event_size),
2315                                    lf_info->log_delayed);
2316       if (mysql_bin_log.write_event(&b))
2317         DBUG_RETURN(1);
2318       lf_info->wrote_create_file= 1;
2319     }
2320   }
2321   DBUG_RETURN(0);
2322 }
2323 
2324 /* Helper function for SHOW BINLOG/RELAYLOG EVENTS */
show_binlog_events(THD * thd,MYSQL_BIN_LOG * binary_log)2325 bool show_binlog_events(THD *thd, MYSQL_BIN_LOG *binary_log)
2326 {
2327   Protocol *protocol= thd->protocol;
2328   List<Item> field_list;
2329   const char *errmsg = 0;
2330   bool ret = TRUE;
2331   IO_CACHE log;
2332   File file = -1;
2333   int old_max_allowed_packet= thd->variables.max_allowed_packet;
2334   LOG_INFO linfo;
2335 
2336   DBUG_ENTER("show_binlog_events");
2337 
2338   DBUG_ASSERT(thd->lex->sql_command == SQLCOM_SHOW_BINLOG_EVENTS ||
2339               thd->lex->sql_command == SQLCOM_SHOW_RELAYLOG_EVENTS);
2340 
2341   Format_description_log_event *description_event= new
2342     Format_description_log_event(3); /* MySQL 4.0 by default */
2343 
2344   if (binary_log->is_open())
2345   {
2346     LEX_MASTER_INFO *lex_mi= &thd->lex->mi;
2347     SELECT_LEX_UNIT *unit= &thd->lex->unit;
2348     ha_rows event_count, limit_start, limit_end;
2349     my_off_t pos = max<my_off_t>(BIN_LOG_HEADER_SIZE, lex_mi->pos); // user-friendly
2350     char search_file_name[FN_REFLEN], *name;
2351     const char *log_file_name = lex_mi->log_file_name;
2352     mysql_mutex_t *log_lock = binary_log->get_log_lock();
2353     Log_event* ev;
2354 
2355     unit->set_limit(thd->lex->current_select);
2356     limit_start= unit->offset_limit_cnt;
2357     limit_end= unit->select_limit_cnt;
2358 
2359     name= search_file_name;
2360     if (log_file_name)
2361       binary_log->make_log_name(search_file_name, log_file_name);
2362     else
2363       name=0;					// Find first log
2364 
2365     linfo.index_file_offset = 0;
2366 
2367     if (binary_log->find_log_pos(&linfo, name, true/*need_lock_index=true*/))
2368     {
2369       errmsg = "Could not find target log";
2370       goto err;
2371     }
2372 
2373     mysql_mutex_lock(&LOCK_thread_count);
2374     thd->current_linfo = &linfo;
2375     mysql_mutex_unlock(&LOCK_thread_count);
2376 
2377     if ((file=open_binlog_file(&log, linfo.log_file_name, &errmsg)) < 0)
2378       goto err;
2379 
2380     my_off_t end_pos;
2381     /*
2382       Acquire LOCK_log only for the duration to calculate the
2383       log's end position. LOCK_log should be acquired even while
2384       we are checking whether the log is active log or not.
2385     */
2386     mysql_mutex_lock(log_lock);
2387     if (binary_log->is_active(linfo.log_file_name))
2388     {
2389       LOG_INFO li;
2390       binary_log->get_current_log(&li, false /*LOCK_log is already acquired*/);
2391       end_pos= li.pos;
2392     }
2393     else
2394     {
2395       end_pos= my_b_filelength(&log);
2396     }
2397     mysql_mutex_unlock(log_lock);
2398 
2399     /*
2400       to account binlog event header size
2401     */
2402     thd->variables.max_allowed_packet += MAX_LOG_EVENT_HEADER;
2403 
2404     DEBUG_SYNC(thd, "after_show_binlog_event_found_file");
2405 
2406     /*
2407       open_binlog_file() sought to position 4.
2408       Read the first event in case it's a Format_description_log_event, to
2409       know the format. If there's no such event, we are 3.23 or 4.x. This
2410       code, like before, can't read 3.23 binlogs.
2411       This code will fail on a mixed relay log (one which has Format_desc then
2412       Rotate then Format_desc).
2413     */
2414     ev= Log_event::read_log_event(&log, (mysql_mutex_t*)0, description_event,
2415                                    opt_master_verify_checksum);
2416     if (ev)
2417     {
2418       if (ev->get_type_code() == FORMAT_DESCRIPTION_EVENT)
2419       {
2420         delete description_event;
2421         description_event= (Format_description_log_event*) ev;
2422       }
2423       else
2424         delete ev;
2425     }
2426 
2427     my_b_seek(&log, pos);
2428 
2429     if (!description_event->is_valid())
2430     {
2431       errmsg="Invalid Format_description event; could be out of memory";
2432       goto err;
2433     }
2434 
2435     for (event_count = 0;
2436          (ev = Log_event::read_log_event(&log, (mysql_mutex_t*) 0,
2437                                          description_event,
2438                                          opt_master_verify_checksum)); )
2439     {
2440       DEBUG_SYNC(thd, "wait_in_show_binlog_events_loop");
2441       if (ev->get_type_code() == FORMAT_DESCRIPTION_EVENT)
2442         description_event->checksum_alg= ev->checksum_alg;
2443 
2444       if (event_count >= limit_start &&
2445 	  ev->net_send(protocol, linfo.log_file_name, pos))
2446       {
2447 	errmsg = "Net error";
2448 	delete ev;
2449 	goto err;
2450       }
2451 
2452       pos = my_b_tell(&log);
2453       delete ev;
2454 
2455       if (++event_count >= limit_end || pos >= end_pos)
2456 	break;
2457     }
2458 
2459     if (event_count < limit_end && log.error)
2460     {
2461       errmsg = "Wrong offset or I/O error";
2462       goto err;
2463     }
2464 
2465   }
2466   // Check that linfo is still on the function scope.
2467   DEBUG_SYNC(thd, "after_show_binlog_events");
2468 
2469   ret= FALSE;
2470 
2471 err:
2472   delete description_event;
2473   if (file >= 0)
2474   {
2475     end_io_cache(&log);
2476     mysql_file_close(file, MYF(MY_WME));
2477   }
2478 
2479   if (errmsg)
2480     my_error(ER_ERROR_WHEN_EXECUTING_COMMAND, MYF(0),
2481              "SHOW BINLOG EVENTS", errmsg);
2482   else
2483     my_eof(thd);
2484 
2485   mysql_mutex_lock(&LOCK_thread_count);
2486   thd->current_linfo = 0;
2487   mysql_mutex_unlock(&LOCK_thread_count);
2488   thd->variables.max_allowed_packet= old_max_allowed_packet;
2489   DBUG_RETURN(ret);
2490 }
2491 
2492 /**
2493   Execute a SHOW BINLOG EVENTS statement.
2494 
2495   @param thd Pointer to THD object for the client thread executing the
2496   statement.
2497 
2498   @retval FALSE success
2499   @retval TRUE failure
2500 */
mysql_show_binlog_events(THD * thd)2501 bool mysql_show_binlog_events(THD* thd)
2502 {
2503   Protocol *protocol= thd->protocol;
2504   List<Item> field_list;
2505   DBUG_ENTER("mysql_show_binlog_events");
2506 
2507   DBUG_ASSERT(thd->lex->sql_command == SQLCOM_SHOW_BINLOG_EVENTS);
2508 
2509   Log_event::init_show_field_list(&field_list);
2510   if (protocol->send_result_set_metadata(&field_list,
2511                             Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF))
2512     DBUG_RETURN(TRUE);
2513 
2514   /*
2515     Wait for handlers to insert any pending information
2516     into the binlog.  For e.g. ndb which updates the binlog asynchronously
2517     this is needed so that the uses sees all its own commands in the binlog
2518   */
2519   ha_binlog_wait(thd);
2520 
2521   DBUG_RETURN(show_binlog_events(thd, &mysql_bin_log));
2522 }
2523 
2524 #endif /* HAVE_REPLICATION */
2525 
2526 
MYSQL_BIN_LOG(uint * sync_period)2527 MYSQL_BIN_LOG::MYSQL_BIN_LOG(uint *sync_period)
2528   :bytes_written(0), file_id(1), open_count(1),
2529    sync_period_ptr(sync_period), sync_counter(0),
2530    m_prep_xids(0),
2531    is_relay_log(0), signal_cnt(0),
2532    checksum_alg_reset(BINLOG_CHECKSUM_ALG_UNDEF),
2533    relay_log_checksum_alg(BINLOG_CHECKSUM_ALG_UNDEF),
2534    previous_gtid_set(0)
2535 {
2536   /*
2537     We don't want to initialize locks here as such initialization depends on
2538     safe_mutex (when using safe_mutex) which depends on MY_INIT(), which is
2539     called only in main(). Doing initialization here would make it happen
2540     before main().
2541   */
2542   index_file_name[0] = 0;
2543   memset(&index_file, 0, sizeof(index_file));
2544   memset(&purge_index_file, 0, sizeof(purge_index_file));
2545   memset(&crash_safe_index_file, 0, sizeof(crash_safe_index_file));
2546 }
2547 
2548 
2549 /* this is called only once */
2550 
cleanup()2551 void MYSQL_BIN_LOG::cleanup()
2552 {
2553   DBUG_ENTER("cleanup");
2554   if (inited)
2555   {
2556     inited= 0;
2557     close(LOG_CLOSE_INDEX|LOG_CLOSE_STOP_EVENT, true /*need_lock_log=true*/,
2558           true /*need_lock_index=true*/);
2559     mysql_mutex_destroy(&LOCK_log);
2560     mysql_mutex_destroy(&LOCK_index);
2561     mysql_mutex_destroy(&LOCK_commit);
2562     mysql_mutex_destroy(&LOCK_sync);
2563     mysql_mutex_destroy(&LOCK_xids);
2564     mysql_cond_destroy(&update_cond);
2565     my_atomic_rwlock_destroy(&m_prep_xids_lock);
2566     mysql_cond_destroy(&m_prep_xids_cond);
2567     stage_manager.deinit();
2568   }
2569   DBUG_VOID_RETURN;
2570 }
2571 
2572 
init_pthread_objects()2573 void MYSQL_BIN_LOG::init_pthread_objects()
2574 {
2575   MYSQL_LOG::init_pthread_objects();
2576   mysql_mutex_init(m_key_LOCK_index, &LOCK_index, MY_MUTEX_INIT_SLOW);
2577   mysql_mutex_init(m_key_LOCK_commit, &LOCK_commit, MY_MUTEX_INIT_FAST);
2578   mysql_mutex_init(m_key_LOCK_sync, &LOCK_sync, MY_MUTEX_INIT_FAST);
2579   mysql_mutex_init(m_key_LOCK_xids, &LOCK_xids, MY_MUTEX_INIT_FAST);
2580   mysql_cond_init(m_key_update_cond, &update_cond, 0);
2581   my_atomic_rwlock_init(&m_prep_xids_lock);
2582   mysql_cond_init(m_key_prep_xids_cond, &m_prep_xids_cond, NULL);
2583   stage_manager.init(
2584 #ifdef HAVE_PSI_INTERFACE
2585                    m_key_LOCK_flush_queue,
2586                    m_key_LOCK_sync_queue,
2587                    m_key_LOCK_commit_queue,
2588                    m_key_LOCK_done, m_key_COND_done
2589 #endif
2590                    );
2591 }
2592 
open_index_file(const char * index_file_name_arg,const char * log_name,bool need_lock_index)2593 bool MYSQL_BIN_LOG::open_index_file(const char *index_file_name_arg,
2594                                     const char *log_name, bool need_lock_index)
2595 {
2596   bool error= false;
2597   File index_file_nr= -1;
2598 
2599   if (need_lock_index)
2600     mysql_mutex_lock(&LOCK_index);
2601   else
2602     mysql_mutex_assert_owner(&LOCK_index);
2603 
2604   /*
2605     First open of this class instance
2606     Create an index file that will hold all file names uses for logging.
2607     Add new entries to the end of it.
2608   */
2609   myf opt= MY_UNPACK_FILENAME;
2610 
2611   if (my_b_inited(&index_file))
2612     goto end;
2613 
2614   if (!index_file_name_arg)
2615   {
2616     index_file_name_arg= log_name;    // Use same basename for index file
2617     opt= MY_UNPACK_FILENAME | MY_REPLACE_EXT;
2618   }
2619   fn_format(index_file_name, index_file_name_arg, mysql_data_home,
2620             ".index", opt);
2621 
2622   if (set_crash_safe_index_file_name(index_file_name_arg))
2623   {
2624     sql_print_error("MYSQL_BIN_LOG::set_crash_safe_index_file_name failed.");
2625     error= true;
2626     goto end;
2627   }
2628 
2629   /*
2630     We need move crash_safe_index_file to index_file if the index_file
2631     does not exist and crash_safe_index_file exists when mysqld server
2632     restarts.
2633   */
2634   if (my_access(index_file_name, F_OK) &&
2635       !my_access(crash_safe_index_file_name, F_OK) &&
2636       my_rename(crash_safe_index_file_name, index_file_name, MYF(MY_WME)))
2637   {
2638     sql_print_error("MYSQL_BIN_LOG::open_index_file failed to "
2639                     "move crash_safe_index_file to index file.");
2640     error= true;
2641     goto end;
2642   }
2643 
2644   if ((index_file_nr= mysql_file_open(m_key_file_log_index,
2645                                       index_file_name,
2646                                       O_RDWR | O_CREAT | O_BINARY,
2647                                       MYF(MY_WME))) < 0 ||
2648        mysql_file_sync(index_file_nr, MYF(MY_WME)) ||
2649        init_io_cache(&index_file, index_file_nr,
2650                      IO_SIZE, READ_CACHE,
2651                      mysql_file_seek(index_file_nr, 0L, MY_SEEK_END, MYF(0)),
2652                                      0, MYF(MY_WME | MY_WAIT_IF_FULL)) ||
2653       DBUG_EVALUATE_IF("fault_injection_openning_index", 1, 0))
2654   {
2655     /*
2656       TODO: all operations creating/deleting the index file or a log, should
2657       call my_sync_dir() or my_sync_dir_by_file() to be durable.
2658       TODO: file creation should be done with mysql_file_create()
2659       not mysql_file_open().
2660     */
2661     if (index_file_nr >= 0)
2662       mysql_file_close(index_file_nr, MYF(0));
2663     error= true;
2664     goto end;
2665   }
2666 
2667 #ifdef HAVE_REPLICATION
2668   /*
2669     Sync the index by purging any binary log file that is not registered.
2670     In other words, either purge binary log files that were removed from
2671     the index but not purged from the file system due to a crash or purge
2672     any binary log file that was created but not register in the index
2673     due to a crash.
2674   */
2675 
2676   if (set_purge_index_file_name(index_file_name_arg) ||
2677       open_purge_index_file(FALSE) ||
2678       purge_index_entry(NULL, NULL, false) ||
2679       close_purge_index_file() ||
2680       DBUG_EVALUATE_IF("fault_injection_recovering_index", 1, 0))
2681   {
2682     sql_print_error("MYSQL_BIN_LOG::open_index_file failed to sync the index "
2683                     "file.");
2684     error= TRUE;
2685     goto end;
2686   }
2687 #endif
2688 end:
2689   if (need_lock_index)
2690     mysql_mutex_unlock(&LOCK_index);
2691   return error;
2692 }
2693 
2694 
2695 /**
2696   Reads GTIDs from the given binlog file.
2697 
2698   @param filename File to read from.
2699   @param all_gtids If not NULL, then the GTIDs from the
2700   Previous_gtids_log_event and from all Gtid_log_events are stored in
2701   this object.
2702   @param prev_gtids If not NULL, then the GTIDs from the
2703   Previous_gtids_log_events are stored in this object.
2704   @param first_gtid If not NULL, then the first GTID information from the
2705   file will be stored in this object.
2706   @param last_gtid If not NULL, then the last GTID information from the
2707   file will be stored in this object.
2708   @param sid_map The sid_map object to use in the rpl_sidno generation
2709   of the Gtid_log_event. If lock is needed in the sid_map, the caller
2710   must hold it.
2711   @param verify_checksum Set to true to verify event checksums.
2712 
2713   @retval GOT_GTIDS The file was successfully read and it contains
2714   both Gtid_log_events and Previous_gtids_log_events.
2715   @retval GOT_PREVIOUS_GTIDS The file was successfully read and it
2716   contains Previous_gtids_log_events but no Gtid_log_events.
2717   @retval NO_GTIDS The file was successfully read and it does not
2718   contain GTID events.
2719   @retval ERROR Out of memory, or the file contains GTID events
2720   when GTID_MODE = OFF, or the file is malformed (e.g., contains
2721   Gtid_log_events but no Previous_gtids_log_event).
2722   @retval TRUNCATED The file was truncated before the end of the
2723   first Previous_gtids_log_event.
2724 */
2725 enum enum_read_gtids_from_binlog_status
2726 { GOT_GTIDS, GOT_PREVIOUS_GTIDS, NO_GTIDS, ERROR, TRUNCATED };
2727 static enum_read_gtids_from_binlog_status
read_gtids_from_binlog(const char * filename,Gtid_set * all_gtids,Gtid_set * prev_gtids,Gtid * first_gtid,Gtid * last_gtid,Sid_map * sid_map,bool verify_checksum)2728 read_gtids_from_binlog(const char *filename, Gtid_set *all_gtids,
2729                        Gtid_set *prev_gtids, Gtid *first_gtid,
2730                        Gtid *last_gtid,
2731                        Sid_map* sid_map,
2732                        bool verify_checksum)
2733 {
2734   DBUG_ENTER("read_gtids_from_binlog");
2735   DBUG_PRINT("info", ("Opening file %s", filename));
2736 
2737   /*
2738     Create a Format_description_log_event that is used to read the
2739     first event of the log.
2740   */
2741   Format_description_log_event fd_ev(BINLOG_VERSION), *fd_ev_p= &fd_ev;
2742   if (!fd_ev.is_valid())
2743     DBUG_RETURN(ERROR);
2744 
2745   File file;
2746   IO_CACHE log;
2747 
2748   /*
2749     We assert here that both all_gtids and prev_gtids, if specified,
2750     uses the same sid_map as the one passed as a parameter. This is just
2751     to ensure that, if the sid_map needed some lock and was locked by
2752     the caller, the lock applies to all the GTID sets this function is
2753     dealing with.
2754   */
2755 #ifndef DBUG_OFF
2756   if (all_gtids)
2757     DBUG_ASSERT(all_gtids->get_sid_map() == sid_map);
2758   if (prev_gtids)
2759     DBUG_ASSERT(prev_gtids->get_sid_map() == sid_map);
2760 #endif
2761 
2762   const char *errmsg= NULL;
2763   if ((file= open_binlog_file(&log, filename, &errmsg)) < 0)
2764   {
2765     sql_print_error("%s", errmsg);
2766     /*
2767       We need to revisit the recovery procedure for relay log
2768       files. Currently, it is called after this routine.
2769       /Alfranio
2770     */
2771     DBUG_RETURN(TRUNCATED);
2772   }
2773 
2774   /*
2775     Seek for Previous_gtids_log_event and Gtid_log_event events to
2776     gather information what has been processed so far.
2777   */
2778   my_b_seek(&log, BIN_LOG_HEADER_SIZE);
2779   Log_event *ev= NULL;
2780   enum_read_gtids_from_binlog_status ret= NO_GTIDS;
2781   bool done= false;
2782   bool seen_first_gtid= false;
2783   while (!done &&
2784          (ev= Log_event::read_log_event(&log, 0, fd_ev_p, verify_checksum)) !=
2785          NULL)
2786   {
2787     DBUG_PRINT("info", ("Read event of type %s", ev->get_type_str()));
2788     switch (ev->get_type_code())
2789     {
2790     case FORMAT_DESCRIPTION_EVENT:
2791       if (fd_ev_p != &fd_ev)
2792         delete fd_ev_p;
2793       fd_ev_p= (Format_description_log_event *)ev;
2794       break;
2795     case ROTATE_EVENT:
2796       // do nothing; just accept this event and go to next
2797       break;
2798     case PREVIOUS_GTIDS_LOG_EVENT:
2799     {
2800       if (gtid_mode == 0)
2801       {
2802         my_error(ER_FOUND_GTID_EVENT_WHEN_GTID_MODE_IS_OFF, MYF(0));
2803         ret= ERROR;
2804       }
2805       ret= GOT_PREVIOUS_GTIDS;
2806       // add events to sets
2807       Previous_gtids_log_event *prev_gtids_ev=
2808         (Previous_gtids_log_event *)ev;
2809       if (all_gtids != NULL && prev_gtids_ev->add_to_set(all_gtids) != 0)
2810         ret= ERROR, done= true;
2811       else if (prev_gtids != NULL && prev_gtids_ev->add_to_set(prev_gtids) != 0)
2812         ret= ERROR, done= true;
2813 #ifndef DBUG_OFF
2814       char* prev_buffer= prev_gtids_ev->get_str(NULL, NULL);
2815       DBUG_PRINT("info", ("Got Previous_gtids from file '%s': Gtid_set='%s'.",
2816                           filename, prev_buffer));
2817       my_free(prev_buffer);
2818 #endif
2819       break;
2820     }
2821     case GTID_LOG_EVENT:
2822     {
2823       DBUG_EXECUTE_IF("inject_fault_bug16502579", {
2824                       DBUG_PRINT("debug", ("GTID_LOG_EVENT found. Injected ret=NO_GTIDS."));
2825                       ret=NO_GTIDS;
2826                       });
2827       if (ret != GOT_GTIDS)
2828       {
2829         if (ret != GOT_PREVIOUS_GTIDS)
2830         {
2831           /*
2832             Since this routine is run on startup, there may not be a
2833             THD instance. Therefore, ER(X) cannot be used.
2834            */
2835           const char* msg_fmt= (current_thd != NULL) ?
2836                                ER(ER_BINLOG_LOGICAL_CORRUPTION) :
2837                                ER_DEFAULT(ER_BINLOG_LOGICAL_CORRUPTION);
2838           my_printf_error(ER_BINLOG_LOGICAL_CORRUPTION,
2839                           msg_fmt, MYF(0),
2840                           filename,
2841                           "The first global transaction identifier was read, but "
2842                           "no other information regarding identifiers existing "
2843                           "on the previous log files was found.");
2844           ret= ERROR, done= true;
2845           break;
2846         }
2847         else
2848           ret= GOT_GTIDS;
2849       }
2850       /*
2851         When all_gtids, first_gtid and last_gtid are all NULL,
2852         we just check if the binary log contains at least one Gtid_log_event,
2853         so that we can distinguish the return values GOT_GTID and
2854         GOT_PREVIOUS_GTIDS. We don't need to read anything else from the
2855         binary log.
2856         If all_gtids or last_gtid is requested (i.e., NOT NULL), we should
2857         continue to read all gtids.
2858         If just first_gtid was requested, we will be done after storing this
2859         Gtid_log_event info on it.
2860       */
2861       if (all_gtids == NULL && first_gtid == NULL && last_gtid == NULL)
2862       {
2863         ret= GOT_GTIDS, done= true;
2864       }
2865       else
2866       {
2867         Gtid_log_event *gtid_ev= (Gtid_log_event *)ev;
2868         rpl_sidno sidno= gtid_ev->get_sidno(sid_map);
2869         if (sidno < 0)
2870           ret= ERROR, done= true;
2871         else
2872         {
2873           if (all_gtids)
2874           {
2875             if (all_gtids->ensure_sidno(sidno) != RETURN_STATUS_OK)
2876               ret= ERROR, done= true;
2877             else if (all_gtids->_add_gtid(sidno, gtid_ev->get_gno()) !=
2878                      RETURN_STATUS_OK)
2879               ret= ERROR, done= true;
2880             DBUG_PRINT("info", ("Got Gtid from file '%s': Gtid(%d, %lld).",
2881                                 filename, sidno, gtid_ev->get_gno()));
2882           }
2883 
2884           /* If the first GTID was requested, stores it */
2885           if (first_gtid && !seen_first_gtid)
2886           {
2887             first_gtid->set(sidno, gtid_ev->get_gno());
2888             seen_first_gtid= true;
2889             /* If the first_gtid was the only thing requested, we are done */
2890             if (all_gtids == NULL && last_gtid == NULL)
2891               ret= GOT_GTIDS, done= true;
2892           }
2893 
2894           if (last_gtid)
2895             last_gtid->set(sidno, gtid_ev->get_gno());
2896         }
2897       }
2898       break;
2899     }
2900     case ANONYMOUS_GTID_LOG_EVENT:
2901     default:
2902       // if we found any other event type without finding a
2903       // previous_gtids_log_event, then the rest of this binlog
2904       // cannot contain gtids
2905       if (ret != GOT_GTIDS && ret != GOT_PREVIOUS_GTIDS)
2906         done= true;
2907       break;
2908     }
2909     if (ev != fd_ev_p)
2910       delete ev;
2911     DBUG_PRINT("info", ("done=%d", done));
2912   }
2913 
2914   if (log.error < 0)
2915   {
2916     // This is not a fatal error; the log may just be truncated.
2917 
2918     // @todo but what other errors could happen? IO error?
2919     sql_print_warning("Error reading GTIDs from binary log: %d", log.error);
2920   }
2921 
2922   if (fd_ev_p != &fd_ev)
2923   {
2924     delete fd_ev_p;
2925     fd_ev_p= &fd_ev;
2926   }
2927 
2928   mysql_file_close(file, MYF(MY_WME));
2929   end_io_cache(&log);
2930 
2931   DBUG_PRINT("info", ("returning %d", ret));
2932   DBUG_RETURN(ret);
2933 }
2934 
find_first_log_not_in_gtid_set(char * binlog_file_name,const Gtid_set * gtid_set,Gtid * first_gtid,const char ** errmsg)2935 bool MYSQL_BIN_LOG::find_first_log_not_in_gtid_set(char *binlog_file_name,
2936                                                    const Gtid_set *gtid_set,
2937                                                    Gtid *first_gtid,
2938                                                    const char **errmsg)
2939 {
2940   DBUG_ENTER("MYSQL_BIN_LOG::gtid_read_start_binlog");
2941   /*
2942     Gather the set of files to be accessed.
2943   */
2944   list<string> filename_list;
2945   LOG_INFO linfo;
2946   int error;
2947 
2948   list<string>::reverse_iterator rit;
2949   Gtid_set previous_gtid_set(gtid_set->get_sid_map());
2950 
2951   mysql_mutex_lock(&LOCK_index);
2952   for (error= find_log_pos(&linfo, NULL, false/*need_lock_index=false*/);
2953        !error; error= find_next_log(&linfo, false/*need_lock_index=false*/))
2954   {
2955     DBUG_PRINT("info", ("read log filename '%s'", linfo.log_file_name));
2956     filename_list.push_back(string(linfo.log_file_name));
2957   }
2958   mysql_mutex_unlock(&LOCK_index);
2959   if (error != LOG_INFO_EOF)
2960   {
2961     *errmsg= "Failed to read the binary log index file while "
2962       "looking for the oldest binary log that contains any GTID "
2963       "that is not in the given gtid set";
2964     error= -1;
2965     goto end;
2966   }
2967 
2968   if (filename_list.empty())
2969   {
2970     *errmsg= "Could not find first log file name in binary log index file "
2971       "while looking for the oldest binary log that contains any GTID "
2972       "that is not in the given gtid set";
2973     error= -2;
2974     goto end;
2975   }
2976 
2977   /*
2978     Iterate over all the binary logs in reverse order, and read only
2979     the Previous_gtids_log_event, to find the first one, that is the
2980     subset of the given gtid set. Since every binary log begins with
2981     a Previous_gtids_log_event, that contains all GTIDs in all
2982     previous binary logs.
2983     We also ask for the first GTID in the binary log to know if we
2984     should send the FD event with the "created" field cleared or not.
2985   */
2986   DBUG_PRINT("info", ("Iterating backwards through binary logs, and reading "
2987                       "only the Previous_gtids_log_event, to find the first "
2988                       "one, that is the subset of the given gtid set."));
2989   rit= filename_list.rbegin();
2990   error= 0;
2991   while (rit != filename_list.rend())
2992   {
2993     previous_gtid_set.clear();
2994     const char *filename= rit->c_str();
2995     DBUG_PRINT("info", ("Read Previous_gtids_log_event from filename='%s'",
2996                         filename));
2997     switch (read_gtids_from_binlog(filename, NULL, &previous_gtid_set,
2998                                    first_gtid, NULL/* last_gtid */,
2999                                    previous_gtid_set.get_sid_map(),
3000                                    opt_master_verify_checksum))
3001     {
3002     case ERROR:
3003       *errmsg= "Error reading header of binary log while looking for "
3004         "the oldest binary log that contains any GTID that is not in "
3005         "the given gtid set";
3006       error= -3;
3007       goto end;
3008     case NO_GTIDS:
3009       *errmsg= "Found old binary log without GTIDs while looking for "
3010         "the oldest binary log that contains any GTID that is not in "
3011         "the given gtid set";
3012       error= -4;
3013       goto end;
3014     case GOT_GTIDS:
3015     case GOT_PREVIOUS_GTIDS:
3016       if (previous_gtid_set.is_subset(gtid_set))
3017       {
3018         strcpy(binlog_file_name, filename);
3019         /*
3020           Verify that the selected binlog is not the first binlog,
3021         */
3022         DBUG_EXECUTE_IF("slave_reconnect_with_gtid_set_executed",
3023                         DBUG_ASSERT(strcmp(filename_list.begin()->c_str(),
3024                                            binlog_file_name) != 0););
3025         goto end;
3026       }
3027     case TRUNCATED:
3028       break;
3029     }
3030 
3031     rit++;
3032   }
3033 
3034   if (rit == filename_list.rend())
3035   {
3036     report_missing_gtids(&previous_gtid_set, gtid_set, errmsg);
3037     error= -5;
3038   }
3039 
3040 end:
3041   if (error)
3042     DBUG_PRINT("error", ("'%s'", *errmsg));
3043   filename_list.clear();
3044   DBUG_PRINT("info", ("returning %d", error));
3045   DBUG_RETURN(error != 0 ? true : false);
3046 }
3047 
init_gtid_sets(Gtid_set * all_gtids,Gtid_set * lost_gtids,Gtid * last_gtid,bool verify_checksum,bool need_lock,bool is_server_starting)3048 bool MYSQL_BIN_LOG::init_gtid_sets(Gtid_set *all_gtids, Gtid_set *lost_gtids,
3049                                    Gtid *last_gtid, bool verify_checksum,
3050                                    bool need_lock, bool is_server_starting)
3051 {
3052   DBUG_ENTER("MYSQL_BIN_LOG::init_gtid_sets");
3053   DBUG_PRINT("info", ("lost_gtids=%p; so we are recovering a %s log",
3054                       lost_gtids, lost_gtids == NULL ? "relay" : "binary"));
3055 
3056   /*
3057     Acquires the necessary locks to ensure that logs are not either
3058     removed or updated when we are reading from it.
3059   */
3060   if (need_lock)
3061   {
3062     // We don't need LOCK_log if we are only going to read the initial
3063     // Prevoius_gtids_log_event and ignore the Gtid_log_events.
3064     if (all_gtids != NULL)
3065       mysql_mutex_lock(&LOCK_log);
3066     mysql_mutex_lock(&LOCK_index);
3067     global_sid_lock->wrlock();
3068   }
3069   else
3070   {
3071     if (all_gtids != NULL)
3072       mysql_mutex_assert_owner(&LOCK_log);
3073     mysql_mutex_assert_owner(&LOCK_index);
3074     global_sid_lock->assert_some_wrlock();
3075   }
3076 
3077   // Gather the set of files to be accessed.
3078   list<string> filename_list;
3079   LOG_INFO linfo;
3080   int error;
3081 
3082   list<string>::iterator it;
3083   list<string>::reverse_iterator rit;
3084   bool reached_first_file= false;
3085 
3086   /* Initialize the sid_map to be used in read_gtids_from_binlog */
3087   Sid_map *sid_map= NULL;
3088   if (all_gtids)
3089     sid_map= all_gtids->get_sid_map();
3090   else if (lost_gtids)
3091     sid_map= lost_gtids->get_sid_map();
3092 
3093   for (error= find_log_pos(&linfo, NULL, false/*need_lock_index=false*/); !error;
3094        error= find_next_log(&linfo, false/*need_lock_index=false*/))
3095   {
3096     DBUG_PRINT("info", ("read log filename '%s'", linfo.log_file_name));
3097     filename_list.push_back(string(linfo.log_file_name));
3098   }
3099   if (error != LOG_INFO_EOF)
3100   {
3101     DBUG_PRINT("error", ("Error reading binlog index"));
3102     goto end;
3103   }
3104   /*
3105     On server starting, one new empty binlog file is created and
3106     its file name is put into index file before initializing
3107     GLOBAL.GTID_EXECUTED AND GLOBAL.GTID_PURGED, it is not the
3108     last binlog file before the server restarts, so we remove
3109     its file name from filename_list.
3110   */
3111   if (is_server_starting && !is_relay_log && !filename_list.empty())
3112     filename_list.pop_back();
3113 
3114   error= 0;
3115 
3116   if (all_gtids != NULL)
3117   {
3118     DBUG_PRINT("info", ("Iterating backwards through binary logs, looking for the last binary log that contains a Previous_gtids_log_event."));
3119     // Iterate over all files in reverse order until we find one that
3120     // contains a Previous_gtids_log_event.
3121     rit= filename_list.rbegin();
3122     bool got_gtids= false;
3123     reached_first_file= (rit == filename_list.rend());
3124     DBUG_PRINT("info", ("filename='%s' reached_first_file=%d",
3125                         rit->c_str(), reached_first_file));
3126     while ((!got_gtids || (last_gtid && last_gtid->empty()))
3127            && !reached_first_file)
3128     {
3129       const char *filename= rit->c_str();
3130       rit++;
3131       reached_first_file= (rit == filename_list.rend());
3132       DBUG_PRINT("info", ("filename='%s' got_gtids=%d reached_first_file=%d",
3133                           filename, got_gtids, reached_first_file));
3134       switch (read_gtids_from_binlog(filename, got_gtids ? NULL : all_gtids,
3135                                      reached_first_file ? lost_gtids : NULL,
3136                                      NULL/* first_gtid */, last_gtid,
3137                                      sid_map, verify_checksum))
3138       {
3139         case ERROR:
3140         {
3141           error= 1;
3142           goto end;
3143         }
3144         case GOT_GTIDS:
3145         case GOT_PREVIOUS_GTIDS:
3146         {
3147           got_gtids= true;
3148           break;
3149         }
3150         case NO_GTIDS:
3151         {
3152           /*
3153             If the binlog_gtid_simple_recovery is enabled, and the
3154             last binary log does not contain any GTID event, do not
3155             read any more binary logs, GLOBAL.GTID_EXECUTED and
3156             GLOBAL.GTID_PURGED should be empty in the case. Otherwise,
3157             initialize GTID_EXECUTED as usual.
3158           */
3159           if (binlog_gtid_simple_recovery && !is_relay_log)
3160           {
3161             DBUG_ASSERT(all_gtids->is_empty() && lost_gtids->is_empty());
3162             goto end;
3163           }
3164           /*FALLTHROUGH*/
3165         }
3166         case TRUNCATED:
3167         {
3168           break;
3169         }
3170       }
3171     }
3172   }
3173   if (lost_gtids != NULL && !reached_first_file)
3174   {
3175     DBUG_PRINT("info", ("Iterating forwards through binary logs, looking for the first binary log that contains a Previous_gtids_log_event."));
3176     for (it= filename_list.begin(); it != filename_list.end(); it++)
3177     {
3178       const char *filename= it->c_str();
3179       DBUG_PRINT("info", ("filename='%s'", filename));
3180       switch (read_gtids_from_binlog(filename, NULL, lost_gtids,
3181                                      NULL/* first_gtid */, NULL/* last_gtid */,
3182                                      sid_map, verify_checksum))
3183       {
3184         case ERROR:
3185         {
3186           error= 1;
3187           /*FALLTHROUGH*/
3188         }
3189         case GOT_GTIDS:
3190         {
3191           goto end;
3192         }
3193         case NO_GTIDS:
3194         {
3195           /*
3196             If the binlog_gtid_simple_recovery is enabled, and the
3197             first binary log does not contain any GTID event, do not
3198             read any more binary logs, GLOBAL.GTID_PURGED should be
3199             empty in the case.
3200           */
3201           if (binlog_gtid_simple_recovery && !is_relay_log)
3202           {
3203             DBUG_ASSERT(lost_gtids->is_empty());
3204             goto end;
3205           }
3206           /*FALLTHROUGH*/
3207         }
3208         case GOT_PREVIOUS_GTIDS:
3209         case TRUNCATED:
3210         {
3211           break;
3212         }
3213       }
3214     }
3215   }
3216 end:
3217   if (all_gtids)
3218     all_gtids->dbug_print("all_gtids");
3219   if (lost_gtids)
3220     lost_gtids->dbug_print("lost_gtids");
3221   if (need_lock)
3222   {
3223     global_sid_lock->unlock();
3224     mysql_mutex_unlock(&LOCK_index);
3225     if (all_gtids != NULL)
3226       mysql_mutex_unlock(&LOCK_log);
3227   }
3228   filename_list.clear();
3229   DBUG_PRINT("info", ("returning %d", error));
3230   DBUG_RETURN(error != 0 ? true : false);
3231 }
3232 
3233 
3234 /**
3235   Open a (new) binlog file.
3236 
3237   - Open the log file and the index file. Register the new
3238   file name in it
3239   - When calling this when the file is in use, you must have a locks
3240   on LOCK_log and LOCK_index.
3241 
3242   @retval
3243     0	ok
3244   @retval
3245     1	error
3246 */
3247 
open_binlog(const char * log_name,const char * new_name,enum cache_type io_cache_type_arg,ulong max_size_arg,bool null_created_arg,bool need_lock_log,bool need_lock_index,bool need_sid_lock,Format_description_log_event * extra_description_event)3248 bool MYSQL_BIN_LOG::open_binlog(const char *log_name,
3249                                 const char *new_name,
3250                                 enum cache_type io_cache_type_arg,
3251                                 ulong max_size_arg,
3252                                 bool null_created_arg,
3253                                 bool need_lock_log,
3254                                 bool need_lock_index,
3255                                 bool need_sid_lock,
3256                                 Format_description_log_event *extra_description_event)
3257 {
3258 
3259   // lock_index must be acquired *before* sid_lock.
3260   DBUG_ASSERT(need_sid_lock || !need_lock_index);
3261   DBUG_ENTER("MYSQL_BIN_LOG::open_binlog(const char *, ...)");
3262   DBUG_PRINT("enter",("name: %s", log_name));
3263 
3264   if (init_and_set_log_file_name(log_name, new_name, LOG_BIN,
3265                                  io_cache_type_arg))
3266   {
3267     sql_print_error("MYSQL_BIN_LOG::open failed to generate new file name.");
3268     DBUG_RETURN(1);
3269   }
3270 
3271 #ifdef HAVE_REPLICATION
3272   if (open_purge_index_file(TRUE) ||
3273       register_create_index_entry(log_file_name) ||
3274       sync_purge_index_file() ||
3275       DBUG_EVALUATE_IF("fault_injection_registering_index", 1, 0))
3276   {
3277     /**
3278       @todo: although this was introduced to appease valgrind
3279       when injecting emulated faults using fault_injection_registering_index
3280       it may be good to consider what actually happens when
3281       open_purge_index_file succeeds but register or sync fails.
3282 
3283       Perhaps we might need the code below in MYSQL_LOG_BIN::cleanup
3284       for "real life" purposes as well?
3285     */
3286     DBUG_EXECUTE_IF("fault_injection_registering_index", {
3287       if (my_b_inited(&purge_index_file))
3288       {
3289         end_io_cache(&purge_index_file);
3290         my_close(purge_index_file.file, MYF(0));
3291       }
3292     });
3293 
3294     sql_print_error("MYSQL_BIN_LOG::open failed to sync the index file.");
3295     DBUG_RETURN(1);
3296   }
3297   DBUG_EXECUTE_IF("crash_create_non_critical_before_update_index", DBUG_SUICIDE(););
3298 #endif
3299 
3300   write_error= 0;
3301 
3302   /* open the main log file */
3303   if (MYSQL_LOG::open(
3304 #ifdef HAVE_PSI_INTERFACE
3305                       m_key_file_log,
3306 #endif
3307                       log_name, LOG_BIN, new_name, io_cache_type_arg))
3308   {
3309 #ifdef HAVE_REPLICATION
3310     close_purge_index_file();
3311 #endif
3312     DBUG_RETURN(1);                            /* all warnings issued */
3313   }
3314 
3315   max_size= max_size_arg;
3316 
3317   open_count++;
3318 
3319   bool write_file_name_to_index_file=0;
3320 
3321   /* This must be before goto err. */
3322   Format_description_log_event s(BINLOG_VERSION);
3323 
3324   if (!my_b_filelength(&log_file))
3325   {
3326     /*
3327       The binary log file was empty (probably newly created)
3328       This is the normal case and happens when the user doesn't specify
3329       an extension for the binary log files.
3330       In this case we write a standard header to it.
3331     */
3332     if (my_b_safe_write(&log_file, (uchar*) BINLOG_MAGIC,
3333                         BIN_LOG_HEADER_SIZE))
3334       goto err;
3335     bytes_written+= BIN_LOG_HEADER_SIZE;
3336     write_file_name_to_index_file= 1;
3337   }
3338 
3339   /*
3340     don't set LOG_EVENT_BINLOG_IN_USE_F for SEQ_READ_APPEND io_cache
3341     as we won't be able to reset it later
3342   */
3343   if (io_cache_type == WRITE_CACHE)
3344     s.flags |= LOG_EVENT_BINLOG_IN_USE_F;
3345   s.checksum_alg= is_relay_log ?
3346     /* relay-log */
3347     /* inherit master's A descriptor if one has been received */
3348     (relay_log_checksum_alg=
3349      (relay_log_checksum_alg != BINLOG_CHECKSUM_ALG_UNDEF) ?
3350      relay_log_checksum_alg :
3351      /* otherwise use slave's local preference of RL events verification */
3352      (opt_slave_sql_verify_checksum == 0) ?
3353      (uint8) BINLOG_CHECKSUM_ALG_OFF : binlog_checksum_options):
3354     /* binlog */
3355     binlog_checksum_options;
3356   DBUG_ASSERT(s.checksum_alg != BINLOG_CHECKSUM_ALG_UNDEF);
3357   if (!s.is_valid())
3358     goto err;
3359   s.dont_set_created= null_created_arg;
3360   /* Set LOG_EVENT_RELAY_LOG_F flag for relay log's FD */
3361   if (is_relay_log)
3362     s.set_relay_log_event();
3363   if (s.write(&log_file))
3364     goto err;
3365   bytes_written+= s.data_written;
3366   /*
3367     We need to revisit this code and improve it.
3368     See further comments in the mysqld.
3369     /Alfranio
3370   */
3371   if (current_thd && gtid_mode > 0)
3372   {
3373     if (need_sid_lock)
3374       global_sid_lock->wrlock();
3375     else
3376       global_sid_lock->assert_some_wrlock();
3377     Previous_gtids_log_event prev_gtids_ev(previous_gtid_set);
3378     if (is_relay_log)
3379       prev_gtids_ev.set_relay_log_event();
3380     if (need_sid_lock)
3381       global_sid_lock->unlock();
3382     prev_gtids_ev.checksum_alg= s.checksum_alg;
3383     if (prev_gtids_ev.write(&log_file))
3384       goto err;
3385     bytes_written+= prev_gtids_ev.data_written;
3386   }
3387   if (extra_description_event &&
3388       extra_description_event->binlog_version>=4)
3389   {
3390     /*
3391       This is a relay log written to by the I/O slave thread.
3392       Write the event so that others can later know the format of this relay
3393       log.
3394       Note that this event is very close to the original event from the
3395       master (it has binlog version of the master, event types of the
3396       master), so this is suitable to parse the next relay log's event. It
3397       has been produced by
3398       Format_description_log_event::Format_description_log_event(char* buf,).
3399       Why don't we want to write the mi_description_event if this
3400       event is for format<4 (3.23 or 4.x): this is because in that case, the
3401       mi_description_event describes the data received from the
3402       master, but not the data written to the relay log (*conversion*),
3403       which is in format 4 (slave's).
3404     */
3405     /*
3406       Set 'created' to 0, so that in next relay logs this event does not
3407       trigger cleaning actions on the slave in
3408       Format_description_log_event::apply_event_impl().
3409     */
3410     extra_description_event->created= 0;
3411     /* Don't set log_pos in event header */
3412     extra_description_event->set_artificial_event();
3413 
3414     if (extra_description_event->write(&log_file))
3415       goto err;
3416     bytes_written+= extra_description_event->data_written;
3417   }
3418   if (flush_io_cache(&log_file) ||
3419       mysql_file_sync(log_file.file, MYF(MY_WME)))
3420     goto err;
3421 
3422   if (write_file_name_to_index_file)
3423   {
3424 #ifdef HAVE_REPLICATION
3425     DBUG_EXECUTE_IF("crash_create_critical_before_update_index", DBUG_SUICIDE(););
3426 #endif
3427 
3428     DBUG_ASSERT(my_b_inited(&index_file) != 0);
3429 
3430     /*
3431       The new log file name is appended into crash safe index file after
3432       all the content of index file is copyed into the crash safe index
3433       file. Then move the crash safe index file to index file.
3434     */
3435     DBUG_EXECUTE_IF("simulate_disk_full_on_open_binlog",
3436                     {DBUG_SET("+d,simulate_no_free_space_error");});
3437     if (DBUG_EVALUATE_IF("fault_injection_updating_index", 1, 0) ||
3438         add_log_to_index((uchar*) log_file_name, strlen(log_file_name),
3439                          need_lock_index))
3440     {
3441       DBUG_EXECUTE_IF("simulate_disk_full_on_open_binlog",
3442                       {
3443                         DBUG_SET("-d,simulate_file_write_error");
3444                         DBUG_SET("-d,simulate_no_free_space_error");
3445                         DBUG_SET("-d,simulate_disk_full_on_open_binlog");
3446                       });
3447       goto err;
3448     }
3449 
3450 #ifdef HAVE_REPLICATION
3451     DBUG_EXECUTE_IF("crash_create_after_update_index", DBUG_SUICIDE(););
3452 #endif
3453   }
3454 
3455   log_state= LOG_OPENED;
3456 
3457 #ifdef HAVE_REPLICATION
3458   close_purge_index_file();
3459 #endif
3460 
3461   DBUG_RETURN(0);
3462 
3463 err:
3464 #ifdef HAVE_REPLICATION
3465   if (is_inited_purge_index_file())
3466     purge_index_entry(NULL, NULL, need_lock_index);
3467   close_purge_index_file();
3468 #endif
3469 
3470   if (binlog_error_action == ABORT_SERVER)
3471   {
3472     exec_binlog_error_action_abort("Either disk is full or file system is read "
3473                                    "only while opening the binlog. Aborting the"
3474                                    " server.");
3475   }
3476   else
3477   {
3478     sql_print_error("Could not use %s for logging (error %d). "
3479                     "Turning logging off for the whole duration of the MySQL "
3480                     "server process. To turn it on again: fix the cause, "
3481                     "shutdown the MySQL server and restart it.",
3482                     (new_name) ? new_name : name, errno);
3483     close(LOG_CLOSE_INDEX, need_lock_log, need_lock_index);
3484   }
3485   DBUG_RETURN(1);
3486 }
3487 
3488 
3489 /**
3490   Move crash safe index file to index file.
3491 
3492   @param need_lock_index If true, LOCK_index will be acquired;
3493   otherwise it should already be held.
3494 
3495   @retval 0 ok
3496   @retval -1 error
3497 */
move_crash_safe_index_file_to_index_file(bool need_lock_index)3498 int MYSQL_BIN_LOG::move_crash_safe_index_file_to_index_file(bool need_lock_index)
3499 {
3500   int error= 0;
3501   File fd= -1;
3502   DBUG_ENTER("MYSQL_BIN_LOG::move_crash_safe_index_file_to_index_file");
3503   int failure_trials= MYSQL_BIN_LOG::MAX_RETRIES_FOR_DELETE_RENAME_FAILURE;
3504   bool file_rename_status= false, file_delete_status= false;
3505   THD *thd= current_thd;
3506 
3507   if (need_lock_index)
3508     mysql_mutex_lock(&LOCK_index);
3509   else
3510     mysql_mutex_assert_owner(&LOCK_index);
3511 
3512   if (my_b_inited(&index_file))
3513   {
3514     end_io_cache(&index_file);
3515     if (mysql_file_close(index_file.file, MYF(0)) < 0)
3516     {
3517       error= -1;
3518       sql_print_error("While rebuilding index file %s: "
3519                       "Failed to close the index file.", index_file_name);
3520       /*
3521         Delete Crash safe index file here and recover the binlog.index
3522         state(index_file io_cache) from old binlog.index content.
3523        */
3524       mysql_file_delete(key_file_binlog_index, crash_safe_index_file_name,
3525                         MYF(0));
3526 
3527       goto recoverable_err;
3528     }
3529 
3530     /*
3531       Sometimes an outsider can lock index files for temporary viewing
3532       purpose. For eg: MEB locks binlog.index/relaylog.index to view
3533       the content of the file. During that small period of time, deletion
3534       of the file is not possible on some platforms(Eg: Windows)
3535       Server should retry the delete operation for few times instead of panicking
3536       immediately.
3537     */
3538     while ((file_delete_status == false) && (failure_trials > 0))
3539     {
3540       if (DBUG_EVALUATE_IF("force_index_file_delete_failure", 1, 0)) break;
3541 
3542       DBUG_EXECUTE_IF("simulate_index_file_delete_failure",
3543                   {
3544                     /* This simulation causes the delete to fail */
3545                     static char first_char= index_file_name[0];
3546                     index_file_name[0]= 0;
3547                     sql_print_information("Retrying delete");
3548                     if (failure_trials == 1)
3549                       index_file_name[0]= first_char;
3550                   };);
3551       file_delete_status = !(mysql_file_delete(key_file_binlog_index,
3552                                                index_file_name, MYF(MY_WME)));
3553       --failure_trials;
3554       if (!file_delete_status)
3555       {
3556         my_sleep(1000);
3557         /* Clear the error before retrying. */
3558         if (failure_trials > 0)
3559           thd->clear_error();
3560       }
3561     }
3562 
3563     if (!file_delete_status)
3564     {
3565       error= -1;
3566       sql_print_error("While rebuilding index file %s: "
3567                       "Failed to delete the existing index file. It could be "
3568                       "that file is being used by some other process.",
3569                       index_file_name);
3570       /*
3571         Delete Crash safe file index file here and recover the binlog.index
3572         state(index_file io_cache) from old binlog.index content.
3573        */
3574       mysql_file_delete(key_file_binlog_index, crash_safe_index_file_name,
3575                         MYF(0));
3576 
3577       goto recoverable_err;
3578     }
3579   }
3580 
3581   DBUG_EXECUTE_IF("crash_create_before_rename_index_file", DBUG_SUICIDE(););
3582   /*
3583     Sometimes an outsider can lock index files for temporary viewing
3584     purpose. For eg: MEB locks binlog.index/relaylog.index to view
3585     the content of the file. During that small period of time, rename
3586     of the file is not possible on some platforms(Eg: Windows)
3587     Server should retry the rename operation for few times instead of panicking
3588     immediately.
3589   */
3590   failure_trials = MYSQL_BIN_LOG::MAX_RETRIES_FOR_DELETE_RENAME_FAILURE;
3591   while ((file_rename_status == false) && (failure_trials > 0))
3592   {
3593     DBUG_EXECUTE_IF("simulate_crash_safe_index_file_rename_failure",
3594                 {
3595                   /* This simulation causes the rename to fail */
3596                   static char first_char= index_file_name[0];
3597                   index_file_name[0]= 0;
3598                   sql_print_information("Retrying rename");
3599                   if (failure_trials == 1)
3600                     index_file_name[0]= first_char;
3601                 };);
3602     file_rename_status =
3603         !(my_rename(crash_safe_index_file_name, index_file_name, MYF(MY_WME)));
3604     --failure_trials;
3605     if (!file_rename_status)
3606     {
3607       my_sleep(1000);
3608       /* Clear the error before retrying. */
3609       if (failure_trials > 0)
3610         thd->clear_error();
3611     }
3612   }
3613   if (!file_rename_status)
3614   {
3615     error= -1;
3616     sql_print_error("While rebuilding index file %s: "
3617                     "Failed to rename the new index file to the existing "
3618                     "index file.", index_file_name);
3619     goto fatal_err;
3620   }
3621   DBUG_EXECUTE_IF("crash_create_after_rename_index_file", DBUG_SUICIDE(););
3622 
3623 recoverable_err:
3624   if ((fd= mysql_file_open(key_file_binlog_index,
3625                            index_file_name,
3626                            O_RDWR | O_CREAT | O_BINARY,
3627                            MYF(MY_WME))) < 0 ||
3628            mysql_file_sync(fd, MYF(MY_WME)) ||
3629            init_io_cache(&index_file, fd, IO_SIZE, READ_CACHE,
3630                          mysql_file_seek(fd, 0L, MY_SEEK_END, MYF(0)),
3631                                          0, MYF(MY_WME | MY_WAIT_IF_FULL)))
3632   {
3633     sql_print_error("After rebuilding the index file %s: "
3634                     "Failed to open the index file.", index_file_name);
3635     goto fatal_err;
3636   }
3637 
3638   if (need_lock_index)
3639     mysql_mutex_unlock(&LOCK_index);
3640   DBUG_RETURN(error);
3641 
3642 fatal_err:
3643   /*
3644     This situation is very very rare to happen (unless there is some serious
3645     memory related issues like OOM) and should be treated as fatal error.
3646     Hence it is better to bring down the server without respecting
3647     'binlog_error_action' value here.
3648   */
3649   exec_binlog_error_action_abort("MySQL server failed to update the "
3650                                  "binlog.index file's content properly. "
3651                                  "It might not be in sync with available "
3652                                  "binlogs and the binlog.index file state is in "
3653                                  "unrecoverable state. Aborting the server.");
3654   /*
3655     Server is aborted in the above function.
3656     This is dead code to make compiler happy.
3657    */
3658   DBUG_RETURN(error);
3659 }
3660 
3661 
3662 /**
3663   Append log file name to index file.
3664 
3665   - To make crash safe, we copy all the content of index file
3666   to crash safe index file firstly and then append the log
3667   file name to the crash safe index file. Finally move the
3668   crash safe index file to index file.
3669 
3670   @retval
3671     0   ok
3672   @retval
3673     -1   error
3674 */
add_log_to_index(uchar * log_name,int log_name_len,bool need_lock_index)3675 int MYSQL_BIN_LOG::add_log_to_index(uchar* log_name,
3676                                     int log_name_len, bool need_lock_index)
3677 {
3678   DBUG_ENTER("MYSQL_BIN_LOG::add_log_to_index");
3679 
3680   if (open_crash_safe_index_file())
3681   {
3682     sql_print_error("MYSQL_BIN_LOG::add_log_to_index failed to "
3683                     "open the crash safe index file.");
3684     goto err;
3685   }
3686 
3687   if (copy_file(&index_file, &crash_safe_index_file, 0))
3688   {
3689     sql_print_error("MYSQL_BIN_LOG::add_log_to_index failed to "
3690                     "copy index file to crash safe index file.");
3691     goto err;
3692   }
3693 
3694   if (my_b_write(&crash_safe_index_file, log_name, log_name_len) ||
3695       my_b_write(&crash_safe_index_file, (uchar*) "\n", 1) ||
3696       flush_io_cache(&crash_safe_index_file) ||
3697       mysql_file_sync(crash_safe_index_file.file, MYF(MY_WME)))
3698   {
3699     sql_print_error("MYSQL_BIN_LOG::add_log_to_index failed to "
3700                     "append log file name: %s, to crash "
3701                     "safe index file.", log_name);
3702     goto err;
3703   }
3704 
3705   if (close_crash_safe_index_file())
3706   {
3707     sql_print_error("MYSQL_BIN_LOG::add_log_to_index failed to "
3708                     "close the crash safe index file.");
3709     goto err;
3710   }
3711 
3712   if (move_crash_safe_index_file_to_index_file(need_lock_index))
3713   {
3714     sql_print_error("MYSQL_BIN_LOG::add_log_to_index failed to "
3715                     "move crash safe index file to index file.");
3716     goto err;
3717   }
3718 
3719   DBUG_RETURN(0);
3720 
3721 err:
3722   DBUG_RETURN(-1);
3723 }
3724 
get_current_log(LOG_INFO * linfo,bool need_lock_log)3725 int MYSQL_BIN_LOG::get_current_log(LOG_INFO* linfo, bool need_lock_log/*true*/)
3726 {
3727   if (need_lock_log)
3728     mysql_mutex_lock(&LOCK_log);
3729   int ret = raw_get_current_log(linfo);
3730   if (need_lock_log)
3731     mysql_mutex_unlock(&LOCK_log);
3732   return ret;
3733 }
3734 
raw_get_current_log(LOG_INFO * linfo)3735 int MYSQL_BIN_LOG::raw_get_current_log(LOG_INFO* linfo)
3736 {
3737   strmake(linfo->log_file_name, log_file_name, sizeof(linfo->log_file_name)-1);
3738   linfo->pos = my_b_safe_tell(&log_file);
3739   return 0;
3740 }
3741 
check_write_error(THD * thd)3742 bool MYSQL_BIN_LOG::check_write_error(THD *thd)
3743 {
3744   DBUG_ENTER("MYSQL_BIN_LOG::check_write_error");
3745 
3746   bool checked= FALSE;
3747 
3748   if (!thd->is_error())
3749     DBUG_RETURN(checked);
3750 
3751   switch (thd->get_stmt_da()->sql_errno())
3752   {
3753     case ER_TRANS_CACHE_FULL:
3754     case ER_STMT_CACHE_FULL:
3755     case ER_ERROR_ON_WRITE:
3756     case ER_BINLOG_LOGGING_IMPOSSIBLE:
3757       checked= TRUE;
3758     break;
3759   }
3760   DBUG_PRINT("return", ("checked: %s", YESNO(checked)));
3761   DBUG_RETURN(checked);
3762 }
3763 
set_write_error(THD * thd,bool is_transactional)3764 void MYSQL_BIN_LOG::set_write_error(THD *thd, bool is_transactional)
3765 {
3766   DBUG_ENTER("MYSQL_BIN_LOG::set_write_error");
3767 
3768   write_error= 1;
3769 
3770   if (check_write_error(thd))
3771     DBUG_VOID_RETURN;
3772 
3773   if (my_errno == EFBIG)
3774   {
3775     if (is_transactional)
3776     {
3777       my_message(ER_TRANS_CACHE_FULL, ER(ER_TRANS_CACHE_FULL), MYF(MY_WME));
3778     }
3779     else
3780     {
3781       my_message(ER_STMT_CACHE_FULL, ER(ER_STMT_CACHE_FULL), MYF(MY_WME));
3782     }
3783   }
3784   else
3785   {
3786     char errbuf[MYSYS_STRERROR_SIZE];
3787     my_error(ER_ERROR_ON_WRITE, MYF(MY_WME), name,
3788              errno, my_strerror(errbuf, sizeof(errbuf), errno));
3789   }
3790 
3791   DBUG_VOID_RETURN;
3792 }
3793 
3794 /**
3795   Find the position in the log-index-file for the given log name.
3796 
3797   @param[out] linfo The found log file name will be stored here, along
3798   with the byte offset of the next log file name in the index file.
3799   @param log_name Filename to find in the index file, or NULL if we
3800   want to read the first entry.
3801   @param need_lock_index If false, this function acquires LOCK_index;
3802   otherwise the lock should already be held by the caller.
3803 
3804   @note
3805     On systems without the truncate function the file will end with one or
3806     more empty lines.  These will be ignored when reading the file.
3807 
3808   @retval
3809     0			ok
3810   @retval
3811     LOG_INFO_EOF	        End of log-index-file found
3812   @retval
3813     LOG_INFO_IO		Got IO error while reading file
3814 */
3815 
find_log_pos(LOG_INFO * linfo,const char * log_name,bool need_lock_index)3816 int MYSQL_BIN_LOG::find_log_pos(LOG_INFO *linfo, const char *log_name,
3817                                 bool need_lock_index)
3818 {
3819   int error= 0;
3820   char *full_fname= linfo->log_file_name;
3821   char full_log_name[FN_REFLEN], fname[FN_REFLEN];
3822   uint log_name_len= 0, fname_len= 0;
3823   DBUG_ENTER("find_log_pos");
3824   full_log_name[0]= full_fname[0]= 0;
3825 
3826   /*
3827     Mutex needed because we need to make sure the file pointer does not
3828     move from under our feet
3829   */
3830   if (need_lock_index)
3831     mysql_mutex_lock(&LOCK_index);
3832   else
3833     mysql_mutex_assert_owner(&LOCK_index);
3834 
3835   if (!my_b_inited(&index_file))
3836   {
3837       error= LOG_INFO_IO;
3838       goto end;
3839   }
3840 
3841   // extend relative paths for log_name to be searched
3842   if (log_name)
3843   {
3844     if(normalize_binlog_name(full_log_name, log_name, is_relay_log))
3845     {
3846       error= LOG_INFO_EOF;
3847       goto end;
3848     }
3849   }
3850 
3851   log_name_len= log_name ? (uint) strlen(full_log_name) : 0;
3852   DBUG_PRINT("enter", ("log_name: %s, full_log_name: %s",
3853                        log_name ? log_name : "NULL", full_log_name));
3854 
3855   /* As the file is flushed, we can't get an error here */
3856   my_b_seek(&index_file, (my_off_t) 0);
3857 
3858   for (;;)
3859   {
3860     uint length;
3861     my_off_t offset= my_b_tell(&index_file);
3862 
3863     DBUG_EXECUTE_IF("simulate_find_log_pos_error",
3864                     error=  LOG_INFO_EOF; break;);
3865     /* If we get 0 or 1 characters, this is the end of the file */
3866     if ((length= my_b_gets(&index_file, fname, FN_REFLEN)) <= 1)
3867     {
3868       /* Did not find the given entry; Return not found or error */
3869       error= !index_file.error ? LOG_INFO_EOF : LOG_INFO_IO;
3870       break;
3871     }
3872 
3873     // extend relative paths and match against full path
3874     if (normalize_binlog_name(full_fname, fname, is_relay_log))
3875     {
3876       error= LOG_INFO_EOF;
3877       break;
3878     }
3879     fname_len= (uint) strlen(full_fname);
3880 
3881     // if the log entry matches, null string matching anything
3882     if (!log_name ||
3883        (log_name_len == fname_len-1 && full_fname[log_name_len] == '\n' &&
3884         !strncmp(full_fname, full_log_name, log_name_len)))
3885     {
3886       DBUG_PRINT("info", ("Found log file entry"));
3887       full_fname[fname_len-1]= 0;                      // remove last \n
3888       linfo->index_file_start_offset= offset;
3889       linfo->index_file_offset = my_b_tell(&index_file);
3890       break;
3891     }
3892     linfo->entry_index++;
3893   }
3894 
3895 end:
3896   if (need_lock_index)
3897     mysql_mutex_unlock(&LOCK_index);
3898   DBUG_RETURN(error);
3899 }
3900 
3901 
3902 /**
3903   Find the position in the log-index-file for the given log name.
3904 
3905   @param[out] linfo The filename will be stored here, along with the
3906   byte offset of the next filename in the index file.
3907 
3908   @param need_lock_index If true, LOCK_index will be acquired;
3909   otherwise it should already be held by the caller.
3910 
3911   @note
3912     - Before calling this function, one has to call find_log_pos()
3913     to set up 'linfo'
3914     - Mutex needed because we need to make sure the file pointer does not move
3915     from under our feet
3916 
3917   @retval 0 ok
3918   @retval LOG_INFO_EOF End of log-index-file found
3919   @retval LOG_INFO_IO Got IO error while reading file
3920 */
find_next_log(LOG_INFO * linfo,bool need_lock_index)3921 int MYSQL_BIN_LOG::find_next_log(LOG_INFO* linfo, bool need_lock_index)
3922 {
3923   int error= 0;
3924   uint length;
3925   char fname[FN_REFLEN];
3926   char *full_fname= linfo->log_file_name;
3927 
3928   if (need_lock_index)
3929     mysql_mutex_lock(&LOCK_index);
3930   else
3931     mysql_mutex_assert_owner(&LOCK_index);
3932 
3933   if (!my_b_inited(&index_file))
3934   {
3935       error= LOG_INFO_IO;
3936       goto err;
3937   }
3938   /* As the file is flushed, we can't get an error here */
3939   my_b_seek(&index_file, linfo->index_file_offset);
3940 
3941   linfo->index_file_start_offset= linfo->index_file_offset;
3942   if ((length=my_b_gets(&index_file, fname, FN_REFLEN)) <= 1)
3943   {
3944     error = !index_file.error ? LOG_INFO_EOF : LOG_INFO_IO;
3945     goto err;
3946   }
3947 
3948   if (fname[0] != 0)
3949   {
3950     if(normalize_binlog_name(full_fname, fname, is_relay_log))
3951     {
3952       error= LOG_INFO_EOF;
3953       goto err;
3954     }
3955     length= strlen(full_fname);
3956   }
3957 
3958   full_fname[length-1]= 0;                     // kill \n
3959   linfo->index_file_offset= my_b_tell(&index_file);
3960 
3961 err:
3962   if (need_lock_index)
3963     mysql_mutex_unlock(&LOCK_index);
3964   return error;
3965 }
3966 
3967 
3968 /**
3969   Removes files, as part of a RESET MASTER or RESET SLAVE statement,
3970   by deleting all logs refered to in the index file. Then, it starts
3971   writing to a new log file.
3972 
3973   The new index file will only contain this file.
3974 
3975   @param thd Thread
3976 
3977   @note
3978     If not called from slave thread, write start event to new log
3979 
3980   @retval
3981     0	ok
3982   @retval
3983     1   error
3984 */
reset_logs(THD * thd)3985 bool MYSQL_BIN_LOG::reset_logs(THD* thd)
3986 {
3987   LOG_INFO linfo;
3988   bool error=0;
3989   int err;
3990   const char* save_name;
3991   DBUG_ENTER("reset_logs");
3992 
3993   /*
3994     Flush logs for storage engines, so that the last transaction
3995     is fsynced inside storage engines.
3996   */
3997   if (ha_flush_logs(NULL))
3998     DBUG_RETURN(1);
3999 
4000   ha_reset_logs(thd);
4001 
4002   /*
4003     We need to get both locks to be sure that no one is trying to
4004     write to the index log file.
4005   */
4006   mysql_mutex_lock(&LOCK_log);
4007   mysql_mutex_lock(&LOCK_index);
4008 
4009   /*
4010     The following mutex is needed to ensure that no threads call
4011     'delete thd' as we would then risk missing a 'rollback' from this
4012     thread. If the transaction involved MyISAM tables, it should go
4013     into binlog even on rollback.
4014   */
4015   mysql_mutex_lock(&LOCK_thread_count);
4016 
4017   global_sid_lock->wrlock();
4018 
4019   /* Save variables so that we can reopen the log */
4020   save_name=name;
4021   name=0;					// Protect against free
4022   close(LOG_CLOSE_TO_BE_OPENED, false/*need_lock_log=false*/,
4023         false/*need_lock_index=false*/);
4024 
4025   /*
4026     First delete all old log files and then update the index file.
4027     As we first delete the log files and do not use sort of logging,
4028     a crash may lead to an inconsistent state where the index has
4029     references to non-existent files.
4030 
4031     We need to invert the steps and use the purge_index_file methods
4032     in order to make the operation safe.
4033   */
4034 
4035   if ((err= find_log_pos(&linfo, NullS, false/*need_lock_index=false*/)) != 0)
4036   {
4037     uint errcode= purge_log_get_error_code(err);
4038     sql_print_error("Failed to locate old binlog or relay log files");
4039     my_message(errcode, ER(errcode), MYF(0));
4040     error= 1;
4041     goto err;
4042   }
4043 
4044   for (;;)
4045   {
4046     if ((error= my_delete_allow_opened(linfo.log_file_name, MYF(0))) != 0)
4047     {
4048       if (my_errno == ENOENT)
4049       {
4050         push_warning_printf(current_thd, Sql_condition::WARN_LEVEL_WARN,
4051                             ER_LOG_PURGE_NO_FILE, ER(ER_LOG_PURGE_NO_FILE),
4052                             linfo.log_file_name);
4053         sql_print_information("Failed to delete file '%s'",
4054                               linfo.log_file_name);
4055         my_errno= 0;
4056         error= 0;
4057       }
4058       else
4059       {
4060         push_warning_printf(current_thd, Sql_condition::WARN_LEVEL_WARN,
4061                             ER_BINLOG_PURGE_FATAL_ERR,
4062                             "a problem with deleting %s; "
4063                             "consider examining correspondence "
4064                             "of your binlog index file "
4065                             "to the actual binlog files",
4066                             linfo.log_file_name);
4067         error= 1;
4068         goto err;
4069       }
4070     }
4071     if (find_next_log(&linfo, false/*need_lock_index=false*/))
4072       break;
4073   }
4074 
4075   /* Start logging with a new file */
4076   close(LOG_CLOSE_INDEX | LOG_CLOSE_TO_BE_OPENED,
4077         false/*need_lock_log=false*/,
4078         false/*need_lock_index=false*/);
4079   if ((error= my_delete_allow_opened(index_file_name, MYF(0))))	// Reset (open will update)
4080   {
4081     if (my_errno == ENOENT)
4082     {
4083       push_warning_printf(current_thd, Sql_condition::WARN_LEVEL_WARN,
4084                           ER_LOG_PURGE_NO_FILE, ER(ER_LOG_PURGE_NO_FILE),
4085                           index_file_name);
4086       sql_print_information("Failed to delete file '%s'",
4087                             index_file_name);
4088       my_errno= 0;
4089       error= 0;
4090     }
4091     else
4092     {
4093       push_warning_printf(current_thd, Sql_condition::WARN_LEVEL_WARN,
4094                           ER_BINLOG_PURGE_FATAL_ERR,
4095                           "a problem with deleting %s; "
4096                           "consider examining correspondence "
4097                           "of your binlog index file "
4098                           "to the actual binlog files",
4099                           index_file_name);
4100       error= 1;
4101       goto err;
4102     }
4103   }
4104 
4105 #ifdef HAVE_REPLICATION
4106   if (is_relay_log)
4107   {
4108     DBUG_ASSERT(active_mi != NULL);
4109     DBUG_ASSERT(active_mi->rli != NULL);
4110     (const_cast<Gtid_set *>(active_mi->rli->get_gtid_set()))->clear();
4111   }
4112   else
4113   {
4114     gtid_state->clear();
4115     // don't clear global_sid_map because it's used by the relay log too
4116     if (gtid_state->init() != 0)
4117       goto err;
4118   }
4119 #endif
4120 
4121   if (!open_index_file(index_file_name, 0, false/*need_lock_index=false*/))
4122     if ((error= open_binlog(save_name, 0, io_cache_type,
4123                             max_size, false,
4124                             false/*need_lock_log=false*/,
4125                             false/*need_lock_index=false*/,
4126                             false/*need_sid_lock=false*/,
4127                             NULL)))
4128       goto err;
4129   my_free((void *) save_name);
4130 
4131 err:
4132   if (error == 1)
4133     name= const_cast<char*>(save_name);
4134   global_sid_lock->unlock();
4135   mysql_mutex_unlock(&LOCK_thread_count);
4136   mysql_mutex_unlock(&LOCK_index);
4137   mysql_mutex_unlock(&LOCK_log);
4138   DBUG_RETURN(error);
4139 }
4140 
4141 
4142 /**
4143   Set the name of crash safe index file.
4144 
4145   @retval
4146     0   ok
4147   @retval
4148     1   error
4149 */
set_crash_safe_index_file_name(const char * base_file_name)4150 int MYSQL_BIN_LOG::set_crash_safe_index_file_name(const char *base_file_name)
4151 {
4152   int error= 0;
4153   DBUG_ENTER("MYSQL_BIN_LOG::set_crash_safe_index_file_name");
4154   if (fn_format(crash_safe_index_file_name, base_file_name, mysql_data_home,
4155                 ".index_crash_safe", MYF(MY_UNPACK_FILENAME | MY_SAFE_PATH |
4156                                          MY_REPLACE_EXT)) == NULL)
4157   {
4158     error= 1;
4159     sql_print_error("MYSQL_BIN_LOG::set_crash_safe_index_file_name failed "
4160                     "to set file name.");
4161   }
4162   DBUG_RETURN(error);
4163 }
4164 
4165 
4166 /**
4167   Open a (new) crash safe index file.
4168 
4169   @note
4170     The crash safe index file is a special file
4171     used for guaranteeing index file crash safe.
4172   @retval
4173     0   ok
4174   @retval
4175     1   error
4176 */
open_crash_safe_index_file()4177 int MYSQL_BIN_LOG::open_crash_safe_index_file()
4178 {
4179   int error= 0;
4180   File file= -1;
4181 
4182   DBUG_ENTER("MYSQL_BIN_LOG::open_crash_safe_index_file");
4183 
4184   if (!my_b_inited(&crash_safe_index_file))
4185   {
4186     if ((file= my_open(crash_safe_index_file_name, O_RDWR | O_CREAT | O_BINARY,
4187                        MYF(MY_WME | ME_WAITTANG))) < 0  ||
4188         init_io_cache(&crash_safe_index_file, file, IO_SIZE, WRITE_CACHE,
4189                       0, 0, MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
4190     {
4191       error= 1;
4192       sql_print_error("MYSQL_BIN_LOG::open_crash_safe_index_file failed "
4193                       "to open temporary index file.");
4194     }
4195   }
4196   DBUG_RETURN(error);
4197 }
4198 
4199 
4200 /**
4201   Close the crash safe index file.
4202 
4203   @note
4204     The crash safe file is just closed, is not deleted.
4205     Because it is moved to index file later on.
4206   @retval
4207     0   ok
4208   @retval
4209     1   error
4210 */
close_crash_safe_index_file()4211 int MYSQL_BIN_LOG::close_crash_safe_index_file()
4212 {
4213   int error= 0;
4214 
4215   DBUG_ENTER("MYSQL_BIN_LOG::close_crash_safe_index_file");
4216 
4217   if (my_b_inited(&crash_safe_index_file))
4218   {
4219     end_io_cache(&crash_safe_index_file);
4220     error= my_close(crash_safe_index_file.file, MYF(0));
4221   }
4222   memset(&crash_safe_index_file, 0, sizeof(crash_safe_index_file));
4223 
4224   DBUG_RETURN(error);
4225 }
4226 
4227 
4228 /**
4229   Delete relay log files prior to rli->group_relay_log_name
4230   (i.e. all logs which are not involved in a non-finished group
4231   (transaction)), remove them from the index file and start on next
4232   relay log.
4233 
4234   IMPLEMENTATION
4235 
4236   - You must hold rli->data_lock before calling this function, since
4237     it writes group_relay_log_pos and similar fields of
4238     Relay_log_info.
4239   - Protects index file with LOCK_index
4240   - Delete relevant relay log files
4241   - Copy all file names after these ones to the front of the index file
4242   - If the OS has truncate, truncate the file, else fill it with \n'
4243   - Read the next file name from the index file and store in rli->linfo
4244 
4245   @param rli	       Relay log information
4246   @param included     If false, all relay logs that are strictly before
4247                       rli->group_relay_log_name are deleted ; if true, the
4248                       latter is deleted too (i.e. all relay logs
4249                       read by the SQL slave thread are deleted).
4250 
4251   @note
4252     - This is only called from the slave SQL thread when it has read
4253     all commands from a relay log and want to switch to a new relay log.
4254     - When this happens, we can be in an active transaction as
4255     a transaction can span over two relay logs
4256     (although it is always written as a single block to the master's binary
4257     log, hence cannot span over two master's binary logs).
4258 
4259   @retval
4260     0			ok
4261   @retval
4262     LOG_INFO_EOF	        End of log-index-file found
4263   @retval
4264     LOG_INFO_SEEK	Could not allocate IO cache
4265   @retval
4266     LOG_INFO_IO		Got IO error while reading file
4267 */
4268 
4269 #ifdef HAVE_REPLICATION
4270 
purge_first_log(Relay_log_info * rli,bool included)4271 int MYSQL_BIN_LOG::purge_first_log(Relay_log_info* rli, bool included)
4272 {
4273   int error;
4274   char *to_purge_if_included= NULL;
4275   DBUG_ENTER("purge_first_log");
4276 
4277   DBUG_ASSERT(current_thd->system_thread == SYSTEM_THREAD_SLAVE_SQL);
4278   DBUG_ASSERT(is_relay_log);
4279   DBUG_ASSERT(is_open());
4280   DBUG_ASSERT(rli->slave_running == 1);
4281   DBUG_ASSERT(!strcmp(rli->linfo.log_file_name,rli->get_event_relay_log_name()));
4282 
4283   mysql_mutex_assert_owner(&rli->data_lock);
4284 
4285   mysql_mutex_lock(&LOCK_index);
4286   to_purge_if_included= my_strdup(rli->get_group_relay_log_name(), MYF(0));
4287 
4288   /*
4289     Read the next log file name from the index file and pass it back to
4290     the caller.
4291   */
4292   if((error=find_log_pos(&rli->linfo, rli->get_event_relay_log_name(),
4293                          false/*need_lock_index=false*/)) ||
4294      (error=find_next_log(&rli->linfo, false/*need_lock_index=false*/)))
4295   {
4296     char buff[22];
4297     sql_print_error("next log error: %d  offset: %s  log: %s included: %d",
4298                     error,
4299                     llstr(rli->linfo.index_file_offset,buff),
4300                     rli->get_event_relay_log_name(),
4301                     included);
4302     goto err;
4303   }
4304 
4305   /*
4306     Reset rli's coordinates to the current log.
4307   */
4308   rli->set_event_relay_log_pos(BIN_LOG_HEADER_SIZE);
4309   rli->set_event_relay_log_name(rli->linfo.log_file_name);
4310 
4311   /*
4312     If we removed the rli->group_relay_log_name file,
4313     we must update the rli->group* coordinates, otherwise do not touch it as the
4314     group's execution is not finished (e.g. COMMIT not executed)
4315   */
4316   if (included)
4317   {
4318     rli->set_group_relay_log_pos(BIN_LOG_HEADER_SIZE);
4319     rli->set_group_relay_log_name(rli->linfo.log_file_name);
4320     rli->notify_group_relay_log_name_update();
4321   }
4322   /*
4323     Store where we are in the new file for the execution thread.
4324     If we are in the middle of a group), then we should not store
4325     the position in the repository, instead in that case set a flag
4326     to true which indicates that a 'forced flush' is postponed due
4327     to transaction split across the relaylogs.
4328   */
4329   if (!rli->is_in_group())
4330     rli->flush_info(TRUE);
4331   else
4332     rli->force_flush_postponed_due_to_split_trans= true;
4333 
4334   DBUG_EXECUTE_IF("crash_before_purge_logs", DBUG_SUICIDE(););
4335 
4336   mysql_mutex_lock(&rli->log_space_lock);
4337   rli->relay_log.purge_logs(to_purge_if_included, included,
4338                             false/*need_lock_index=false*/,
4339                             false/*need_update_threads=false*/,
4340                             &rli->log_space_total, true);
4341   // Tell the I/O thread to take the relay_log_space_limit into account
4342   rli->ignore_log_space_limit= 0;
4343   mysql_mutex_unlock(&rli->log_space_lock);
4344 
4345   /*
4346     Ok to broadcast after the critical region as there is no risk of
4347     the mutex being destroyed by this thread later - this helps save
4348     context switches
4349   */
4350   mysql_cond_broadcast(&rli->log_space_cond);
4351 
4352   /*
4353    * Need to update the log pos because purge logs has been called
4354    * after fetching initially the log pos at the begining of the method.
4355    */
4356   if((error=find_log_pos(&rli->linfo, rli->get_event_relay_log_name(),
4357                          false/*need_lock_index=false*/)))
4358   {
4359     char buff[22];
4360     sql_print_error("next log error: %d  offset: %s  log: %s included: %d",
4361                     error,
4362                     llstr(rli->linfo.index_file_offset,buff),
4363                     rli->get_group_relay_log_name(),
4364                     included);
4365     goto err;
4366   }
4367 
4368   /* If included was passed, rli->linfo should be the first entry. */
4369   DBUG_ASSERT(!included || rli->linfo.index_file_start_offset == 0);
4370 
4371 err:
4372   my_free(to_purge_if_included);
4373   mysql_mutex_unlock(&LOCK_index);
4374   DBUG_RETURN(error);
4375 }
4376 
4377 
4378 /**
4379   Remove logs from index file.
4380 
4381   - To make crash safe, we copy the content of index file
4382   from index_file_start_offset recored in log_info to
4383   crash safe index file firstly and then move the crash
4384   safe index file to index file.
4385 
4386   @param linfo                  Store here the found log file name and
4387                                 position to the NEXT log file name in
4388                                 the index file.
4389 
4390   @param need_update_threads    If we want to update the log coordinates
4391                                 of all threads. False for relay logs,
4392                                 true otherwise.
4393 
4394   @retval
4395     0    ok
4396   @retval
4397     LOG_INFO_IO    Got IO error while reading/writing file
4398 */
remove_logs_from_index(LOG_INFO * log_info,bool need_update_threads)4399 int MYSQL_BIN_LOG::remove_logs_from_index(LOG_INFO* log_info, bool need_update_threads)
4400 {
4401   if (open_crash_safe_index_file())
4402   {
4403     sql_print_error("MYSQL_BIN_LOG::remove_logs_from_index failed to "
4404                     "open the crash safe index file.");
4405     goto err;
4406   }
4407 
4408   if (copy_file(&index_file, &crash_safe_index_file,
4409                 log_info->index_file_start_offset))
4410   {
4411     sql_print_error("MYSQL_BIN_LOG::remove_logs_from_index failed to "
4412                     "copy index file to crash safe index file.");
4413     goto err;
4414   }
4415 
4416   if (close_crash_safe_index_file())
4417   {
4418     sql_print_error("MYSQL_BIN_LOG::remove_logs_from_index failed to "
4419                     "close the crash safe index file.");
4420     goto err;
4421   }
4422   DBUG_EXECUTE_IF("fault_injection_copy_part_file", DBUG_SUICIDE(););
4423 
4424   if (move_crash_safe_index_file_to_index_file(false/*need_lock_index=false*/))
4425   {
4426     sql_print_error("MYSQL_BIN_LOG::remove_logs_from_index failed to "
4427                     "move crash safe index file to index file.");
4428     goto err;
4429   }
4430 
4431   // now update offsets in index file for running threads
4432   if (need_update_threads)
4433     adjust_linfo_offsets(log_info->index_file_start_offset);
4434   return 0;
4435 
4436 err:
4437   return LOG_INFO_IO;
4438 }
4439 
4440 /**
4441   Remove all logs before the given log from disk and from the index file.
4442 
4443   @param to_log	      Delete all log file name before this file.
4444   @param included            If true, to_log is deleted too.
4445   @param need_lock_index
4446   @param need_update_threads If we want to update the log coordinates of
4447                              all threads. False for relay logs, true otherwise.
4448   @param freed_log_space     If not null, decrement this variable of
4449                              the amount of log space freed
4450   @param auto_purge          True if this is an automatic purge.
4451 
4452   @note
4453     If any of the logs before the deleted one is in use,
4454     only purge logs up to this one.
4455 
4456   @retval
4457     0			ok
4458   @retval
4459     LOG_INFO_EOF		to_log not found
4460     LOG_INFO_EMFILE             too many files opened
4461     LOG_INFO_FATAL              if any other than ENOENT error from
4462                                 mysql_file_stat() or mysql_file_delete()
4463 */
4464 
purge_logs(const char * to_log,bool included,bool need_lock_index,bool need_update_threads,ulonglong * decrease_log_space,bool auto_purge)4465 int MYSQL_BIN_LOG::purge_logs(const char *to_log,
4466                               bool included,
4467                               bool need_lock_index,
4468                               bool need_update_threads,
4469                               ulonglong *decrease_log_space,
4470                               bool auto_purge)
4471 {
4472   int error= 0, no_of_log_files_to_purge= 0, no_of_log_files_purged= 0;
4473   int no_of_threads_locking_log= 0;
4474   bool exit_loop= 0;
4475   LOG_INFO log_info;
4476   THD *thd= current_thd;
4477   DBUG_ENTER("purge_logs");
4478   DBUG_PRINT("info",("to_log= %s",to_log));
4479 
4480   if (need_lock_index)
4481     mysql_mutex_lock(&LOCK_index);
4482   else
4483     mysql_mutex_assert_owner(&LOCK_index);
4484   if ((error=find_log_pos(&log_info, to_log, false/*need_lock_index=false*/)))
4485   {
4486     sql_print_error("MYSQL_BIN_LOG::purge_logs was called with file %s not "
4487                     "listed in the index.", to_log);
4488     goto err;
4489   }
4490 
4491   no_of_log_files_to_purge= log_info.entry_index;
4492 
4493   if ((error= open_purge_index_file(TRUE)))
4494   {
4495     sql_print_error("MYSQL_BIN_LOG::purge_logs failed to sync the index file.");
4496     goto err;
4497   }
4498 
4499   /*
4500     File name exists in index file; delete until we find this file
4501     or a file that is used.
4502   */
4503   if ((error=find_log_pos(&log_info, NullS, false/*need_lock_index=false*/)))
4504     goto err;
4505 
4506   while ((strcmp(to_log,log_info.log_file_name) || (exit_loop=included)))
4507   {
4508     if(is_active(log_info.log_file_name))
4509     {
4510       if(!auto_purge)
4511         push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
4512                             ER_WARN_PURGE_LOG_IS_ACTIVE,
4513                             ER(ER_WARN_PURGE_LOG_IS_ACTIVE),
4514                             log_info.log_file_name);
4515       break;
4516     }
4517 
4518     if ((no_of_threads_locking_log= log_in_use(log_info.log_file_name)))
4519     {
4520       if(!auto_purge)
4521         push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
4522                             ER_WARN_PURGE_LOG_IN_USE,
4523                             ER(ER_WARN_PURGE_LOG_IN_USE),
4524                             log_info.log_file_name,  no_of_threads_locking_log,
4525                             no_of_log_files_purged, no_of_log_files_to_purge);
4526       break;
4527     }
4528     no_of_log_files_purged++;
4529 
4530     if ((error= register_purge_index_entry(log_info.log_file_name)))
4531     {
4532       sql_print_error("MYSQL_BIN_LOG::purge_logs failed to copy %s to register file.",
4533                       log_info.log_file_name);
4534       goto err;
4535     }
4536 
4537     if (find_next_log(&log_info, false/*need_lock_index=false*/) || exit_loop)
4538       break;
4539   }
4540 
4541   DBUG_EXECUTE_IF("crash_purge_before_update_index", DBUG_SUICIDE(););
4542 
4543   if ((error= sync_purge_index_file()))
4544   {
4545     sql_print_error("MYSQL_BIN_LOG::purge_logs failed to flush register file.");
4546     goto err;
4547   }
4548 
4549   /* We know how many files to delete. Update index file. */
4550   if ((error=remove_logs_from_index(&log_info, need_update_threads)))
4551   {
4552     sql_print_error("MYSQL_BIN_LOG::purge_logs failed to update the index file");
4553     goto err;
4554   }
4555 
4556   // Update gtid_state->lost_gtids
4557   if (gtid_mode > 0 && !is_relay_log)
4558   {
4559     global_sid_lock->wrlock();
4560     error= init_gtid_sets(NULL,
4561                        const_cast<Gtid_set *>(gtid_state->get_lost_gtids()),
4562                        NULL,
4563                        opt_master_verify_checksum,
4564                        false/*false=don't need lock*/);
4565     global_sid_lock->unlock();
4566     if (error)
4567       goto err;
4568   }
4569 
4570   DBUG_EXECUTE_IF("crash_purge_critical_after_update_index", DBUG_SUICIDE(););
4571 
4572 err:
4573 
4574   int error_index= 0, close_error_index= 0;
4575   /* Read each entry from purge_index_file and delete the file. */
4576   if (!error && is_inited_purge_index_file() &&
4577       (error_index= purge_index_entry(thd, decrease_log_space, false/*need_lock_index=false*/)))
4578     sql_print_error("MYSQL_BIN_LOG::purge_logs failed to process registered files"
4579                     " that would be purged.");
4580 
4581   close_error_index= close_purge_index_file();
4582 
4583   DBUG_EXECUTE_IF("crash_purge_non_critical_after_update_index", DBUG_SUICIDE(););
4584 
4585   if (need_lock_index)
4586     mysql_mutex_unlock(&LOCK_index);
4587 
4588   /*
4589     Error codes from purge logs take precedence.
4590     Then error codes from purging the index entry.
4591     Finally, error codes from closing the purge index file.
4592   */
4593   error= error ? error : (error_index ? error_index :
4594                           close_error_index);
4595 
4596   DBUG_RETURN(error);
4597 }
4598 
set_purge_index_file_name(const char * base_file_name)4599 int MYSQL_BIN_LOG::set_purge_index_file_name(const char *base_file_name)
4600 {
4601   int error= 0;
4602   DBUG_ENTER("MYSQL_BIN_LOG::set_purge_index_file_name");
4603   if (fn_format(purge_index_file_name, base_file_name, mysql_data_home,
4604                 ".~rec~", MYF(MY_UNPACK_FILENAME | MY_SAFE_PATH |
4605                               MY_REPLACE_EXT)) == NULL)
4606   {
4607     error= 1;
4608     sql_print_error("MYSQL_BIN_LOG::set_purge_index_file_name failed to set "
4609                       "file name.");
4610   }
4611   DBUG_RETURN(error);
4612 }
4613 
open_purge_index_file(bool destroy)4614 int MYSQL_BIN_LOG::open_purge_index_file(bool destroy)
4615 {
4616   int error= 0;
4617   File file= -1;
4618 
4619   DBUG_ENTER("MYSQL_BIN_LOG::open_purge_index_file");
4620 
4621   if (destroy)
4622     close_purge_index_file();
4623 
4624   if (!my_b_inited(&purge_index_file))
4625   {
4626     if ((file= my_open(purge_index_file_name, O_RDWR | O_CREAT | O_BINARY,
4627                        MYF(MY_WME | ME_WAITTANG))) < 0  ||
4628         init_io_cache(&purge_index_file, file, IO_SIZE,
4629                       (destroy ? WRITE_CACHE : READ_CACHE),
4630                       0, 0, MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
4631     {
4632       error= 1;
4633       sql_print_error("MYSQL_BIN_LOG::open_purge_index_file failed to open register "
4634                       " file.");
4635     }
4636   }
4637   DBUG_RETURN(error);
4638 }
4639 
close_purge_index_file()4640 int MYSQL_BIN_LOG::close_purge_index_file()
4641 {
4642   int error= 0;
4643 
4644   DBUG_ENTER("MYSQL_BIN_LOG::close_purge_index_file");
4645 
4646   if (my_b_inited(&purge_index_file))
4647   {
4648     end_io_cache(&purge_index_file);
4649     error= my_close(purge_index_file.file, MYF(0));
4650   }
4651   my_delete(purge_index_file_name, MYF(0));
4652   memset(&purge_index_file, 0, sizeof(purge_index_file));
4653 
4654   DBUG_RETURN(error);
4655 }
4656 
is_inited_purge_index_file()4657 bool MYSQL_BIN_LOG::is_inited_purge_index_file()
4658 {
4659   DBUG_ENTER("MYSQL_BIN_LOG::is_inited_purge_index_file");
4660   DBUG_RETURN (my_b_inited(&purge_index_file));
4661 }
4662 
sync_purge_index_file()4663 int MYSQL_BIN_LOG::sync_purge_index_file()
4664 {
4665   int error= 0;
4666   DBUG_ENTER("MYSQL_BIN_LOG::sync_purge_index_file");
4667 
4668   if ((error= flush_io_cache(&purge_index_file)) ||
4669       (error= my_sync(purge_index_file.file, MYF(MY_WME))))
4670     DBUG_RETURN(error);
4671 
4672   DBUG_RETURN(error);
4673 }
4674 
register_purge_index_entry(const char * entry)4675 int MYSQL_BIN_LOG::register_purge_index_entry(const char *entry)
4676 {
4677   int error= 0;
4678   DBUG_ENTER("MYSQL_BIN_LOG::register_purge_index_entry");
4679 
4680   if ((error=my_b_write(&purge_index_file, (const uchar*)entry, strlen(entry))) ||
4681       (error=my_b_write(&purge_index_file, (const uchar*)"\n", 1)))
4682     DBUG_RETURN (error);
4683 
4684   DBUG_RETURN(error);
4685 }
4686 
register_create_index_entry(const char * entry)4687 int MYSQL_BIN_LOG::register_create_index_entry(const char *entry)
4688 {
4689   DBUG_ENTER("MYSQL_BIN_LOG::register_create_index_entry");
4690   DBUG_RETURN(register_purge_index_entry(entry));
4691 }
4692 
purge_index_entry(THD * thd,ulonglong * decrease_log_space,bool need_lock_index)4693 int MYSQL_BIN_LOG::purge_index_entry(THD *thd, ulonglong *decrease_log_space,
4694                                      bool need_lock_index)
4695 {
4696   MY_STAT s;
4697   int error= 0;
4698   LOG_INFO log_info;
4699   LOG_INFO check_log_info;
4700 
4701   DBUG_ENTER("MYSQL_BIN_LOG:purge_index_entry");
4702 
4703   DBUG_ASSERT(my_b_inited(&purge_index_file));
4704 
4705   if ((error=reinit_io_cache(&purge_index_file, READ_CACHE, 0, 0, 0)))
4706   {
4707     sql_print_error("MYSQL_BIN_LOG::purge_index_entry failed to reinit register file "
4708                     "for read");
4709     goto err;
4710   }
4711 
4712   for (;;)
4713   {
4714     uint length;
4715 
4716     if ((length=my_b_gets(&purge_index_file, log_info.log_file_name,
4717                           FN_REFLEN)) <= 1)
4718     {
4719       if (purge_index_file.error)
4720       {
4721         error= purge_index_file.error;
4722         sql_print_error("MYSQL_BIN_LOG::purge_index_entry error %d reading from "
4723                         "register file.", error);
4724         goto err;
4725       }
4726 
4727       /* Reached EOF */
4728       break;
4729     }
4730 
4731     /* Get rid of the trailing '\n' */
4732     log_info.log_file_name[length-1]= 0;
4733 
4734     if (!mysql_file_stat(m_key_file_log, log_info.log_file_name, &s, MYF(0)))
4735     {
4736       if (my_errno == ENOENT)
4737       {
4738         /*
4739           It's not fatal if we can't stat a log file that does not exist;
4740           If we could not stat, we won't delete.
4741         */
4742         if (thd)
4743         {
4744           push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
4745                               ER_LOG_PURGE_NO_FILE, ER(ER_LOG_PURGE_NO_FILE),
4746                               log_info.log_file_name);
4747         }
4748         sql_print_information("Failed to execute mysql_file_stat on file '%s'",
4749 			      log_info.log_file_name);
4750         my_errno= 0;
4751       }
4752       else
4753       {
4754         /*
4755           Other than ENOENT are fatal
4756         */
4757         if (thd)
4758         {
4759           push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
4760                               ER_BINLOG_PURGE_FATAL_ERR,
4761                               "a problem with getting info on being purged %s; "
4762                               "consider examining correspondence "
4763                               "of your binlog index file "
4764                               "to the actual binlog files",
4765                               log_info.log_file_name);
4766         }
4767         else
4768         {
4769           sql_print_information("Failed to delete log file '%s'; "
4770                                 "consider examining correspondence "
4771                                 "of your binlog index file "
4772                                 "to the actual binlog files",
4773                                 log_info.log_file_name);
4774         }
4775         error= LOG_INFO_FATAL;
4776         goto err;
4777       }
4778     }
4779     else
4780     {
4781       if ((error= find_log_pos(&check_log_info, log_info.log_file_name,
4782                                need_lock_index)))
4783       {
4784         if (error != LOG_INFO_EOF)
4785         {
4786           if (thd)
4787           {
4788             push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
4789                                 ER_BINLOG_PURGE_FATAL_ERR,
4790                                 "a problem with deleting %s and "
4791                                 "reading the binlog index file",
4792                                 log_info.log_file_name);
4793           }
4794           else
4795           {
4796             sql_print_information("Failed to delete file '%s' and "
4797                                   "read the binlog index file",
4798                                   log_info.log_file_name);
4799           }
4800           goto err;
4801         }
4802 
4803         error= 0;
4804         if (!need_lock_index)
4805         {
4806           /*
4807             This is to avoid triggering an error in NDB.
4808 
4809             @todo: This is weird, what does NDB errors have to do with
4810             need_lock_index? Explain better or refactor /Sven
4811           */
4812           ha_binlog_index_purge_file(current_thd, log_info.log_file_name);
4813         }
4814 
4815         DBUG_PRINT("info",("purging %s",log_info.log_file_name));
4816         if (!mysql_file_delete(key_file_binlog, log_info.log_file_name, MYF(0)))
4817         {
4818           DBUG_EXECUTE_IF("wait_in_purge_index_entry",
4819                           {
4820                               const char action[] = "now SIGNAL in_purge_index_entry WAIT_FOR go_ahead_sql";
4821                               DBUG_ASSERT(!debug_sync_set_action(thd, STRING_WITH_LEN(action)));
4822                               DBUG_SET("-d,wait_in_purge_index_entry");
4823                           };);
4824 
4825           if (decrease_log_space)
4826             *decrease_log_space-= s.st_size;
4827         }
4828         else
4829         {
4830           if (my_errno == ENOENT)
4831           {
4832             if (thd)
4833             {
4834               push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
4835                                   ER_LOG_PURGE_NO_FILE, ER(ER_LOG_PURGE_NO_FILE),
4836                                   log_info.log_file_name);
4837             }
4838             sql_print_information("Failed to delete file '%s'",
4839                                   log_info.log_file_name);
4840             my_errno= 0;
4841           }
4842           else
4843           {
4844             if (thd)
4845             {
4846               push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
4847                                   ER_BINLOG_PURGE_FATAL_ERR,
4848                                   "a problem with deleting %s; "
4849                                   "consider examining correspondence "
4850                                   "of your binlog index file "
4851                                   "to the actual binlog files",
4852                                   log_info.log_file_name);
4853             }
4854             else
4855             {
4856               sql_print_information("Failed to delete file '%s'; "
4857                                     "consider examining correspondence "
4858                                     "of your binlog index file "
4859                                     "to the actual binlog files",
4860                                     log_info.log_file_name);
4861             }
4862             if (my_errno == EMFILE)
4863             {
4864               DBUG_PRINT("info",
4865                          ("my_errno: %d, set ret = LOG_INFO_EMFILE", my_errno));
4866               error= LOG_INFO_EMFILE;
4867               goto err;
4868             }
4869             error= LOG_INFO_FATAL;
4870             goto err;
4871           }
4872         }
4873       }
4874     }
4875   }
4876 
4877 err:
4878   DBUG_RETURN(error);
4879 }
4880 
4881 /**
4882   Remove all logs before the given file date from disk and from the
4883   index file.
4884 
4885   @param thd		Thread pointer
4886   @param purge_time	Delete all log files before given date.
4887   @param auto_purge     True if this is an automatic purge.
4888 
4889   @note
4890     If any of the logs before the deleted one is in use,
4891     only purge logs up to this one.
4892 
4893   @retval
4894     0				ok
4895   @retval
4896     LOG_INFO_PURGE_NO_ROTATE	Binary file that can't be rotated
4897     LOG_INFO_FATAL              if any other than ENOENT error from
4898                                 mysql_file_stat() or mysql_file_delete()
4899 */
4900 
purge_logs_before_date(time_t purge_time,bool auto_purge)4901 int MYSQL_BIN_LOG::purge_logs_before_date(time_t purge_time, bool auto_purge)
4902 {
4903   int error;
4904   int no_of_threads_locking_log= 0, no_of_log_files_purged= 0;
4905   bool log_is_active= false, log_is_in_use= false;
4906   char to_log[FN_REFLEN], copy_log_in_use[FN_REFLEN];
4907   LOG_INFO log_info;
4908   MY_STAT stat_area;
4909   THD *thd= current_thd;
4910 
4911   DBUG_ENTER("purge_logs_before_date");
4912 
4913   mysql_mutex_lock(&LOCK_index);
4914   to_log[0]= 0;
4915 
4916   if ((error=find_log_pos(&log_info, NullS, false/*need_lock_index=false*/)))
4917     goto err;
4918 
4919   while (!(log_is_active= is_active(log_info.log_file_name)))
4920   {
4921     if ((no_of_threads_locking_log= log_in_use(log_info.log_file_name)))
4922     {
4923       if (!auto_purge)
4924       {
4925         log_is_in_use= true;
4926         strcpy(copy_log_in_use, log_info.log_file_name);
4927       }
4928       break;
4929     }
4930     no_of_log_files_purged++;
4931 
4932     if (!mysql_file_stat(m_key_file_log,
4933                          log_info.log_file_name, &stat_area, MYF(0)))
4934     {
4935       if (my_errno == ENOENT)
4936       {
4937         /*
4938           It's not fatal if we can't stat a log file that does not exist.
4939         */
4940         my_errno= 0;
4941       }
4942       else
4943       {
4944         /*
4945           Other than ENOENT are fatal
4946         */
4947         if (thd)
4948         {
4949           push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
4950                               ER_BINLOG_PURGE_FATAL_ERR,
4951                               "a problem with getting info on being purged %s; "
4952                               "consider examining correspondence "
4953                               "of your binlog index file "
4954                               "to the actual binlog files",
4955                               log_info.log_file_name);
4956         }
4957         else
4958         {
4959           sql_print_information("Failed to delete log file '%s'",
4960                                 log_info.log_file_name);
4961         }
4962         error= LOG_INFO_FATAL;
4963         goto err;
4964       }
4965     }
4966     else
4967     {
4968       if (stat_area.st_mtime < purge_time)
4969         strmake(to_log,
4970                 log_info.log_file_name,
4971                 sizeof(log_info.log_file_name) - 1);
4972       else
4973         break;
4974     }
4975     if (find_next_log(&log_info, false/*need_lock_index=false*/))
4976       break;
4977   }
4978 
4979   if (log_is_active)
4980   {
4981     if(!auto_purge)
4982       push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
4983                           ER_WARN_PURGE_LOG_IS_ACTIVE,
4984                           ER(ER_WARN_PURGE_LOG_IS_ACTIVE),
4985                           log_info.log_file_name);
4986 
4987   }
4988 
4989   if (log_is_in_use)
4990   {
4991     int no_of_log_files_to_purge= no_of_log_files_purged+1;
4992     while (strcmp(log_file_name, log_info.log_file_name))
4993     {
4994       if (mysql_file_stat(m_key_file_log, log_info.log_file_name,
4995                           &stat_area, MYF(0)))
4996       {
4997         if (stat_area.st_mtime < purge_time)
4998           no_of_log_files_to_purge++;
4999         else
5000           break;
5001       }
5002       if (find_next_log(&log_info, false/*need_lock_index=false*/))
5003       {
5004         no_of_log_files_to_purge++;
5005         break;
5006       }
5007     }
5008 
5009     push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
5010                         ER_WARN_PURGE_LOG_IN_USE,
5011                         ER(ER_WARN_PURGE_LOG_IN_USE),
5012                         copy_log_in_use, no_of_threads_locking_log,
5013                         no_of_log_files_purged, no_of_log_files_to_purge);
5014   }
5015 
5016   error= (to_log[0] ? purge_logs(to_log, true,
5017                                  false/*need_lock_index=false*/,
5018                                  true/*need_update_threads=true*/,
5019                                  (ulonglong *) 0, auto_purge) : 0);
5020 
5021 err:
5022   mysql_mutex_unlock(&LOCK_index);
5023   DBUG_RETURN(error);
5024 }
5025 #endif /* HAVE_REPLICATION */
5026 
5027 
5028 /**
5029   Create a new log file name.
5030 
5031   @param buf		buf of at least FN_REFLEN where new name is stored
5032 
5033   @note
5034     If file name will be longer then FN_REFLEN it will be truncated
5035 */
5036 
make_log_name(char * buf,const char * log_ident)5037 void MYSQL_BIN_LOG::make_log_name(char* buf, const char* log_ident)
5038 {
5039   uint dir_len = dirname_length(log_file_name);
5040   if (dir_len >= FN_REFLEN)
5041     dir_len=FN_REFLEN-1;
5042   strnmov(buf, log_file_name, dir_len);
5043   strmake(buf+dir_len, log_ident, FN_REFLEN - dir_len -1);
5044 }
5045 
5046 
5047 /**
5048   Check if we are writing/reading to the given log file.
5049 */
5050 
is_active(const char * log_file_name_arg)5051 bool MYSQL_BIN_LOG::is_active(const char *log_file_name_arg)
5052 {
5053   return !strcmp(log_file_name, log_file_name_arg);
5054 }
5055 
5056 
5057 /*
5058   Wrappers around new_file_impl to avoid using argument
5059   to control locking. The argument 1) less readable 2) breaks
5060   incapsulation 3) allows external access to the class without
5061   a lock (which is not possible with private new_file_without_locking
5062   method).
5063 
5064   @retval
5065     nonzero - error
5066 
5067 */
5068 
new_file(Format_description_log_event * extra_description_event)5069 int MYSQL_BIN_LOG::new_file(Format_description_log_event *extra_description_event)
5070 {
5071   return new_file_impl(true/*need_lock_log=true*/, extra_description_event);
5072 }
5073 
5074 /*
5075   @retval
5076     nonzero - error
5077 */
new_file_without_locking(Format_description_log_event * extra_description_event)5078 int MYSQL_BIN_LOG::new_file_without_locking(Format_description_log_event *extra_description_event)
5079 {
5080   return new_file_impl(false/*need_lock_log=false*/, extra_description_event);
5081 }
5082 
5083 
5084 /**
5085   Start writing to a new log file or reopen the old file.
5086 
5087   @param need_lock_log If true, this function acquires LOCK_log;
5088   otherwise the caller should already have acquired it.
5089 
5090   @retval 0 success
5091   @retval nonzero - error
5092 
5093   @note The new file name is stored last in the index file
5094 */
new_file_impl(bool need_lock_log,Format_description_log_event * extra_description_event)5095 int MYSQL_BIN_LOG::new_file_impl(bool need_lock_log, Format_description_log_event *extra_description_event)
5096 {
5097   int error= 0, close_on_error= FALSE;
5098   char new_name[FN_REFLEN], *new_name_ptr, *old_name, *file_to_open;
5099 
5100   DBUG_ENTER("MYSQL_BIN_LOG::new_file_impl");
5101   if (!is_open())
5102   {
5103     DBUG_PRINT("info",("log is closed"));
5104     DBUG_RETURN(error);
5105   }
5106 
5107   if (need_lock_log)
5108     mysql_mutex_lock(&LOCK_log);
5109   else
5110     mysql_mutex_assert_owner(&LOCK_log);
5111   DBUG_EXECUTE_IF("semi_sync_3-way_deadlock",
5112                   DEBUG_SYNC(current_thd, "before_rotate_binlog"););
5113   mysql_mutex_lock(&LOCK_xids);
5114   /*
5115     We need to ensure that the number of prepared XIDs are 0.
5116 
5117     If m_prep_xids is not zero:
5118     - We wait for storage engine commit, hence decrease m_prep_xids
5119     - We keep the LOCK_log to block new transactions from being
5120       written to the binary log.
5121    */
5122   while (get_prep_xids() > 0)
5123   {
5124     DEBUG_SYNC(current_thd, "before_rotate_binlog_file");
5125     mysql_cond_wait(&m_prep_xids_cond, &LOCK_xids);
5126   }
5127   mysql_mutex_unlock(&LOCK_xids);
5128 
5129   mysql_mutex_lock(&LOCK_index);
5130 
5131   if (DBUG_EVALUATE_IF("expire_logs_always", 0, 1)
5132       && (error= ha_flush_logs(NULL)))
5133     goto end;
5134 
5135   mysql_mutex_assert_owner(&LOCK_log);
5136   mysql_mutex_assert_owner(&LOCK_index);
5137 
5138 
5139   /*
5140     If user hasn't specified an extension, generate a new log name
5141     We have to do this here and not in open as we want to store the
5142     new file name in the current binary log file.
5143   */
5144   new_name_ptr= new_name;
5145   if ((error= generate_new_name(new_name, name)))
5146   {
5147     // Use the old name if generation of new name fails.
5148     strcpy(new_name, name);
5149     close_on_error= TRUE;
5150     goto end;
5151   }
5152   else
5153   {
5154     /*
5155       We log the whole file name for log file as the user may decide
5156       to change base names at some point.
5157     */
5158     Rotate_log_event r(new_name+dirname_length(new_name), 0, LOG_EVENT_OFFSET,
5159                        is_relay_log ? Rotate_log_event::RELAY_LOG : 0);
5160     /*
5161       The current relay-log's closing Rotate event must have checksum
5162       value computed with an algorithm of the last relay-logged FD event.
5163     */
5164     if (is_relay_log)
5165       r.checksum_alg= relay_log_checksum_alg;
5166     DBUG_ASSERT(!is_relay_log || relay_log_checksum_alg != BINLOG_CHECKSUM_ALG_UNDEF);
5167     if(DBUG_EVALUATE_IF("fault_injection_new_file_rotate_event", (error=close_on_error=TRUE), FALSE) ||
5168        (error= r.write(&log_file)))
5169     {
5170       char errbuf[MYSYS_STRERROR_SIZE];
5171       DBUG_EXECUTE_IF("fault_injection_new_file_rotate_event", errno=2;);
5172       close_on_error= TRUE;
5173       my_printf_error(ER_ERROR_ON_WRITE, ER(ER_CANT_OPEN_FILE),
5174                       MYF(ME_FATALERROR), name,
5175                       errno, my_strerror(errbuf, sizeof(errbuf), errno));
5176       goto end;
5177     }
5178     bytes_written += r.data_written;
5179   }
5180   /*
5181     Update needs to be signalled even if there is no rotate event
5182     log rotation should give the waiting thread a signal to
5183     discover EOF and move on to the next log.
5184   */
5185   signal_update();
5186 
5187   old_name=name;
5188   name=0;				// Don't free name
5189   close(LOG_CLOSE_TO_BE_OPENED | LOG_CLOSE_INDEX,
5190         false/*need_lock_log=false*/,
5191         false/*need_lock_index=false*/);
5192 
5193   if (checksum_alg_reset != BINLOG_CHECKSUM_ALG_UNDEF)
5194   {
5195     DBUG_ASSERT(!is_relay_log);
5196     DBUG_ASSERT(binlog_checksum_options != checksum_alg_reset);
5197     binlog_checksum_options= checksum_alg_reset;
5198   }
5199   /*
5200      Note that at this point, log_state != LOG_CLOSED (important for is_open()).
5201   */
5202 
5203   DEBUG_SYNC(current_thd, "before_rotate_binlog_file");
5204   /*
5205      new_file() is only used for rotation (in FLUSH LOGS or because size >
5206      max_binlog_size or max_relay_log_size).
5207      If this is a binary log, the Format_description_log_event at the beginning of
5208      the new file should have created=0 (to distinguish with the
5209      Format_description_log_event written at server startup, which should
5210      trigger temp tables deletion on slaves.
5211   */
5212 
5213   /* reopen index binlog file, BUG#34582 */
5214   file_to_open= index_file_name;
5215   error= open_index_file(index_file_name, 0, false/*need_lock_index=false*/);
5216   if (!error)
5217   {
5218     /* reopen the binary log file. */
5219     file_to_open= new_name_ptr;
5220     error= open_binlog(old_name, new_name_ptr, io_cache_type,
5221                        max_size, true/*null_created_arg=true*/,
5222                        false/*need_lock_log=false*/,
5223                        false/*need_lock_index=false*/,
5224                        true/*need_sid_lock=true*/,
5225                        extra_description_event);
5226   }
5227 
5228   /* handle reopening errors */
5229   if (error)
5230   {
5231     char errbuf[MYSYS_STRERROR_SIZE];
5232     my_printf_error(ER_CANT_OPEN_FILE, ER(ER_CANT_OPEN_FILE),
5233                     MYF(ME_FATALERROR), file_to_open,
5234                     error, my_strerror(errbuf, sizeof(errbuf), error));
5235     close_on_error= TRUE;
5236   }
5237   my_free(old_name);
5238 
5239 end:
5240 
5241   if (error && close_on_error /* rotate or reopen failed */)
5242   {
5243     /*
5244       Close whatever was left opened.
5245 
5246       We are keeping the behavior as it exists today, ie,
5247       we disable logging and move on (see: BUG#51014).
5248 
5249       TODO: as part of WL#1790 consider other approaches:
5250        - kill mysql (safety);
5251        - try multiple locations for opening a log file;
5252        - switch server to protected/readonly mode
5253        - ...
5254     */
5255     if (binlog_error_action == ABORT_SERVER)
5256     {
5257       exec_binlog_error_action_abort("Either disk is full or file system is"
5258                                      " read only while rotating the binlog."
5259                                      " Aborting the server.");
5260     }
5261     else
5262       sql_print_error("Could not open %s for logging (error %d). "
5263                       "Turning logging off for the whole duration "
5264                       "of the MySQL server process. To turn it on "
5265                       "again: fix the cause, shutdown the MySQL "
5266                       "server and restart it.",
5267                       new_name_ptr, errno);
5268     close(LOG_CLOSE_INDEX, false /*need_lock_log=false*/,
5269           false/*need_lock_index=false*/);
5270   }
5271 
5272   mysql_mutex_unlock(&LOCK_index);
5273   if (need_lock_log)
5274     mysql_mutex_unlock(&LOCK_log);
5275 
5276   DEBUG_SYNC(current_thd, "after_disable_binlog");
5277   DBUG_RETURN(error);
5278 }
5279 
5280 
5281 #ifdef HAVE_REPLICATION
5282 /**
5283   Called after an event has been written to the relay log by the IO
5284   thread.  This flushes and possibly syncs the file (according to the
5285   sync options), rotates the file if it has grown over the limit, and
5286   finally calls signal_update().
5287 
5288   @note The caller must hold LOCK_log before invoking this function.
5289 
5290   @param mi Master_info for the IO thread.
5291   @param need_data_lock If true, mi->data_lock will be acquired if a
5292   rotation is needed.  Otherwise, mi->data_lock must be held by the
5293   caller.
5294 
5295   @retval false success
5296   @retval true error
5297 */
after_append_to_relay_log(Master_info * mi)5298 bool MYSQL_BIN_LOG::after_append_to_relay_log(Master_info *mi)
5299 {
5300   DBUG_ENTER("MYSQL_BIN_LOG::after_append_to_relay_log");
5301   DBUG_PRINT("info",("max_size: %lu",max_size));
5302 
5303   // Check pre-conditions
5304   mysql_mutex_assert_owner(&LOCK_log);
5305   mysql_mutex_assert_owner(&mi->data_lock);
5306   DBUG_ASSERT(is_relay_log);
5307   DBUG_ASSERT(current_thd->system_thread == SYSTEM_THREAD_SLAVE_IO);
5308 
5309   // Flush and sync
5310   bool error= false;
5311   if (flush_and_sync(0) == 0)
5312   {
5313     DBUG_EXECUTE_IF ("set_max_size_zero",
5314                      {max_size=0;});
5315     // If relay log is too big, rotate
5316     if ((uint) my_b_append_tell(&log_file) >
5317         DBUG_EVALUATE_IF("rotate_slave_debug_group", 500, max_size))
5318     {
5319       error= new_file_without_locking(mi->get_mi_description_event());
5320       DBUG_EXECUTE_IF ("set_max_size_zero",
5321                        {
5322                        max_size=1073741824;
5323                        DBUG_SET("-d,set_max_size_zero");
5324                        DBUG_SET("-d,flush_after_reading_gtid_event");
5325                        });
5326     }
5327   }
5328 
5329   signal_update();
5330 
5331   DBUG_RETURN(error);
5332 }
5333 
5334 
append_event(Log_event * ev,Master_info * mi)5335 bool MYSQL_BIN_LOG::append_event(Log_event* ev, Master_info *mi)
5336 {
5337   DBUG_ENTER("MYSQL_BIN_LOG::append");
5338 
5339   // check preconditions
5340   DBUG_ASSERT(log_file.type == SEQ_READ_APPEND);
5341   DBUG_ASSERT(is_relay_log);
5342 
5343   // acquire locks
5344   mysql_mutex_lock(&LOCK_log);
5345 
5346   // write data
5347   bool error = false;
5348   if (ev->write(&log_file) == 0)
5349   {
5350     bytes_written+= ev->data_written;
5351     error= after_append_to_relay_log(mi);
5352   }
5353   else
5354     error= true;
5355 
5356   mysql_mutex_unlock(&LOCK_log);
5357   DBUG_RETURN(error);
5358 }
5359 
5360 
append_buffer(const char * buf,uint len,Master_info * mi)5361 bool MYSQL_BIN_LOG::append_buffer(const char* buf, uint len, Master_info *mi)
5362 {
5363   DBUG_ENTER("MYSQL_BIN_LOG::append_buffer");
5364 
5365   // check preconditions
5366   DBUG_ASSERT(log_file.type == SEQ_READ_APPEND);
5367   DBUG_ASSERT(is_relay_log);
5368   mysql_mutex_assert_owner(&LOCK_log);
5369 
5370   // write data
5371   bool error= false;
5372   if (my_b_append(&log_file,(uchar*) buf,len) == 0)
5373   {
5374     bytes_written += len;
5375     error= after_append_to_relay_log(mi);
5376   }
5377   else
5378     error= true;
5379 
5380   DBUG_RETURN(error);
5381 }
5382 #endif // ifdef HAVE_REPLICATION
5383 
flush_and_sync(const bool force)5384 bool MYSQL_BIN_LOG::flush_and_sync(const bool force)
5385 {
5386   mysql_mutex_assert_owner(&LOCK_log);
5387 
5388   if (flush_io_cache(&log_file))
5389     return 1;
5390 
5391   std::pair<bool, bool> result= sync_binlog_file(force);
5392 
5393   return result.first;
5394 }
5395 
start_union_events(THD * thd,query_id_t query_id_param)5396 void MYSQL_BIN_LOG::start_union_events(THD *thd, query_id_t query_id_param)
5397 {
5398   DBUG_ASSERT(!thd->binlog_evt_union.do_union);
5399   thd->binlog_evt_union.do_union= TRUE;
5400   thd->binlog_evt_union.unioned_events= FALSE;
5401   thd->binlog_evt_union.unioned_events_trans= FALSE;
5402   thd->binlog_evt_union.first_query_id= query_id_param;
5403 }
5404 
stop_union_events(THD * thd)5405 void MYSQL_BIN_LOG::stop_union_events(THD *thd)
5406 {
5407   DBUG_ASSERT(thd->binlog_evt_union.do_union);
5408   thd->binlog_evt_union.do_union= FALSE;
5409 }
5410 
is_query_in_union(THD * thd,query_id_t query_id_param)5411 bool MYSQL_BIN_LOG::is_query_in_union(THD *thd, query_id_t query_id_param)
5412 {
5413   return (thd->binlog_evt_union.do_union &&
5414           query_id_param >= thd->binlog_evt_union.first_query_id);
5415 }
5416 
5417 /*
5418   Updates thd's position-of-next-event variables
5419   after a *real* write a file.
5420  */
update_thd_next_event_pos(THD * thd)5421 void MYSQL_BIN_LOG::update_thd_next_event_pos(THD* thd)
5422 {
5423   if (likely(thd != NULL))
5424   {
5425     thd->set_next_event_pos(log_file_name,
5426                             my_b_tell(&log_file));
5427   }
5428 }
5429 
5430 /*
5431   Moves the last bunch of rows from the pending Rows event to a cache (either
5432   transactional cache if is_transaction is @c true, or the non-transactional
5433   cache otherwise. Sets a new pending event.
5434 
5435   @param thd               a pointer to the user thread.
5436   @param evt               a pointer to the row event.
5437   @param is_transactional  @c true indicates a transactional cache,
5438                            otherwise @c false a non-transactional.
5439 */
5440 int
flush_and_set_pending_rows_event(THD * thd,Rows_log_event * event,bool is_transactional)5441 MYSQL_BIN_LOG::flush_and_set_pending_rows_event(THD *thd,
5442                                                 Rows_log_event* event,
5443                                                 bool is_transactional)
5444 {
5445   DBUG_ENTER("MYSQL_BIN_LOG::flush_and_set_pending_rows_event(event)");
5446   DBUG_ASSERT(mysql_bin_log.is_open());
5447   DBUG_PRINT("enter", ("event: 0x%lx", (long) event));
5448 
5449   int error= 0;
5450   binlog_cache_mngr *const cache_mngr= thd_get_cache_mngr(thd);
5451 
5452   DBUG_ASSERT(cache_mngr);
5453 
5454   binlog_cache_data *cache_data=
5455     cache_mngr->get_binlog_cache_data(is_transactional);
5456 
5457   DBUG_PRINT("info", ("cache_mngr->pending(): 0x%lx", (long) cache_data->pending()));
5458 
5459   if (Rows_log_event* pending= cache_data->pending())
5460   {
5461     /*
5462       Write pending event to the cache.
5463     */
5464     if (cache_data->write_event(thd, pending))
5465     {
5466       set_write_error(thd, is_transactional);
5467       if (check_write_error(thd) && cache_data &&
5468           stmt_cannot_safely_rollback(thd))
5469         cache_data->set_incident();
5470       delete pending;
5471       cache_data->set_pending(NULL);
5472       DBUG_RETURN(1);
5473     }
5474 
5475     delete pending;
5476   }
5477 
5478   cache_data->set_pending(event);
5479 
5480   DBUG_RETURN(error);
5481 }
5482 
5483 /**
5484   Write an event to the binary log.
5485 */
5486 
write_event(Log_event * event_info)5487 bool MYSQL_BIN_LOG::write_event(Log_event *event_info)
5488 {
5489   THD *thd= event_info->thd;
5490   bool error= 1;
5491   DBUG_ENTER("MYSQL_BIN_LOG::write_event(Log_event *)");
5492 
5493   if (thd->binlog_evt_union.do_union)
5494   {
5495     /*
5496       In Stored function; Remember that function call caused an update.
5497       We will log the function call to the binary log on function exit
5498     */
5499     thd->binlog_evt_union.unioned_events= TRUE;
5500     thd->binlog_evt_union.unioned_events_trans |=
5501       event_info->is_using_trans_cache();
5502     DBUG_RETURN(0);
5503   }
5504 
5505   /*
5506     We only end the statement if we are in a top-level statement.  If
5507     we are inside a stored function, we do not end the statement since
5508     this will close all tables on the slave. But there can be a special case
5509     where we are inside a stored function/trigger and a SAVEPOINT is being
5510     set in side the stored function/trigger. This SAVEPOINT execution will
5511     force the pending event to be flushed without an STMT_END_F flag. This
5512     will result in a case where following DMLs will be considered as part of
5513     same statement and result in data loss on slave. Hence in this case we
5514     force the end_stmt to be true.
5515   */
5516   bool const end_stmt= (thd->in_sub_stmt && thd->lex->sql_command ==
5517                         SQLCOM_SAVEPOINT)? true:
5518     (thd->locked_tables_mode && thd->lex->requires_prelocking());
5519   if (thd->binlog_flush_pending_rows_event(end_stmt,
5520                                            event_info->is_using_trans_cache()))
5521     DBUG_RETURN(error);
5522 
5523   /*
5524      In most cases this is only called if 'is_open()' is true; in fact this is
5525      mostly called if is_open() *was* true a few instructions before, but it
5526      could have changed since.
5527   */
5528   if (likely(is_open()))
5529   {
5530 #ifdef HAVE_REPLICATION
5531     /*
5532       In the future we need to add to the following if tests like
5533       "do the involved tables match (to be implemented)
5534       binlog_[wild_]{do|ignore}_table?" (WL#1049)"
5535     */
5536     const char *local_db= event_info->get_db();
5537     if ((thd && !(thd->variables.option_bits & OPTION_BIN_LOG)) ||
5538 	(thd->lex->sql_command != SQLCOM_ROLLBACK_TO_SAVEPOINT &&
5539          thd->lex->sql_command != SQLCOM_SAVEPOINT &&
5540          (!event_info->is_no_filter_event() &&
5541           !binlog_filter->db_ok(local_db))))
5542       DBUG_RETURN(0);
5543 #endif /* HAVE_REPLICATION */
5544 
5545     DBUG_ASSERT(event_info->is_using_trans_cache() || event_info->is_using_stmt_cache());
5546 
5547     if (binlog_start_trans_and_stmt(thd, event_info))
5548       DBUG_RETURN(error);
5549 
5550     bool is_trans_cache= event_info->is_using_trans_cache();
5551     binlog_cache_mngr *cache_mngr= thd_get_cache_mngr(thd);
5552     binlog_cache_data *cache_data= cache_mngr->get_binlog_cache_data(is_trans_cache);
5553 
5554     DBUG_PRINT("info",("event type: %d",event_info->get_type_code()));
5555 
5556     /*
5557        No check for auto events flag here - this write method should
5558        never be called if auto-events are enabled.
5559 
5560        Write first log events which describe the 'run environment'
5561        of the SQL command. If row-based binlogging, Insert_id, Rand
5562        and other kind of "setting context" events are not needed.
5563     */
5564     if (thd)
5565     {
5566       if (!thd->is_current_stmt_binlog_format_row())
5567       {
5568         if (thd->stmt_depends_on_first_successful_insert_id_in_prev_stmt)
5569         {
5570           Intvar_log_event e(thd,(uchar) LAST_INSERT_ID_EVENT,
5571                              thd->first_successful_insert_id_in_prev_stmt_for_binlog,
5572                              event_info->event_cache_type, event_info->event_logging_type);
5573           if (cache_data->write_event(thd, &e))
5574             goto err;
5575         }
5576         if (thd->auto_inc_intervals_in_cur_stmt_for_binlog.nb_elements() > 0)
5577         {
5578           DBUG_PRINT("info",("number of auto_inc intervals: %u",
5579                              thd->auto_inc_intervals_in_cur_stmt_for_binlog.
5580                              nb_elements()));
5581           Intvar_log_event e(thd, (uchar) INSERT_ID_EVENT,
5582                              thd->auto_inc_intervals_in_cur_stmt_for_binlog.
5583                              minimum(), event_info->event_cache_type,
5584                              event_info->event_logging_type);
5585           if (cache_data->write_event(thd, &e))
5586             goto err;
5587         }
5588         if (thd->rand_used)
5589         {
5590           Rand_log_event e(thd,thd->rand_saved_seed1,thd->rand_saved_seed2,
5591                            event_info->event_cache_type,
5592                            event_info->event_logging_type);
5593           if (cache_data->write_event(thd, &e))
5594             goto err;
5595         }
5596         if (thd->user_var_events.elements)
5597         {
5598           for (uint i= 0; i < thd->user_var_events.elements; i++)
5599           {
5600             BINLOG_USER_VAR_EVENT *user_var_event;
5601             get_dynamic(&thd->user_var_events,(uchar*) &user_var_event, i);
5602 
5603             /* setting flags for user var log event */
5604             uchar flags= User_var_log_event::UNDEF_F;
5605             if (user_var_event->unsigned_flag)
5606               flags|= User_var_log_event::UNSIGNED_F;
5607 
5608             User_var_log_event e(thd,
5609                                  user_var_event->user_var_event->entry_name.ptr(),
5610                                  user_var_event->user_var_event->entry_name.length(),
5611                                  user_var_event->value,
5612                                  user_var_event->length,
5613                                  user_var_event->type,
5614                                  user_var_event->charset_number, flags,
5615                                  event_info->event_cache_type,
5616                                  event_info->event_logging_type);
5617             if (cache_data->write_event(thd, &e))
5618               goto err;
5619           }
5620         }
5621       }
5622     }
5623 
5624     /*
5625       Write the event.
5626     */
5627     if (cache_data->write_event(thd, event_info) ||
5628         DBUG_EVALUATE_IF("injecting_fault_writing", 1, 0))
5629       goto err;
5630 
5631     /*
5632       After writing the event, if the trx-cache was used and any unsafe
5633       change was written into it, the cache is marked as cannot safely
5634       roll back.
5635     */
5636     if (is_trans_cache && stmt_cannot_safely_rollback(thd))
5637       cache_mngr->trx_cache.set_cannot_rollback();
5638 
5639     error= 0;
5640 
5641 err:
5642     if (error)
5643     {
5644       set_write_error(thd, is_trans_cache);
5645       if (check_write_error(thd) && cache_data &&
5646           stmt_cannot_safely_rollback(thd))
5647         cache_data->set_incident();
5648     }
5649   }
5650 
5651   DBUG_RETURN(error);
5652 }
5653 
5654 /**
5655   The method executes rotation when LOCK_log is already acquired
5656   by the caller.
5657 
5658   @param force_rotate  caller can request the log rotation
5659   @param check_purge   is set to true if rotation took place
5660 
5661   @note
5662     If rotation fails, for instance the server was unable
5663     to create a new log file, we still try to write an
5664     incident event to the current log.
5665 
5666   @note The caller must hold LOCK_log when invoking this function.
5667 
5668   @retval
5669     nonzero - error in rotating routine.
5670 */
rotate(bool force_rotate,bool * check_purge)5671 int MYSQL_BIN_LOG::rotate(bool force_rotate, bool* check_purge)
5672 {
5673   int error= 0;
5674   DBUG_ENTER("MYSQL_BIN_LOG::rotate");
5675 
5676   DBUG_ASSERT(!is_relay_log);
5677   mysql_mutex_assert_owner(&LOCK_log);
5678 
5679   *check_purge= false;
5680 
5681   if (DBUG_EVALUATE_IF("force_rotate", 1, 0) || force_rotate ||
5682       (my_b_tell(&log_file) >= (my_off_t) max_size))
5683   {
5684     error= new_file_without_locking(NULL);
5685     *check_purge= true;
5686   }
5687   DBUG_RETURN(error);
5688 }
5689 
5690 /**
5691   The method executes logs purging routine.
5692 
5693   @retval
5694     nonzero - error in rotating routine.
5695 */
purge()5696 void MYSQL_BIN_LOG::purge()
5697 {
5698 #ifdef HAVE_REPLICATION
5699   if (expire_logs_days)
5700   {
5701     DEBUG_SYNC(current_thd, "at_purge_logs_before_date");
5702     time_t purge_time= my_time(0) - expire_logs_days*24*60*60;
5703     DBUG_EXECUTE_IF("expire_logs_always",
5704                     { purge_time= my_time(0);});
5705     if (purge_time >= 0)
5706     {
5707       /*
5708         Flush logs for storage engines, so that the last transaction
5709         is fsynced inside storage engines.
5710       */
5711       ha_flush_logs(NULL);
5712       purge_logs_before_date(purge_time, true);
5713     }
5714   }
5715 #endif
5716 }
5717 
5718 /**
5719   The method is a shortcut of @c rotate() and @c purge().
5720   LOCK_log is acquired prior to rotate and is released after it.
5721 
5722   @param force_rotate  caller can request the log rotation
5723 
5724   @retval
5725     nonzero - error in rotating routine.
5726 */
rotate_and_purge(THD * thd,bool force_rotate)5727 int MYSQL_BIN_LOG::rotate_and_purge(THD* thd, bool force_rotate)
5728 {
5729   int error= 0;
5730   DBUG_ENTER("MYSQL_BIN_LOG::rotate_and_purge");
5731   bool check_purge= false;
5732 
5733   /*
5734     Wait for handlerton to insert any pending information into the binlog.
5735     For e.g. ha_ndbcluster which updates the binlog asynchronously this is
5736     needed so that the user see its own commands in the binlog.
5737   */
5738   ha_binlog_wait(thd);
5739 
5740   DBUG_ASSERT(!is_relay_log);
5741   mysql_mutex_lock(&LOCK_log);
5742   error= rotate(force_rotate, &check_purge);
5743   /*
5744     NOTE: Run purge_logs wo/ holding LOCK_log because it does not need
5745           the mutex. Otherwise causes various deadlocks.
5746   */
5747   mysql_mutex_unlock(&LOCK_log);
5748 
5749   if (!error && check_purge)
5750     purge();
5751 
5752   DBUG_RETURN(error);
5753 }
5754 
next_file_id()5755 uint MYSQL_BIN_LOG::next_file_id()
5756 {
5757   uint res;
5758   mysql_mutex_lock(&LOCK_log);
5759   res = file_id++;
5760   mysql_mutex_unlock(&LOCK_log);
5761   return res;
5762 }
5763 
5764 
5765 /**
5766   Calculate checksum of possibly a part of an event containing at least
5767   the whole common header.
5768 
5769   @param    buf       the pointer to trans cache's buffer
5770   @param    off       the offset of the beginning of the event in the buffer
5771   @param    event_len no-checksum length of the event
5772   @param    length    the current size of the buffer
5773 
5774   @param    crc       [in-out] the checksum
5775 
5776   Event size in incremented by @c BINLOG_CHECKSUM_LEN.
5777 
5778   @return 0 or number of unprocessed yet bytes of the event excluding
5779             the checksum part.
5780 */
fix_log_event_crc(uchar * buf,uint off,uint event_len,uint length,ha_checksum * crc)5781   static ulong fix_log_event_crc(uchar *buf, uint off, uint event_len,
5782                                  uint length, ha_checksum *crc)
5783 {
5784   ulong ret;
5785   uchar *event_begin= buf + off;
5786   uint16 flags= uint2korr(event_begin + FLAGS_OFFSET);
5787 
5788   DBUG_ASSERT(length >= off + LOG_EVENT_HEADER_LEN); //at least common header in
5789   int2store(event_begin + FLAGS_OFFSET, flags);
5790   ret= length >= off + event_len ? 0 : off + event_len - length;
5791   *crc= my_checksum(*crc, event_begin, event_len - ret);
5792   return ret;
5793 }
5794 
5795 /*
5796   Write the contents of a cache to the binary log.
5797 
5798   SYNOPSIS
5799     do_write_cache()
5800     cache    Cache to write to the binary log
5801     lock_log True if the LOCK_log mutex should be aquired, false otherwise
5802 
5803   DESCRIPTION
5804     Write the contents of the cache to the binary log. The cache will
5805     be reset as a READ_CACHE to be able to read the contents from it.
5806 
5807     Reading from the trans cache with possible (per @c binlog_checksum_options)
5808     adding checksum value  and then fixing the length and the end_log_pos of
5809     events prior to fill in the binlog cache.
5810 */
5811 
do_write_cache(IO_CACHE * cache)5812 int MYSQL_BIN_LOG::do_write_cache(IO_CACHE *cache)
5813 {
5814   DBUG_ENTER("MYSQL_BIN_LOG::do_write_cache(IO_CACHE *)");
5815 
5816   DBUG_EXECUTE_IF("simulate_do_write_cache_failure",
5817                   {
5818                     /*
5819                        see binlog_cache_data::write_event() that reacts on
5820                        @c simulate_disk_full_at_flush_pending.
5821                     */
5822                     DBUG_SET("-d,simulate_do_write_cache_failure");
5823                     DBUG_RETURN(ER_ERROR_ON_WRITE);
5824                   });
5825 
5826   if (reinit_io_cache(cache, READ_CACHE, 0, 0, 0))
5827     DBUG_RETURN(ER_ERROR_ON_WRITE);
5828   uint length= my_b_bytes_in_cache(cache), group, carry, hdr_offs;
5829   ulong remains= 0; // part of unprocessed yet netto length of the event
5830   long val;
5831   ulong end_log_pos_inc= 0; // each event processed adds BINLOG_CHECKSUM_LEN 2 t
5832   uchar header[LOG_EVENT_HEADER_LEN];
5833   ha_checksum crc= 0, crc_0= 0; // assignments to keep compiler happy
5834   my_bool do_checksum= (binlog_checksum_options != BINLOG_CHECKSUM_ALG_OFF);
5835   uchar buf[BINLOG_CHECKSUM_LEN];
5836 
5837   // while there is just one alg the following must hold:
5838   DBUG_ASSERT(!do_checksum ||
5839               binlog_checksum_options == BINLOG_CHECKSUM_ALG_CRC32);
5840 
5841   /*
5842     The events in the buffer have incorrect end_log_pos data
5843     (relative to beginning of group rather than absolute),
5844     so we'll recalculate them in situ so the binlog is always
5845     correct, even in the middle of a group. This is possible
5846     because we now know the start position of the group (the
5847     offset of this cache in the log, if you will); all we need
5848     to do is to find all event-headers, and add the position of
5849     the group to the end_log_pos of each event.  This is pretty
5850     straight forward, except that we read the cache in segments,
5851     so an event-header might end up on the cache-border and get
5852     split.
5853   */
5854 
5855   group= (uint)my_b_tell(&log_file);
5856   DBUG_PRINT("debug", ("length: %llu, group: %llu",
5857                        (ulonglong) length, (ulonglong) group));
5858   hdr_offs= carry= 0;
5859   if (do_checksum)
5860     crc= crc_0= my_checksum(0L, NULL, 0);
5861 
5862   if (DBUG_EVALUATE_IF("fault_injection_crc_value", 1, 0))
5863     crc= crc - 1;
5864 
5865   do
5866   {
5867     /*
5868       if we only got a partial header in the last iteration,
5869       get the other half now and process a full header.
5870     */
5871     if (unlikely(carry > 0))
5872     {
5873       DBUG_ASSERT(carry < LOG_EVENT_HEADER_LEN);
5874 
5875       /* assemble both halves */
5876       memcpy(&header[carry], (char *)cache->read_pos,
5877              LOG_EVENT_HEADER_LEN - carry);
5878 
5879       /* fix end_log_pos */
5880       val=uint4korr(header + LOG_POS_OFFSET);
5881       val+= group +
5882         (end_log_pos_inc+= (do_checksum ? BINLOG_CHECKSUM_LEN : 0));
5883       int4store(&header[LOG_POS_OFFSET], val);
5884 
5885       if (do_checksum)
5886       {
5887         ulong len= uint4korr(header + EVENT_LEN_OFFSET);
5888         /* fix len */
5889         int4store(&header[EVENT_LEN_OFFSET], len + BINLOG_CHECKSUM_LEN);
5890       }
5891 
5892       /* write the first half of the split header */
5893       if (my_b_write(&log_file, header, carry))
5894         DBUG_RETURN(ER_ERROR_ON_WRITE);
5895 
5896       /*
5897         copy fixed second half of header to cache so the correct
5898         version will be written later.
5899       */
5900       memcpy((char *)cache->read_pos, &header[carry],
5901              LOG_EVENT_HEADER_LEN - carry);
5902 
5903       /* next event header at ... */
5904       hdr_offs= uint4korr(header + EVENT_LEN_OFFSET) - carry -
5905         (do_checksum ? BINLOG_CHECKSUM_LEN : 0);
5906 
5907       if (do_checksum)
5908       {
5909         DBUG_ASSERT(crc == crc_0 && remains == 0);
5910         crc= my_checksum(crc, header, carry);
5911         remains= uint4korr(header + EVENT_LEN_OFFSET) - carry -
5912           BINLOG_CHECKSUM_LEN;
5913       }
5914       carry= 0;
5915     }
5916 
5917     /* if there is anything to write, process it. */
5918 
5919     if (likely(length > 0))
5920     {
5921       /*
5922         process all event-headers in this (partial) cache.
5923         if next header is beyond current read-buffer,
5924         we'll get it later (though not necessarily in the
5925         very next iteration, just "eventually").
5926       */
5927 
5928       /* crc-calc the whole buffer */
5929       if (do_checksum && hdr_offs >= length)
5930       {
5931 
5932         DBUG_ASSERT(remains != 0 && crc != crc_0);
5933 
5934         crc= my_checksum(crc, cache->read_pos, length);
5935         remains -= length;
5936         if (my_b_write(&log_file, cache->read_pos, length))
5937           DBUG_RETURN(ER_ERROR_ON_WRITE);
5938         if (remains == 0)
5939         {
5940           int4store(buf, crc);
5941           if (my_b_write(&log_file, buf, BINLOG_CHECKSUM_LEN))
5942             DBUG_RETURN(ER_ERROR_ON_WRITE);
5943           crc= crc_0;
5944         }
5945       }
5946 
5947       while (hdr_offs < length)
5948       {
5949         /*
5950           partial header only? save what we can get, process once
5951           we get the rest.
5952         */
5953 
5954         if (do_checksum)
5955         {
5956           if (remains != 0)
5957           {
5958             /*
5959               finish off with remains of the last event that crawls
5960               from previous into the current buffer
5961             */
5962             DBUG_ASSERT(crc != crc_0);
5963             crc= my_checksum(crc, cache->read_pos, hdr_offs);
5964             int4store(buf, crc);
5965             remains -= hdr_offs;
5966             DBUG_ASSERT(remains == 0);
5967             if (my_b_write(&log_file, cache->read_pos, hdr_offs) ||
5968                 my_b_write(&log_file, buf, BINLOG_CHECKSUM_LEN))
5969               DBUG_RETURN(ER_ERROR_ON_WRITE);
5970             crc= crc_0;
5971           }
5972         }
5973 
5974         if (hdr_offs + LOG_EVENT_HEADER_LEN > length)
5975         {
5976           carry= length - hdr_offs;
5977           memcpy(header, (char *)cache->read_pos + hdr_offs, carry);
5978           length= hdr_offs;
5979         }
5980         else
5981         {
5982           /* we've got a full event-header, and it came in one piece */
5983           uchar *ev= (uchar *)cache->read_pos + hdr_offs;
5984           uint event_len= uint4korr(ev + EVENT_LEN_OFFSET); // netto len
5985           uchar *log_pos= ev + LOG_POS_OFFSET;
5986 
5987           /* fix end_log_pos */
5988           val= uint4korr(log_pos) + group +
5989             (end_log_pos_inc += (do_checksum ? BINLOG_CHECKSUM_LEN : 0));
5990           int4store(log_pos, val);
5991 
5992 	  /* fix CRC */
5993 	  if (do_checksum)
5994           {
5995             /* fix length */
5996             int4store(ev + EVENT_LEN_OFFSET, event_len + BINLOG_CHECKSUM_LEN);
5997             remains= fix_log_event_crc(cache->read_pos, hdr_offs, event_len,
5998                                        length, &crc);
5999             if (my_b_write(&log_file, ev,
6000                            remains == 0 ? event_len : length - hdr_offs))
6001               DBUG_RETURN(ER_ERROR_ON_WRITE);
6002             if (remains == 0)
6003             {
6004               int4store(buf, crc);
6005               if (my_b_write(&log_file, buf, BINLOG_CHECKSUM_LEN))
6006                 DBUG_RETURN(ER_ERROR_ON_WRITE);
6007               crc= crc_0; // crc is complete
6008             }
6009           }
6010 
6011           /* next event header at ... */
6012           hdr_offs += event_len; // incr by the netto len
6013 
6014           DBUG_ASSERT(!do_checksum || remains == 0 || hdr_offs >= length);
6015         }
6016       }
6017 
6018       /*
6019         Adjust hdr_offs. Note that it may still point beyond the segment
6020         read in the next iteration; if the current event is very long,
6021         it may take a couple of read-iterations (and subsequent adjustments
6022         of hdr_offs) for it to point into the then-current segment.
6023         If we have a split header (!carry), hdr_offs will be set at the
6024         beginning of the next iteration, overwriting the value we set here:
6025       */
6026       hdr_offs -= length;
6027     }
6028 
6029     /* Write the entire buf to the binary log file */
6030     if (!do_checksum)
6031       if (my_b_write(&log_file, cache->read_pos, length))
6032         DBUG_RETURN(ER_ERROR_ON_WRITE);
6033     cache->read_pos=cache->read_end;		// Mark buffer used up
6034   } while ((length= my_b_fill(cache)));
6035 
6036   DBUG_ASSERT(carry == 0);
6037   DBUG_ASSERT(!do_checksum || remains == 0);
6038   DBUG_ASSERT(!do_checksum || crc == crc_0);
6039 
6040   DBUG_RETURN(0); // All OK
6041 }
6042 
6043 /**
6044   Writes an incident event to the binary log.
6045 
6046   @param ev Incident event to be written
6047   @param need_lock_log If true, will acquire LOCK_log; otherwise the
6048   caller should already have acquired LOCK_log.
6049   @do_flush_and_sync If true, will call flush_and_sync(), rotate() and
6050   purge().
6051 
6052   @retval false error
6053   @retval true success
6054 */
write_incident(Incident_log_event * ev,bool need_lock_log,bool do_flush_and_sync)6055 bool MYSQL_BIN_LOG::write_incident(Incident_log_event *ev, bool need_lock_log,
6056                                    bool do_flush_and_sync)
6057 {
6058   uint error= 0;
6059   DBUG_ENTER("MYSQL_BIN_LOG::write_incident");
6060 
6061   if (!is_open())
6062     DBUG_RETURN(error);
6063 
6064   if (need_lock_log)
6065     mysql_mutex_lock(&LOCK_log);
6066   else
6067     mysql_mutex_assert_owner(&LOCK_log);
6068 
6069   // @todo make this work with the group log. /sven
6070 
6071   error= ev->write(&log_file);
6072 
6073   if (do_flush_and_sync)
6074   {
6075     if (!error && !(error= flush_and_sync()))
6076     {
6077       bool check_purge= false;
6078       signal_update();
6079       error= rotate(true, &check_purge);
6080       if (!error && check_purge)
6081         purge();
6082     }
6083   }
6084 
6085   if (need_lock_log)
6086     mysql_mutex_unlock(&LOCK_log);
6087 
6088   DBUG_RETURN(error);
6089 }
6090 
write_dml_directly(THD * thd,const char * stmt,size_t stmt_len,enum_sql_command sql_command)6091 bool MYSQL_BIN_LOG::write_dml_directly(THD* thd, const char *stmt, size_t stmt_len,
6092                                        enum_sql_command sql_command)
6093 {
6094   bool ret= false;
6095   /* backup the original command */
6096   enum_sql_command save_sql_command= thd->lex->sql_command;
6097   thd->lex->sql_command= sql_command;
6098 
6099   if (thd->binlog_query(THD::STMT_QUERY_TYPE, stmt, stmt_len,
6100                         FALSE, FALSE, FALSE, 0) ||
6101       commit(thd, false) != TC_LOG::RESULT_SUCCESS)
6102   {
6103     ret= true;
6104   }
6105 
6106   thd->lex->sql_command= save_sql_command;
6107   return ret;
6108 }
6109 
6110 
6111 /**
6112   Creates an incident event and writes it to the binary log.
6113 
6114   @param thd  Thread variable
6115   @param ev   Incident event to be written
6116   @param lock If the binary lock should be locked or not
6117 
6118   @retval
6119     0    error
6120   @retval
6121     1    success
6122 */
write_incident(THD * thd,bool need_lock_log,bool do_flush_and_sync)6123 bool MYSQL_BIN_LOG::write_incident(THD *thd, bool need_lock_log,
6124                                    bool do_flush_and_sync)
6125 {
6126   DBUG_ENTER("MYSQL_BIN_LOG::write_incident");
6127 
6128   if (!is_open())
6129     DBUG_RETURN(0);
6130 
6131   LEX_STRING const write_error_msg=
6132     { C_STRING_WITH_LEN("error writing to the binary log") };
6133   Incident incident= INCIDENT_LOST_EVENTS;
6134   Incident_log_event ev(thd, incident, write_error_msg);
6135 
6136   DBUG_RETURN(write_incident(&ev, need_lock_log, do_flush_and_sync));
6137 }
6138 
6139 /**
6140   Write a cached log entry to the binary log.
6141 
6142   @param thd            Thread variable
6143   @param cache		The cache to copy to the binlog
6144   @param incident       Defines if an incident event should be created to
6145                         notify that some non-transactional changes did
6146                         not get into the binlog.
6147   @param prepared       Defines if a transaction is part of a 2-PC.
6148 
6149   @note
6150     We only come here if there is something in the cache.
6151   @note
6152     The thing in the cache is always a complete transaction.
6153   @note
6154     'cache' needs to be reinitialized after this functions returns.
6155 */
6156 
write_cache(THD * thd,binlog_cache_data * cache_data)6157 bool MYSQL_BIN_LOG::write_cache(THD *thd, binlog_cache_data *cache_data)
6158 {
6159   DBUG_ENTER("MYSQL_BIN_LOG::write_cache(THD *, binlog_cache_data *, bool)");
6160 
6161   IO_CACHE *cache= &cache_data->cache_log;
6162   bool incident= cache_data->has_incident();
6163 
6164   DBUG_EXECUTE_IF("simulate_binlog_flush_error",
6165                   {
6166                     if (rand() % 3 == 0)
6167                     {
6168                       write_error=1;
6169                       thd->commit_error= THD::CE_FLUSH_ERROR;
6170                       DBUG_RETURN(0);
6171                     }
6172                   };);
6173 
6174   mysql_mutex_assert_owner(&LOCK_log);
6175 
6176   DBUG_ASSERT(is_open());
6177   if (likely(is_open()))                       // Should always be true
6178   {
6179     /*
6180       We only bother to write to the binary log if there is anything
6181       to write.
6182      */
6183     if (my_b_tell(cache) > 0)
6184     {
6185       DBUG_EXECUTE_IF("crash_before_writing_xid",
6186                       {
6187                         if ((write_error= do_write_cache(cache)))
6188                           DBUG_PRINT("info", ("error writing binlog cache: %d",
6189                                                write_error));
6190                         flush_and_sync(true);
6191                         DBUG_PRINT("info", ("crashing before writing xid"));
6192                         DBUG_SUICIDE();
6193                       });
6194 
6195       if ((write_error= do_write_cache(cache)))
6196         goto err;
6197 
6198       if (incident && write_incident(thd, false/*need_lock_log=false*/,
6199                                      false/*do_flush_and_sync==false*/))
6200         goto err;
6201 
6202       DBUG_EXECUTE_IF("half_binlogged_transaction", DBUG_SUICIDE(););
6203       if (cache->error)				// Error on read
6204       {
6205         char errbuf[MYSYS_STRERROR_SIZE];
6206         sql_print_error(ER(ER_ERROR_ON_READ), cache->file_name,
6207                         errno, my_strerror(errbuf, sizeof(errbuf), errno));
6208         write_error=1;				// Don't give more errors
6209         goto err;
6210       }
6211 
6212       global_sid_lock->rdlock();
6213       if (gtid_state->update_on_flush(thd) != RETURN_STATUS_OK)
6214       {
6215         global_sid_lock->unlock();
6216         goto err;
6217       }
6218       global_sid_lock->unlock();
6219     }
6220     update_thd_next_event_pos(thd);
6221   }
6222 
6223   DBUG_RETURN(0);
6224 
6225 err:
6226   if (!write_error)
6227   {
6228     char errbuf[MYSYS_STRERROR_SIZE];
6229     write_error= 1;
6230     sql_print_error(ER(ER_ERROR_ON_WRITE), name,
6231                     errno, my_strerror(errbuf, sizeof(errbuf), errno));
6232   }
6233 
6234   /*
6235     If the flush has failed due to ENOSPC, set the flush_error flag.
6236   */
6237   if (cache->error && thd->is_error() && my_errno == ENOSPC)
6238   {
6239     cache_data->set_flush_error(thd);
6240   }
6241   thd->commit_error= THD::CE_FLUSH_ERROR;
6242 
6243   DBUG_RETURN(1);
6244 }
6245 
6246 
6247 /**
6248   Wait until we get a signal that the relay log has been updated.
6249 
6250   @param[in] thd        Thread variable
6251   @param[in] timeout    a pointer to a timespec;
6252                         NULL means to wait w/o timeout.
6253 
6254   @retval    0          if got signalled on update
6255   @retval    non-0      if wait timeout elapsed
6256 
6257   @note
6258     One must have a lock on LOCK_log before calling this function.
6259 */
6260 
wait_for_update_relay_log(THD * thd,const struct timespec * timeout)6261 int MYSQL_BIN_LOG::wait_for_update_relay_log(THD* thd, const struct timespec *timeout)
6262 {
6263   int ret= 0;
6264   PSI_stage_info old_stage;
6265   DBUG_ENTER("wait_for_update_relay_log");
6266 
6267   thd->ENTER_COND(&update_cond, &LOCK_log,
6268                   &stage_slave_has_read_all_relay_log,
6269                   &old_stage);
6270 
6271   if (!timeout)
6272     mysql_cond_wait(&update_cond, &LOCK_log);
6273   else
6274     ret= mysql_cond_timedwait(&update_cond, &LOCK_log,
6275                               const_cast<struct timespec *>(timeout));
6276   thd->EXIT_COND(&old_stage);
6277 
6278   DBUG_RETURN(ret);
6279 }
6280 
6281 /**
6282   Wait until we get a signal that the binary log has been updated.
6283   Applies to master only.
6284 
6285   NOTES
6286   @param[in] thd        a THD struct
6287   @param[in] timeout    a pointer to a timespec;
6288                         NULL means to wait w/o timeout.
6289   @retval    0          if got signalled on update
6290   @retval    non-0      if wait timeout elapsed
6291   @note
6292     LOCK_log must be taken before calling this function.
6293     LOCK_log is being released while the thread is waiting.
6294     LOCK_log is released by the caller.
6295 */
6296 
wait_for_update_bin_log(THD * thd,const struct timespec * timeout)6297 int MYSQL_BIN_LOG::wait_for_update_bin_log(THD* thd,
6298                                            const struct timespec *timeout)
6299 {
6300   int ret= 0;
6301   DBUG_ENTER("wait_for_update_bin_log");
6302 
6303   if (!timeout)
6304     mysql_cond_wait(&update_cond, &LOCK_log);
6305   else
6306     ret= mysql_cond_timedwait(&update_cond, &LOCK_log,
6307                               const_cast<struct timespec *>(timeout));
6308   DBUG_RETURN(ret);
6309 }
6310 
6311 
6312 /**
6313   Close the log file.
6314 
6315   @param exiting     Bitmask for one or more of the following bits:
6316           - LOG_CLOSE_INDEX : if we should close the index file
6317           - LOG_CLOSE_TO_BE_OPENED : if we intend to call open
6318                                      at once after close.
6319           - LOG_CLOSE_STOP_EVENT : write a 'stop' event to the log
6320 
6321   @param need_lock_log If true, this function acquires LOCK_log;
6322   otherwise the caller should already have acquired it.
6323 
6324   @param need_lock_index If true, this function acquires LOCK_index;
6325   otherwise the caller should already have acquired it.
6326 
6327   @note
6328     One can do an open on the object at once after doing a close.
6329     The internal structures are not freed until cleanup() is called
6330 */
6331 
close(uint exiting,bool need_lock_log,bool need_lock_index)6332 void MYSQL_BIN_LOG::close(uint exiting, bool need_lock_log,
6333                           bool need_lock_index)
6334 {					// One can't set log_type here!
6335   DBUG_ENTER("MYSQL_BIN_LOG::close");
6336   DBUG_PRINT("enter",("exiting: %d", (int) exiting));
6337 
6338   if (need_lock_log)
6339     mysql_mutex_lock(&LOCK_log);
6340   else
6341     mysql_mutex_assert_owner(&LOCK_log);
6342 
6343   if (log_state == LOG_OPENED)
6344   {
6345 #ifdef HAVE_REPLICATION
6346     if ((exiting & LOG_CLOSE_STOP_EVENT) != 0)
6347     {
6348       Stop_log_event s;
6349       // the checksumming rule for relay-log case is similar to Rotate
6350         s.checksum_alg= is_relay_log ?
6351           relay_log_checksum_alg : binlog_checksum_options;
6352       DBUG_ASSERT(!is_relay_log ||
6353                   relay_log_checksum_alg != BINLOG_CHECKSUM_ALG_UNDEF);
6354       s.write(&log_file);
6355       bytes_written+= s.data_written;
6356       signal_update();
6357     }
6358 #endif /* HAVE_REPLICATION */
6359 
6360     /* don't pwrite in a file opened with O_APPEND - it doesn't work */
6361     if (log_file.type == WRITE_CACHE)
6362     {
6363       my_off_t offset= BIN_LOG_HEADER_SIZE + FLAGS_OFFSET;
6364       my_off_t org_position= mysql_file_tell(log_file.file, MYF(0));
6365       uchar flags= 0;            // clearing LOG_EVENT_BINLOG_IN_USE_F
6366       mysql_file_pwrite(log_file.file, &flags, 1, offset, MYF(0));
6367       /*
6368         Restore position so that anything we have in the IO_cache is written
6369         to the correct position.
6370         We need the seek here, as mysql_file_pwrite() is not guaranteed to keep the
6371         original position on system that doesn't support pwrite().
6372       */
6373       mysql_file_seek(log_file.file, org_position, MY_SEEK_SET, MYF(0));
6374     }
6375 
6376     /* this will cleanup IO_CACHE, sync and close the file */
6377     MYSQL_LOG::close(exiting);
6378   }
6379 
6380   /*
6381     The following test is needed even if is_open() is not set, as we may have
6382     called a not complete close earlier and the index file is still open.
6383   */
6384 
6385   if (need_lock_index)
6386     mysql_mutex_lock(&LOCK_index);
6387   else
6388     mysql_mutex_assert_owner(&LOCK_index);
6389 
6390   if ((exiting & LOG_CLOSE_INDEX) && my_b_inited(&index_file))
6391   {
6392     end_io_cache(&index_file);
6393     if (mysql_file_close(index_file.file, MYF(0)) < 0 && ! write_error)
6394     {
6395       char errbuf[MYSYS_STRERROR_SIZE];
6396       write_error= 1;
6397       sql_print_error(ER(ER_ERROR_ON_WRITE), index_file_name,
6398                       errno, my_strerror(errbuf, sizeof(errbuf), errno));
6399     }
6400   }
6401 
6402   if (need_lock_index)
6403     mysql_mutex_unlock(&LOCK_index);
6404 
6405   log_state= (exiting & LOG_CLOSE_TO_BE_OPENED) ? LOG_TO_BE_OPENED : LOG_CLOSED;
6406   my_free(name);
6407   name= NULL;
6408 
6409   if (need_lock_log)
6410     mysql_mutex_unlock(&LOCK_log);
6411 
6412   DBUG_VOID_RETURN;
6413 }
6414 
harvest_bytes_written(Relay_log_info * rli,bool need_log_space_lock)6415 void MYSQL_BIN_LOG::harvest_bytes_written(Relay_log_info* rli, bool need_log_space_lock)
6416 {
6417 #ifndef DBUG_OFF
6418   char buf1[22],buf2[22];
6419 #endif
6420   DBUG_ENTER("harvest_bytes_written");
6421   if (need_log_space_lock)
6422     mysql_mutex_lock(&rli->log_space_lock);
6423   else
6424     mysql_mutex_assert_owner(&rli->log_space_lock);
6425   rli->log_space_total+= bytes_written;
6426   DBUG_PRINT("info",("relay_log_space: %s  bytes_written: %s",
6427         llstr(rli->log_space_total,buf1), llstr(bytes_written,buf2)));
6428   bytes_written=0;
6429   if (need_log_space_lock)
6430     mysql_mutex_unlock(&rli->log_space_lock);
6431   DBUG_VOID_RETURN;
6432 }
6433 
set_max_size(ulong max_size_arg)6434 void MYSQL_BIN_LOG::set_max_size(ulong max_size_arg)
6435 {
6436   /*
6437     We need to take locks, otherwise this may happen:
6438     new_file() is called, calls open(old_max_size), then before open() starts,
6439     set_max_size() sets max_size to max_size_arg, then open() starts and
6440     uses the old_max_size argument, so max_size_arg has been overwritten and
6441     it's like if the SET command was never run.
6442   */
6443   DBUG_ENTER("MYSQL_BIN_LOG::set_max_size");
6444   mysql_mutex_lock(&LOCK_log);
6445   if (is_open())
6446     max_size= max_size_arg;
6447   mysql_mutex_unlock(&LOCK_log);
6448   DBUG_VOID_RETURN;
6449 }
6450 
6451 
signal_update()6452 void MYSQL_BIN_LOG::signal_update()
6453 {
6454   DBUG_ENTER("MYSQL_BIN_LOG::signal_update");
6455   signal_cnt++;
6456   mysql_cond_broadcast(&update_cond);
6457   DBUG_VOID_RETURN;
6458 }
6459 
6460 /****** transaction coordinator log for 2pc - binlog() based solution ******/
6461 
6462 /**
6463   @todo
6464   keep in-memory list of prepared transactions
6465   (add to list in log(), remove on unlog())
6466   and copy it to the new binlog if rotated
6467   but let's check the behaviour of tc_log_page_waits first!
6468 */
6469 
open_binlog(const char * opt_name)6470 int MYSQL_BIN_LOG::open_binlog(const char *opt_name)
6471 {
6472   LOG_INFO log_info;
6473   int      error= 1;
6474 
6475   /*
6476     This function is used for 2pc transaction coordination.  Hence, it
6477     is never used for relay logs.
6478   */
6479   DBUG_ASSERT(!is_relay_log);
6480   DBUG_ASSERT(total_ha_2pc > 1 || (1 == total_ha_2pc && opt_bin_log));
6481   DBUG_ASSERT(opt_name && opt_name[0]);
6482 
6483   if (!my_b_inited(&index_file))
6484   {
6485     /* There was a failure to open the index file, can't open the binlog */
6486     cleanup();
6487     return 1;
6488   }
6489 
6490   if (using_heuristic_recover())
6491   {
6492     /* generate a new binlog to mask a corrupted one */
6493     open_binlog(opt_name, 0, WRITE_CACHE, max_binlog_size, false,
6494                 true/*need_lock_log=true*/,
6495                 true/*need_lock_index=true*/,
6496                 true/*need_sid_lock=true*/,
6497                 NULL);
6498     cleanup();
6499     return 1;
6500   }
6501 
6502   if ((error= find_log_pos(&log_info, NullS, true/*need_lock_index=true*/)))
6503   {
6504     if (error != LOG_INFO_EOF)
6505       sql_print_error("find_log_pos() failed (error: %d)", error);
6506     else
6507       error= 0;
6508     goto err;
6509   }
6510 
6511   {
6512     const char *errmsg;
6513     IO_CACHE    log;
6514     File        file;
6515     Log_event  *ev=0;
6516     Format_description_log_event fdle(BINLOG_VERSION);
6517     char        log_name[FN_REFLEN];
6518     my_off_t    valid_pos= 0;
6519     my_off_t    binlog_size;
6520     MY_STAT     s;
6521 
6522     if (! fdle.is_valid())
6523       goto err;
6524 
6525     do
6526     {
6527       strmake(log_name, log_info.log_file_name, sizeof(log_name)-1);
6528     } while (!(error= find_next_log(&log_info, true/*need_lock_index=true*/)));
6529 
6530     if (error !=  LOG_INFO_EOF)
6531     {
6532       sql_print_error("find_log_pos() failed (error: %d)", error);
6533       goto err;
6534     }
6535 
6536     if ((file= open_binlog_file(&log, log_name, &errmsg)) < 0)
6537     {
6538       sql_print_error("%s", errmsg);
6539       goto err;
6540     }
6541 
6542     my_stat(log_name, &s, MYF(0));
6543     binlog_size= s.st_size;
6544 
6545     if ((ev= Log_event::read_log_event(&log, 0, &fdle,
6546                                        opt_master_verify_checksum)) &&
6547         ev->get_type_code() == FORMAT_DESCRIPTION_EVENT &&
6548         ev->flags & LOG_EVENT_BINLOG_IN_USE_F)
6549     {
6550       sql_print_information("Recovering after a crash using %s", opt_name);
6551       valid_pos= my_b_tell(&log);
6552       error= recover(&log, (Format_description_log_event *)ev, &valid_pos);
6553     }
6554     else
6555       error=0;
6556 
6557     delete ev;
6558     end_io_cache(&log);
6559     mysql_file_close(file, MYF(MY_WME));
6560 
6561     if (error)
6562       goto err;
6563 
6564     /* Trim the crashed binlog file to last valid transaction
6565       or event (non-transaction) base on valid_pos. */
6566     if (valid_pos > 0)
6567     {
6568       if ((file= mysql_file_open(key_file_binlog, log_name,
6569                                  O_RDWR | O_BINARY, MYF(MY_WME))) < 0)
6570       {
6571         sql_print_error("Failed to open the crashed binlog file "
6572                         "when master server is recovering it.");
6573         return -1;
6574       }
6575 
6576       /* Change binlog file size to valid_pos */
6577       if (valid_pos < binlog_size)
6578       {
6579         if (my_chsize(file, valid_pos, 0, MYF(MY_WME)))
6580         {
6581           sql_print_error("Failed to trim the crashed binlog file "
6582                           "when master server is recovering it.");
6583           mysql_file_close(file, MYF(MY_WME));
6584           return -1;
6585         }
6586         else
6587         {
6588           sql_print_information("Crashed binlog file %s size is %llu, "
6589                                 "but recovered up to %llu. Binlog trimmed to %llu bytes.",
6590                                 log_name, binlog_size, valid_pos, valid_pos);
6591         }
6592       }
6593 
6594       /* Clear LOG_EVENT_BINLOG_IN_USE_F */
6595       my_off_t offset= BIN_LOG_HEADER_SIZE + FLAGS_OFFSET;
6596       uchar flags= 0;
6597       if (mysql_file_pwrite(file, &flags, 1, offset, MYF(0)) != 1)
6598       {
6599         sql_print_error("Failed to clear LOG_EVENT_BINLOG_IN_USE_F "
6600                         "for the crashed binlog file when master "
6601                         "server is recovering it.");
6602         mysql_file_close(file, MYF(MY_WME));
6603         return -1;
6604       }
6605 
6606       mysql_file_close(file, MYF(MY_WME));
6607     } //end if
6608   }
6609 
6610 err:
6611   return error;
6612 }
6613 
6614 /** This is called on shutdown, after ha_panic. */
close()6615 void MYSQL_BIN_LOG::close()
6616 {
6617 }
6618 
6619 /*
6620   Prepare the transaction in the transaction coordinator.
6621 
6622   This function will prepare the transaction in the storage engines
6623   (by calling @c ha_prepare_low) what will write a prepare record
6624   to the log buffers.
6625 
6626   @retval 0    success
6627   @retval 1    error
6628 */
prepare(THD * thd,bool all)6629 int MYSQL_BIN_LOG::prepare(THD *thd, bool all)
6630 {
6631   DBUG_ENTER("MYSQL_BIN_LOG::prepare");
6632 
6633   int error= ha_prepare_low(thd, all);
6634 
6635   DBUG_RETURN(error);
6636 }
6637 
6638 /**
6639   Commit the transaction in the transaction coordinator.
6640 
6641   This function will commit the sessions transaction in the binary log
6642   and in the storage engines (by calling @c ha_commit_low). If the
6643   transaction was successfully logged (or not successfully unlogged)
6644   but the commit in the engines did not succed, there is a risk of
6645   inconsistency between the engines and the binary log.
6646 
6647   For binary log group commit, the commit is separated into three
6648   parts:
6649 
6650   1. First part consists of filling the necessary caches and
6651      finalizing them (if they need to be finalized). After this,
6652      nothing is added to any of the caches.
6653 
6654   2. Second part execute an ordered flush and commit. This will be
6655      done using the group commit functionality in ordered_commit.
6656 
6657   3. Third part checks any errors resulting from the ordered commit
6658      and handles them appropriately.
6659 
6660   @retval 0    success
6661   @retval 1    error, transaction was neither logged nor committed
6662   @retval 2    error, transaction was logged but not committed
6663 */
commit(THD * thd,bool all)6664 TC_LOG::enum_result MYSQL_BIN_LOG::commit(THD *thd, bool all)
6665 {
6666   DBUG_ENTER("MYSQL_BIN_LOG::commit");
6667 
6668   binlog_cache_mngr *cache_mngr= thd_get_cache_mngr(thd);
6669   my_xid xid= thd->transaction.xid_state.xid.get_my_xid();
6670   int error= RESULT_SUCCESS;
6671   bool stuff_logged= false;
6672 
6673   DBUG_PRINT("enter", ("thd: 0x%llx, all: %s, xid: %llu, cache_mngr: 0x%llx",
6674                        (ulonglong) thd, YESNO(all), (ulonglong) xid,
6675                        (ulonglong) cache_mngr));
6676 
6677   /*
6678     No cache manager means nothing to log, but we still have to commit
6679     the transaction.
6680    */
6681   if (cache_mngr == NULL)
6682   {
6683     if (ha_commit_low(thd, all))
6684       DBUG_RETURN(RESULT_ABORTED);
6685     DBUG_RETURN(RESULT_SUCCESS);
6686   }
6687 
6688   THD_TRANS *trans= all ? &thd->transaction.all : &thd->transaction.stmt;
6689 
6690   DBUG_PRINT("debug", ("in_transaction: %s, no_2pc: %s, rw_ha_count: %d",
6691                        YESNO(thd->in_multi_stmt_transaction_mode()),
6692                        YESNO(trans->no_2pc),
6693                        trans->rw_ha_count));
6694   DBUG_PRINT("debug",
6695              ("all.cannot_safely_rollback(): %s, trx_cache_empty: %s",
6696               YESNO(thd->transaction.all.cannot_safely_rollback()),
6697               YESNO(cache_mngr->trx_cache.is_binlog_empty())));
6698   DBUG_PRINT("debug",
6699              ("stmt.cannot_safely_rollback(): %s, stmt_cache_empty: %s",
6700               YESNO(thd->transaction.stmt.cannot_safely_rollback()),
6701               YESNO(cache_mngr->stmt_cache.is_binlog_empty())));
6702 
6703 
6704   /*
6705     If there are no handlertons registered, there is nothing to
6706     commit. Note that DDLs are written earlier in this case (inside
6707     binlog_query).
6708 
6709     TODO: This can be a problem in those cases that there are no
6710     handlertons registered. DDLs are one example, but the other case
6711     is MyISAM. In this case, we could register a dummy handlerton to
6712     trigger the commit.
6713 
6714     Any statement that requires logging will call binlog_query before
6715     trans_commit_stmt, so an alternative is to use the condition
6716     "binlog_query called or stmt.ha_list != 0".
6717    */
6718   if (!all && trans->ha_list == 0 &&
6719       cache_mngr->stmt_cache.is_binlog_empty())
6720     DBUG_RETURN(RESULT_SUCCESS);
6721 
6722   /*
6723     If there is anything in the stmt cache, and GTIDs are enabled,
6724     then this is a single statement outside a transaction and it is
6725     impossible that there is anything in the trx cache.  Hence, we
6726     write any empty group(s) to the stmt cache.
6727 
6728     Otherwise, we write any empty group(s) to the trx cache at the end
6729     of the transaction.
6730   */
6731   if (!cache_mngr->stmt_cache.is_binlog_empty())
6732   {
6733     error= write_empty_groups_to_cache(thd, &cache_mngr->stmt_cache);
6734     if (error == 0)
6735     {
6736       if (cache_mngr->stmt_cache.finalize(thd))
6737         DBUG_RETURN(RESULT_ABORTED);
6738       stuff_logged= true;
6739     }
6740   }
6741 
6742   /*
6743     We commit the transaction if:
6744      - We are not in a transaction and committing a statement, or
6745      - We are in a transaction and a full transaction is committed.
6746     Otherwise, we accumulate the changes.
6747   */
6748   if (!error && !cache_mngr->trx_cache.is_binlog_empty() &&
6749       ending_trans(thd, all))
6750   {
6751     const bool real_trans= (all || thd->transaction.all.ha_list == 0);
6752     /*
6753       We are committing an XA transaction if it is a "real" transaction
6754       and have an XID assigned (because some handlerton registered). A
6755       transaction is "real" if either 'all' is true or the 'all.ha_list'
6756       is empty.
6757 
6758       Note: This is kind of strange since registering the binlog
6759       handlerton will then make the transaction XA, which is not really
6760       true. This occurs for example if a MyISAM statement is executed
6761       with row-based replication on.
6762    */
6763     if (real_trans && xid && trans->rw_ha_count > 1 && !trans->no_2pc)
6764     {
6765       Xid_log_event end_evt(thd, xid);
6766       if (cache_mngr->trx_cache.finalize(thd, &end_evt))
6767         DBUG_RETURN(RESULT_ABORTED);
6768     }
6769     else
6770     {
6771       Query_log_event end_evt(thd, STRING_WITH_LEN("COMMIT"),
6772                               true, FALSE, TRUE, 0, TRUE);
6773       if (cache_mngr->trx_cache.finalize(thd, &end_evt))
6774         DBUG_RETURN(RESULT_ABORTED);
6775     }
6776     stuff_logged= true;
6777   }
6778 
6779   /*
6780     This is part of the stmt rollback.
6781   */
6782   if (!all)
6783     cache_mngr->trx_cache.set_prev_position(MY_OFF_T_UNDEF);
6784 
6785   DBUG_PRINT("debug", ("error: %d", error));
6786 
6787   if (error)
6788     DBUG_RETURN(RESULT_ABORTED);
6789 
6790   /*
6791     Now all the events are written to the caches, so we will commit
6792     the transaction in the engines. This is done using the group
6793     commit logic in ordered_commit, which will return when the
6794     transaction is committed.
6795 
6796     If the commit in the engines fail, we still have something logged
6797     to the binary log so we have to report this as a "bad" failure
6798     (failed to commit, but logged something).
6799   */
6800   if (stuff_logged)
6801   {
6802     if (ordered_commit(thd, all))
6803       DBUG_RETURN(RESULT_INCONSISTENT);
6804   }
6805   else
6806   {
6807     if (ha_commit_low(thd, all))
6808       DBUG_RETURN(RESULT_INCONSISTENT);
6809   }
6810 
6811   DBUG_RETURN(error ? RESULT_INCONSISTENT : RESULT_SUCCESS);
6812 }
6813 
6814 
6815 /**
6816    Flush caches for session.
6817 
6818    @note @c set_trans_pos is called with a pointer to the file name
6819    that the binary log currently use and a rotation will change the
6820    contents of the variable.
6821 
6822    The position is used when calling the after_flush, after_commit,
6823    and after_rollback hooks, but these have been placed so that they
6824    occur before a rotation is executed.
6825 
6826    It is the responsibility of any plugin that use this position to
6827    copy it if they need it after the hook has returned.
6828  */
6829 std::pair<int,my_off_t>
flush_thread_caches(THD * thd)6830 MYSQL_BIN_LOG::flush_thread_caches(THD *thd)
6831 {
6832   binlog_cache_mngr *cache_mngr= thd_get_cache_mngr(thd);
6833   my_off_t bytes= 0;
6834   bool wrote_xid= false;
6835   int error= cache_mngr->flush(thd, &bytes, &wrote_xid);
6836   if (!error && bytes > 0)
6837   {
6838     /*
6839       Note that set_trans_pos does not copy the file name. See
6840       this function documentation for more info.
6841     */
6842     thd->set_trans_pos(log_file_name, my_b_tell(&log_file));
6843     if (wrote_xid)
6844       inc_prep_xids(thd);
6845   }
6846   DBUG_PRINT("debug", ("bytes: %llu", bytes));
6847   return std::make_pair(error, bytes);
6848 }
6849 
6850 
6851 /**
6852   Execute the flush stage.
6853 
6854   @param total_bytes_var Pointer to variable that will be set to total
6855   number of bytes flushed, or NULL.
6856 
6857   @param rotate_var Pointer to variable that will be set to true if
6858   binlog rotation should be performed after releasing locks. If rotate
6859   is not necessary, the variable will not be touched.
6860 
6861   @return Error code on error, zero on success
6862  */
6863 
6864 int
process_flush_stage_queue(my_off_t * total_bytes_var,bool * rotate_var,THD ** out_queue_var)6865 MYSQL_BIN_LOG::process_flush_stage_queue(my_off_t *total_bytes_var,
6866                                          bool *rotate_var,
6867                                          THD **out_queue_var)
6868 {
6869   DBUG_ASSERT(total_bytes_var && rotate_var && out_queue_var);
6870   my_off_t total_bytes= 0;
6871   int flush_error= 1;
6872   mysql_mutex_assert_owner(&LOCK_log);
6873 
6874   my_atomic_rwlock_rdlock(&opt_binlog_max_flush_queue_time_lock);
6875   const ulonglong max_udelay= my_atomic_load32(&opt_binlog_max_flush_queue_time);
6876   my_atomic_rwlock_rdunlock(&opt_binlog_max_flush_queue_time_lock);
6877   const ulonglong start_utime= max_udelay > 0 ? my_micro_time() : 0;
6878 
6879   /*
6880     First we read the queue until it either is empty or the difference
6881     between the time we started and the current time is too large.
6882 
6883     We remember the first thread we unqueued, because this will be the
6884     beginning of the out queue.
6885    */
6886   bool has_more= true;
6887   THD *first_seen= NULL;
6888   while ((max_udelay == 0 || my_micro_time() < start_utime + max_udelay) && has_more)
6889   {
6890     std::pair<bool,THD*> current= stage_manager.pop_front(Stage_manager::FLUSH_STAGE);
6891     std::pair<int,my_off_t> result= flush_thread_caches(current.second);
6892     has_more= current.first;
6893     total_bytes+= result.second;
6894     if (flush_error == 1)
6895       flush_error= result.first;
6896     if (first_seen == NULL)
6897       first_seen= current.second;
6898   }
6899 
6900   /*
6901     Either the queue is empty, or we ran out of time. If we ran out of
6902     time, we have to fetch the entire queue (and flush it) since
6903     otherwise the next batch will not have a leader.
6904    */
6905   if (has_more)
6906   {
6907     THD *queue= stage_manager.fetch_queue_for(Stage_manager::FLUSH_STAGE);
6908     for (THD *head= queue ; head ; head = head->next_to_commit)
6909     {
6910       std::pair<int,my_off_t> result= flush_thread_caches(head);
6911       total_bytes+= result.second;
6912       if (flush_error == 1)
6913         flush_error= result.first;
6914     }
6915     if (first_seen == NULL)
6916       first_seen= queue;
6917   }
6918 
6919   *out_queue_var= first_seen;
6920   *total_bytes_var= total_bytes;
6921   if (total_bytes > 0 && my_b_tell(&log_file) >= (my_off_t) max_size)
6922     *rotate_var= true;
6923   return flush_error;
6924 }
6925 
6926 
6927 /**
6928   Commit a sequence of sessions.
6929 
6930   This function commit an entire queue of sessions starting with the
6931   session in @c first. If there were an error in the flushing part of
6932   the ordered commit, the error code is passed in and all the threads
6933   are marked accordingly (but not committed).
6934 
6935   @see MYSQL_BIN_LOG::ordered_commit
6936 
6937   @param thd The "master" thread
6938   @param first First thread in the queue of threads to commit
6939  */
6940 
6941 void
process_commit_stage_queue(THD * thd,THD * first)6942 MYSQL_BIN_LOG::process_commit_stage_queue(THD *thd, THD *first)
6943 {
6944   mysql_mutex_assert_owner(&LOCK_commit);
6945   Thread_excursion excursion(thd);
6946 #ifndef DBUG_OFF
6947   thd->transaction.flags.ready_preempt= 1; // formality by the leader
6948 #endif
6949   for (THD *head= first ; head ; head = head->next_to_commit)
6950   {
6951     DBUG_PRINT("debug", ("Thread ID: %lu, commit_error: %d, flags.pending: %s",
6952                          head->thread_id, head->commit_error,
6953                          YESNO(head->transaction.flags.pending)));
6954     /*
6955       If flushing failed, set commit_error for the session, skip the
6956       transaction and proceed with the next transaction instead. This
6957       will mark all threads as failed, since the flush failed.
6958 
6959       If flush succeeded, attach to the session and commit it in the
6960       engines.
6961     */
6962 #ifndef DBUG_OFF
6963     stage_manager.clear_preempt_status(head);
6964 #endif
6965     /*
6966       Flush/Sync error should be ignored and continue
6967       to commit phase. And thd->commit_error cannot be
6968       COMMIT_ERROR at this moment.
6969     */
6970     DBUG_ASSERT(head->commit_error != THD::CE_COMMIT_ERROR);
6971     excursion.try_to_attach_to(head);
6972     bool all= head->transaction.flags.real_commit;
6973     if (head->transaction.flags.commit_low)
6974     {
6975       /* head is parked to have exited append() */
6976       DBUG_ASSERT(head->transaction.flags.ready_preempt);
6977       /*
6978         storage engine commit
6979       */
6980       if (ha_commit_low(head, all, false))
6981         head->commit_error= THD::CE_COMMIT_ERROR;
6982     }
6983     DBUG_PRINT("debug", ("commit_error: %d, flags.pending: %s",
6984                          head->commit_error,
6985                          YESNO(head->transaction.flags.pending)));
6986     /*
6987       Decrement the prepared XID counter after storage engine commit.
6988       We also need decrement the prepared XID when encountering a
6989       flush error or session attach error for avoiding 3-way deadlock
6990       among user thread, rotate thread and dump thread.
6991     */
6992     if (head->transaction.flags.xid_written)
6993       dec_prep_xids(head);
6994   }
6995 }
6996 
6997 /**
6998   Process after commit for a sequence of sessions.
6999 
7000   @param thd The "master" thread
7001   @param first First thread in the queue of threads to commit
7002  */
7003 
7004 void
process_after_commit_stage_queue(THD * thd,THD * first)7005 MYSQL_BIN_LOG::process_after_commit_stage_queue(THD *thd, THD *first)
7006 {
7007   Thread_excursion excursion(thd);
7008   for (THD *head= first; head; head= head->next_to_commit)
7009   {
7010     if (head->transaction.flags.run_hooks &&
7011         head->commit_error != THD::CE_COMMIT_ERROR)
7012     {
7013 
7014       /*
7015         TODO: This hook here should probably move outside/below this
7016               if and be the only after_commit invocation left in the
7017               code.
7018       */
7019       excursion.try_to_attach_to(head);
7020       bool all= head->transaction.flags.real_commit;
7021       (void) RUN_HOOK(transaction, after_commit, (head, all));
7022       /*
7023         When after_commit finished for the transaction, clear the run_hooks flag.
7024         This allow other parts of the system to check if after_commit was called.
7025       */
7026       head->transaction.flags.run_hooks= false;
7027     }
7028   }
7029 }
7030 
7031 #ifndef DBUG_OFF
7032 /** Names for the stages. */
7033 static const char* g_stage_name[] = {
7034   "FLUSH",
7035   "SYNC",
7036   "COMMIT",
7037 };
7038 #endif
7039 
7040 
7041 /**
7042   Enter a stage of the ordered commit procedure.
7043 
7044   Entering is stage is done by:
7045 
7046   - Atomically enqueueing a queue of processes (which is just one for
7047     the first phase).
7048 
7049   - If the queue was empty, the thread is the leader for that stage
7050     and it should process the entire queue for that stage.
7051 
7052   - If the queue was not empty, the thread is a follower and can go
7053     waiting for the commit to finish.
7054 
7055   The function will lock the stage mutex if it was designated the
7056   leader for the phase.
7057 
7058   @param thd    Session structure
7059   @param stage  The stage to enter
7060   @param queue  Queue of threads to enqueue for the stage
7061   @param stage_mutex Mutex for the stage
7062 
7063   @retval true  The thread should "bail out" and go waiting for the
7064                 commit to finish
7065   @retval false The thread is the leader for the stage and should do
7066                 the processing.
7067 */
7068 
7069 bool
change_stage(THD * thd,Stage_manager::StageID stage,THD * queue,mysql_mutex_t * leave_mutex,mysql_mutex_t * enter_mutex)7070 MYSQL_BIN_LOG::change_stage(THD *thd,
7071                             Stage_manager::StageID stage, THD *queue,
7072                             mysql_mutex_t *leave_mutex,
7073                             mysql_mutex_t *enter_mutex)
7074 {
7075   DBUG_ENTER("MYSQL_BIN_LOG::change_stage");
7076   DBUG_PRINT("enter", ("thd: 0x%llx, stage: %s, queue: 0x%llx",
7077                        (ulonglong) thd, g_stage_name[stage], (ulonglong) queue));
7078   DBUG_ASSERT(0 <= stage && stage < Stage_manager::STAGE_COUNTER);
7079   DBUG_ASSERT(enter_mutex);
7080   DBUG_ASSERT(queue);
7081   /*
7082     enroll_for will release the leave_mutex once the sessions are
7083     queued.
7084   */
7085   if (!stage_manager.enroll_for(stage, queue, leave_mutex))
7086   {
7087     DBUG_ASSERT(!thd_get_cache_mngr(thd)->dbug_any_finalized());
7088     DBUG_RETURN(true);
7089   }
7090   mysql_mutex_lock(enter_mutex);
7091   DBUG_RETURN(false);
7092 }
7093 
7094 
7095 
7096 /**
7097   Flush the I/O cache to file.
7098 
7099   Flush the binary log to the binlog file if any byte where written
7100   and signal that the binary log file has been updated if the flush
7101   succeeds.
7102 */
7103 
7104 int
flush_cache_to_file(my_off_t * end_pos_var)7105 MYSQL_BIN_LOG::flush_cache_to_file(my_off_t *end_pos_var)
7106 {
7107   if (flush_io_cache(&log_file))
7108   {
7109     THD *thd= current_thd;
7110     thd->commit_error= THD::CE_FLUSH_ERROR;
7111     return ER_ERROR_ON_WRITE;
7112   }
7113   *end_pos_var= my_b_tell(&log_file);
7114   return 0;
7115 }
7116 
7117 
7118 /**
7119   Call fsync() to sync the file to disk.
7120 */
7121 std::pair<bool, bool>
sync_binlog_file(bool force)7122 MYSQL_BIN_LOG::sync_binlog_file(bool force)
7123 {
7124   bool synced= false;
7125   unsigned int sync_period= get_sync_period();
7126   if (force || (sync_period && ++sync_counter >= sync_period))
7127   {
7128     sync_counter= 0;
7129 
7130     /**
7131       On *pure non-transactional* workloads there is a small window
7132       in time where a concurrent rotate might be able to close
7133       the file before the sync is actually done. In that case,
7134       ignore the bad file descriptor errors.
7135 
7136       Transactional workloads (InnoDB) are not affected since the
7137       the rotation will not happen until all transactions have
7138       committed to the storage engine, thence decreased the XID
7139       counters.
7140 
7141       TODO: fix this properly even for non-transactional storage
7142             engines.
7143      */
7144     if (DBUG_EVALUATE_IF("simulate_error_during_sync_binlog_file", 1,
7145                          mysql_file_sync(log_file.file,
7146                                          MYF(MY_WME | MY_IGNORE_BADFD))))
7147     {
7148       THD *thd= current_thd;
7149       thd->commit_error= THD::CE_SYNC_ERROR;
7150       return std::make_pair(true, synced);
7151     }
7152     synced= true;
7153   }
7154   return std::make_pair(false, synced);
7155 }
7156 
7157 
7158 /**
7159    Helper function executed when leaving @c ordered_commit.
7160 
7161    This function contain the necessary code for fetching the error
7162    code, doing post-commit checks, and wrapping up the commit if
7163    necessary.
7164 
7165    It is typically called when enter_stage indicates that the thread
7166    should bail out, and also when the ultimate leader thread finishes
7167    executing @c ordered_commit.
7168 
7169    It is typically used in this manner:
7170    @code
7171    if (enter_stage(thd, Thread_queue::FLUSH_STAGE, thd, &LOCK_log))
7172      return finish_commit(thd);
7173    @endcode
7174 
7175    @return Error code if the session commit failed, or zero on
7176    success.
7177  */
7178 int
finish_commit(THD * thd)7179 MYSQL_BIN_LOG::finish_commit(THD *thd)
7180 {
7181   /*
7182     In some unlikely situations, it can happen that binary
7183     log is closed before the thread flushes it's cache.
7184     In that case, clear the caches before doing commit.
7185   */
7186   if (unlikely(!is_open()))
7187   {
7188     binlog_cache_mngr *cache_mngr= thd_get_cache_mngr(thd);
7189     if (cache_mngr)
7190       cache_mngr->reset();
7191   }
7192   if (thd->transaction.flags.commit_low)
7193   {
7194     const bool all= thd->transaction.flags.real_commit;
7195     /*
7196       storage engine commit
7197     */
7198     DBUG_ASSERT(thd->commit_error != THD::CE_COMMIT_ERROR);
7199     if (ha_commit_low(thd, all, false))
7200       thd->commit_error= THD::CE_COMMIT_ERROR;
7201     /*
7202       Decrement the prepared XID counter after storage engine commit
7203     */
7204     if (thd->transaction.flags.xid_written)
7205       dec_prep_xids(thd);
7206     /*
7207       If commit succeeded, we call the after_commit hook
7208 
7209       TODO: This hook here should probably move outside/below this
7210             if and be the only after_commit invocation left in the
7211             code.
7212     */
7213     if ((thd->commit_error != THD::CE_COMMIT_ERROR ) && thd->transaction.flags.run_hooks)
7214     {
7215       (void) RUN_HOOK(transaction, after_commit, (thd, all));
7216       thd->transaction.flags.run_hooks= false;
7217     }
7218   }
7219   else if (thd->transaction.flags.xid_written)
7220     dec_prep_xids(thd);
7221 
7222   /*
7223     Remove committed GTID from owned_gtids, it was already logged on
7224     MYSQL_BIN_LOG::write_cache().
7225   */
7226   global_sid_lock->rdlock();
7227   gtid_state->update_on_commit(thd);
7228   global_sid_lock->unlock();
7229 
7230   DBUG_ASSERT(thd->commit_error || !thd->transaction.flags.run_hooks);
7231   DBUG_ASSERT(!thd_get_cache_mngr(thd)->dbug_any_finalized());
7232   DBUG_PRINT("return", ("Thread ID: %lu, commit_error: %d",
7233                         thd->thread_id, thd->commit_error));
7234   /*
7235     flush or sync errors are handled by the leader of the group
7236     (using binlog_error_action). Hence treat only COMMIT_ERRORs as errors.
7237   */
7238   return (thd->commit_error == THD::CE_COMMIT_ERROR);
7239 }
7240 
7241 /**
7242   Helper function to handle flush or sync stage errors.
7243   If binlog_error_action= ABORT_SERVER, server will be aborted
7244   after reporting the error to the client.
7245   If binlog_error_action= IGNORE_ERROR, binlog will be closed
7246   for the life time of the server. close() call is protected
7247   with LOCK_log to avoid any parallel operations on binary log.
7248 
7249   @param thd Thread object that faced flush/sync error
7250   @param need_lock_log
7251                        > Indicates true if LOCk_log is needed before closing
7252                          binlog (happens when we are handling sync error)
7253                        > Indicates false if LOCK_log is already acquired
7254                          by the thread (happens when we are handling flush
7255                          error)
7256 
7257   @return void
7258 */
handle_binlog_flush_or_sync_error(THD * thd,bool need_lock_log)7259 void MYSQL_BIN_LOG::handle_binlog_flush_or_sync_error(THD *thd,
7260                                                       bool need_lock_log)
7261 {
7262   char errmsg[MYSQL_ERRMSG_SIZE];
7263   sprintf(errmsg, "An error occurred during %s stage of the commit. "
7264           "'binlog_error_action' is set to '%s'.",
7265           thd->commit_error== THD::CE_FLUSH_ERROR ? "flush" : "sync",
7266           binlog_error_action == ABORT_SERVER ? "ABORT_SERVER" : "IGNORE_ERROR");
7267   if (binlog_error_action == ABORT_SERVER)
7268   {
7269     char err_buff[MYSQL_ERRMSG_SIZE];
7270     sprintf(err_buff, "%s Hence aborting the server.", errmsg);
7271     exec_binlog_error_action_abort(err_buff);
7272   }
7273   else
7274   {
7275     DEBUG_SYNC(thd, "before_binlog_closed_due_to_error");
7276     if (need_lock_log)
7277       mysql_mutex_lock(&LOCK_log);
7278     else
7279       mysql_mutex_assert_owner(&LOCK_log);
7280     /*
7281       It can happen that other group leader encountered
7282       error and already closed the binary log. So print
7283       error only if it is in open state. But we should
7284       call close() always just in case if the previous
7285       close did not close index file.
7286     */
7287     if (is_open())
7288     {
7289       sql_print_error("%s Hence turning logging off for the whole duration "
7290                       "of the MySQL server process. To turn it on again: fix "
7291                       "the cause, shutdown the MySQL server and restart it.",
7292                       errmsg);
7293     }
7294     close(LOG_CLOSE_INDEX|LOG_CLOSE_STOP_EVENT, false/*need_lock_log=false*/,
7295           true/*need_lock_index=true*/);
7296     /*
7297       If there is a write error (flush/sync stage) and if
7298       binlog_error_action=IGNORE_ERROR, clear the error
7299       and allow the commit to happen in storage engine.
7300     */
7301     if (check_write_error(thd))
7302       thd->clear_error();
7303 
7304     if (need_lock_log)
7305       mysql_mutex_unlock(&LOCK_log);
7306     DEBUG_SYNC(thd, "after_binlog_closed_due_to_error");
7307   }
7308 }
7309 /**
7310   Flush and commit the transaction.
7311 
7312   This will execute an ordered flush and commit of all outstanding
7313   transactions and is the main function for the binary log group
7314   commit logic. The function performs the ordered commit in two
7315   phases.
7316 
7317   The first phase flushes the caches to the binary log and under
7318   LOCK_log and marks all threads that were flushed as not pending.
7319 
7320   The second phase executes under LOCK_commit and commits all
7321   transactions in order.
7322 
7323   The procedure is:
7324 
7325   1. Queue ourselves for flushing.
7326   2. Grab the log lock, which might result is blocking if the mutex is
7327      already held by another thread.
7328   3. If we were not committed while waiting for the lock
7329      1. Fetch the queue
7330      2. For each thread in the queue:
7331         a. Attach to it
7332         b. Flush the caches, saving any error code
7333      3. Flush and sync (depending on the value of sync_binlog).
7334      4. Signal that the binary log was updated
7335   4. Release the log lock
7336   5. Grab the commit lock
7337      1. For each thread in the queue:
7338         a. If there were no error when flushing and the transaction shall be committed:
7339            - Commit the transaction, saving the result of executing the commit.
7340   6. Release the commit lock
7341   7. Call purge, if any of the committed thread requested a purge.
7342   8. Return with the saved error code
7343 
7344   @todo The use of @c skip_commit is a hack that we use since the @c
7345   TC_LOG Interface does not contain functions to handle
7346   savepoints. Once the binary log is eliminated as a handlerton and
7347   the @c TC_LOG interface is extended with savepoint handling, this
7348   parameter can be removed.
7349 
7350   @param thd Session to commit transaction for
7351   @param all   This is @c true if this is a real transaction commit, and
7352                @c false otherwise.
7353   @param skip_commit
7354                This is @c true if the call to @c ha_commit_low should
7355                be skipped (it is handled by the caller somehow) and @c
7356                false otherwise (the normal case).
7357  */
ordered_commit(THD * thd,bool all,bool skip_commit)7358 int MYSQL_BIN_LOG::ordered_commit(THD *thd, bool all, bool skip_commit)
7359 {
7360   DBUG_ENTER("MYSQL_BIN_LOG::ordered_commit");
7361   int flush_error= 0, sync_error= 0;
7362   my_off_t total_bytes= 0;
7363   bool do_rotate= false;
7364 
7365   /*
7366     These values are used while flushing a transaction, so clear
7367     everything.
7368 
7369     Notes:
7370 
7371     - It would be good if we could keep transaction coordinator
7372       log-specific data out of the THD structure, but that is not the
7373       case right now.
7374 
7375     - Everything in the transaction structure is reset when calling
7376       ha_commit_low since that calls st_transaction::cleanup.
7377   */
7378   thd->transaction.flags.pending= true;
7379   thd->commit_error= THD::CE_NONE;
7380   thd->next_to_commit= NULL;
7381   thd->durability_property= HA_IGNORE_DURABILITY;
7382   thd->transaction.flags.real_commit= all;
7383   thd->transaction.flags.xid_written= false;
7384   thd->transaction.flags.commit_low= !skip_commit;
7385   thd->transaction.flags.run_hooks= !skip_commit;
7386 #ifndef DBUG_OFF
7387   /*
7388      The group commit Leader may have to wait for follower whose transaction
7389      is not ready to be preempted. Initially the status is pessimistic.
7390      Preemption guarding logics is necessary only when DBUG_ON is set.
7391      It won't be required for the dbug-off case as long as the follower won't
7392      execute any thread-specific write access code in this method, which is
7393      the case as of current.
7394   */
7395   thd->transaction.flags.ready_preempt= 0;
7396 #endif
7397 
7398   DBUG_PRINT("enter", ("flags.pending: %s, commit_error: %d, thread_id: %lu",
7399                        YESNO(thd->transaction.flags.pending),
7400                        thd->commit_error, thd->thread_id));
7401 
7402   /*
7403     Stage #1: flushing transactions to binary log
7404 
7405     While flushing, we allow new threads to enter and will process
7406     them in due time. Once the queue was empty, we cannot reap
7407     anything more since it is possible that a thread entered and
7408     appointed itself leader for the flush phase.
7409   */
7410   DEBUG_SYNC(thd, "waiting_to_enter_flush_stage");
7411   if (change_stage(thd, Stage_manager::FLUSH_STAGE, thd, NULL, &LOCK_log))
7412   {
7413     DBUG_PRINT("return", ("Thread ID: %lu, commit_error: %d",
7414                           thd->thread_id, thd->commit_error));
7415     DBUG_RETURN(finish_commit(thd));
7416   }
7417 
7418   THD *wait_queue= NULL, *final_queue= NULL;
7419   mysql_mutex_t *leave_mutex_before_commit_stage= NULL;
7420   my_off_t flush_end_pos= 0;
7421   bool need_LOCK_log;
7422   if (unlikely(!is_open()))
7423   {
7424     final_queue= stage_manager.fetch_queue_for(Stage_manager::FLUSH_STAGE);
7425     leave_mutex_before_commit_stage= &LOCK_log;
7426     /*
7427       binary log is closed, flush stage and sync stage should be
7428       ignored. Binlog cache should be cleared, but instead of doing
7429       it here, do that work in 'finish_commit' function so that
7430       leader and followers thread caches will be cleared.
7431     */
7432     goto commit_stage;
7433   }
7434   DEBUG_SYNC(thd, "waiting_in_the_middle_of_flush_stage");
7435   flush_error= process_flush_stage_queue(&total_bytes, &do_rotate,
7436                                                  &wait_queue);
7437 
7438   if (flush_error == 0 && total_bytes > 0)
7439     flush_error= flush_cache_to_file(&flush_end_pos);
7440 
7441   /*
7442     If the flush finished successfully, we can call the after_flush
7443     hook. Being invoked here, we have the guarantee that the hook is
7444     executed before the before/after_send_hooks on the dump thread
7445     preventing race conditions among these plug-ins.
7446   */
7447   if (flush_error == 0)
7448   {
7449     const char *file_name_ptr= log_file_name + dirname_length(log_file_name);
7450     DBUG_ASSERT(flush_end_pos != 0);
7451     if (RUN_HOOK(binlog_storage, after_flush,
7452                  (thd, file_name_ptr, flush_end_pos)))
7453     {
7454       sql_print_error("Failed to run 'after_flush' hooks");
7455       flush_error= ER_ERROR_ON_WRITE;
7456     }
7457 
7458     signal_update();
7459     DBUG_EXECUTE_IF("crash_commit_after_log", DBUG_SUICIDE(););
7460   }
7461 
7462   if (flush_error)
7463   {
7464     /*
7465       Handle flush error (if any) after leader finishes it's flush stage.
7466     */
7467     handle_binlog_flush_or_sync_error(thd, false /* need_lock_log */);
7468   }
7469 
7470   /*
7471     Stage #2: Syncing binary log file to disk
7472   */
7473   need_LOCK_log= (get_sync_period() == 1);
7474 
7475   /*
7476     LOCK_log is not released when sync_binlog is 1. It guarantees that the
7477     events are not be replicated by dump threads before they are synced to disk.
7478   */
7479   if (change_stage(thd, Stage_manager::SYNC_STAGE, wait_queue,
7480                    need_LOCK_log ? NULL : &LOCK_log, &LOCK_sync))
7481   {
7482     DBUG_PRINT("return", ("Thread ID: %lu, commit_error: %d",
7483                           thd->thread_id, thd->commit_error));
7484     DBUG_RETURN(finish_commit(thd));
7485   }
7486   final_queue= stage_manager.fetch_queue_for(Stage_manager::SYNC_STAGE);
7487   if (flush_error == 0 && total_bytes > 0)
7488   {
7489     DEBUG_SYNC(thd, "before_sync_binlog_file");
7490     std::pair<bool, bool> result= sync_binlog_file(false);
7491     sync_error= result.first;
7492   }
7493 
7494   if (need_LOCK_log)
7495     mysql_mutex_unlock(&LOCK_log);
7496   leave_mutex_before_commit_stage= &LOCK_sync;
7497   /*
7498     Stage #3: Commit all transactions in order.
7499 
7500     This stage is skipped if we do not need to order the commits and
7501     each thread have to execute the handlerton commit instead.
7502 
7503     Howver, since we are keeping the lock from the previous stage, we
7504     need to unlock it if we skip the stage.
7505    */
7506 commit_stage:
7507   /*
7508     We are delaying the handling of sync error until
7509     all locks are released but we should not enter into
7510     commit stage if binlog_error_action is ABORT_SERVER.
7511   */
7512   if (opt_binlog_order_commits &&
7513       (sync_error == 0 || binlog_error_action != ABORT_SERVER))
7514   {
7515     if (change_stage(thd, Stage_manager::COMMIT_STAGE,
7516                      final_queue, leave_mutex_before_commit_stage,
7517                      &LOCK_commit))
7518     {
7519       DBUG_PRINT("return", ("Thread ID: %lu, commit_error: %d",
7520                             thd->thread_id, thd->commit_error));
7521       DBUG_RETURN(finish_commit(thd));
7522     }
7523     THD *commit_queue= stage_manager.fetch_queue_for(Stage_manager::COMMIT_STAGE);
7524     DBUG_EXECUTE_IF("semi_sync_3-way_deadlock",
7525                     DEBUG_SYNC(thd, "before_process_commit_stage_queue"););
7526     process_commit_stage_queue(thd, commit_queue);
7527     mysql_mutex_unlock(&LOCK_commit);
7528     /*
7529       Process after_commit after LOCK_commit is released for avoiding
7530       3-way deadlock among user thread, rotate thread and dump thread.
7531     */
7532     process_after_commit_stage_queue(thd, commit_queue);
7533     final_queue= commit_queue;
7534   }
7535   else if (leave_mutex_before_commit_stage)
7536     mysql_mutex_unlock(leave_mutex_before_commit_stage);
7537 
7538   /*
7539     Handle sync error after we release all locks in order to avoid deadlocks
7540   */
7541   if (sync_error)
7542     handle_binlog_flush_or_sync_error(thd, true /* need_lock_log */);
7543 
7544   /* Commit done so signal all waiting threads */
7545   stage_manager.signal_done(final_queue);
7546 
7547   /*
7548     Finish the commit before executing a rotate, or run the risk of a
7549     deadlock. We don't need the return value here since it is in
7550     thd->commit_error, which is returned below.
7551   */
7552   (void) finish_commit(thd);
7553 
7554   /*
7555     If we need to rotate, we do it without commit error.
7556     Otherwise the thd->commit_error will be possibly reset.
7557    */
7558   if (DBUG_EVALUATE_IF("force_rotate", 1, 0) ||
7559       (do_rotate && thd->commit_error == THD::CE_NONE))
7560   {
7561     /*
7562       Do not force the rotate as several consecutive groups may
7563       request unnecessary rotations.
7564 
7565       NOTE: Run purge_logs wo/ holding LOCK_log because it does not
7566       need the mutex. Otherwise causes various deadlocks.
7567     */
7568 
7569     DEBUG_SYNC(thd, "ready_to_do_rotation");
7570     bool check_purge= false;
7571     mysql_mutex_lock(&LOCK_log);
7572     /*
7573       If rotate fails then depends on binlog_error_action variable
7574       appropriate action will be taken inside rotate call.
7575     */
7576     int error= rotate(false, &check_purge);
7577     mysql_mutex_unlock(&LOCK_log);
7578 
7579     if (error)
7580       thd->commit_error= THD::CE_COMMIT_ERROR;
7581     else if (check_purge)
7582       purge();
7583   }
7584   /*
7585     flush or sync errors are handled above (using binlog_error_action).
7586     Hence treat only COMMIT_ERRORs as errors.
7587   */
7588   DBUG_RETURN(thd->commit_error == THD::CE_COMMIT_ERROR);
7589 }
7590 
7591 
7592 /**
7593   MYSQLD server recovers from last crashed binlog.
7594 
7595   @param log           IO_CACHE of the crashed binlog.
7596   @param fdle          Format_description_log_event of the crashed binlog.
7597   @param valid_pos     The position of the last valid transaction or
7598                        event(non-transaction) of the crashed binlog.
7599 
7600   @retval
7601     0                  ok
7602   @retval
7603     1                  error
7604 */
recover(IO_CACHE * log,Format_description_log_event * fdle,my_off_t * valid_pos)7605 int MYSQL_BIN_LOG::recover(IO_CACHE *log, Format_description_log_event *fdle,
7606                             my_off_t *valid_pos)
7607 {
7608   Log_event  *ev;
7609   HASH xids;
7610   MEM_ROOT mem_root;
7611   /*
7612     The flag is used for handling the case that a transaction
7613     is partially written to the binlog.
7614   */
7615   bool in_transaction= FALSE;
7616 
7617   if (! fdle->is_valid() ||
7618       my_hash_init(&xids, &my_charset_bin, TC_LOG_PAGE_SIZE/3, 0,
7619                    sizeof(my_xid), 0, 0, MYF(0)))
7620     goto err1;
7621 
7622   init_alloc_root(&mem_root, TC_LOG_PAGE_SIZE, TC_LOG_PAGE_SIZE);
7623 
7624   while ((ev= Log_event::read_log_event(log, 0, fdle, TRUE))
7625          && ev->is_valid())
7626   {
7627     if (ev->get_type_code() == QUERY_EVENT &&
7628         !strcmp(((Query_log_event*)ev)->query, "BEGIN"))
7629       in_transaction= TRUE;
7630 
7631     if (ev->get_type_code() == QUERY_EVENT &&
7632         !strcmp(((Query_log_event*)ev)->query, "COMMIT"))
7633     {
7634       DBUG_ASSERT(in_transaction == TRUE);
7635       in_transaction= FALSE;
7636     }
7637     else if (ev->get_type_code() == XID_EVENT)
7638     {
7639       DBUG_ASSERT(in_transaction == TRUE);
7640       in_transaction= FALSE;
7641       Xid_log_event *xev=(Xid_log_event *)ev;
7642       uchar *x= (uchar *) memdup_root(&mem_root, (uchar*) &xev->xid,
7643                                       sizeof(xev->xid));
7644       if (!x || my_hash_insert(&xids, x))
7645         goto err2;
7646     }
7647 
7648     /*
7649       Recorded valid position for the crashed binlog file
7650       which did not contain incorrect events. The following
7651       positions increase the variable valid_pos:
7652 
7653       1 -
7654         ...
7655         <---> HERE IS VALID <--->
7656         GTID
7657         BEGIN
7658         ...
7659         COMMIT
7660         ...
7661 
7662       2 -
7663         ...
7664         <---> HERE IS VALID <--->
7665         GTID
7666         DDL/UTILITY
7667         ...
7668 
7669       In other words, the following positions do not increase
7670       the variable valid_pos:
7671 
7672       1 -
7673         GTID
7674         <---> HERE IS VALID <--->
7675         ...
7676 
7677       2 -
7678         GTID
7679         BEGIN
7680         <---> HERE IS VALID <--->
7681         ...
7682     */
7683     if (!log->error && !in_transaction &&
7684         !is_gtid_event(ev))
7685       *valid_pos= my_b_tell(log);
7686 
7687     delete ev;
7688   }
7689 
7690   if (ha_recover(&xids))
7691     goto err2;
7692 
7693   free_root(&mem_root, MYF(0));
7694   my_hash_free(&xids);
7695   return 0;
7696 
7697 err2:
7698   free_root(&mem_root, MYF(0));
7699   my_hash_free(&xids);
7700 err1:
7701   sql_print_error("Crash recovery failed. Either correct the problem "
7702                   "(if it's, for example, out of memory error) and restart, "
7703                   "or delete (or rename) binary log and start mysqld with "
7704                   "--tc-heuristic-recover={commit|rollback}");
7705   return 1;
7706 }
7707 
report_missing_purged_gtids(const Gtid_set * slave_executed_gtid_set,const char ** errmsg)7708 void MYSQL_BIN_LOG::report_missing_purged_gtids(const Gtid_set* slave_executed_gtid_set,
7709                                          const char** errmsg)
7710 {
7711   DBUG_ENTER("MYSQL_BIN_LOG::report_missing_purged_gtids");
7712   THD *thd= current_thd;
7713   Gtid_set gtid_missing(gtid_state->get_lost_gtids()->get_sid_map());
7714   gtid_missing.add_gtid_set(gtid_state->get_lost_gtids());
7715   gtid_missing.remove_gtid_set(slave_executed_gtid_set);
7716 
7717   String tmp_uuid;
7718   uchar name[]= "slave_uuid";
7719 
7720   /* Protects thd->user_vars. */
7721   mysql_mutex_lock(&thd->LOCK_thd_data);
7722   user_var_entry *entry=
7723     (user_var_entry*) my_hash_search(&thd->user_vars, name, sizeof(name)-1);
7724   if (entry && entry->length() > 0)
7725     tmp_uuid.copy(entry->ptr(), entry->length(), NULL);
7726   mysql_mutex_unlock(&thd->LOCK_thd_data);
7727 
7728 
7729   char* missing_gtids= NULL;
7730   char* slave_executed_gtids= NULL;
7731   gtid_missing.to_string(&missing_gtids, NULL);
7732   slave_executed_gtid_set->to_string(&slave_executed_gtids, NULL);
7733 
7734   /*
7735      Log the information about the missing purged GTIDs to the error log
7736      if the message is less than MAX_LOG_BUFFER_SIZE.
7737   */
7738   std::ostringstream log_info;
7739   log_info << "The missing transactions are '"<< missing_gtids <<"'";
7740   const char* log_msg= ER(ER_FOUND_MISSING_GTIDS);
7741 
7742   /* Don't consider the "%s" in the format string. Subtract 2 from the
7743      total length */
7744   int total_length= (strlen(log_msg) - 2 + log_info.str().length());
7745 
7746   DBUG_EXECUTE_IF("simulate_long_missing_gtids",
7747                   { total_length= MAX_LOG_BUFFER_SIZE + 1;});
7748 
7749   if (total_length > MAX_LOG_BUFFER_SIZE)
7750     log_info.str("To find the missing purged transactions, run \"SELECT"
7751                  " @@GLOBAL.GTID_PURGED\" on the master, then run \"SHOW"
7752                  " SLAVE STATUS\" on the slave for the Retrieved_Gtid_Set,"
7753                  " and then run \"SELECT GTID_SUBTRACT(<master_set>,"
7754                  " <slave_set>)\" on any server");
7755 
7756   sql_print_warning(ER_THD(thd, ER_FOUND_MISSING_GTIDS), tmp_uuid.ptr(),
7757                     log_info.str().c_str());
7758 
7759   /*
7760      Send the information about the slave executed GTIDs and missing
7761      purged GTIDs to slave if the message is less than MYSQL_ERRMSG_SIZE.
7762   */
7763   std::ostringstream gtid_info;
7764   gtid_info << "The GTID set sent by the slave is '" << slave_executed_gtids
7765             << "', and the missing transactions are '"<< missing_gtids <<"'";
7766   *errmsg= ER_THD(thd, ER_MASTER_HAS_PURGED_REQUIRED_GTIDS);
7767 
7768   /* Don't consider the "%s" in the format string. Subtract 2 from the
7769      total length */
7770   total_length= (strlen(*errmsg) - 2 + gtid_info.str().length());
7771 
7772   DBUG_EXECUTE_IF("simulate_long_missing_gtids",
7773                   { total_length= MYSQL_ERRMSG_SIZE + 1;});
7774 
7775   if (total_length > MYSQL_ERRMSG_SIZE)
7776     gtid_info.str("The GTID sets and the missing purged transactions are too"
7777                   " long to print in this message. For more information,"
7778                   " please see the master's error log or the manual for"
7779                   " GTID_SUBTRACT");
7780 
7781   /* Buffer for formatting the message about the missing GTIDs. */
7782   static char buff[MYSQL_ERRMSG_SIZE];
7783   my_snprintf(buff, MYSQL_ERRMSG_SIZE, *errmsg, gtid_info.str().c_str());
7784   *errmsg= const_cast<const char*>(buff);
7785 
7786   my_free(missing_gtids);
7787   my_free(slave_executed_gtids);
7788   DBUG_VOID_RETURN;
7789 }
7790 
report_missing_gtids(const Gtid_set * previous_gtid_set,const Gtid_set * slave_executed_gtid_set,const char ** errmsg)7791 void MYSQL_BIN_LOG::report_missing_gtids(const Gtid_set* previous_gtid_set,
7792                                          const Gtid_set* slave_executed_gtid_set,
7793                                          const char** errmsg)
7794 {
7795   DBUG_ENTER("MYSQL_BIN_LOG::report_missing_gtids");
7796   THD *thd=current_thd;
7797   char* missing_gtids= NULL;
7798   char* slave_executed_gtids= NULL;
7799   Gtid_set gtid_missing(slave_executed_gtid_set->get_sid_map());
7800   gtid_missing.add_gtid_set(slave_executed_gtid_set);
7801   gtid_missing.remove_gtid_set(previous_gtid_set);
7802   gtid_missing.to_string(&missing_gtids, NULL);
7803   slave_executed_gtid_set->to_string(&slave_executed_gtids, NULL);
7804 
7805   String tmp_uuid;
7806   uchar name[]= "slave_uuid";
7807 
7808   /* Protects thd->user_vars. */
7809   mysql_mutex_lock(&thd->LOCK_thd_data);
7810 
7811   user_var_entry *entry=
7812     (user_var_entry*) my_hash_search(&thd->user_vars, name, sizeof(name)-1);
7813   if (entry && entry->length() > 0)
7814     tmp_uuid.copy(entry->ptr(), entry->length(), NULL);
7815   mysql_mutex_unlock(&thd->LOCK_thd_data);
7816 
7817   /*
7818      Log the information about the missing purged GTIDs to the error log
7819      if the message is less than MAX_LOG_BUFFER_SIZE.
7820   */
7821   std::ostringstream log_info;
7822   log_info << "If the binary log files have been deleted from disk,"
7823       " check the consistency of 'GTID_PURGED' variable."
7824       " The missing transactions are '"<< missing_gtids <<"'";
7825   const char* log_msg= ER(ER_FOUND_MISSING_GTIDS);
7826 
7827   /* Don't consider the "%s" in the format string. Subtract 2 from the
7828      total length */
7829   if ((strlen(log_msg) - 2 + log_info.str().length()) > MAX_LOG_BUFFER_SIZE)
7830     log_info.str("To find the missing purged transactions, run \"SELECT"
7831                  " @@GLOBAL.GTID_PURGED\" on the master, then run \"SHOW"
7832                  " SLAVE STATUS\" on the slave for the Retrieved_Gtid_Set,"
7833                  " and then run \"SELECT GTID_SUBTRACT(<master_set>,"
7834                  " <slave_set>)\" on any server");
7835 
7836   sql_print_warning(ER_THD(thd, ER_FOUND_MISSING_GTIDS), tmp_uuid.ptr(),
7837                     log_info.str().c_str());
7838 
7839   /*
7840      Send the information about the slave executed GTIDs and missing
7841      purged GTIDs to slave if the message is less than MYSQL_ERRMSG_SIZE.
7842   */
7843   std::ostringstream gtid_info;
7844   gtid_info << "The GTID set sent by the slave is '" << slave_executed_gtids
7845             << "', and the missing transactions are '"<< missing_gtids <<"'";
7846   *errmsg= ER_THD(thd, ER_MASTER_HAS_PURGED_REQUIRED_GTIDS);
7847 
7848   /* Don't consider the "%s" in the format string. Subtract 2 from the
7849      total length */
7850   if ((strlen(*errmsg) - 2 + gtid_info.str().length()) > MYSQL_ERRMSG_SIZE)
7851     gtid_info.str("The GTID sets and the missing purged transactions are too"
7852                   " long to print in this message. For more information,"
7853                   " please see the master's error log or the manual for"
7854                   " GTID_SUBTRACT");
7855   /* Buffer for formatting the message about the missing GTIDs. */
7856   static char buff[MYSQL_ERRMSG_SIZE];
7857   my_snprintf(buff, MYSQL_ERRMSG_SIZE, *errmsg, gtid_info.str().c_str());
7858   *errmsg= const_cast<const char*>(buff);
7859 
7860   my_free(missing_gtids);
7861   my_free(slave_executed_gtids);
7862 
7863   DBUG_VOID_RETURN;
7864 }
get_group_cache(bool is_transactional)7865 Group_cache *THD::get_group_cache(bool is_transactional)
7866 {
7867   DBUG_ENTER("THD::get_group_cache(bool)");
7868 
7869   // If opt_bin_log==0, it is not safe to call thd_get_cache_mngr
7870   // because binlog_hton has not been completely set up.
7871   DBUG_ASSERT(opt_bin_log);
7872   binlog_cache_mngr *cache_mngr= thd_get_cache_mngr(this);
7873 
7874   // cache_mngr is NULL until we call thd->binlog_setup_trx_data, so
7875   // we assert that this has been done.
7876   DBUG_ASSERT(cache_mngr != NULL);
7877 
7878   binlog_cache_data *cache_data=
7879     cache_mngr->get_binlog_cache_data(is_transactional);
7880   DBUG_ASSERT(cache_data != NULL);
7881 
7882   DBUG_RETURN(&cache_data->group_cache);
7883 }
7884 
7885 /*
7886   These functions are placed in this file since they need access to
7887   binlog_hton, which has internal linkage.
7888 */
7889 
binlog_setup_trx_data()7890 int THD::binlog_setup_trx_data()
7891 {
7892   DBUG_ENTER("THD::binlog_setup_trx_data");
7893   binlog_cache_mngr *cache_mngr= thd_get_cache_mngr(this);
7894 
7895   if (cache_mngr)
7896     DBUG_RETURN(0);                             // Already set up
7897 
7898   cache_mngr= (binlog_cache_mngr*) my_malloc(sizeof(binlog_cache_mngr), MYF(MY_ZEROFILL));
7899   if (!cache_mngr ||
7900       open_cached_file(&cache_mngr->stmt_cache.cache_log, mysql_tmpdir,
7901                        LOG_PREFIX, binlog_stmt_cache_size, MYF(MY_WME)) ||
7902       open_cached_file(&cache_mngr->trx_cache.cache_log, mysql_tmpdir,
7903                        LOG_PREFIX, binlog_cache_size, MYF(MY_WME)))
7904   {
7905     my_free(cache_mngr);
7906     DBUG_RETURN(1);                      // Didn't manage to set it up
7907   }
7908   DBUG_PRINT("debug", ("Set ha_data slot %d to 0x%llx", binlog_hton->slot, (ulonglong) cache_mngr));
7909   thd_set_ha_data(this, binlog_hton, cache_mngr);
7910 
7911   cache_mngr= new (thd_get_cache_mngr(this))
7912               binlog_cache_mngr(max_binlog_stmt_cache_size,
7913                                 &binlog_stmt_cache_use,
7914                                 &binlog_stmt_cache_disk_use,
7915                                 max_binlog_cache_size,
7916                                 &binlog_cache_use,
7917                                 &binlog_cache_disk_use);
7918   DBUG_RETURN(0);
7919 }
7920 
7921 /**
7922 
7923 */
register_binlog_handler(THD * thd,bool trx)7924 void register_binlog_handler(THD *thd, bool trx)
7925 {
7926   DBUG_ENTER("register_binlog_handler");
7927   /*
7928     If this is the first call to this function while processing a statement,
7929     the transactional cache does not have a savepoint defined. So, in what
7930     follows:
7931       . an implicit savepoint is defined;
7932       . callbacks are registered;
7933       . binary log is set as read/write.
7934 
7935     The savepoint allows for truncating the trx-cache transactional changes
7936     fail. Callbacks are necessary to flush caches upon committing or rolling
7937     back a statement or a transaction. However, notifications do not happen
7938     if the binary log is set as read/write.
7939   */
7940   binlog_cache_mngr *cache_mngr= thd_get_cache_mngr(thd);
7941   if (cache_mngr->trx_cache.get_prev_position() == MY_OFF_T_UNDEF)
7942   {
7943     /*
7944       Set an implicit savepoint in order to be able to truncate a trx-cache.
7945     */
7946     my_off_t pos= 0;
7947     binlog_trans_log_savepos(thd, &pos);
7948     cache_mngr->trx_cache.set_prev_position(pos);
7949 
7950     /*
7951       Set callbacks in order to be able to call commmit or rollback.
7952     */
7953     if (trx)
7954       trans_register_ha(thd, TRUE, binlog_hton);
7955     trans_register_ha(thd, FALSE, binlog_hton);
7956 
7957     /*
7958       Set the binary log as read/write otherwise callbacks are not called.
7959     */
7960     thd->ha_data[binlog_hton->slot].ha_info[0].set_trx_read_write();
7961   }
7962   DBUG_VOID_RETURN;
7963 }
7964 
7965 /**
7966   Function to start a statement and optionally a transaction for the
7967   binary log.
7968 
7969   This function does three things:
7970     - Starts a transaction if not in autocommit mode or if a BEGIN
7971       statement has been seen.
7972 
7973     - Start a statement transaction to allow us to truncate the cache.
7974 
7975     - Save the currrent binlog position so that we can roll back the
7976       statement by truncating the cache.
7977 
7978       We only update the saved position if the old one was undefined,
7979       the reason is that there are some cases (e.g., for CREATE-SELECT)
7980       where the position is saved twice (e.g., both in
7981       select_create::prepare() and THD::binlog_write_table_map()) , but
7982       we should use the first. This means that calls to this function
7983       can be used to start the statement before the first table map
7984       event, to include some extra events.
7985 
7986   Note however that IMMEDIATE_LOGGING implies that the statement is
7987   written without BEGIN/COMMIT.
7988 
7989   @param thd         Thread variable
7990   @param start_event The first event requested to be written into the
7991                      binary log
7992  */
binlog_start_trans_and_stmt(THD * thd,Log_event * start_event)7993 static int binlog_start_trans_and_stmt(THD *thd, Log_event *start_event)
7994 {
7995   DBUG_ENTER("binlog_start_trans_and_stmt");
7996 
7997   /*
7998     Initialize the cache manager if this was not done yet.
7999   */
8000   if (thd->binlog_setup_trx_data())
8001     DBUG_RETURN(1);
8002 
8003   /*
8004     Retrieve the appropriated cache.
8005   */
8006   bool is_transactional= start_event->is_using_trans_cache();
8007   binlog_cache_mngr *cache_mngr= thd_get_cache_mngr(thd);
8008   binlog_cache_data *cache_data= cache_mngr->get_binlog_cache_data(is_transactional);
8009 
8010   /*
8011     If the event is requesting immediatly logging, there is no need to go
8012     further down and set savepoint and register callbacks.
8013   */
8014   if (start_event->is_using_immediate_logging())
8015     DBUG_RETURN(0);
8016 
8017   register_binlog_handler(thd, thd->in_multi_stmt_transaction_mode());
8018 
8019   /*
8020     If the cache is empty log "BEGIN" at the beginning of every transaction.
8021     Here, a transaction is either a BEGIN..COMMIT/ROLLBACK block or a single
8022     statement in autocommit mode.
8023   */
8024   if (cache_data->is_binlog_empty())
8025   {
8026     Query_log_event qinfo(thd, STRING_WITH_LEN("BEGIN"),
8027                           is_transactional, FALSE, TRUE, 0, TRUE);
8028     if (cache_data->write_event(thd, &qinfo))
8029       DBUG_RETURN(1);
8030   }
8031 
8032   DBUG_RETURN(0);
8033 }
8034 
8035 /**
8036   This function writes a table map to the binary log.
8037   Note that in order to keep the signature uniform with related methods,
8038   we use a redundant parameter to indicate whether a transactional table
8039   was changed or not.
8040   Sometimes it will write a Rows_query_log_event into binary log before
8041   the table map too.
8042 
8043   @param table             a pointer to the table.
8044   @param is_transactional  @c true indicates a transactional table,
8045                            otherwise @c false a non-transactional.
8046   @param binlog_rows_query @c true indicates a Rows_query log event
8047                            will be binlogged before table map,
8048                            otherwise @c false indicates it will not
8049                            be binlogged.
8050   @return
8051     nonzero if an error pops up when writing the table map event
8052     or the Rows_query log event.
8053 */
binlog_write_table_map(TABLE * table,bool is_transactional,bool binlog_rows_query)8054 int THD::binlog_write_table_map(TABLE *table, bool is_transactional,
8055                                 bool binlog_rows_query)
8056 {
8057   int error;
8058   DBUG_ENTER("THD::binlog_write_table_map");
8059   DBUG_PRINT("enter", ("table: 0x%lx  (%s: #%llu)",
8060                        (long) table, table->s->table_name.str,
8061                        table->s->table_map_id.id()));
8062 
8063   /* Pre-conditions */
8064   DBUG_ASSERT(is_current_stmt_binlog_format_row() && mysql_bin_log.is_open());
8065   DBUG_ASSERT(table->s->table_map_id.is_valid());
8066 
8067   Table_map_log_event
8068     the_event(this, table, table->s->table_map_id, is_transactional);
8069 
8070   binlog_start_trans_and_stmt(this, &the_event);
8071 
8072   binlog_cache_mngr *const cache_mngr= thd_get_cache_mngr(this);
8073 
8074   binlog_cache_data *cache_data=
8075     cache_mngr->get_binlog_cache_data(is_transactional);
8076 
8077   if (binlog_rows_query && this->query())
8078   {
8079     /* Write the Rows_query_log_event into binlog before the table map */
8080     Rows_query_log_event
8081       rows_query_ev(this, this->query(), this->query_length());
8082     if ((error= cache_data->write_event(this, &rows_query_ev)))
8083       DBUG_RETURN(error);
8084   }
8085 
8086   if ((error= cache_data->write_event(this, &the_event)))
8087     DBUG_RETURN(error);
8088 
8089   binlog_table_maps++;
8090   DBUG_RETURN(0);
8091 }
8092 
8093 /**
8094   This function retrieves a pending row event from a cache which is
8095   specified through the parameter @c is_transactional. Respectively, when it
8096   is @c true, the pending event is returned from the transactional cache.
8097   Otherwise from the non-transactional cache.
8098 
8099   @param is_transactional  @c true indicates a transactional cache,
8100                            otherwise @c false a non-transactional.
8101   @return
8102     The row event if any.
8103 */
8104 Rows_log_event*
binlog_get_pending_rows_event(bool is_transactional) const8105 THD::binlog_get_pending_rows_event(bool is_transactional) const
8106 {
8107   Rows_log_event* rows= NULL;
8108   binlog_cache_mngr *const cache_mngr= thd_get_cache_mngr(this);
8109 
8110   /*
8111     This is less than ideal, but here's the story: If there is no cache_mngr,
8112     prepare_pending_rows_event() has never been called (since the cache_mngr
8113     is set up there). In that case, we just return NULL.
8114    */
8115   if (cache_mngr)
8116   {
8117     binlog_cache_data *cache_data=
8118       cache_mngr->get_binlog_cache_data(is_transactional);
8119 
8120     rows= cache_data->pending();
8121   }
8122   return (rows);
8123 }
8124 
8125 /**
8126    @param db    db name c-string to be inserted into alphabetically sorted
8127                 THD::binlog_accessed_db_names list.
8128 
8129                 Note, that space for both the data and the node
8130                 struct are allocated in THD::main_mem_root.
8131                 The list lasts for the top-level query time and is reset
8132                 in @c THD::cleanup_after_query().
8133 */
8134 void
add_to_binlog_accessed_dbs(const char * db_param)8135 THD::add_to_binlog_accessed_dbs(const char *db_param)
8136 {
8137   char *after_db;
8138   /*
8139     binlog_accessed_db_names list is to maintain the database
8140     names which are referenced in a given command.
8141     Prior to bug 17806014 fix, 'main_mem_root' memory root used
8142     to store this list. The 'main_mem_root' scope is till the end
8143     of the query. Hence it caused increasing memory consumption
8144     problem in big procedures like the ones mentioned below.
8145     Eg: CALL p1() where p1 is having 1,00,000 create and drop tables.
8146     'main_mem_root' is freed only at the end of the command CALL p1()'s
8147     execution. But binlog_accessed_db_names list scope is only till the
8148     individual statements specified the procedure(create/drop statements).
8149     Hence the memory allocated in 'main_mem_root' was left uncleared
8150     until the p1's completion, even though it is not required after
8151     completion of individual statements.
8152 
8153     Instead of using 'main_mem_root' whose scope is complete query execution,
8154     now the memroot is changed to use 'thd->mem_root' whose scope is until the
8155     individual statement in CALL p1(). 'thd->mem_root' is set to 'execute_mem_root'
8156     in the context of procedure and it's scope is till the individual statement
8157     in CALL p1() and thd->memroot is equal to 'main_mem_root' in the context
8158     of a normal 'top level query'.
8159 
8160     Eg: a) create table t1(i int); => If this function is called while
8161            processing this statement, thd->memroot is equal to &main_mem_root
8162            which will be freed immediately after executing this statement.
8163         b) CALL p1() -> p1 contains create table t1(i int); => If this function
8164            is called while processing create table statement which is inside
8165            a stored procedure, then thd->memroot is equal to 'execute_mem_root'
8166            which will be freed immediately after executing this statement.
8167     In both a and b case, thd->memroot will be freed immediately and will not
8168     increase memory consumption.
8169 
8170     A special case(stored functions/triggers):
8171     Consider the following example:
8172     create function f1(i int) returns int
8173     begin
8174       insert into db1.t1 values (1);
8175       insert into db2.t1 values (2);
8176     end;
8177     When we are processing SELECT f1(), the list should contain db1, db2 names.
8178     Since thd->mem_root contains 'execute_mem_root' in the context of
8179     stored function, the mem root will be freed after adding db1 in
8180     the list and when we are processing the second statement and when we try
8181     to add 'db2' in the db1's list, it will lead to crash as db1's memory
8182     is already freed. To handle this special case, if in_sub_stmt is set
8183     (which is true incase of stored functions/triggers), we use &main_mem_root,
8184     if not set we will use thd->memroot which changes it's value to
8185     'execute_mem_root' or '&main_mem_root' depends on the context.
8186    */
8187   MEM_ROOT *db_mem_root= in_sub_stmt ? &main_mem_root : mem_root;
8188 
8189   if (!binlog_accessed_db_names)
8190     binlog_accessed_db_names= new (db_mem_root) List<char>;
8191 
8192   if (binlog_accessed_db_names->elements >  MAX_DBS_IN_EVENT_MTS)
8193   {
8194     push_warning_printf(this, Sql_condition::WARN_LEVEL_WARN,
8195                         ER_MTS_UPDATED_DBS_GREATER_MAX,
8196                         ER(ER_MTS_UPDATED_DBS_GREATER_MAX),
8197                         MAX_DBS_IN_EVENT_MTS);
8198     return;
8199   }
8200 
8201   after_db= strdup_root(db_mem_root, db_param);
8202 
8203   /*
8204      sorted insertion is implemented with first rearranging data
8205      (pointer to char*) of the links and final appending of the least
8206      ordered data to create a new link in the list.
8207   */
8208   if (binlog_accessed_db_names->elements != 0)
8209   {
8210     List_iterator<char> it(*get_binlog_accessed_db_names());
8211 
8212     while (it++)
8213     {
8214       char *swap= NULL;
8215       char **ref_cur_db= it.ref();
8216       int cmp= strcmp(after_db, *ref_cur_db);
8217 
8218       DBUG_ASSERT(!swap || cmp < 0);
8219 
8220       if (cmp == 0)
8221       {
8222         after_db= NULL;  /* dup to ignore */
8223         break;
8224       }
8225       else if (swap || cmp > 0)
8226       {
8227         swap= *ref_cur_db;
8228         *ref_cur_db= after_db;
8229         after_db= swap;
8230       }
8231     }
8232   }
8233   if (after_db)
8234     binlog_accessed_db_names->push_back(after_db, db_mem_root);
8235 }
8236 
8237 /*
8238   Tells if two (or more) tables have auto_increment columns and we want to
8239   lock those tables with a write lock.
8240 
8241   SYNOPSIS
8242     has_two_write_locked_tables_with_auto_increment
8243       tables        Table list
8244 
8245   NOTES:
8246     Call this function only when you have established the list of all tables
8247     which you'll want to update (including stored functions, triggers, views
8248     inside your statement).
8249 */
8250 
8251 static bool
has_write_table_with_auto_increment(TABLE_LIST * tables)8252 has_write_table_with_auto_increment(TABLE_LIST *tables)
8253 {
8254   for (TABLE_LIST *table= tables; table; table= table->next_global)
8255   {
8256     /* we must do preliminary checks as table->table may be NULL */
8257     if (!table->placeholder() &&
8258         table->table->found_next_number_field &&
8259         (table->lock_type >= TL_WRITE_ALLOW_WRITE))
8260       return 1;
8261   }
8262 
8263   return 0;
8264 }
8265 
8266 /*
8267    checks if we have select tables in the table list and write tables
8268    with auto-increment column.
8269 
8270   SYNOPSIS
8271    has_two_write_locked_tables_with_auto_increment_and_select
8272       tables        Table list
8273 
8274   RETURN VALUES
8275 
8276    -true if the table list has atleast one table with auto-increment column
8277 
8278 
8279          and atleast one table to select from.
8280    -false otherwise
8281 */
8282 
8283 static bool
has_write_table_with_auto_increment_and_select(TABLE_LIST * tables)8284 has_write_table_with_auto_increment_and_select(TABLE_LIST *tables)
8285 {
8286   bool has_select= false;
8287   bool has_auto_increment_tables = has_write_table_with_auto_increment(tables);
8288   for(TABLE_LIST *table= tables; table; table= table->next_global)
8289   {
8290      if (!table->placeholder() &&
8291         (table->lock_type <= TL_READ_NO_INSERT))
8292       {
8293         has_select= true;
8294         break;
8295       }
8296   }
8297   return(has_select && has_auto_increment_tables);
8298 }
8299 
8300 /*
8301   Tells if there is a table whose auto_increment column is a part
8302   of a compound primary key while is not the first column in
8303   the table definition.
8304 
8305   @param tables Table list
8306 
8307   @return true if the table exists, fais if does not.
8308 */
8309 
8310 static bool
has_write_table_auto_increment_not_first_in_pk(TABLE_LIST * tables)8311 has_write_table_auto_increment_not_first_in_pk(TABLE_LIST *tables)
8312 {
8313   for (TABLE_LIST *table= tables; table; table= table->next_global)
8314   {
8315     /* we must do preliminary checks as table->table may be NULL */
8316     if (!table->placeholder() &&
8317         table->table->found_next_number_field &&
8318         (table->lock_type >= TL_WRITE_ALLOW_WRITE)
8319         && table->table->s->next_number_keypart != 0)
8320       return 1;
8321   }
8322 
8323   return 0;
8324 }
8325 
8326 #ifndef DBUG_OFF
get_locked_tables_mode_name(enum_locked_tables_mode locked_tables_mode)8327 const char * get_locked_tables_mode_name(enum_locked_tables_mode locked_tables_mode)
8328 {
8329    switch (locked_tables_mode)
8330    {
8331    case LTM_NONE:
8332      return "LTM_NONE";
8333    case LTM_LOCK_TABLES:
8334      return "LTM_LOCK_TABLES";
8335    case LTM_PRELOCKED:
8336      return "LTM_PRELOCKED";
8337    case LTM_PRELOCKED_UNDER_LOCK_TABLES:
8338      return "LTM_PRELOCKED_UNDER_LOCK_TABLES";
8339    default:
8340      return "Unknown table lock mode";
8341    }
8342 }
8343 #endif
8344 
8345 
8346 /**
8347   Decide on logging format to use for the statement and issue errors
8348   or warnings as needed.  The decision depends on the following
8349   parameters:
8350 
8351   - The logging mode, i.e., the value of binlog_format.  Can be
8352     statement, mixed, or row.
8353 
8354   - The type of statement.  There are three types of statements:
8355     "normal" safe statements; unsafe statements; and row injections.
8356     An unsafe statement is one that, if logged in statement format,
8357     might produce different results when replayed on the slave (e.g.,
8358     INSERT DELAYED).  A row injection is either a BINLOG statement, or
8359     a row event executed by the slave's SQL thread.
8360 
8361   - The capabilities of tables modified by the statement.  The
8362     *capabilities vector* for a table is a set of flags associated
8363     with the table.  Currently, it only includes two flags: *row
8364     capability flag* and *statement capability flag*.
8365 
8366     The row capability flag is set if and only if the engine can
8367     handle row-based logging. The statement capability flag is set if
8368     and only if the table can handle statement-based logging.
8369 
8370   Decision table for logging format
8371   ---------------------------------
8372 
8373   The following table summarizes how the format and generated
8374   warning/error depends on the tables' capabilities, the statement
8375   type, and the current binlog_format.
8376 
8377      Row capable        N NNNNNNNNN YYYYYYYYY YYYYYYYYY
8378      Statement capable  N YYYYYYYYY NNNNNNNNN YYYYYYYYY
8379 
8380      Statement type     * SSSUUUIII SSSUUUIII SSSUUUIII
8381 
8382      binlog_format      * SMRSMRSMR SMRSMRSMR SMRSMRSMR
8383 
8384      Logged format      - SS-S----- -RR-RR-RR SRRSRR-RR
8385      Warning/Error      1 --2732444 5--5--6-- ---7--6--
8386 
8387   Legend
8388   ------
8389 
8390   Row capable:    N - Some table not row-capable, Y - All tables row-capable
8391   Stmt capable:   N - Some table not stmt-capable, Y - All tables stmt-capable
8392   Statement type: (S)afe, (U)nsafe, or Row (I)njection
8393   binlog_format:  (S)TATEMENT, (M)IXED, or (R)OW
8394   Logged format:  (S)tatement or (R)ow
8395   Warning/Error:  Warnings and error messages are as follows:
8396 
8397   1. Error: Cannot execute statement: binlogging impossible since both
8398      row-incapable engines and statement-incapable engines are
8399      involved.
8400 
8401   2. Error: Cannot execute statement: binlogging impossible since
8402      BINLOG_FORMAT = ROW and at least one table uses a storage engine
8403      limited to statement-logging.
8404 
8405   3. Error: Cannot execute statement: binlogging of unsafe statement
8406      is impossible when storage engine is limited to statement-logging
8407      and BINLOG_FORMAT = MIXED.
8408 
8409   4. Error: Cannot execute row injection: binlogging impossible since
8410      at least one table uses a storage engine limited to
8411      statement-logging.
8412 
8413   5. Error: Cannot execute statement: binlogging impossible since
8414      BINLOG_FORMAT = STATEMENT and at least one table uses a storage
8415      engine limited to row-logging.
8416 
8417   6. Error: Cannot execute row injection: binlogging impossible since
8418      BINLOG_FORMAT = STATEMENT.
8419 
8420   7. Warning: Unsafe statement binlogged in statement format since
8421      BINLOG_FORMAT = STATEMENT.
8422 
8423   In addition, we can produce the following error (not depending on
8424   the variables of the decision diagram):
8425 
8426   8. Error: Cannot execute statement: binlogging impossible since more
8427      than one engine is involved and at least one engine is
8428      self-logging.
8429 
8430   For each error case above, the statement is prevented from being
8431   logged, we report an error, and roll back the statement.  For
8432   warnings, we set the thd->binlog_flags variable: the warning will be
8433   printed only if the statement is successfully logged.
8434 
8435   @see THD::binlog_query
8436 
8437   @param[in] thd    Client thread
8438   @param[in] tables Tables involved in the query
8439 
8440   @retval 0 No error; statement can be logged.
8441   @retval -1 One of the error conditions above applies (1, 2, 4, 5, or 6).
8442 */
8443 
decide_logging_format(TABLE_LIST * tables)8444 int THD::decide_logging_format(TABLE_LIST *tables)
8445 {
8446   DBUG_ENTER("THD::decide_logging_format");
8447   DBUG_PRINT("info", ("query: %s", query()));
8448   DBUG_PRINT("info", ("variables.binlog_format: %lu",
8449                       variables.binlog_format));
8450   DBUG_PRINT("info", ("lex->get_stmt_unsafe_flags(): 0x%x",
8451                       lex->get_stmt_unsafe_flags()));
8452 
8453   reset_binlog_local_stmt_filter();
8454 
8455   /*
8456     We should not decide logging format if the binlog is closed or
8457     binlogging is off, or if the statement is filtered out from the
8458     binlog by filtering rules.
8459   */
8460   if (mysql_bin_log.is_open() && (variables.option_bits & OPTION_BIN_LOG) &&
8461       !(variables.binlog_format == BINLOG_FORMAT_STMT &&
8462         !binlog_filter->db_ok(db)))
8463   {
8464     /*
8465       Compute one bit field with the union of all the engine
8466       capabilities, and one with the intersection of all the engine
8467       capabilities.
8468     */
8469     handler::Table_flags flags_write_some_set= 0;
8470     handler::Table_flags flags_access_some_set= 0;
8471     handler::Table_flags flags_write_all_set=
8472       HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE;
8473 
8474     /*
8475        If different types of engines are about to be updated.
8476        For example: Innodb and Falcon; Innodb and MyIsam.
8477     */
8478     my_bool multi_write_engine= FALSE;
8479     /*
8480        If different types of engines are about to be accessed
8481        and any of them is about to be updated. For example:
8482        Innodb and Falcon; Innodb and MyIsam.
8483     */
8484     my_bool multi_access_engine= FALSE;
8485     /*
8486        Identifies if a table is changed.
8487     */
8488     my_bool is_write= FALSE;
8489     /*
8490        A pointer to a previous table that was changed.
8491     */
8492     TABLE* prev_write_table= NULL;
8493     /*
8494        A pointer to a previous table that was accessed.
8495     */
8496     TABLE* prev_access_table= NULL;
8497     /*
8498       True if at least one table is transactional.
8499     */
8500     bool write_to_some_transactional_table= false;
8501     /*
8502       True if at least one table is non-transactional.
8503     */
8504     bool write_to_some_non_transactional_table= false;
8505     /*
8506        True if all non-transactional tables that has been updated
8507        are temporary.
8508     */
8509     bool write_all_non_transactional_are_tmp_tables= true;
8510     /**
8511       The number of tables used in the current statement,
8512       that should be replicated.
8513     */
8514     uint replicated_tables_count= 0;
8515     /**
8516       The number of tables written to in the current statement,
8517       that should not be replicated.
8518       A table should not be replicated when it is considered
8519       'local' to a MySQL instance.
8520       Currently, these tables are:
8521       - mysql.slow_log
8522       - mysql.general_log
8523       - mysql.slave_relay_log_info
8524       - mysql.slave_master_info
8525       - mysql.slave_worker_info
8526       - performance_schema.*
8527       - TODO: information_schema.*
8528       In practice, from this list, only performance_schema.* tables
8529       are written to by user queries.
8530     */
8531     uint non_replicated_tables_count= 0;
8532 #ifndef DBUG_OFF
8533     {
8534       DBUG_PRINT("debug", ("prelocked_mode: %s",
8535                            get_locked_tables_mode_name(locked_tables_mode)));
8536     }
8537 #endif
8538 
8539     if (variables.binlog_format != BINLOG_FORMAT_ROW && tables)
8540     {
8541       /*
8542         DML statements that modify a table with an auto_increment column based on
8543         rows selected from a table are unsafe as the order in which the rows are
8544         fetched fron the select tables cannot be determined and may differ on
8545         master and slave.
8546        */
8547       if (has_write_table_with_auto_increment_and_select(tables))
8548         lex->set_stmt_unsafe(LEX::BINLOG_STMT_UNSAFE_WRITE_AUTOINC_SELECT);
8549 
8550       if (has_write_table_auto_increment_not_first_in_pk(tables))
8551         lex->set_stmt_unsafe(LEX::BINLOG_STMT_UNSAFE_AUTOINC_NOT_FIRST);
8552 
8553       /*
8554         A query that modifies autoinc column in sub-statement can make the
8555         master and slave inconsistent.
8556         We can solve these problems in mixed mode by switching to binlogging
8557         if at least one updated table is used by sub-statement
8558        */
8559       if (lex->requires_prelocking() &&
8560           has_write_table_with_auto_increment(lex->first_not_own_table()))
8561         lex->set_stmt_unsafe(LEX::BINLOG_STMT_UNSAFE_AUTOINC_COLUMNS);
8562     }
8563 
8564     /*
8565       Get the capabilities vector for all involved storage engines and
8566       mask out the flags for the binary log.
8567     */
8568     for (TABLE_LIST *table= tables; table; table= table->next_global)
8569     {
8570       if (table->placeholder())
8571         continue;
8572 
8573       handler::Table_flags const flags= table->table->file->ha_table_flags();
8574 
8575       DBUG_PRINT("info", ("table: %s; ha_table_flags: 0x%llx",
8576                           table->table_name, flags));
8577 
8578       if (table->table->no_replicate)
8579       {
8580         /*
8581           The statement uses a table that is not replicated.
8582           The following properties about the table:
8583           - persistent / transient
8584           - transactional / non transactional
8585           - temporary / permanent
8586           - read or write
8587           - multiple engines involved because of this table
8588           are not relevant, as this table is completely ignored.
8589           Because the statement uses a non replicated table,
8590           using STATEMENT format in the binlog is impossible.
8591           Either this statement will be discarded entirely,
8592           or it will be logged (possibly partially) in ROW format.
8593         */
8594         lex->set_stmt_unsafe(LEX::BINLOG_STMT_UNSAFE_SYSTEM_TABLE);
8595 
8596         if (table->lock_type >= TL_WRITE_ALLOW_WRITE)
8597         {
8598           non_replicated_tables_count++;
8599           continue;
8600         }
8601       }
8602 
8603       replicated_tables_count++;
8604 
8605       my_bool trans= table->table->file->has_transactions();
8606 
8607       if (table->lock_type >= TL_WRITE_ALLOW_WRITE)
8608       {
8609         write_to_some_transactional_table=
8610           write_to_some_transactional_table || trans;
8611 
8612         write_to_some_non_transactional_table=
8613           write_to_some_non_transactional_table || !trans;
8614 
8615         if (prev_write_table && prev_write_table->file->ht !=
8616             table->table->file->ht)
8617           multi_write_engine= TRUE;
8618 
8619         if (table->table->s->tmp_table)
8620           lex->set_stmt_accessed_table(trans ? LEX::STMT_WRITES_TEMP_TRANS_TABLE :
8621                                                LEX::STMT_WRITES_TEMP_NON_TRANS_TABLE);
8622         else
8623           lex->set_stmt_accessed_table(trans ? LEX::STMT_WRITES_TRANS_TABLE :
8624                                                LEX::STMT_WRITES_NON_TRANS_TABLE);
8625 
8626         /*
8627          Non-transactional updates are allowed when row binlog format is
8628          used and all non-transactional tables are temporary.
8629          Binlog format is checked on THD::is_dml_gtid_compatible() method.
8630         */
8631         if (!trans)
8632           write_all_non_transactional_are_tmp_tables=
8633             write_all_non_transactional_are_tmp_tables &&
8634             table->table->s->tmp_table;
8635 
8636         flags_write_all_set &= flags;
8637         flags_write_some_set |= flags;
8638         is_write= TRUE;
8639 
8640         prev_write_table= table->table;
8641 
8642         /*
8643           INSERT...ON DUPLICATE KEY UPDATE on a table with more than one unique keys
8644           can be unsafe. Check for it if the flag is already not marked for the
8645           given statement.
8646         */
8647         if (!lex->is_stmt_unsafe(LEX::BINLOG_STMT_UNSAFE_INSERT_TWO_KEYS) &&
8648             lex->sql_command == SQLCOM_INSERT &&
8649             /* Duplicate key update is not supported by INSERT DELAYED */
8650             get_command() != COM_DELAYED_INSERT && lex->duplicates == DUP_UPDATE)
8651         {
8652           uint keys= table->table->s->keys, i= 0, unique_keys= 0;
8653           for (KEY* keyinfo= table->table->s->key_info;
8654                i < keys && unique_keys <= 1; i++, keyinfo++)
8655           {
8656             if (keyinfo->flags & HA_NOSAME)
8657               unique_keys++;
8658           }
8659           if (unique_keys > 1 )
8660             lex->set_stmt_unsafe(LEX::BINLOG_STMT_UNSAFE_INSERT_TWO_KEYS);
8661         }
8662       }
8663       flags_access_some_set |= flags;
8664 
8665       if (lex->sql_command != SQLCOM_CREATE_TABLE ||
8666           (lex->sql_command == SQLCOM_CREATE_TABLE &&
8667           (lex->create_info.options & HA_LEX_CREATE_TMP_TABLE)))
8668       {
8669         if (table->table->s->tmp_table)
8670           lex->set_stmt_accessed_table(trans ? LEX::STMT_READS_TEMP_TRANS_TABLE :
8671                                                LEX::STMT_READS_TEMP_NON_TRANS_TABLE);
8672         else
8673           lex->set_stmt_accessed_table(trans ? LEX::STMT_READS_TRANS_TABLE :
8674                                                LEX::STMT_READS_NON_TRANS_TABLE);
8675       }
8676 
8677       if (prev_access_table && prev_access_table->file->ht !=
8678           table->table->file->ht)
8679          multi_access_engine= TRUE;
8680 
8681       prev_access_table= table->table;
8682     }
8683     DBUG_ASSERT(!is_write ||
8684                 write_to_some_transactional_table ||
8685                 write_to_some_non_transactional_table);
8686     /*
8687       write_all_non_transactional_are_tmp_tables may be true if any
8688       non-transactional table was not updated, so we fix its value here.
8689     */
8690     write_all_non_transactional_are_tmp_tables=
8691       write_all_non_transactional_are_tmp_tables &&
8692       write_to_some_non_transactional_table;
8693 
8694     DBUG_PRINT("info", ("flags_write_all_set: 0x%llx", flags_write_all_set));
8695     DBUG_PRINT("info", ("flags_write_some_set: 0x%llx", flags_write_some_set));
8696     DBUG_PRINT("info", ("flags_access_some_set: 0x%llx", flags_access_some_set));
8697     DBUG_PRINT("info", ("multi_write_engine: %d", multi_write_engine));
8698     DBUG_PRINT("info", ("multi_access_engine: %d", multi_access_engine));
8699 
8700     int error= 0;
8701     int unsafe_flags;
8702 
8703     bool multi_stmt_trans= in_multi_stmt_transaction_mode();
8704     bool trans_table= trans_has_updated_trans_table(this);
8705     bool binlog_direct= variables.binlog_direct_non_trans_update;
8706 
8707     if (lex->is_mixed_stmt_unsafe(multi_stmt_trans, binlog_direct,
8708                                   trans_table, tx_isolation))
8709       lex->set_stmt_unsafe(LEX::BINLOG_STMT_UNSAFE_MIXED_STATEMENT);
8710     else if (multi_stmt_trans && trans_table && !binlog_direct &&
8711              lex->stmt_accessed_table(LEX::STMT_WRITES_NON_TRANS_TABLE))
8712       lex->set_stmt_unsafe(LEX::BINLOG_STMT_UNSAFE_NONTRANS_AFTER_TRANS);
8713 
8714     /*
8715       If more than one engine is involved in the statement and at
8716       least one is doing it's own logging (is *self-logging*), the
8717       statement cannot be logged atomically, so we generate an error
8718       rather than allowing the binlog to become corrupt.
8719     */
8720     if (multi_write_engine &&
8721         (flags_write_some_set & HA_HAS_OWN_BINLOGGING))
8722       my_error((error= ER_BINLOG_MULTIPLE_ENGINES_AND_SELF_LOGGING_ENGINE),
8723                MYF(0));
8724     else if (multi_access_engine && flags_access_some_set & HA_HAS_OWN_BINLOGGING)
8725       lex->set_stmt_unsafe(LEX::BINLOG_STMT_UNSAFE_MULTIPLE_ENGINES_AND_SELF_LOGGING_ENGINE);
8726 
8727     DBUG_EXECUTE_IF("make_stmt_only_engines",
8728                     {
8729                       flags_write_all_set= HA_BINLOG_STMT_CAPABLE;
8730                     };);
8731 
8732     /* both statement-only and row-only engines involved */
8733     if ((flags_write_all_set & (HA_BINLOG_STMT_CAPABLE | HA_BINLOG_ROW_CAPABLE)) == 0)
8734     {
8735       /*
8736         1. Error: Binary logging impossible since both row-incapable
8737            engines and statement-incapable engines are involved
8738       */
8739       my_error((error= ER_BINLOG_ROW_ENGINE_AND_STMT_ENGINE), MYF(0));
8740     }
8741     /* statement-only engines involved */
8742     else if ((flags_write_all_set & HA_BINLOG_ROW_CAPABLE) == 0)
8743     {
8744       if (lex->is_stmt_row_injection())
8745       {
8746         /*
8747           4. Error: Cannot execute row injection since table uses
8748              storage engine limited to statement-logging
8749         */
8750         my_error((error= ER_BINLOG_ROW_INJECTION_AND_STMT_ENGINE), MYF(0));
8751       }
8752       else if (variables.binlog_format == BINLOG_FORMAT_ROW &&
8753                sqlcom_can_generate_row_events(this->lex->sql_command))
8754       {
8755         /*
8756           2. Error: Cannot modify table that uses a storage engine
8757              limited to statement-logging when BINLOG_FORMAT = ROW
8758         */
8759         my_error((error= ER_BINLOG_ROW_MODE_AND_STMT_ENGINE), MYF(0));
8760       }
8761       else if (variables.binlog_format == BINLOG_FORMAT_MIXED &&
8762           ((unsafe_flags= lex->get_stmt_unsafe_flags()) != 0))
8763       {
8764         /*
8765           3. Error: Cannot execute statement: binlogging of unsafe
8766              statement is impossible when storage engine is limited to
8767              statement-logging and BINLOG_FORMAT = MIXED.
8768         */
8769         for (int unsafe_type= 0;
8770              unsafe_type < LEX::BINLOG_STMT_UNSAFE_COUNT;
8771              unsafe_type++)
8772           if (unsafe_flags & (1 << unsafe_type))
8773             my_error((error= ER_BINLOG_UNSAFE_AND_STMT_ENGINE), MYF(0),
8774                      ER(LEX::binlog_stmt_unsafe_errcode[unsafe_type]));
8775       }
8776       else if (is_write && ((unsafe_flags= lex->get_stmt_unsafe_flags()) != 0))
8777       {
8778         /*
8779           7. Warning: Unsafe statement logged as statement due to
8780              binlog_format = STATEMENT
8781         */
8782         binlog_unsafe_warning_flags|= unsafe_flags;
8783         DBUG_PRINT("info", ("Scheduling warning to be issued by "
8784                             "binlog_query: '%s'",
8785                             ER(ER_BINLOG_UNSAFE_STATEMENT)));
8786         DBUG_PRINT("info", ("binlog_unsafe_warning_flags: 0x%x",
8787                             binlog_unsafe_warning_flags));
8788       }
8789       /* log in statement format! */
8790     }
8791     /* no statement-only engines */
8792     else
8793     {
8794       /* binlog_format = STATEMENT */
8795       if (variables.binlog_format == BINLOG_FORMAT_STMT)
8796       {
8797         if (lex->is_stmt_row_injection())
8798         {
8799           /*
8800             6. Error: Cannot execute row injection since
8801                BINLOG_FORMAT = STATEMENT
8802           */
8803           my_error((error= ER_BINLOG_ROW_INJECTION_AND_STMT_MODE), MYF(0));
8804         }
8805         else if ((flags_write_all_set & HA_BINLOG_STMT_CAPABLE) == 0 &&
8806                  sqlcom_can_generate_row_events(this->lex->sql_command))
8807         {
8808           /*
8809             5. Error: Cannot modify table that uses a storage engine
8810                limited to row-logging when binlog_format = STATEMENT
8811           */
8812           my_error((error= ER_BINLOG_STMT_MODE_AND_ROW_ENGINE), MYF(0), "");
8813         }
8814         else if (is_write && (unsafe_flags= lex->get_stmt_unsafe_flags()) != 0)
8815         {
8816           /*
8817             7. Warning: Unsafe statement logged as statement due to
8818                binlog_format = STATEMENT
8819           */
8820           binlog_unsafe_warning_flags|= unsafe_flags;
8821           DBUG_PRINT("info", ("Scheduling warning to be issued by "
8822                               "binlog_query: '%s'",
8823                               ER(ER_BINLOG_UNSAFE_STATEMENT)));
8824           DBUG_PRINT("info", ("binlog_unsafe_warning_flags: 0x%x",
8825                               binlog_unsafe_warning_flags));
8826         }
8827         /* log in statement format! */
8828       }
8829       /* No statement-only engines and binlog_format != STATEMENT.
8830          I.e., nothing prevents us from row logging if needed. */
8831       else
8832       {
8833         if (lex->is_stmt_unsafe() || lex->is_stmt_row_injection()
8834             || (flags_write_all_set & HA_BINLOG_STMT_CAPABLE) == 0)
8835         {
8836           /* log in row format! */
8837           set_current_stmt_binlog_format_row_if_mixed();
8838         }
8839       }
8840     }
8841 
8842     if (non_replicated_tables_count > 0)
8843     {
8844       if ((replicated_tables_count == 0) || ! is_write)
8845       {
8846         DBUG_PRINT("info", ("decision: no logging, no replicated table affected"));
8847         set_binlog_local_stmt_filter();
8848       }
8849       else
8850       {
8851         if (! is_current_stmt_binlog_format_row())
8852         {
8853           my_error((error= ER_BINLOG_STMT_MODE_AND_NO_REPL_TABLES), MYF(0));
8854         }
8855         else
8856         {
8857           clear_binlog_local_stmt_filter();
8858         }
8859       }
8860     }
8861     else
8862     {
8863       clear_binlog_local_stmt_filter();
8864     }
8865 
8866     if (!error && enforce_gtid_consistency &&
8867         !is_dml_gtid_compatible(write_to_some_transactional_table,
8868                                 write_to_some_non_transactional_table,
8869                                 write_all_non_transactional_are_tmp_tables))
8870       error= 1;
8871 
8872     if (error) {
8873       DBUG_PRINT("info", ("decision: no logging since an error was generated"));
8874       DBUG_RETURN(-1);
8875     }
8876 
8877     if (is_write &&
8878         lex->sql_command != SQLCOM_END /* rows-event applying by slave */)
8879     {
8880       /*
8881         Master side of DML in the STMT format events parallelization.
8882         All involving table db:s are stored in a abc-ordered name list.
8883         In case the number of databases exceeds MAX_DBS_IN_EVENT_MTS maximum
8884         the list gathering breaks since it won't be sent to the slave.
8885       */
8886       for (TABLE_LIST *table= tables; table; table= table->next_global)
8887       {
8888         if (table->placeholder())
8889           continue;
8890 
8891         DBUG_ASSERT(table->table);
8892 
8893         if (table->table->file->referenced_by_foreign_key())
8894         {
8895           /*
8896              FK-referenced dbs can't be gathered currently. The following
8897              event will be marked for sequential execution on slave.
8898           */
8899           binlog_accessed_db_names= NULL;
8900           add_to_binlog_accessed_dbs("");
8901           break;
8902         }
8903         if (!is_current_stmt_binlog_format_row())
8904           add_to_binlog_accessed_dbs(table->db);
8905       }
8906     }
8907     DBUG_PRINT("info", ("decision: logging in %s format",
8908                         is_current_stmt_binlog_format_row() ?
8909                         "ROW" : "STATEMENT"));
8910 
8911     if (variables.binlog_format == BINLOG_FORMAT_ROW &&
8912         (lex->sql_command == SQLCOM_UPDATE ||
8913          lex->sql_command == SQLCOM_UPDATE_MULTI ||
8914          lex->sql_command == SQLCOM_DELETE ||
8915          lex->sql_command == SQLCOM_DELETE_MULTI))
8916     {
8917       String table_names;
8918       /*
8919         Generate a warning for UPDATE/DELETE statements that modify a
8920         BLACKHOLE table, as row events are not logged in row format.
8921       */
8922       for (TABLE_LIST *table= tables; table; table= table->next_global)
8923       {
8924         if (table->placeholder())
8925           continue;
8926         if (table->table->file->ht->db_type == DB_TYPE_BLACKHOLE_DB &&
8927             table->lock_type >= TL_WRITE_ALLOW_WRITE)
8928         {
8929             table_names.append(table->table_name);
8930             table_names.append(",");
8931         }
8932       }
8933       if (!table_names.is_empty())
8934       {
8935         bool is_update= (lex->sql_command == SQLCOM_UPDATE ||
8936                          lex->sql_command == SQLCOM_UPDATE_MULTI);
8937         /*
8938           Replace the last ',' with '.' for table_names
8939         */
8940         table_names.replace(table_names.length()-1, 1, ".", 1);
8941         push_warning_printf(this, Sql_condition::WARN_LEVEL_WARN,
8942                             WARN_ON_BLOCKHOLE_IN_RBR,
8943                             ER(WARN_ON_BLOCKHOLE_IN_RBR),
8944                             is_update ? "UPDATE" : "DELETE",
8945                             table_names.c_ptr());
8946       }
8947     }
8948   }
8949 #ifndef DBUG_OFF
8950   else
8951     DBUG_PRINT("info", ("decision: no logging since "
8952                         "mysql_bin_log.is_open() = %d "
8953                         "and (options & OPTION_BIN_LOG) = 0x%llx "
8954                         "and binlog_format = %lu "
8955                         "and binlog_filter->db_ok(db) = %d",
8956                         mysql_bin_log.is_open(),
8957                         (variables.option_bits & OPTION_BIN_LOG),
8958                         variables.binlog_format,
8959                         binlog_filter->db_ok(db)));
8960 #endif
8961 
8962   DBUG_RETURN(0);
8963 }
8964 
8965 
is_ddl_gtid_compatible() const8966 bool THD::is_ddl_gtid_compatible() const
8967 {
8968   DBUG_ENTER("THD::is_ddl_gtid_compatible");
8969 
8970   // If @@session.sql_log_bin has been manually turned off (only
8971   // doable by SUPER), then no problem, we can execute any statement.
8972   if ((variables.option_bits & OPTION_BIN_LOG) == 0)
8973     DBUG_RETURN(true);
8974 
8975   if (lex->sql_command == SQLCOM_CREATE_TABLE &&
8976       !(lex->create_info.options & HA_LEX_CREATE_TMP_TABLE) &&
8977       lex->select_lex.item_list.elements)
8978   {
8979     /*
8980       CREATE ... SELECT (without TEMPORARY) is unsafe because if
8981       binlog_format=row it will be logged as a CREATE TABLE followed
8982       by row events, re-executed non-atomically as two transactions,
8983       and then written to the slave's binary log as two separate
8984       transactions with the same GTID.
8985     */
8986     my_error(ER_GTID_UNSAFE_CREATE_SELECT, MYF(0));
8987     DBUG_RETURN(false);
8988   }
8989   if ((lex->sql_command == SQLCOM_CREATE_TABLE &&
8990        (lex->create_info.options & HA_LEX_CREATE_TMP_TABLE) != 0) ||
8991       (lex->sql_command == SQLCOM_DROP_TABLE && lex->drop_temporary))
8992   {
8993     /*
8994       [CREATE|DROP] TEMPORARY TABLE is unsafe to execute
8995       inside a transaction because the table will stay and the
8996       transaction will be written to the slave's binary log with the
8997       GTID even if the transaction is rolled back.
8998       This includes the execution inside Functions and Triggers.
8999     */
9000     if (in_multi_stmt_transaction_mode() || in_sub_stmt)
9001     {
9002       my_error(ER_GTID_UNSAFE_CREATE_DROP_TEMPORARY_TABLE_IN_TRANSACTION,
9003                MYF(0));
9004       DBUG_RETURN(false);
9005     }
9006   }
9007   DBUG_RETURN(true);
9008 }
9009 
9010 
9011 bool
is_dml_gtid_compatible(bool transactional_table,bool non_transactional_table,bool non_transactional_tmp_tables) const9012 THD::is_dml_gtid_compatible(bool transactional_table,
9013                             bool non_transactional_table,
9014                             bool non_transactional_tmp_tables) const
9015 {
9016   DBUG_ENTER("THD::is_dml_gtid_compatible(bool, bool, bool)");
9017 
9018   // If @@session.sql_log_bin has been manually turned off (only
9019   // doable by SUPER), then no problem, we can execute any statement.
9020   if ((variables.option_bits & OPTION_BIN_LOG) == 0)
9021     DBUG_RETURN(true);
9022 
9023   /*
9024     Single non-transactional updates are allowed when not mixed
9025     together with transactional statements within a transaction.
9026     Furthermore, writing to transactional and non-transactional
9027     engines in a single statement is also disallowed.
9028     Multi-statement transactions on non-transactional tables are
9029     split into single-statement transactions when
9030     GTID_NEXT = "AUTOMATIC".
9031 
9032     Non-transactional updates are allowed when row binlog format is
9033     used and all non-transactional tables are temporary.
9034 
9035     The debug symbol "allow_gtid_unsafe_non_transactional_updates"
9036     disables the error.  This is useful because it allows us to run
9037     old tests that were not written with the restrictions of GTIDs in
9038     mind.
9039   */
9040   if (non_transactional_table &&
9041       (transactional_table || trans_has_updated_trans_table(this)) &&
9042       !(non_transactional_tmp_tables && is_current_stmt_binlog_format_row()) &&
9043       !DBUG_EVALUATE_IF("allow_gtid_unsafe_non_transactional_updates", 1, 0))
9044   {
9045     my_error(ER_GTID_UNSAFE_NON_TRANSACTIONAL_TABLE, MYF(0));
9046     DBUG_RETURN(false);
9047   }
9048 
9049   DBUG_RETURN(true);
9050 }
9051 
9052 /*
9053   Implementation of interface to write rows to the binary log through the
9054   thread.  The thread is responsible for writing the rows it has
9055   inserted/updated/deleted.
9056 */
9057 
9058 #ifndef MYSQL_CLIENT
9059 
9060 /*
9061   Template member function for ensuring that there is an rows log
9062   event of the apropriate type before proceeding.
9063 
9064   PRE CONDITION:
9065     - Events of type 'RowEventT' have the type code 'type_code'.
9066 
9067   POST CONDITION:
9068     If a non-NULL pointer is returned, the pending event for thread 'thd' will
9069     be an event of type 'RowEventT' (which have the type code 'type_code')
9070     will either empty or have enough space to hold 'needed' bytes.  In
9071     addition, the columns bitmap will be correct for the row, meaning that
9072     the pending event will be flushed if the columns in the event differ from
9073     the columns suppled to the function.
9074 
9075   RETURNS
9076     If no error, a non-NULL pending event (either one which already existed or
9077     the newly created one).
9078     If error, NULL.
9079  */
9080 
9081 template <class RowsEventT> Rows_log_event*
binlog_prepare_pending_rows_event(TABLE * table,uint32 serv_id,size_t needed,bool is_transactional,RowsEventT * hint MY_ATTRIBUTE ((unused)),const uchar * extra_row_info)9082 THD::binlog_prepare_pending_rows_event(TABLE* table, uint32 serv_id,
9083                                        size_t needed,
9084                                        bool is_transactional,
9085 				       RowsEventT *hint MY_ATTRIBUTE((unused)),
9086                                        const uchar* extra_row_info)
9087 {
9088   DBUG_ENTER("binlog_prepare_pending_rows_event");
9089 
9090   /* Fetch the type code for the RowsEventT template parameter */
9091   int const general_type_code= RowsEventT::TYPE_CODE;
9092 
9093   Rows_log_event* pending= binlog_get_pending_rows_event(is_transactional);
9094 
9095   if (unlikely(pending && !pending->is_valid()))
9096     DBUG_RETURN(NULL);
9097 
9098   /*
9099     Check if the current event is non-NULL and a write-rows
9100     event. Also check if the table provided is mapped: if it is not,
9101     then we have switched to writing to a new table.
9102     If there is no pending event, we need to create one. If there is a pending
9103     event, but it's not about the same table id, or not of the same type
9104     (between Write, Update and Delete), or not the same affected columns, or
9105     going to be too big, flush this event to disk and create a new pending
9106     event.
9107   */
9108   if (!pending ||
9109       pending->server_id != serv_id ||
9110       pending->get_table_id() != table->s->table_map_id ||
9111       pending->get_general_type_code() != general_type_code ||
9112       pending->get_data_size() + needed > opt_binlog_rows_event_max_size ||
9113       pending->read_write_bitmaps_cmp(table) == FALSE ||
9114       !binlog_row_event_extra_data_eq(pending->get_extra_row_data(),
9115                                       extra_row_info))
9116   {
9117     /* Create a new RowsEventT... */
9118     Rows_log_event* const
9119 	ev= new RowsEventT(this, table, table->s->table_map_id,
9120                            is_transactional, extra_row_info);
9121     if (unlikely(!ev))
9122       DBUG_RETURN(NULL);
9123     ev->server_id= serv_id; // I don't like this, it's too easy to forget.
9124     /*
9125       flush the pending event and replace it with the newly created
9126       event...
9127     */
9128     if (unlikely(
9129         mysql_bin_log.flush_and_set_pending_rows_event(this, ev,
9130                                                        is_transactional)))
9131     {
9132       delete ev;
9133       DBUG_RETURN(NULL);
9134     }
9135 
9136     DBUG_RETURN(ev);               /* This is the new pending event */
9137   }
9138   DBUG_RETURN(pending);        /* This is the current pending event */
9139 }
9140 
9141 /* Declare in unnamed namespace. */
9142 CPP_UNNAMED_NS_START
9143 
9144   /**
9145      Class to handle temporary allocation of memory for row data.
9146 
9147      The responsibilities of the class is to provide memory for
9148      packing one or two rows of packed data (depending on what
9149      constructor is called).
9150 
9151      In order to make the allocation more efficient for "simple" rows,
9152      i.e., rows that do not contain any blobs, a pointer to the
9153      allocated memory is of memory is stored in the table structure
9154      for simple rows.  If memory for a table containing a blob field
9155      is requested, only memory for that is allocated, and subsequently
9156      released when the object is destroyed.
9157 
9158    */
9159   class Row_data_memory {
9160   public:
9161     /**
9162       Build an object to keep track of a block-local piece of memory
9163       for storing a row of data.
9164 
9165       @param table
9166       Table where the pre-allocated memory is stored.
9167 
9168       @param length
9169       Length of data that is needed, if the record contain blobs.
9170      */
Row_data_memory(TABLE * table,size_t const len1)9171     Row_data_memory(TABLE *table, size_t const len1)
9172       : m_memory(0)
9173     {
9174 #ifndef DBUG_OFF
9175       m_alloc_checked= FALSE;
9176 #endif
9177       allocate_memory(table, len1);
9178       m_ptr[0]= has_memory() ? m_memory : 0;
9179       m_ptr[1]= 0;
9180     }
9181 
Row_data_memory(TABLE * table,size_t const len1,size_t const len2)9182     Row_data_memory(TABLE *table, size_t const len1, size_t const len2)
9183       : m_memory(0)
9184     {
9185 #ifndef DBUG_OFF
9186       m_alloc_checked= FALSE;
9187 #endif
9188       allocate_memory(table, len1 + len2);
9189       m_ptr[0]= has_memory() ? m_memory        : 0;
9190       m_ptr[1]= has_memory() ? m_memory + len1 : 0;
9191     }
9192 
~Row_data_memory()9193     ~Row_data_memory()
9194     {
9195       if (m_memory != 0 && m_release_memory_on_destruction)
9196         my_free(m_memory);
9197     }
9198 
9199     /**
9200        Is there memory allocated?
9201 
9202        @retval true There is memory allocated
9203        @retval false Memory allocation failed
9204      */
has_memory() const9205     bool has_memory() const {
9206 #ifndef DBUG_OFF
9207       m_alloc_checked= TRUE;
9208 #endif
9209       return m_memory != 0;
9210     }
9211 
slot(uint s)9212     uchar *slot(uint s)
9213     {
9214       DBUG_ASSERT(s < sizeof(m_ptr)/sizeof(*m_ptr));
9215       DBUG_ASSERT(m_ptr[s] != 0);
9216       DBUG_ASSERT(m_alloc_checked == TRUE);
9217       return m_ptr[s];
9218     }
9219 
9220   private:
allocate_memory(TABLE * const table,size_t const total_length)9221     void allocate_memory(TABLE *const table, size_t const total_length)
9222     {
9223       if (table->s->blob_fields == 0)
9224       {
9225         /*
9226           The maximum length of a packed record is less than this
9227           length. We use this value instead of the supplied length
9228           when allocating memory for records, since we don't know how
9229           the memory will be used in future allocations.
9230 
9231           Since table->s->reclength is for unpacked records, we have
9232           to add two bytes for each field, which can potentially be
9233           added to hold the length of a packed field.
9234         */
9235         size_t const maxlen= table->s->reclength + 2 * table->s->fields;
9236 
9237         /*
9238           Allocate memory for two records if memory hasn't been
9239           allocated. We allocate memory for two records so that it can
9240           be used when processing update rows as well.
9241         */
9242         if (table->write_row_record == 0)
9243           table->write_row_record=
9244             (uchar *) alloc_root(&table->mem_root, 2 * maxlen);
9245         m_memory= table->write_row_record;
9246         m_release_memory_on_destruction= FALSE;
9247       }
9248       else
9249       {
9250         m_memory= (uchar *) my_malloc(total_length, MYF(MY_WME));
9251         m_release_memory_on_destruction= TRUE;
9252       }
9253     }
9254 
9255 #ifndef DBUG_OFF
9256     mutable bool m_alloc_checked;
9257 #endif
9258     bool m_release_memory_on_destruction;
9259     uchar *m_memory;
9260     uchar *m_ptr[2];
9261   };
9262 
9263 CPP_UNNAMED_NS_END
9264 
binlog_write_row(TABLE * table,bool is_trans,uchar const * record,const uchar * extra_row_info)9265 int THD::binlog_write_row(TABLE* table, bool is_trans,
9266                           uchar const *record,
9267                           const uchar* extra_row_info)
9268 {
9269   DBUG_ASSERT(is_current_stmt_binlog_format_row() && mysql_bin_log.is_open());
9270 
9271   /*
9272     Pack records into format for transfer. We are allocating more
9273     memory than needed, but that doesn't matter.
9274   */
9275   Row_data_memory memory(table, max_row_length(table, record));
9276   if (!memory.has_memory())
9277     return HA_ERR_OUT_OF_MEM;
9278 
9279   uchar *row_data= memory.slot(0);
9280 
9281   size_t const len= pack_row(table, table->write_set, row_data, record);
9282 
9283   Rows_log_event* const ev=
9284     binlog_prepare_pending_rows_event(table, server_id, len, is_trans,
9285                                       static_cast<Write_rows_log_event*>(0),
9286                                       extra_row_info);
9287 
9288   if (unlikely(ev == 0))
9289     return HA_ERR_OUT_OF_MEM;
9290 
9291   return ev->add_row_data(row_data, len);
9292 }
9293 
binlog_update_row(TABLE * table,bool is_trans,const uchar * before_record,const uchar * after_record,const uchar * extra_row_info)9294 int THD::binlog_update_row(TABLE* table, bool is_trans,
9295                            const uchar *before_record,
9296                            const uchar *after_record,
9297                            const uchar* extra_row_info)
9298 {
9299   DBUG_ASSERT(is_current_stmt_binlog_format_row() && mysql_bin_log.is_open());
9300   int error= 0;
9301 
9302   /**
9303     Save a reference to the original read and write set bitmaps.
9304     We will need this to restore the bitmaps at the end.
9305    */
9306   MY_BITMAP *old_read_set= table->read_set;
9307   MY_BITMAP *old_write_set= table->write_set;
9308 
9309   /**
9310      This will remove spurious fields required during execution but
9311      not needed for binlogging. This is done according to the:
9312      binlog-row-image option.
9313    */
9314   binlog_prepare_row_images(table);
9315 
9316   size_t const before_maxlen = max_row_length(table, before_record);
9317   size_t const after_maxlen  = max_row_length(table, after_record);
9318 
9319   Row_data_memory row_data(table, before_maxlen, after_maxlen);
9320   if (!row_data.has_memory())
9321     return HA_ERR_OUT_OF_MEM;
9322 
9323   uchar *before_row= row_data.slot(0);
9324   uchar *after_row= row_data.slot(1);
9325 
9326   size_t const before_size= pack_row(table, table->read_set, before_row,
9327                                         before_record);
9328   size_t const after_size= pack_row(table, table->write_set, after_row,
9329                                        after_record);
9330 
9331   /*
9332     Don't print debug messages when running valgrind since they can
9333     trigger false warnings.
9334    */
9335 #ifndef HAVE_purify
9336   DBUG_DUMP("before_record", before_record, table->s->reclength);
9337   DBUG_DUMP("after_record",  after_record, table->s->reclength);
9338   DBUG_DUMP("before_row",    before_row, before_size);
9339   DBUG_DUMP("after_row",     after_row, after_size);
9340 #endif
9341 
9342   Rows_log_event* const ev=
9343     binlog_prepare_pending_rows_event(table, server_id,
9344 				      before_size + after_size, is_trans,
9345 				      static_cast<Update_rows_log_event*>(0),
9346                                       extra_row_info);
9347 
9348   if (unlikely(ev == 0))
9349     return HA_ERR_OUT_OF_MEM;
9350 
9351   error= ev->add_row_data(before_row, before_size) ||
9352          ev->add_row_data(after_row, after_size);
9353 
9354   /* restore read/write set for the rest of execution */
9355   table->column_bitmaps_set_no_signal(old_read_set,
9356                                       old_write_set);
9357 
9358   return error;
9359 }
9360 
binlog_delete_row(TABLE * table,bool is_trans,uchar const * record,const uchar * extra_row_info)9361 int THD::binlog_delete_row(TABLE* table, bool is_trans,
9362                            uchar const *record,
9363                            const uchar* extra_row_info)
9364 {
9365   DBUG_ASSERT(is_current_stmt_binlog_format_row() && mysql_bin_log.is_open());
9366   int error= 0;
9367 
9368   /**
9369     Save a reference to the original read and write set bitmaps.
9370     We will need this to restore the bitmaps at the end.
9371    */
9372   MY_BITMAP *old_read_set= table->read_set;
9373   MY_BITMAP *old_write_set= table->write_set;
9374 
9375   /**
9376      This will remove spurious fields required during execution but
9377      not needed for binlogging. This is done according to the:
9378      binlog-row-image option.
9379    */
9380   binlog_prepare_row_images(table);
9381 
9382   /*
9383      Pack records into format for transfer. We are allocating more
9384      memory than needed, but that doesn't matter.
9385   */
9386   Row_data_memory memory(table, max_row_length(table, record));
9387   if (unlikely(!memory.has_memory()))
9388     return HA_ERR_OUT_OF_MEM;
9389 
9390   uchar *row_data= memory.slot(0);
9391 
9392   DBUG_DUMP("table->read_set", (uchar*) table->read_set->bitmap, (table->s->fields + 7) / 8);
9393   size_t const len= pack_row(table, table->read_set, row_data, record);
9394 
9395   Rows_log_event* const ev=
9396     binlog_prepare_pending_rows_event(table, server_id, len, is_trans,
9397 				      static_cast<Delete_rows_log_event*>(0),
9398                                       extra_row_info);
9399 
9400   if (unlikely(ev == 0))
9401     return HA_ERR_OUT_OF_MEM;
9402 
9403   error= ev->add_row_data(row_data, len);
9404 
9405   /* restore read/write set for the rest of execution */
9406   table->column_bitmaps_set_no_signal(old_read_set,
9407                                       old_write_set);
9408 
9409   return error;
9410 }
9411 
binlog_prepare_row_images(TABLE * table)9412 void THD::binlog_prepare_row_images(TABLE *table)
9413 {
9414   DBUG_ENTER("THD::binlog_prepare_row_images");
9415   /**
9416     Remove from read_set spurious columns. The write_set has been
9417     handled before in table->mark_columns_needed_for_update.
9418    */
9419 
9420   DBUG_PRINT_BITSET("debug", "table->read_set (before preparing): %s", table->read_set);
9421   THD *thd= table->in_use;
9422 
9423   /**
9424     if there is a primary key in the table (ie, user declared PK or a
9425     non-null unique index) and we dont want to ship the entire image,
9426     and the handler involved supports this.
9427    */
9428   if (table->s->primary_key < MAX_KEY &&
9429       (thd->variables.binlog_row_image < BINLOG_ROW_IMAGE_FULL) &&
9430       !ha_check_storage_engine_flag(table->s->db_type(), HTON_NO_BINLOG_ROW_OPT))
9431   {
9432     /**
9433       Just to be sure that tmp_set is currently not in use as
9434       the read_set already.
9435     */
9436     DBUG_ASSERT(table->read_set != &table->tmp_set);
9437 
9438     bitmap_clear_all(&table->tmp_set);
9439 
9440     switch(thd->variables.binlog_row_image)
9441     {
9442       case BINLOG_ROW_IMAGE_MINIMAL:
9443         /* MINIMAL: Mark only PK */
9444         table->mark_columns_used_by_index_no_reset(table->s->primary_key,
9445                                                    &table->tmp_set);
9446         break;
9447       case BINLOG_ROW_IMAGE_NOBLOB:
9448         /**
9449           NOBLOB: Remove unnecessary BLOB fields from read_set
9450                   (the ones that are not part of PK).
9451          */
9452         bitmap_union(&table->tmp_set, table->read_set);
9453         for (Field **ptr=table->field ; *ptr ; ptr++)
9454         {
9455           Field *field= (*ptr);
9456           if ((field->type() == MYSQL_TYPE_BLOB) &&
9457               !(field->flags & PRI_KEY_FLAG))
9458             bitmap_clear_bit(&table->tmp_set, field->field_index);
9459         }
9460         break;
9461       default:
9462         DBUG_ASSERT(0); // impossible.
9463     }
9464 
9465     /* set the temporary read_set */
9466     table->column_bitmaps_set_no_signal(&table->tmp_set,
9467                                         table->write_set);
9468   }
9469 
9470   DBUG_PRINT_BITSET("debug", "table->read_set (after preparing): %s", table->read_set);
9471   DBUG_VOID_RETURN;
9472 }
9473 
9474 
binlog_flush_pending_rows_event(bool stmt_end,bool is_transactional)9475 int THD::binlog_flush_pending_rows_event(bool stmt_end, bool is_transactional)
9476 {
9477   DBUG_ENTER("THD::binlog_flush_pending_rows_event");
9478   /*
9479     We shall flush the pending event even if we are not in row-based
9480     mode: it might be the case that we left row-based mode before
9481     flushing anything (e.g., if we have explicitly locked tables).
9482    */
9483   if (!mysql_bin_log.is_open())
9484     DBUG_RETURN(0);
9485 
9486   /*
9487     Mark the event as the last event of a statement if the stmt_end
9488     flag is set.
9489   */
9490   int error= 0;
9491   if (Rows_log_event *pending= binlog_get_pending_rows_event(is_transactional))
9492   {
9493     if (stmt_end)
9494     {
9495       pending->set_flags(Rows_log_event::STMT_END_F);
9496       binlog_table_maps= 0;
9497     }
9498 
9499     error= mysql_bin_log.flush_and_set_pending_rows_event(this, 0,
9500                                                           is_transactional);
9501   }
9502 
9503   DBUG_RETURN(error);
9504 }
9505 
9506 
9507 /**
9508    binlog_row_event_extra_data_eq
9509 
9510    Comparator for two binlog row event extra data
9511    pointers.
9512 
9513    It compares their significant bytes.
9514 
9515    Null pointers are acceptable
9516 
9517    @param a
9518      first pointer
9519 
9520    @param b
9521      first pointer
9522 
9523    @return
9524      true if the referenced structures are equal
9525 */
9526 bool
binlog_row_event_extra_data_eq(const uchar * a,const uchar * b)9527 THD::binlog_row_event_extra_data_eq(const uchar* a,
9528                                     const uchar* b)
9529 {
9530   return ((a == b) ||
9531           ((a != NULL) &&
9532            (b != NULL) &&
9533            (a[EXTRA_ROW_INFO_LEN_OFFSET] ==
9534             b[EXTRA_ROW_INFO_LEN_OFFSET]) &&
9535            (memcmp(a, b,
9536                    a[EXTRA_ROW_INFO_LEN_OFFSET]) == 0)));
9537 }
9538 
9539 #if !defined(DBUG_OFF) && !defined(_lint)
9540 static const char *
show_query_type(THD::enum_binlog_query_type qtype)9541 show_query_type(THD::enum_binlog_query_type qtype)
9542 {
9543   switch (qtype) {
9544   case THD::ROW_QUERY_TYPE:
9545     return "ROW";
9546   case THD::STMT_QUERY_TYPE:
9547     return "STMT";
9548   case THD::QUERY_TYPE_COUNT:
9549   default:
9550     DBUG_ASSERT(0 <= qtype && qtype < THD::QUERY_TYPE_COUNT);
9551   }
9552   static char buf[64];
9553   sprintf(buf, "UNKNOWN#%d", qtype);
9554   return buf;
9555 }
9556 #endif
9557 
9558 /**
9559   Auxiliary function to reset the limit unsafety warning suppression.
9560 */
reset_binlog_unsafe_suppression()9561 static void reset_binlog_unsafe_suppression()
9562 {
9563   DBUG_ENTER("reset_binlog_unsafe_suppression");
9564   unsafe_warning_suppression_is_activated= false;
9565   limit_unsafe_warning_count= 0;
9566   limit_unsafe_suppression_start_time= my_getsystime()/10000000;
9567   DBUG_VOID_RETURN;
9568 }
9569 
9570 /**
9571   Auxiliary function to print warning in the error log.
9572 */
print_unsafe_warning_to_log(int unsafe_type,char * buf,char * query)9573 static void print_unsafe_warning_to_log(int unsafe_type, char* buf,
9574                                  char* query)
9575 {
9576   DBUG_ENTER("print_unsafe_warning_in_log");
9577   sprintf(buf, ER(ER_BINLOG_UNSAFE_STATEMENT),
9578           ER(LEX::binlog_stmt_unsafe_errcode[unsafe_type]));
9579   sql_print_warning(ER(ER_MESSAGE_AND_STATEMENT), buf, query);
9580   DBUG_VOID_RETURN;
9581 }
9582 
9583 /**
9584   Auxiliary function to check if the warning for limit unsafety should be
9585   thrown or suppressed. Details of the implementation can be found in the
9586   comments inline.
9587   SYNOPSIS:
9588   @params
9589    buf         - buffer to hold the warning message text
9590    unsafe_type - The type of unsafety.
9591    query       - The actual query statement.
9592 
9593   TODO: Remove this function and implement a general service for all warnings
9594   that would prevent flooding the error log.
9595 */
do_unsafe_limit_checkout(char * buf,int unsafe_type,char * query)9596 static void do_unsafe_limit_checkout(char* buf, int unsafe_type, char* query)
9597 {
9598   ulonglong now;
9599   DBUG_ENTER("do_unsafe_limit_checkout");
9600   DBUG_ASSERT(unsafe_type == LEX::BINLOG_STMT_UNSAFE_LIMIT);
9601   limit_unsafe_warning_count++;
9602   /*
9603     INITIALIZING:
9604     If this is the first time this function is called with log warning
9605     enabled, the monitoring the unsafe warnings should start.
9606   */
9607   if (limit_unsafe_suppression_start_time == 0)
9608   {
9609     limit_unsafe_suppression_start_time= my_getsystime()/10000000;
9610     print_unsafe_warning_to_log(unsafe_type, buf, query);
9611   }
9612   else
9613   {
9614     if (!unsafe_warning_suppression_is_activated)
9615       print_unsafe_warning_to_log(unsafe_type, buf, query);
9616 
9617     if (limit_unsafe_warning_count >=
9618         LIMIT_UNSAFE_WARNING_ACTIVATION_THRESHOLD_COUNT)
9619     {
9620       now= my_getsystime()/10000000;
9621       if (!unsafe_warning_suppression_is_activated)
9622       {
9623         /*
9624           ACTIVATION:
9625           We got LIMIT_UNSAFE_WARNING_ACTIVATION_THRESHOLD_COUNT warnings in
9626           less than LIMIT_UNSAFE_WARNING_ACTIVATION_TIMEOUT we activate the
9627           suppression.
9628         */
9629         if ((now-limit_unsafe_suppression_start_time) <=
9630                        LIMIT_UNSAFE_WARNING_ACTIVATION_TIMEOUT)
9631         {
9632           unsafe_warning_suppression_is_activated= true;
9633           DBUG_PRINT("info",("A warning flood has been detected and the limit \
9634 unsafety warning suppression has been activated."));
9635         }
9636         else
9637         {
9638           /*
9639            there is no flooding till now, therefore we restart the monitoring
9640           */
9641           limit_unsafe_suppression_start_time= my_getsystime()/10000000;
9642           limit_unsafe_warning_count= 0;
9643         }
9644       }
9645       else
9646       {
9647         /*
9648           Print the suppression note and the unsafe warning.
9649         */
9650         sql_print_information("The following warning was suppressed %d times \
9651 during the last %d seconds in the error log",
9652                               limit_unsafe_warning_count,
9653                               (int)
9654                               (now-limit_unsafe_suppression_start_time));
9655         print_unsafe_warning_to_log(unsafe_type, buf, query);
9656         /*
9657           DEACTIVATION: We got LIMIT_UNSAFE_WARNING_ACTIVATION_THRESHOLD_COUNT
9658           warnings in more than  LIMIT_UNSAFE_WARNING_ACTIVATION_TIMEOUT, the
9659           suppression should be deactivated.
9660         */
9661         if ((now - limit_unsafe_suppression_start_time) >
9662             LIMIT_UNSAFE_WARNING_ACTIVATION_TIMEOUT)
9663         {
9664           reset_binlog_unsafe_suppression();
9665           DBUG_PRINT("info",("The limit unsafety warning supression has been \
9666 deactivated"));
9667         }
9668       }
9669       limit_unsafe_warning_count= 0;
9670     }
9671   }
9672   DBUG_VOID_RETURN;
9673 }
9674 
9675 /**
9676   Auxiliary method used by @c binlog_query() to raise warnings.
9677 
9678   The type of warning and the type of unsafeness is stored in
9679   THD::binlog_unsafe_warning_flags.
9680 */
issue_unsafe_warnings()9681 void THD::issue_unsafe_warnings()
9682 {
9683   char buf[MYSQL_ERRMSG_SIZE * 2];
9684   DBUG_ENTER("issue_unsafe_warnings");
9685   /*
9686     Ensure that binlog_unsafe_warning_flags is big enough to hold all
9687     bits.  This is actually a constant expression.
9688   */
9689   DBUG_ASSERT(LEX::BINLOG_STMT_UNSAFE_COUNT <=
9690               sizeof(binlog_unsafe_warning_flags) * CHAR_BIT);
9691 
9692   uint32 unsafe_type_flags= binlog_unsafe_warning_flags;
9693 
9694   /*
9695     For each unsafe_type, check if the statement is unsafe in this way
9696     and issue a warning.
9697   */
9698   for (int unsafe_type=0;
9699        unsafe_type < LEX::BINLOG_STMT_UNSAFE_COUNT;
9700        unsafe_type++)
9701   {
9702     if ((unsafe_type_flags & (1 << unsafe_type)) != 0)
9703     {
9704       push_warning_printf(this, Sql_condition::WARN_LEVEL_NOTE,
9705                           ER_BINLOG_UNSAFE_STATEMENT,
9706                           ER(ER_BINLOG_UNSAFE_STATEMENT),
9707                           ER(LEX::binlog_stmt_unsafe_errcode[unsafe_type]));
9708       if (log_warnings)
9709       {
9710         if (unsafe_type == LEX::BINLOG_STMT_UNSAFE_LIMIT)
9711           do_unsafe_limit_checkout( buf, unsafe_type, query());
9712         else //cases other than LIMIT unsafety
9713           print_unsafe_warning_to_log(unsafe_type, buf, query());
9714       }
9715     }
9716   }
9717   DBUG_VOID_RETURN;
9718 }
9719 
9720 /**
9721   Log the current query.
9722 
9723   The query will be logged in either row format or statement format
9724   depending on the value of @c current_stmt_binlog_format_row field and
9725   the value of the @c qtype parameter.
9726 
9727   This function must be called:
9728 
9729   - After the all calls to ha_*_row() functions have been issued.
9730 
9731   - After any writes to system tables. Rationale: if system tables
9732     were written after a call to this function, and the master crashes
9733     after the call to this function and before writing the system
9734     tables, then the master and slave get out of sync.
9735 
9736   - Before tables are unlocked and closed.
9737 
9738   @see decide_logging_format
9739 
9740   @retval 0 Success
9741 
9742   @retval nonzero If there is a failure when writing the query (e.g.,
9743   write failure), then the error code is returned.
9744 */
binlog_query(THD::enum_binlog_query_type qtype,char const * query_arg,ulong query_len,bool is_trans,bool direct,bool suppress_use,int errcode)9745 int THD::binlog_query(THD::enum_binlog_query_type qtype, char const *query_arg,
9746                       ulong query_len, bool is_trans, bool direct,
9747                       bool suppress_use, int errcode)
9748 {
9749   DBUG_ENTER("THD::binlog_query");
9750   DBUG_PRINT("enter", ("qtype: %s  query: '%s'",
9751                        show_query_type(qtype), query_arg));
9752   DBUG_ASSERT(query_arg && mysql_bin_log.is_open());
9753 
9754   if (get_binlog_local_stmt_filter() == BINLOG_FILTER_SET)
9755   {
9756     /*
9757       The current statement is to be ignored, and not written to
9758       the binlog. Do not call issue_unsafe_warnings().
9759     */
9760     DBUG_RETURN(0);
9761   }
9762 
9763   /*
9764     If we are not in prelocked mode, mysql_unlock_tables() will be
9765     called after this binlog_query(), so we have to flush the pending
9766     rows event with the STMT_END_F set to unlock all tables at the
9767     slave side as well.
9768 
9769     If we are in prelocked mode, the flushing will be done inside the
9770     top-most close_thread_tables().
9771   */
9772   if (this->locked_tables_mode <= LTM_LOCK_TABLES)
9773     if (int error= binlog_flush_pending_rows_event(TRUE, is_trans))
9774       DBUG_RETURN(error);
9775 
9776   /*
9777     Warnings for unsafe statements logged in statement format are
9778     printed in three places instead of in decide_logging_format().
9779     This is because the warnings should be printed only if the statement
9780     is actually logged. When executing decide_logging_format(), we cannot
9781     know for sure if the statement will be logged:
9782 
9783     1 - sp_head::execute_procedure which prints out warnings for calls to
9784     stored procedures.
9785 
9786     2 - sp_head::execute_function which prints out warnings for calls
9787     involving functions.
9788 
9789     3 - THD::binlog_query (here) which prints warning for top level
9790     statements not covered by the two cases above: i.e., if not insided a
9791     procedure and a function.
9792 
9793     Besides, we should not try to print these warnings if it is not
9794     possible to write statements to the binary log as it happens when
9795     the execution is inside a function, or generaly speaking, when
9796     the variables.option_bits & OPTION_BIN_LOG is false.
9797   */
9798   if ((variables.option_bits & OPTION_BIN_LOG) &&
9799       sp_runtime_ctx == NULL && !binlog_evt_union.do_union)
9800     issue_unsafe_warnings();
9801 
9802   switch (qtype) {
9803     /*
9804       ROW_QUERY_TYPE means that the statement may be logged either in
9805       row format or in statement format.  If
9806       current_stmt_binlog_format is row, it means that the
9807       statement has already been logged in row format and hence shall
9808       not be logged again.
9809     */
9810   case THD::ROW_QUERY_TYPE:
9811     DBUG_PRINT("debug",
9812                ("is_current_stmt_binlog_format_row: %d",
9813                 is_current_stmt_binlog_format_row()));
9814     if (is_current_stmt_binlog_format_row())
9815       DBUG_RETURN(0);
9816     /* Fall through */
9817 
9818     /*
9819       STMT_QUERY_TYPE means that the query must be logged in statement
9820       format; it cannot be logged in row format.  This is typically
9821       used by DDL statements.  It is an error to use this query type
9822       if current_stmt_binlog_format_row is row.
9823 
9824       @todo Currently there are places that call this method with
9825       STMT_QUERY_TYPE and current_stmt_binlog_format is row.  Fix those
9826       places and add assert to ensure correct behavior. /Sven
9827     */
9828   case THD::STMT_QUERY_TYPE:
9829     /*
9830       The MYSQL_LOG::write() function will set the STMT_END_F flag and
9831       flush the pending rows event if necessary.
9832     */
9833     {
9834       Query_log_event qinfo(this, query_arg, query_len, is_trans, direct,
9835                             suppress_use, errcode);
9836       /*
9837         Binlog table maps will be irrelevant after a Query_log_event
9838         (they are just removed on the slave side) so after the query
9839         log event is written to the binary log, we pretend that no
9840         table maps were written.
9841        */
9842       int error= mysql_bin_log.write_event(&qinfo);
9843       binlog_table_maps= 0;
9844       DBUG_RETURN(error);
9845     }
9846     break;
9847 
9848   case THD::QUERY_TYPE_COUNT:
9849   default:
9850     DBUG_ASSERT(0 <= qtype && qtype < QUERY_TYPE_COUNT);
9851   }
9852   DBUG_RETURN(0);
9853 }
9854 
9855 #endif /* !defined(MYSQL_CLIENT) */
9856 
9857 struct st_mysql_storage_engine binlog_storage_engine=
9858 { MYSQL_HANDLERTON_INTERFACE_VERSION };
9859 
9860 /** @} */
9861 
mysql_declare_plugin(binlog)9862 mysql_declare_plugin(binlog)
9863 {
9864   MYSQL_STORAGE_ENGINE_PLUGIN,
9865   &binlog_storage_engine,
9866   "binlog",
9867   "MySQL AB",
9868   "This is a pseudo storage engine to represent the binlog in a transaction",
9869   PLUGIN_LICENSE_GPL,
9870   binlog_init, /* Plugin Init */
9871   NULL, /* Plugin Deinit */
9872   0x0100 /* 1.0 */,
9873   NULL,                       /* status variables                */
9874   NULL,                       /* system variables                */
9875   NULL,                       /* config options                  */
9876   0,
9877 }
9878 mysql_declare_plugin_end;
9879