1 /* Copyright (c) 2000, 2018, Oracle and/or its affiliates.
2    Copyright (c) 2009, 2021, MariaDB Corporation.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; version 2 of the License.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1335  USA */
16 
17 
18 /**
19   @file
20 
21   @brief
22   logging of commands
23 
24   @todo
25     Abort logging when we get an error in reading or writing log files
26 */
27 
28 #include "mariadb.h"		/* NO_EMBEDDED_ACCESS_CHECKS */
29 #include "sql_priv.h"
30 #include "log.h"
31 #include "sql_base.h"                           // open_log_table
32 #include "sql_repl.h"
33 #include "sql_delete.h"                         // mysql_truncate
34 #include "sql_parse.h"                          // command_name
35 #include "sql_time.h"           // calc_time_from_sec, my_time_compare
36 #include "tztime.h"             // my_tz_OFFSET0, struct Time_zone
37 #include "log_event.h"          // Query_log_event
38 #include "rpl_filter.h"
39 #include "rpl_rli.h"
40 #include "sql_audit.h"
41 #include "mysqld.h"
42 
43 #include <my_dir.h>
44 #include <m_ctype.h>				// For test_if_number
45 
46 #include <set_var.h> // for Sys_last_gtid_ptr
47 
48 #ifdef _WIN32
49 #include "message.h"
50 #endif
51 
52 #include "sql_plugin.h"
53 #include "debug_sync.h"
54 #include "sql_show.h"
55 #include "my_pthread.h"
56 #include "semisync_master.h"
57 #include "sp_rcontext.h"
58 #include "sp_head.h"
59 
60 #include "wsrep_mysqld.h"
61 #ifdef WITH_WSREP
62 #include "wsrep_trans_observer.h"
63 #endif /* WITH_WSREP */
64 
65 /* max size of the log message */
66 #define MAX_LOG_BUFFER_SIZE 1024
67 #define MAX_TIME_SIZE 32
68 #define MY_OFF_T_UNDEF (~(my_off_t)0UL)
69 /* Truncate cache log files bigger than this */
70 #define CACHE_FILE_TRUNC_SIZE 65536
71 
72 #define FLAGSTR(V,F) ((V)&(F)?#F" ":"")
73 
74 handlerton *binlog_hton;
75 LOGGER logger;
76 
77 const char *log_bin_index= 0;
78 const char *log_bin_basename= 0;
79 
80 MYSQL_BIN_LOG mysql_bin_log(&sync_binlog_period);
81 
82 static bool test_if_number(const char *str,
83 			   ulong *res, bool allow_wildcards);
84 static int binlog_init(void *p);
85 static int binlog_close_connection(handlerton *hton, THD *thd);
86 static int binlog_savepoint_set(handlerton *hton, THD *thd, void *sv);
87 static int binlog_savepoint_rollback(handlerton *hton, THD *thd, void *sv);
88 static bool binlog_savepoint_rollback_can_release_mdl(handlerton *hton,
89                                                       THD *thd);
90 static int binlog_commit(handlerton *hton, THD *thd, bool all);
91 static int binlog_rollback(handlerton *hton, THD *thd, bool all);
92 static int binlog_prepare(handlerton *hton, THD *thd, bool all);
93 static int binlog_xa_recover_dummy(handlerton *hton, XID *xid_list, uint len);
94 static int binlog_commit_by_xid(handlerton *hton, XID *xid);
95 static int binlog_rollback_by_xid(handlerton *hton, XID *xid);
96 static int binlog_start_consistent_snapshot(handlerton *hton, THD *thd);
97 static int binlog_flush_cache(THD *thd, binlog_cache_mngr *cache_mngr,
98                               Log_event *end_ev, bool all, bool using_stmt,
99                               bool using_trx);
100 
101 static const LEX_CSTRING write_error_msg=
102     { STRING_WITH_LEN("error writing to the binary log") };
103 
104 static my_bool opt_optimize_thread_scheduling= TRUE;
105 ulong binlog_checksum_options;
106 #ifndef DBUG_OFF
107 ulong opt_binlog_dbug_fsync_sleep= 0;
108 #endif
109 
110 mysql_mutex_t LOCK_prepare_ordered;
111 mysql_cond_t COND_prepare_ordered;
112 mysql_mutex_t LOCK_after_binlog_sync;
113 mysql_mutex_t LOCK_commit_ordered;
114 
115 static ulonglong binlog_status_var_num_commits;
116 static ulonglong binlog_status_var_num_group_commits;
117 static ulonglong binlog_status_group_commit_trigger_count;
118 static ulonglong binlog_status_group_commit_trigger_lock_wait;
119 static ulonglong binlog_status_group_commit_trigger_timeout;
120 static char binlog_snapshot_file[FN_REFLEN];
121 static ulonglong binlog_snapshot_position;
122 
123 static const char *fatal_log_error=
124   "Could not use %s for logging (error %d). "
125   "Turning logging off for the whole duration of the MariaDB server process. "
126   "To turn it on again: fix the cause, shutdown the MariaDB server and "
127   "restart it.";
128 
129 
130 static SHOW_VAR binlog_status_vars_detail[]=
131 {
132   {"commits",
133     (char *)&binlog_status_var_num_commits, SHOW_LONGLONG},
134   {"group_commits",
135     (char *)&binlog_status_var_num_group_commits, SHOW_LONGLONG},
136   {"group_commit_trigger_count",
137     (char *)&binlog_status_group_commit_trigger_count, SHOW_LONGLONG},
138   {"group_commit_trigger_lock_wait",
139     (char *)&binlog_status_group_commit_trigger_lock_wait, SHOW_LONGLONG},
140   {"group_commit_trigger_timeout",
141     (char *)&binlog_status_group_commit_trigger_timeout, SHOW_LONGLONG},
142   {"snapshot_file",
143     (char *)&binlog_snapshot_file, SHOW_CHAR},
144   {"snapshot_position",
145    (char *)&binlog_snapshot_position, SHOW_LONGLONG},
146   {NullS, NullS, SHOW_LONG}
147 };
148 
149 /*
150   Variables for the binlog background thread.
151   Protected by the MYSQL_BIN_LOG::LOCK_binlog_background_thread mutex.
152  */
153 static bool binlog_background_thread_started= false;
154 static bool binlog_background_thread_stop= false;
155 static MYSQL_BIN_LOG::xid_count_per_binlog *
156     binlog_background_thread_queue= NULL;
157 
158 static bool start_binlog_background_thread();
159 
160 static rpl_binlog_state rpl_global_gtid_binlog_state;
161 
setup_log_handling()162 void setup_log_handling()
163 {
164   rpl_global_gtid_binlog_state.init();
165 }
166 
167 
168 /**
169    purge logs, master and slave sides both, related error code
170    converter.
171    Called from @c purge_error_message(), @c MYSQL_BIN_LOG::reset_logs()
172 
173    @param  res  an internal to purging routines error code
174 
175    @return the user level error code ER_*
176 */
purge_log_get_error_code(int res)177 uint purge_log_get_error_code(int res)
178 {
179   uint errcode= 0;
180 
181   switch (res)  {
182   case 0: break;
183   case LOG_INFO_EOF:	errcode= ER_UNKNOWN_TARGET_BINLOG; break;
184   case LOG_INFO_IO:	errcode= ER_IO_ERR_LOG_INDEX_READ; break;
185   case LOG_INFO_INVALID:errcode= ER_BINLOG_PURGE_PROHIBITED; break;
186   case LOG_INFO_SEEK:	errcode= ER_FSEEK_FAIL; break;
187   case LOG_INFO_MEM:	errcode= ER_OUT_OF_RESOURCES; break;
188   case LOG_INFO_FATAL:	errcode= ER_BINLOG_PURGE_FATAL_ERR; break;
189   case LOG_INFO_IN_USE: errcode= ER_LOG_IN_USE; break;
190   case LOG_INFO_EMFILE: errcode= ER_BINLOG_PURGE_EMFILE; break;
191   default:		errcode= ER_LOG_PURGE_UNKNOWN_ERR; break;
192   }
193 
194   return errcode;
195 }
196 
197 /**
198   Silence all errors and warnings reported when performing a write
199   to a log table.
200   Errors and warnings are not reported to the client or SQL exception
201   handlers, so that the presence of logging does not interfere and affect
202   the logic of an application.
203 */
204 class Silence_log_table_errors : public Internal_error_handler
205 {
206   char m_message[MYSQL_ERRMSG_SIZE];
207 public:
Silence_log_table_errors()208   Silence_log_table_errors()
209   {
210     m_message[0]= '\0';
211   }
212 
~Silence_log_table_errors()213   virtual ~Silence_log_table_errors() {}
214 
215   virtual bool handle_condition(THD *thd,
216                                 uint sql_errno,
217                                 const char* sql_state,
218                                 Sql_condition::enum_warning_level *level,
219                                 const char* msg,
220                                 Sql_condition ** cond_hdl);
message() const221   const char *message() const { return m_message; }
222 };
223 
224 bool
handle_condition(THD *,uint,const char *,Sql_condition::enum_warning_level *,const char * msg,Sql_condition ** cond_hdl)225 Silence_log_table_errors::handle_condition(THD *,
226                                            uint,
227                                            const char*,
228                                            Sql_condition::enum_warning_level*,
229                                            const char* msg,
230                                            Sql_condition ** cond_hdl)
231 {
232   *cond_hdl= NULL;
233   strmake_buf(m_message, msg);
234   return TRUE;
235 }
236 
237 sql_print_message_func sql_print_message_handlers[3] =
238 {
239   sql_print_information,
240   sql_print_warning,
241   sql_print_error
242 };
243 
244 
245 /**
246   Create the name of the log file
247 
248   @param[OUT] out    a pointer to a new allocated name will go there
249   @param[IN] log_ext The extension for the file (e.g .log)
250   @param[IN] once    whether to use malloc_once or a normal malloc.
251 */
make_default_log_name(char ** out,const char * log_ext,bool once)252 void make_default_log_name(char **out, const char* log_ext, bool once)
253 {
254   char buff[FN_REFLEN+10];
255   fn_format(buff, opt_log_basename, "", log_ext, MYF(MY_REPLACE_EXT));
256   if (once)
257     *out= my_once_strdup(buff, MYF(MY_WME));
258   else
259   {
260     my_free(*out);
261     *out= my_strdup(PSI_INSTRUMENT_ME, buff, MYF(MY_WME));
262   }
263 }
264 
265 
266 /*
267   Helper classes to store non-transactional and transactional data
268   before copying it to the binary log.
269 */
270 class binlog_cache_data
271 {
272 public:
binlog_cache_data()273   binlog_cache_data(): m_pending(0), status(0),
274   before_stmt_pos(MY_OFF_T_UNDEF),
275   incident(FALSE),
276   saved_max_binlog_cache_size(0), ptr_binlog_cache_use(0),
277   ptr_binlog_cache_disk_use(0)
278   { }
279 
~binlog_cache_data()280   ~binlog_cache_data()
281   {
282     DBUG_ASSERT(empty());
283     close_cached_file(&cache_log);
284   }
285 
286   /*
287     Return 1 if there is no relevant entries in the cache
288 
289     This is:
290     - Cache is empty
291     - There are row or critical (DDL?) events in the cache
292 
293     The status test is needed to avoid writing entries with only
294     a table map entry, which would crash in do_apply_event() on the slave
295     as it assumes that there is always a row entry after a table map.
296   */
empty() const297   bool empty() const
298   {
299     return (pending() == NULL &&
300             (my_b_write_tell(&cache_log) == 0 ||
301              ((status & (LOGGED_ROW_EVENT | LOGGED_CRITICAL)) == 0)));
302   }
303 
pending() const304   Rows_log_event *pending() const
305   {
306     return m_pending;
307   }
308 
set_pending(Rows_log_event * const pending_arg)309   void set_pending(Rows_log_event *const pending_arg)
310   {
311     m_pending= pending_arg;
312   }
313 
set_incident(void)314   void set_incident(void)
315   {
316     incident= TRUE;
317   }
318 
has_incident(void)319   bool has_incident(void)
320   {
321     return(incident);
322   }
323 
reset()324   void reset()
325   {
326     bool cache_was_empty= empty();
327     bool truncate_file= (cache_log.file != -1 &&
328                          my_b_write_tell(&cache_log) > CACHE_FILE_TRUNC_SIZE);
329     truncate(0,1);                              // Forget what's in cache
330     if (!cache_was_empty)
331       compute_statistics();
332     if (truncate_file)
333       my_chsize(cache_log.file, 0, 0, MYF(MY_WME));
334 
335     status= 0;
336     incident= FALSE;
337     before_stmt_pos= MY_OFF_T_UNDEF;
338     DBUG_ASSERT(empty());
339   }
340 
get_byte_position() const341   my_off_t get_byte_position() const
342   {
343     return my_b_tell(&cache_log);
344   }
345 
get_prev_position()346   my_off_t get_prev_position()
347   {
348      return(before_stmt_pos);
349   }
350 
set_prev_position(my_off_t pos)351   void set_prev_position(my_off_t pos)
352   {
353      before_stmt_pos= pos;
354   }
355 
restore_prev_position()356   void restore_prev_position()
357   {
358     truncate(before_stmt_pos);
359   }
360 
restore_savepoint(my_off_t pos)361   void restore_savepoint(my_off_t pos)
362   {
363     truncate(pos);
364     if (pos < before_stmt_pos)
365       before_stmt_pos= MY_OFF_T_UNDEF;
366   }
367 
set_binlog_cache_info(my_off_t param_max_binlog_cache_size,ulong * param_ptr_binlog_cache_use,ulong * param_ptr_binlog_cache_disk_use)368   void set_binlog_cache_info(my_off_t param_max_binlog_cache_size,
369                              ulong *param_ptr_binlog_cache_use,
370                              ulong *param_ptr_binlog_cache_disk_use)
371   {
372     /*
373       The assertions guarantee that the set_binlog_cache_info is
374       called just once and information passed as parameters are
375       never zero.
376 
377       This is done while calling the constructor binlog_cache_mngr.
378       We cannot set information in the constructor binlog_cache_data
379       because the space for binlog_cache_mngr is allocated through
380       a placement new.
381 
382       In the future, we can refactor this and change it to avoid
383       the set_binlog_info.
384     */
385     DBUG_ASSERT(saved_max_binlog_cache_size == 0);
386     DBUG_ASSERT(param_max_binlog_cache_size != 0);
387     DBUG_ASSERT(ptr_binlog_cache_use == 0);
388     DBUG_ASSERT(param_ptr_binlog_cache_use != 0);
389     DBUG_ASSERT(ptr_binlog_cache_disk_use == 0);
390     DBUG_ASSERT(param_ptr_binlog_cache_disk_use != 0);
391 
392     saved_max_binlog_cache_size= param_max_binlog_cache_size;
393     ptr_binlog_cache_use= param_ptr_binlog_cache_use;
394     ptr_binlog_cache_disk_use= param_ptr_binlog_cache_disk_use;
395     cache_log.end_of_file= saved_max_binlog_cache_size;
396   }
397 
add_status(enum_logged_status status_arg)398   void add_status(enum_logged_status status_arg)
399   {
400     status|= status_arg;
401   }
402 
403   /*
404     Cache to store data before copying it to the binary log.
405   */
406   IO_CACHE cache_log;
407 
408 private:
409   /*
410     Pending binrows event. This event is the event where the rows are currently
411     written.
412    */
413   Rows_log_event *m_pending;
414 
415   /*
416     Bit flags for what has been writting to cache. Used to
417     discard logs without any data changes.
418     see enum_logged_status;
419   */
420   uint32 status;
421 
422   /*
423     Binlog position before the start of the current statement.
424   */
425   my_off_t before_stmt_pos;
426 
427   /*
428     This indicates that some events did not get into the cache and most likely
429     it is corrupted.
430   */
431   bool incident;
432 
433   /**
434     This function computes binlog cache and disk usage.
435   */
compute_statistics()436   void compute_statistics()
437   {
438     statistic_increment(*ptr_binlog_cache_use, &LOCK_status);
439     if (cache_log.disk_writes != 0)
440     {
441 #ifdef REAL_STATISTICS
442       statistic_add(*ptr_binlog_cache_disk_use,
443                     cache_log.disk_writes, &LOCK_status);
444 #else
445       statistic_increment(*ptr_binlog_cache_disk_use, &LOCK_status);
446 #endif
447       cache_log.disk_writes= 0;
448     }
449   }
450 
451   /*
452     Stores the values of maximum size of the cache allowed when this cache
453     is configured. This corresponds to either
454       . max_binlog_cache_size or max_binlog_stmt_cache_size.
455   */
456   my_off_t saved_max_binlog_cache_size;
457 
458   /*
459     Stores a pointer to the status variable that keeps track of the in-memory
460     cache usage. This corresponds to either
461       . binlog_cache_use or binlog_stmt_cache_use.
462   */
463   ulong *ptr_binlog_cache_use;
464 
465   /*
466     Stores a pointer to the status variable that keeps track of the disk
467     cache usage. This corresponds to either
468       . binlog_cache_disk_use or binlog_stmt_cache_disk_use.
469   */
470   ulong *ptr_binlog_cache_disk_use;
471 
472   /*
473     It truncates the cache to a certain position. This includes deleting the
474     pending event.
475    */
truncate(my_off_t pos,bool reset_cache=0)476   void truncate(my_off_t pos, bool reset_cache=0)
477   {
478     DBUG_PRINT("info", ("truncating to position %lu", (ulong) pos));
479     cache_log.error=0;
480     if (pending())
481     {
482       delete pending();
483       set_pending(0);
484     }
485     reinit_io_cache(&cache_log, WRITE_CACHE, pos, 0, reset_cache);
486     cache_log.end_of_file= saved_max_binlog_cache_size;
487   }
488 
489   binlog_cache_data& operator=(const binlog_cache_data& info);
490   binlog_cache_data(const binlog_cache_data& info);
491 };
492 
493 
add_status(enum_logged_status status)494 void Log_event_writer::add_status(enum_logged_status status)
495 {
496   if (likely(cache_data))
497     cache_data->add_status(status);
498 }
499 
set_incident()500 void Log_event_writer::set_incident()
501 {
502   cache_data->set_incident();
503 }
504 
505 
506 class binlog_cache_mngr {
507 public:
binlog_cache_mngr(my_off_t param_max_binlog_stmt_cache_size,my_off_t param_max_binlog_cache_size,ulong * param_ptr_binlog_stmt_cache_use,ulong * param_ptr_binlog_stmt_cache_disk_use,ulong * param_ptr_binlog_cache_use,ulong * param_ptr_binlog_cache_disk_use)508   binlog_cache_mngr(my_off_t param_max_binlog_stmt_cache_size,
509                     my_off_t param_max_binlog_cache_size,
510                     ulong *param_ptr_binlog_stmt_cache_use,
511                     ulong *param_ptr_binlog_stmt_cache_disk_use,
512                     ulong *param_ptr_binlog_cache_use,
513                     ulong *param_ptr_binlog_cache_disk_use)
514     : last_commit_pos_offset(0), using_xa(FALSE), xa_xid(0)
515   {
516      stmt_cache.set_binlog_cache_info(param_max_binlog_stmt_cache_size,
517                                       param_ptr_binlog_stmt_cache_use,
518                                       param_ptr_binlog_stmt_cache_disk_use);
519      trx_cache.set_binlog_cache_info(param_max_binlog_cache_size,
520                                      param_ptr_binlog_cache_use,
521                                      param_ptr_binlog_cache_disk_use);
522      last_commit_pos_file[0]= 0;
523   }
524 
reset(bool do_stmt,bool do_trx)525   void reset(bool do_stmt, bool do_trx)
526   {
527     if (do_stmt)
528       stmt_cache.reset();
529     if (do_trx)
530     {
531       trx_cache.reset();
532       using_xa= FALSE;
533       last_commit_pos_file[0]= 0;
534       last_commit_pos_offset= 0;
535     }
536   }
537 
get_binlog_cache_data(bool is_transactional)538   binlog_cache_data* get_binlog_cache_data(bool is_transactional)
539   {
540     return (is_transactional ? &trx_cache : &stmt_cache);
541   }
542 
get_binlog_cache_log(bool is_transactional)543   IO_CACHE* get_binlog_cache_log(bool is_transactional)
544   {
545     return (is_transactional ? &trx_cache.cache_log : &stmt_cache.cache_log);
546   }
547 
548   binlog_cache_data stmt_cache;
549 
550   binlog_cache_data trx_cache;
551 
552   /*
553     Binlog position for current transaction.
554     For START TRANSACTION WITH CONSISTENT SNAPSHOT, this is the binlog
555     position corresponding to the snapshot taken. During (and after) commit,
556     this is set to the binlog position corresponding to just after the
557     commit (so storage engines can store it in their transaction log).
558   */
559   char last_commit_pos_file[FN_REFLEN];
560   my_off_t last_commit_pos_offset;
561 
562   /*
563     Flag set true if this transaction is committed with log_xid() as part of
564     XA, false if not.
565   */
566   bool using_xa;
567   my_xid xa_xid;
568   bool need_unlog;
569   /*
570     Id of binlog that transaction was written to; only needed if need_unlog is
571     true.
572   */
573   ulong binlog_id;
574   /* Set if we get an error during commit that must be returned from unlog(). */
575   bool delayed_error;
576 
577 private:
578 
579   binlog_cache_mngr& operator=(const binlog_cache_mngr& info);
580   binlog_cache_mngr(const binlog_cache_mngr& info);
581 };
582 
is_log_table_enabled(uint log_table_type)583 bool LOGGER::is_log_table_enabled(uint log_table_type)
584 {
585   switch (log_table_type) {
586   case QUERY_LOG_SLOW:
587     return (table_log_handler != NULL) && global_system_variables.sql_log_slow
588             && (log_output_options & LOG_TABLE);
589   case QUERY_LOG_GENERAL:
590     return (table_log_handler != NULL) && opt_log
591             && (log_output_options & LOG_TABLE);
592   default:
593     DBUG_ASSERT(0);
594     return FALSE;                             /* make compiler happy */
595   }
596 }
597 
598 /**
599    Check if a given table is opened log table
600 
601    @param table             Table to check
602    @param check_if_opened   Only fail if it's a log table in use
603    @param error_msg	    String to put in error message if not ok.
604                             No error message if 0
605    @return 0 ok
606    @return # Type of log file
607  */
608 
check_if_log_table(const TABLE_LIST * table,bool check_if_opened,const char * error_msg)609 int check_if_log_table(const TABLE_LIST *table,
610                        bool check_if_opened,
611                        const char *error_msg)
612 {
613   int result= 0;
614   if (table->db.length == 5 &&
615       !my_strcasecmp(table_alias_charset, table->db.str, "mysql"))
616   {
617     const char *table_name= table->table_name.str;
618 
619     if (table->table_name.length == 11 &&
620         !my_strcasecmp(table_alias_charset, table_name, "general_log"))
621     {
622       result= QUERY_LOG_GENERAL;
623       goto end;
624     }
625 
626     if (table->table_name.length == 8 &&
627         !my_strcasecmp(table_alias_charset, table_name, "slow_log"))
628     {
629       result= QUERY_LOG_SLOW;
630       goto end;
631     }
632   }
633   return 0;
634 
635 end:
636   if (!check_if_opened || logger.is_log_table_enabled(result))
637   {
638     if (error_msg)
639       my_error(ER_BAD_LOG_STATEMENT, MYF(0), error_msg);
640     return result;
641   }
642   return 0;
643 }
644 
645 
Log_to_csv_event_handler()646 Log_to_csv_event_handler::Log_to_csv_event_handler()
647 {
648 }
649 
650 
~Log_to_csv_event_handler()651 Log_to_csv_event_handler::~Log_to_csv_event_handler()
652 {
653 }
654 
655 
cleanup()656 void Log_to_csv_event_handler::cleanup()
657 {
658   logger.is_log_tables_initialized= FALSE;
659 }
660 
661 /* log event handlers */
662 
663 /**
664   Log command to the general log table
665 
666   Log given command to the general log table.
667 
668   @param  event_time        command start timestamp
669   @param  user_host         the pointer to the string with user@host info
670   @param  user_host_len     length of the user_host string. this is computed
671                             once and passed to all general log event handlers
672   @param  thread_id         Id of the thread, issued a query
673   @param  command_type      the type of the command being logged
674   @param  command_type_len  the length of the string above
675   @param  sql_text          the very text of the query being executed
676   @param  sql_text_len      the length of sql_text string
677 
678 
679   @return This function attempts to never call my_error(). This is
680   necessary, because general logging happens already after a statement
681   status has been sent to the client, so the client can not see the
682   error anyway. Besides, the error is not related to the statement
683   being executed and is internal, and thus should be handled
684   internally (@todo: how?).
685   If a write to the table has failed, the function attempts to
686   write to a short error message to the file. The failure is also
687   indicated in the return value.
688 
689   @retval  FALSE   OK
690   @retval  TRUE    error occurred
691 */
692 
693 bool Log_to_csv_event_handler::
log_general(THD * thd,my_hrtime_t event_time,const char * user_host,size_t user_host_len,my_thread_id thread_id_arg,const char * command_type,size_t command_type_len,const char * sql_text,size_t sql_text_len,CHARSET_INFO * client_cs)694   log_general(THD *thd, my_hrtime_t event_time, const char *user_host, size_t user_host_len, my_thread_id thread_id_arg,
695               const char *command_type, size_t command_type_len,
696               const char *sql_text, size_t sql_text_len,
697               CHARSET_INFO *client_cs)
698 {
699   TABLE_LIST table_list;
700   TABLE *table;
701   bool result= TRUE;
702   bool need_close= FALSE;
703   bool need_pop= FALSE;
704   bool need_rnd_end= FALSE;
705   uint field_index;
706   Silence_log_table_errors error_handler;
707   Open_tables_backup open_tables_backup;
708   bool save_time_zone_used;
709   DBUG_ENTER("log_general");
710 
711   /*
712     CSV uses TIME_to_timestamp() internally if table needs to be repaired
713     which will set thd->time_zone_used
714   */
715   save_time_zone_used= thd->time_zone_used;
716 
717   table_list.init_one_table(&MYSQL_SCHEMA_NAME, &GENERAL_LOG_NAME, 0,
718                             TL_WRITE_CONCURRENT_INSERT);
719 
720   /*
721     1) open_log_table generates an error of the
722     table can not be opened or is corrupted.
723     2) "INSERT INTO general_log" can generate warning sometimes.
724 
725     Suppress these warnings and errors, they can't be dealt with
726     properly anyway.
727 
728     QQ: this problem needs to be studied in more detail.
729     Comment this 2 lines and run "cast.test" to see what's happening.
730   */
731   thd->push_internal_handler(& error_handler);
732   need_pop= TRUE;
733 
734   if (!(table= open_log_table(thd, &table_list, &open_tables_backup)))
735     goto err;
736 
737   need_close= TRUE;
738 
739   if (table->file->extra(HA_EXTRA_MARK_AS_LOG_TABLE) ||
740       table->file->ha_rnd_init_with_error(0))
741     goto err;
742 
743   need_rnd_end= TRUE;
744 
745   /* Honor next number columns if present */
746   table->next_number_field= table->found_next_number_field;
747 
748   /*
749     NOTE: we do not call restore_record() here, as all fields are
750     filled by the Logger (=> no need to load default ones).
751   */
752 
753   /*
754     We do not set a value for table->field[0], as it will use
755     default value (which is CURRENT_TIMESTAMP).
756   */
757 
758   /* check that all columns exist */
759   if (table->s->fields < 6)
760     goto err;
761 
762   DBUG_ASSERT(table->field[0]->type() == MYSQL_TYPE_TIMESTAMP);
763 
764   table->field[0]->store_timestamp(
765                   hrtime_to_my_time(event_time), hrtime_sec_part(event_time));
766 
767   /* do a write */
768   if (table->field[1]->store(user_host, user_host_len, client_cs) ||
769       table->field[2]->store((longlong) thread_id_arg, TRUE) ||
770       table->field[3]->store((longlong) global_system_variables.server_id,
771                              TRUE) ||
772       table->field[4]->store(command_type, command_type_len, client_cs))
773     goto err;
774 
775   /*
776     A positive return value in store() means truncation.
777     Still logging a message in the log in this case.
778   */
779   table->field[5]->flags|= FIELDFLAG_HEX_ESCAPE;
780   if (table->field[5]->store(sql_text, sql_text_len, client_cs) < 0)
781     goto err;
782 
783   /* mark all fields as not null */
784   table->field[1]->set_notnull();
785   table->field[2]->set_notnull();
786   table->field[3]->set_notnull();
787   table->field[4]->set_notnull();
788   table->field[5]->set_notnull();
789 
790   /* Set any extra columns to their default values */
791   for (field_index= 6 ; field_index < table->s->fields ; field_index++)
792   {
793     table->field[field_index]->set_default();
794   }
795 
796   if (table->file->ha_write_row(table->record[0]))
797     goto err;
798 
799   result= FALSE;
800 
801 err:
802   if (result && !thd->killed)
803     sql_print_error("Failed to write to mysql.general_log: %s",
804                     error_handler.message());
805 
806   if (need_rnd_end)
807   {
808     table->file->ha_rnd_end();
809     table->file->ha_release_auto_increment();
810   }
811   if (need_pop)
812     thd->pop_internal_handler();
813   if (need_close)
814     close_log_table(thd, &open_tables_backup);
815 
816   thd->time_zone_used= save_time_zone_used;
817   DBUG_RETURN(result);
818 }
819 
820 
821 /*
822   Log a query to the slow log table
823 
824   SYNOPSIS
825     log_slow()
826     thd               THD of the query
827     current_time      current timestamp
828     user_host         the pointer to the string with user@host info
829     user_host_len     length of the user_host string. this is computed once
830                       and passed to all general log event handlers
831     query_time        Amount of time the query took to execute (in microseconds)
832     lock_time         Amount of time the query was locked (in microseconds)
833     is_command        The flag, which determines, whether the sql_text is a
834                       query or an administrator command (these are treated
835                       differently by the old logging routines)
836     sql_text          the very text of the query or administrator command
837                       processed
838     sql_text_len      the length of sql_text string
839 
840   DESCRIPTION
841 
842    Log a query to the slow log table
843 
844   RETURN
845     FALSE - OK
846     TRUE - error occurred
847 */
848 
849 bool Log_to_csv_event_handler::
log_slow(THD * thd,my_hrtime_t current_time,const char * user_host,size_t user_host_len,ulonglong query_utime,ulonglong lock_utime,bool is_command,const char * sql_text,size_t sql_text_len)850   log_slow(THD *thd, my_hrtime_t current_time,
851            const char *user_host, size_t user_host_len,
852            ulonglong query_utime, ulonglong lock_utime, bool is_command,
853            const char *sql_text, size_t sql_text_len)
854 {
855   TABLE_LIST table_list;
856   TABLE *table;
857   bool result= TRUE;
858   bool need_close= FALSE;
859   bool need_rnd_end= FALSE;
860   Silence_log_table_errors error_handler;
861   Open_tables_backup open_tables_backup;
862   CHARSET_INFO *client_cs= thd->variables.character_set_client;
863   bool save_time_zone_used;
864   ulong query_time= (ulong) MY_MIN(query_utime/1000000, TIME_MAX_VALUE_SECONDS);
865   ulong lock_time=  (ulong) MY_MIN(lock_utime/1000000, TIME_MAX_VALUE_SECONDS);
866   ulong query_time_micro= (ulong) (query_utime % 1000000);
867   ulong lock_time_micro=  (ulong) (lock_utime % 1000000);
868   DBUG_ENTER("Log_to_csv_event_handler::log_slow");
869 
870   thd->push_internal_handler(& error_handler);
871   /*
872     CSV uses TIME_to_timestamp() internally if table needs to be repaired
873     which will set thd->time_zone_used
874   */
875   save_time_zone_used= thd->time_zone_used;
876 
877   table_list.init_one_table(&MYSQL_SCHEMA_NAME, &SLOW_LOG_NAME, 0,
878                             TL_WRITE_CONCURRENT_INSERT);
879 
880   if (!(table= open_log_table(thd, &table_list, &open_tables_backup)))
881     goto err;
882 
883   need_close= TRUE;
884 
885   if (table->file->extra(HA_EXTRA_MARK_AS_LOG_TABLE) ||
886       table->file->ha_rnd_init_with_error(0))
887     goto err;
888 
889   need_rnd_end= TRUE;
890 
891   /* Honor next number columns if present */
892   table->next_number_field= table->found_next_number_field;
893 
894   restore_record(table, s->default_values);    // Get empty record
895 
896   /* check that all columns exist */
897   if (table->s->fields < 13)
898     goto err;
899 
900   /* store the time and user values */
901   DBUG_ASSERT(table->field[0]->type() == MYSQL_TYPE_TIMESTAMP);
902   table->field[0]->store_timestamp(
903              hrtime_to_my_time(current_time), hrtime_sec_part(current_time));
904   if (table->field[1]->store(user_host, user_host_len, client_cs))
905     goto err;
906 
907   /*
908     A TIME field can not hold the full longlong range; query_time or
909     lock_time may be truncated without warning here, if greater than
910     839 hours (~35 days)
911   */
912   MYSQL_TIME t;
913   t.neg= 0;
914 
915   /* fill in query_time field */
916   calc_time_from_sec(&t, query_time, query_time_micro);
917   if (table->field[2]->store_time(&t))
918     goto err;
919   /* lock_time */
920   calc_time_from_sec(&t, lock_time, lock_time_micro);
921   if (table->field[3]->store_time(&t))
922     goto err;
923   /* rows_sent */
924   if (table->field[4]->store((longlong) thd->get_sent_row_count(), TRUE))
925     goto err;
926   /* rows_examined */
927   if (table->field[5]->store((longlong) thd->get_examined_row_count(), TRUE))
928     goto err;
929 
930   /* fill database field */
931   if (thd->db.str)
932   {
933     if (table->field[6]->store(thd->db.str, thd->db.length, client_cs))
934       goto err;
935     table->field[6]->set_notnull();
936   }
937 
938   if (thd->stmt_depends_on_first_successful_insert_id_in_prev_stmt)
939   {
940     if (table->
941         field[7]->store((longlong)
942                         thd->first_successful_insert_id_in_prev_stmt_for_binlog,
943                         TRUE))
944       goto err;
945     table->field[7]->set_notnull();
946   }
947 
948   /*
949     Set value if we do an insert on autoincrement column. Note that for
950     some engines (those for which get_auto_increment() does not leave a
951     table lock until the statement ends), this is just the first value and
952     the next ones used may not be contiguous to it.
953   */
954   if (thd->auto_inc_intervals_in_cur_stmt_for_binlog.nb_elements() > 0)
955   {
956     if (table->
957         field[8]->store((longlong)
958           thd->auto_inc_intervals_in_cur_stmt_for_binlog.minimum(), TRUE))
959       goto err;
960     table->field[8]->set_notnull();
961   }
962 
963   if (table->field[9]->store((longlong)global_system_variables.server_id, TRUE))
964     goto err;
965   table->field[9]->set_notnull();
966 
967   /*
968     Column sql_text.
969     A positive return value in store() means truncation.
970     Still logging a message in the log in this case.
971   */
972   if (table->field[10]->store(sql_text, sql_text_len, client_cs) < 0)
973     goto err;
974 
975   if (table->field[11]->store((longlong) thd->thread_id, TRUE))
976     goto err;
977 
978   /* Rows_affected */
979   if (table->field[12]->store(thd->get_stmt_da()->is_ok() ?
980                               (longlong) thd->get_stmt_da()->affected_rows() :
981                               0, TRUE))
982     goto err;
983 
984   if (table->file->ha_write_row(table->record[0]))
985     goto err;
986 
987   result= FALSE;
988 
989 err:
990   thd->pop_internal_handler();
991 
992   if (result && !thd->killed)
993     sql_print_error("Failed to write to mysql.slow_log: %s",
994                     error_handler.message());
995 
996   if (need_rnd_end)
997   {
998     table->file->ha_rnd_end();
999     table->file->ha_release_auto_increment();
1000   }
1001   if (need_close)
1002     close_log_table(thd, &open_tables_backup);
1003   thd->time_zone_used= save_time_zone_used;
1004   DBUG_RETURN(result);
1005 }
1006 
1007 int Log_to_csv_event_handler::
activate_log(THD * thd,uint log_table_type)1008   activate_log(THD *thd, uint log_table_type)
1009 {
1010   TABLE_LIST table_list;
1011   TABLE *table;
1012   LEX_CSTRING *UNINIT_VAR(log_name);
1013   int result;
1014   Open_tables_backup open_tables_backup;
1015 
1016   DBUG_ENTER("Log_to_csv_event_handler::activate_log");
1017 
1018   if (log_table_type == QUERY_LOG_GENERAL)
1019   {
1020     log_name= &GENERAL_LOG_NAME;
1021   }
1022   else
1023   {
1024     DBUG_ASSERT(log_table_type == QUERY_LOG_SLOW);
1025 
1026     log_name= &SLOW_LOG_NAME;
1027   }
1028   table_list.init_one_table(&MYSQL_SCHEMA_NAME, log_name, 0, TL_WRITE_CONCURRENT_INSERT);
1029 
1030   table= open_log_table(thd, &table_list, &open_tables_backup);
1031   if (table)
1032   {
1033     result= 0;
1034     close_log_table(thd, &open_tables_backup);
1035   }
1036   else
1037     result= 1;
1038 
1039   DBUG_RETURN(result);
1040 }
1041 
1042 bool Log_to_csv_event_handler::
log_error(enum loglevel level,const char * format,va_list args)1043   log_error(enum loglevel level, const char *format, va_list args)
1044 {
1045   /* No log table is implemented */
1046   DBUG_ASSERT(0);
1047   return FALSE;
1048 }
1049 
1050 bool Log_to_file_event_handler::
log_error(enum loglevel level,const char * format,va_list args)1051   log_error(enum loglevel level, const char *format,
1052             va_list args)
1053 {
1054   return vprint_msg_to_log(level, format, args);
1055 }
1056 
init_pthread_objects()1057 void Log_to_file_event_handler::init_pthread_objects()
1058 {
1059   mysql_log.init_pthread_objects();
1060   mysql_slow_log.init_pthread_objects();
1061 }
1062 
1063 
1064 /** Wrapper around MYSQL_LOG::write() for slow log. */
1065 
1066 bool Log_to_file_event_handler::
log_slow(THD * thd,my_hrtime_t current_time,const char * user_host,size_t user_host_len,ulonglong query_utime,ulonglong lock_utime,bool is_command,const char * sql_text,size_t sql_text_len)1067   log_slow(THD *thd, my_hrtime_t current_time,
1068            const char *user_host, size_t user_host_len,
1069            ulonglong query_utime, ulonglong lock_utime, bool is_command,
1070            const char *sql_text, size_t sql_text_len)
1071 {
1072   Silence_log_table_errors error_handler;
1073   thd->push_internal_handler(&error_handler);
1074   bool retval= mysql_slow_log.write(thd, hrtime_to_my_time(current_time),
1075                                     user_host, user_host_len,
1076                                     query_utime, lock_utime, is_command,
1077                                     sql_text, sql_text_len);
1078   thd->pop_internal_handler();
1079   return retval;
1080 }
1081 
1082 
1083 /**
1084    Wrapper around MYSQL_LOG::write() for general log. We need it since we
1085    want all log event handlers to have the same signature.
1086 */
1087 
1088 bool Log_to_file_event_handler::
log_general(THD * thd,my_hrtime_t event_time,const char * user_host,size_t user_host_len,my_thread_id thread_id_arg,const char * command_type,size_t command_type_len,const char * sql_text,size_t sql_text_len,CHARSET_INFO * client_cs)1089   log_general(THD *thd, my_hrtime_t event_time, const char *user_host, size_t user_host_len, my_thread_id thread_id_arg,
1090               const char *command_type, size_t command_type_len,
1091               const char *sql_text, size_t sql_text_len,
1092               CHARSET_INFO *client_cs)
1093 {
1094   Silence_log_table_errors error_handler;
1095   thd->push_internal_handler(&error_handler);
1096   bool retval= mysql_log.write(hrtime_to_time(event_time), user_host,
1097                                user_host_len,
1098                                thread_id_arg, command_type, command_type_len,
1099                                sql_text, sql_text_len);
1100   thd->pop_internal_handler();
1101   return retval;
1102 }
1103 
1104 
init()1105 bool Log_to_file_event_handler::init()
1106 {
1107   if (!is_initialized)
1108   {
1109     if (global_system_variables.sql_log_slow)
1110       mysql_slow_log.open_slow_log(opt_slow_logname);
1111 
1112     if (opt_log)
1113       mysql_log.open_query_log(opt_logname);
1114 
1115     is_initialized= TRUE;
1116   }
1117 
1118   return FALSE;
1119 }
1120 
1121 
cleanup()1122 void Log_to_file_event_handler::cleanup()
1123 {
1124   mysql_log.cleanup();
1125   mysql_slow_log.cleanup();
1126 }
1127 
flush()1128 void Log_to_file_event_handler::flush()
1129 {
1130   /* reopen log files */
1131   if (opt_log)
1132     mysql_log.reopen_file();
1133   if (global_system_variables.sql_log_slow)
1134     mysql_slow_log.reopen_file();
1135 }
1136 
1137 /*
1138   Log error with all enabled log event handlers
1139 
1140   SYNOPSIS
1141     error_log_print()
1142 
1143     level             The level of the error significance: NOTE,
1144                       WARNING or ERROR.
1145     format            format string for the error message
1146     args              list of arguments for the format string
1147 
1148   RETURN
1149     FALSE - OK
1150     TRUE - error occurred
1151 */
1152 
error_log_print(enum loglevel level,const char * format,va_list args)1153 bool LOGGER::error_log_print(enum loglevel level, const char *format,
1154                              va_list args)
1155 {
1156   bool error= FALSE;
1157   Log_event_handler **current_handler;
1158   THD *thd= current_thd;
1159 
1160   if (likely(thd))
1161     thd->error_printed_to_log= 1;
1162 
1163   /* currently we don't need locking here as there is no error_log table */
1164   for (current_handler= error_log_handler_list ; *current_handler ;)
1165     error= (*current_handler++)->log_error(level, format, args) || error;
1166 
1167   return error;
1168 }
1169 
1170 
cleanup_base()1171 void LOGGER::cleanup_base()
1172 {
1173   DBUG_ASSERT(inited == 1);
1174   mysql_rwlock_destroy(&LOCK_logger);
1175   if (table_log_handler)
1176   {
1177     table_log_handler->cleanup();
1178     delete table_log_handler;
1179     table_log_handler= NULL;
1180   }
1181   if (file_log_handler)
1182     file_log_handler->cleanup();
1183 }
1184 
1185 
cleanup_end()1186 void LOGGER::cleanup_end()
1187 {
1188   DBUG_ASSERT(inited == 1);
1189   if (file_log_handler)
1190   {
1191     delete file_log_handler;
1192     file_log_handler=NULL;
1193   }
1194   inited= 0;
1195 }
1196 
1197 
1198 /**
1199   Perform basic log initialization: create file-based log handler and
1200   init error log.
1201 */
init_base()1202 void LOGGER::init_base()
1203 {
1204   DBUG_ASSERT(inited == 0);
1205   inited= 1;
1206 
1207   /*
1208     Here we create file log handler. We don't do it for the table log handler
1209     here as it cannot be created so early. The reason is THD initialization,
1210     which depends on the system variables (parsed later).
1211   */
1212   if (!file_log_handler)
1213     file_log_handler= new Log_to_file_event_handler;
1214 
1215   /* by default we use traditional error log */
1216   init_error_log(LOG_FILE);
1217 
1218   file_log_handler->init_pthread_objects();
1219   mysql_rwlock_init(key_rwlock_LOCK_logger, &LOCK_logger);
1220 }
1221 
1222 
init_log_tables()1223 void LOGGER::init_log_tables()
1224 {
1225   if (!table_log_handler)
1226     table_log_handler= new Log_to_csv_event_handler;
1227 
1228   if (!is_log_tables_initialized &&
1229       !table_log_handler->init() && !file_log_handler->init())
1230     is_log_tables_initialized= TRUE;
1231 }
1232 
1233 
1234 /**
1235   Close and reopen the slow log (with locks).
1236 
1237   @returns FALSE.
1238 */
flush_slow_log()1239 bool LOGGER::flush_slow_log()
1240 {
1241   /*
1242     Now we lock logger, as nobody should be able to use logging routines while
1243     log tables are closed
1244   */
1245   logger.lock_exclusive();
1246 
1247   /* Reopen slow log file */
1248   if (global_system_variables.sql_log_slow)
1249     file_log_handler->get_mysql_slow_log()->reopen_file();
1250 
1251   /* End of log flush */
1252   logger.unlock();
1253 
1254   return 0;
1255 }
1256 
1257 
1258 /**
1259   Close and reopen the general log (with locks).
1260 
1261   @returns FALSE.
1262 */
flush_general_log()1263 bool LOGGER::flush_general_log()
1264 {
1265   /*
1266     Now we lock logger, as nobody should be able to use logging routines while
1267     log tables are closed
1268   */
1269   logger.lock_exclusive();
1270 
1271   /* Reopen general log file */
1272   if (opt_log)
1273     file_log_handler->get_mysql_log()->reopen_file();
1274 
1275   /* End of log flush */
1276   logger.unlock();
1277 
1278   return 0;
1279 }
1280 
1281 
1282 /*
1283   Log slow query with all enabled log event handlers
1284 
1285   SYNOPSIS
1286     slow_log_print()
1287 
1288     thd                 THD of the query being logged
1289     query               The query being logged
1290     query_length        The length of the query string
1291     current_utime       Current time in microseconds (from undefined start)
1292 
1293   RETURN
1294     FALSE   OK
1295     TRUE    error occurred
1296 */
1297 
slow_log_print(THD * thd,const char * query,size_t query_length,ulonglong current_utime)1298 bool LOGGER::slow_log_print(THD *thd, const char *query, size_t query_length,
1299                             ulonglong current_utime)
1300 
1301 {
1302   bool error= FALSE;
1303   Log_event_handler **current_handler;
1304   bool is_command= FALSE;
1305   char user_host_buff[MAX_USER_HOST_SIZE + 1];
1306   Security_context *sctx= thd->security_ctx;
1307   uint user_host_len= 0;
1308   ulonglong query_utime, lock_utime;
1309 
1310   DBUG_ASSERT(thd->enable_slow_log);
1311   /*
1312     Print the message to the buffer if we have slow log enabled
1313   */
1314 
1315   if (*slow_log_handler_list)
1316   {
1317     /* do not log slow queries from replication threads */
1318     if (!thd->variables.sql_log_slow)
1319       return 0;
1320 
1321     lock_shared();
1322     if (!global_system_variables.sql_log_slow)
1323     {
1324       unlock();
1325       return 0;
1326     }
1327 
1328     /* fill in user_host value: the format is "%s[%s] @ %s [%s]" */
1329     user_host_len= (uint)(strxnmov(user_host_buff, MAX_USER_HOST_SIZE,
1330                              sctx->priv_user, "[",
1331                              sctx->user ? sctx->user : (thd->slave_thread ? "SQL_SLAVE" : ""), "] @ ",
1332                              sctx->host ? sctx->host : "", " [",
1333                              sctx->ip ? sctx->ip : "", "]", NullS) -
1334                     user_host_buff);
1335 
1336     DBUG_ASSERT(thd->start_utime);
1337     DBUG_ASSERT(thd->start_time);
1338     query_utime= (current_utime - thd->start_utime);
1339     lock_utime=  (thd->utime_after_lock - thd->start_utime);
1340     my_hrtime_t current_time= { hrtime_from_time(thd->start_time) +
1341                                 thd->start_time_sec_part + query_utime };
1342 
1343     if (!query || thd->get_command() == COM_STMT_PREPARE)
1344     {
1345       is_command= TRUE;
1346       query= command_name[thd->get_command()].str;
1347       query_length= (uint)command_name[thd->get_command()].length;
1348     }
1349 
1350     for (current_handler= slow_log_handler_list; *current_handler ;)
1351       error= (*current_handler++)->log_slow(thd, current_time,
1352                                             user_host_buff, user_host_len,
1353                                             query_utime, lock_utime, is_command,
1354                                             query, query_length) || error;
1355 
1356     unlock();
1357   }
1358   return error;
1359 }
1360 
general_log_write(THD * thd,enum enum_server_command command,const char * query,size_t query_length)1361 bool LOGGER::general_log_write(THD *thd, enum enum_server_command command,
1362                                const char *query, size_t query_length)
1363 {
1364   bool error= FALSE;
1365   Log_event_handler **current_handler= general_log_handler_list;
1366   char user_host_buff[MAX_USER_HOST_SIZE + 1];
1367   uint user_host_len= 0;
1368   my_hrtime_t current_time;
1369 
1370   DBUG_ASSERT(thd);
1371 
1372   user_host_len= make_user_name(thd, user_host_buff);
1373 
1374   current_time= my_hrtime();
1375 
1376   mysql_audit_general_log(thd, hrtime_to_time(current_time),
1377                           user_host_buff, user_host_len,
1378                           command_name[(uint) command].str,
1379                           (uint)command_name[(uint) command].length,
1380                           query, (uint)query_length);
1381 
1382   if (opt_log && log_command(thd, command))
1383   {
1384     lock_shared();
1385     while (*current_handler)
1386       error|= (*current_handler++)->
1387         log_general(thd, current_time, user_host_buff,
1388                     user_host_len, thd->thread_id,
1389                     command_name[(uint) command].str,
1390                     command_name[(uint) command].length,
1391                     query, query_length,
1392                     thd->variables.character_set_client) || error;
1393     unlock();
1394   }
1395 
1396   return error;
1397 }
1398 
general_log_print(THD * thd,enum enum_server_command command,const char * format,va_list args)1399 bool LOGGER::general_log_print(THD *thd, enum enum_server_command command,
1400                                const char *format, va_list args)
1401 {
1402   size_t message_buff_len= 0;
1403   char message_buff[MAX_LOG_BUFFER_SIZE];
1404 
1405   /* prepare message */
1406   if (format)
1407     message_buff_len= my_vsnprintf(message_buff, sizeof(message_buff),
1408                                    format, args);
1409   else
1410     message_buff[0]= '\0';
1411 
1412   return general_log_write(thd, command, message_buff, message_buff_len);
1413 }
1414 
init_error_log(ulonglong error_log_printer)1415 void LOGGER::init_error_log(ulonglong error_log_printer)
1416 {
1417   if (error_log_printer & LOG_NONE)
1418   {
1419     error_log_handler_list[0]= 0;
1420     return;
1421   }
1422 
1423   switch (error_log_printer) {
1424   case LOG_FILE:
1425     error_log_handler_list[0]= file_log_handler;
1426     error_log_handler_list[1]= 0;
1427     break;
1428     /* these two are disabled for now */
1429   case LOG_TABLE:
1430     DBUG_ASSERT(0);
1431     break;
1432   case LOG_TABLE|LOG_FILE:
1433     DBUG_ASSERT(0);
1434     break;
1435   }
1436 }
1437 
init_slow_log(ulonglong slow_log_printer)1438 void LOGGER::init_slow_log(ulonglong slow_log_printer)
1439 {
1440   if (slow_log_printer & LOG_NONE)
1441   {
1442     slow_log_handler_list[0]= 0;
1443     return;
1444   }
1445 
1446   switch (slow_log_printer) {
1447   case LOG_FILE:
1448     slow_log_handler_list[0]= file_log_handler;
1449     slow_log_handler_list[1]= 0;
1450     break;
1451   case LOG_TABLE:
1452     slow_log_handler_list[0]= table_log_handler;
1453     slow_log_handler_list[1]= 0;
1454     break;
1455   case LOG_TABLE|LOG_FILE:
1456     slow_log_handler_list[0]= file_log_handler;
1457     slow_log_handler_list[1]= table_log_handler;
1458     slow_log_handler_list[2]= 0;
1459     break;
1460   }
1461 }
1462 
init_general_log(ulonglong general_log_printer)1463 void LOGGER::init_general_log(ulonglong general_log_printer)
1464 {
1465   if (general_log_printer & LOG_NONE)
1466   {
1467     general_log_handler_list[0]= 0;
1468     return;
1469   }
1470 
1471   switch (general_log_printer) {
1472   case LOG_FILE:
1473     general_log_handler_list[0]= file_log_handler;
1474     general_log_handler_list[1]= 0;
1475     break;
1476   case LOG_TABLE:
1477     general_log_handler_list[0]= table_log_handler;
1478     general_log_handler_list[1]= 0;
1479     break;
1480   case LOG_TABLE|LOG_FILE:
1481     general_log_handler_list[0]= file_log_handler;
1482     general_log_handler_list[1]= table_log_handler;
1483     general_log_handler_list[2]= 0;
1484     break;
1485   }
1486 }
1487 
1488 
activate_log_handler(THD * thd,uint log_type)1489 bool LOGGER::activate_log_handler(THD* thd, uint log_type)
1490 {
1491   MYSQL_QUERY_LOG *file_log;
1492   bool res= FALSE;
1493   lock_exclusive();
1494   switch (log_type) {
1495   case QUERY_LOG_SLOW:
1496     if (!global_system_variables.sql_log_slow)
1497     {
1498       file_log= file_log_handler->get_mysql_slow_log();
1499 
1500       file_log->open_slow_log(opt_slow_logname);
1501       if (table_log_handler->activate_log(thd, QUERY_LOG_SLOW))
1502       {
1503         /* Error printed by open table in activate_log() */
1504         res= TRUE;
1505         file_log->close(0);
1506       }
1507       else
1508       {
1509         init_slow_log(log_output_options);
1510         global_system_variables.sql_log_slow= TRUE;
1511       }
1512     }
1513     break;
1514   case QUERY_LOG_GENERAL:
1515     if (!opt_log)
1516     {
1517       file_log= file_log_handler->get_mysql_log();
1518 
1519       file_log->open_query_log(opt_logname);
1520       if (table_log_handler->activate_log(thd, QUERY_LOG_GENERAL))
1521       {
1522         /* Error printed by open table in activate_log() */
1523         res= TRUE;
1524         file_log->close(0);
1525       }
1526       else
1527       {
1528         init_general_log(log_output_options);
1529         opt_log= TRUE;
1530       }
1531     }
1532     break;
1533   default:
1534     DBUG_ASSERT(0);
1535   }
1536   unlock();
1537   return res;
1538 }
1539 
1540 
deactivate_log_handler(THD * thd,uint log_type)1541 void LOGGER::deactivate_log_handler(THD *thd, uint log_type)
1542 {
1543   my_bool *tmp_opt= 0;
1544   MYSQL_LOG *UNINIT_VAR(file_log);
1545 
1546   switch (log_type) {
1547   case QUERY_LOG_SLOW:
1548     tmp_opt= &global_system_variables.sql_log_slow;
1549     file_log= file_log_handler->get_mysql_slow_log();
1550     break;
1551   case QUERY_LOG_GENERAL:
1552     tmp_opt= &opt_log;
1553     file_log= file_log_handler->get_mysql_log();
1554     break;
1555   default:
1556     MY_ASSERT_UNREACHABLE();
1557   }
1558 
1559   if (!(*tmp_opt))
1560     return;
1561 
1562   lock_exclusive();
1563   file_log->close(0);
1564   *tmp_opt= FALSE;
1565   unlock();
1566 }
1567 
1568 
1569 /* the parameters are unused for the log tables */
init()1570 bool Log_to_csv_event_handler::init()
1571 {
1572   return 0;
1573 }
1574 
set_handlers(ulonglong error_log_printer,ulonglong slow_log_printer,ulonglong general_log_printer)1575 int LOGGER::set_handlers(ulonglong error_log_printer,
1576                          ulonglong slow_log_printer,
1577                          ulonglong general_log_printer)
1578 {
1579   /* error log table is not supported yet */
1580   DBUG_ASSERT(error_log_printer < LOG_TABLE);
1581 
1582   lock_exclusive();
1583 
1584   if ((slow_log_printer & LOG_TABLE || general_log_printer & LOG_TABLE) &&
1585       !is_log_tables_initialized)
1586   {
1587     slow_log_printer= (slow_log_printer & ~LOG_TABLE) | LOG_FILE;
1588     general_log_printer= (general_log_printer & ~LOG_TABLE) | LOG_FILE;
1589 
1590     sql_print_error("Failed to initialize log tables. "
1591                     "Falling back to the old-fashioned logs");
1592   }
1593 
1594   init_error_log(error_log_printer);
1595   init_slow_log(slow_log_printer);
1596   init_general_log(general_log_printer);
1597 
1598   unlock();
1599 
1600   return 0;
1601 }
1602 
1603  /*
1604   Save position of binary log transaction cache.
1605 
1606   SYNPOSIS
1607     binlog_trans_log_savepos()
1608 
1609     thd      The thread to take the binlog data from
1610     pos      Pointer to variable where the position will be stored
1611 
1612   DESCRIPTION
1613 
1614     Save the current position in the binary log transaction cache into
1615     the variable pointed to by 'pos'
1616  */
1617 
1618 static void
binlog_trans_log_savepos(THD * thd,my_off_t * pos)1619 binlog_trans_log_savepos(THD *thd, my_off_t *pos)
1620 {
1621   DBUG_ENTER("binlog_trans_log_savepos");
1622   DBUG_ASSERT(pos != NULL);
1623   binlog_cache_mngr *const cache_mngr= thd->binlog_setup_trx_data();
1624   DBUG_ASSERT((WSREP(thd) && wsrep_emulate_bin_log) || mysql_bin_log.is_open());
1625   *pos= cache_mngr->trx_cache.get_byte_position();
1626   DBUG_PRINT("return", ("*pos: %lu", (ulong) *pos));
1627   DBUG_VOID_RETURN;
1628 }
1629 
1630 
1631 /*
1632   Truncate the binary log transaction cache.
1633 
1634   SYNPOSIS
1635     binlog_trans_log_truncate()
1636 
1637     thd      The thread to take the binlog data from
1638     pos      Position to truncate to
1639 
1640   DESCRIPTION
1641 
1642     Truncate the binary log to the given position. Will not change
1643     anything else.
1644 
1645  */
1646 static void
binlog_trans_log_truncate(THD * thd,my_off_t pos)1647 binlog_trans_log_truncate(THD *thd, my_off_t pos)
1648 {
1649   DBUG_ENTER("binlog_trans_log_truncate");
1650   DBUG_PRINT("enter", ("pos: %lu", (ulong) pos));
1651 
1652   DBUG_ASSERT(thd_get_ha_data(thd, binlog_hton) != NULL);
1653   /* Only true if binlog_trans_log_savepos() wasn't called before */
1654   DBUG_ASSERT(pos != ~(my_off_t) 0);
1655 
1656   binlog_cache_mngr *const cache_mngr=
1657     (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
1658   cache_mngr->trx_cache.restore_savepoint(pos);
1659   DBUG_VOID_RETURN;
1660 }
1661 
1662 
1663 /*
1664   this function is mostly a placeholder.
1665   conceptually, binlog initialization (now mostly done in MYSQL_BIN_LOG::open)
1666   should be moved here.
1667 */
1668 
binlog_init(void * p)1669 int binlog_init(void *p)
1670 {
1671   binlog_hton= (handlerton *)p;
1672   binlog_hton->savepoint_offset= sizeof(my_off_t);
1673   binlog_hton->close_connection= binlog_close_connection;
1674   binlog_hton->savepoint_set= binlog_savepoint_set;
1675   binlog_hton->savepoint_rollback= binlog_savepoint_rollback;
1676   binlog_hton->savepoint_rollback_can_release_mdl=
1677                                      binlog_savepoint_rollback_can_release_mdl;
1678   binlog_hton->commit= binlog_commit;
1679   binlog_hton->rollback= binlog_rollback;
1680   binlog_hton->drop_table= [](handlerton *, const char*) { return -1; };
1681   if (WSREP_ON || opt_bin_log)
1682   {
1683     binlog_hton->prepare= binlog_prepare;
1684     binlog_hton->start_consistent_snapshot= binlog_start_consistent_snapshot;
1685     binlog_hton->commit_by_xid= binlog_commit_by_xid;
1686     binlog_hton->rollback_by_xid= binlog_rollback_by_xid;
1687     // recover needs to be set to make xa{commit,rollback}_handlerton effective
1688     binlog_hton->recover= binlog_xa_recover_dummy;
1689   }
1690   binlog_hton->flags= HTON_NOT_USER_SELECTABLE | HTON_HIDDEN | HTON_NO_ROLLBACK;
1691   return 0;
1692 }
1693 
1694 #ifdef WITH_WSREP
1695 #include "wsrep_binlog.h"
1696 #endif /* WITH_WSREP */
binlog_close_connection(handlerton * hton,THD * thd)1697 static int binlog_close_connection(handlerton *hton, THD *thd)
1698 {
1699   DBUG_ENTER("binlog_close_connection");
1700   binlog_cache_mngr *const cache_mngr=
1701     (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
1702 #ifdef WITH_WSREP
1703   if (WSREP(thd) && cache_mngr && !cache_mngr->trx_cache.empty()) {
1704     IO_CACHE* cache= cache_mngr->get_binlog_cache_log(true);
1705     uchar *buf;
1706     size_t len=0;
1707     wsrep_write_cache_buf(cache, &buf, &len);
1708     WSREP_WARN("binlog trx cache not empty (%zu bytes) @ connection close %lld",
1709                len, (longlong) thd->thread_id);
1710     if (len > 0) wsrep_dump_rbr_buf(thd, buf, len);
1711 
1712     cache = cache_mngr->get_binlog_cache_log(false);
1713     wsrep_write_cache_buf(cache, &buf, &len);
1714     WSREP_WARN("binlog stmt cache not empty (%zu bytes) @ connection close %lld",
1715                len, (longlong) thd->thread_id);
1716     if (len > 0) wsrep_dump_rbr_buf(thd, buf, len);
1717   }
1718 #endif /* WITH_WSREP */
1719   DBUG_ASSERT(cache_mngr->trx_cache.empty());
1720   DBUG_ASSERT(cache_mngr->stmt_cache.empty());
1721   cache_mngr->~binlog_cache_mngr();
1722   my_free(cache_mngr);
1723   DBUG_RETURN(0);
1724 }
1725 
1726 /*
1727   This function flushes a cache upon commit/rollback.
1728 
1729   SYNOPSIS
1730     binlog_flush_cache()
1731 
1732     thd        The thread whose transaction should be ended
1733     cache_mngr Pointer to the binlog_cache_mngr to use
1734     all        True if the entire transaction should be ended, false if
1735                only the statement transaction should be ended.
1736     end_ev     The end event to use (COMMIT, ROLLBACK, or commit XID)
1737     using_stmt True if the statement cache should be flushed
1738     using_trx  True if the transaction cache should be flushed
1739 
1740   DESCRIPTION
1741 
1742     End the currently transaction or statement. The transaction can be either
1743     a real transaction or a statement transaction.
1744 
1745     This can be to commit a transaction, with a COMMIT query event or an XA
1746     commit XID event. But it can also be to rollback a transaction with a
1747     ROLLBACK query event, used for rolling back transactions which also
1748     contain updates to non-transactional tables. Or it can be a flush of
1749     a statement cache.
1750  */
1751 
1752 static int
binlog_flush_cache(THD * thd,binlog_cache_mngr * cache_mngr,Log_event * end_ev,bool all,bool using_stmt,bool using_trx)1753 binlog_flush_cache(THD *thd, binlog_cache_mngr *cache_mngr,
1754                    Log_event *end_ev, bool all, bool using_stmt,
1755                    bool using_trx)
1756 {
1757   int error= 0;
1758   DBUG_ENTER("binlog_flush_cache");
1759   DBUG_PRINT("enter", ("end_ev: %p", end_ev));
1760 
1761   if ((using_stmt && !cache_mngr->stmt_cache.empty()) ||
1762       (using_trx && !cache_mngr->trx_cache.empty())   ||
1763       thd->transaction->xid_state.is_explicit_XA())
1764   {
1765     if (using_stmt && thd->binlog_flush_pending_rows_event(TRUE, FALSE))
1766       DBUG_RETURN(1);
1767     if (using_trx && thd->binlog_flush_pending_rows_event(TRUE, TRUE))
1768       DBUG_RETURN(1);
1769 
1770     /*
1771       Doing a commit or a rollback including non-transactional tables,
1772       i.e., ending a transaction where we might write the transaction
1773       cache to the binary log.
1774 
1775       We can always end the statement when ending a transaction since
1776       transactions are not allowed inside stored functions.  If they
1777       were, we would have to ensure that we're not ending a statement
1778       inside a stored function.
1779     */
1780     error= mysql_bin_log.write_transaction_to_binlog(thd, cache_mngr,
1781                                                      end_ev, all,
1782                                                      using_stmt, using_trx);
1783   }
1784   else
1785   {
1786     /*
1787       This can happen in row-format binlog with something like
1788           BEGIN; INSERT INTO nontrans_table; INSERT IGNORE INTO trans_table;
1789       The nontrans_table is written directly into the binlog before commit,
1790       and if the trans_table is ignored there will be no rows to write when
1791       we get here.
1792 
1793       So there is no work to do. Therefore, we will not increment any XID
1794       count, so we must not decrement any XID count in unlog().
1795     */
1796     cache_mngr->need_unlog= 0;
1797   }
1798   cache_mngr->reset(using_stmt, using_trx);
1799 
1800   DBUG_ASSERT(!using_stmt || cache_mngr->stmt_cache.empty());
1801   DBUG_ASSERT(!using_trx || cache_mngr->trx_cache.empty());
1802   DBUG_RETURN(error);
1803 }
1804 
1805 
1806 /**
1807   This function flushes the stmt-cache upon commit.
1808 
1809   @param thd                The thread whose transaction should be flushed
1810   @param cache_mngr         Pointer to the cache manager
1811 
1812   @return
1813     nonzero if an error pops up when flushing the cache.
1814 */
1815 static inline int
binlog_commit_flush_stmt_cache(THD * thd,bool all,binlog_cache_mngr * cache_mngr)1816 binlog_commit_flush_stmt_cache(THD *thd, bool all,
1817                                binlog_cache_mngr *cache_mngr)
1818 {
1819   DBUG_ENTER("binlog_commit_flush_stmt_cache");
1820 #ifdef WITH_WSREP
1821   if (thd->wsrep_mysql_replicated > 0)
1822   {
1823     DBUG_ASSERT(WSREP(thd));
1824     WSREP_DEBUG("avoiding binlog_commit_flush_trx_cache: %d",
1825                 thd->wsrep_mysql_replicated);
1826     return 0;
1827   }
1828 #endif
1829 
1830   Query_log_event end_evt(thd, STRING_WITH_LEN("COMMIT"),
1831                           FALSE, TRUE, TRUE, 0);
1832   DBUG_RETURN(binlog_flush_cache(thd, cache_mngr, &end_evt, all, TRUE, FALSE));
1833 }
1834 
1835 
serialize_with_xid(XID * xid,char * buf,const char * query,size_t q_len)1836 inline size_t serialize_with_xid(XID *xid, char *buf,
1837                                  const char *query, size_t q_len)
1838 {
1839   memcpy(buf, query, q_len);
1840 
1841   return
1842     q_len + strlen(static_cast<event_xid_t*>(xid)->serialize(buf + q_len));
1843 }
1844 
1845 
1846 /**
1847   This function flushes the trx-cache upon commit.
1848 
1849   @param thd                The thread whose transaction should be flushed
1850   @param cache_mngr         Pointer to the cache manager
1851 
1852   @return
1853     nonzero if an error pops up when flushing the cache.
1854 */
1855 static inline int
binlog_commit_flush_trx_cache(THD * thd,bool all,binlog_cache_mngr * cache_mngr)1856 binlog_commit_flush_trx_cache(THD *thd, bool all, binlog_cache_mngr *cache_mngr)
1857 {
1858   DBUG_ENTER("binlog_commit_flush_trx_cache");
1859 
1860   const char query[]= "XA COMMIT ";
1861   const size_t q_len= sizeof(query) - 1; // do not count trailing 0
1862   char buf[q_len + ser_buf_size]= "COMMIT";
1863   size_t buflen= sizeof("COMMIT") - 1;
1864 
1865   if (thd->lex->sql_command == SQLCOM_XA_COMMIT &&
1866       thd->lex->xa_opt != XA_ONE_PHASE)
1867   {
1868     DBUG_ASSERT(thd->transaction->xid_state.is_explicit_XA());
1869     DBUG_ASSERT(thd->transaction->xid_state.get_state_code() ==
1870                 XA_PREPARED);
1871 
1872     buflen= serialize_with_xid(thd->transaction->xid_state.get_xid(),
1873                                buf, query, q_len);
1874   }
1875   Query_log_event end_evt(thd, buf, buflen, TRUE, TRUE, TRUE, 0);
1876 
1877   DBUG_RETURN(binlog_flush_cache(thd, cache_mngr, &end_evt, all, FALSE, TRUE));
1878 }
1879 
1880 
1881 /**
1882   This function flushes the trx-cache upon rollback.
1883 
1884   @param thd                The thread whose transaction should be flushed
1885   @param cache_mngr         Pointer to the cache manager
1886 
1887   @return
1888     nonzero if an error pops up when flushing the cache.
1889 */
1890 static inline int
binlog_rollback_flush_trx_cache(THD * thd,bool all,binlog_cache_mngr * cache_mngr)1891 binlog_rollback_flush_trx_cache(THD *thd, bool all,
1892                                 binlog_cache_mngr *cache_mngr)
1893 {
1894   const char query[]= "XA ROLLBACK ";
1895   const size_t q_len= sizeof(query) - 1; // do not count trailing 0
1896   char buf[q_len + ser_buf_size]= "ROLLBACK";
1897   size_t buflen= sizeof("ROLLBACK") - 1;
1898 
1899   if (thd->transaction->xid_state.is_explicit_XA())
1900   {
1901     /* for not prepared use plain ROLLBACK */
1902     if (thd->transaction->xid_state.get_state_code() == XA_PREPARED)
1903       buflen= serialize_with_xid(thd->transaction->xid_state.get_xid(),
1904                                  buf, query, q_len);
1905   }
1906   Query_log_event end_evt(thd, buf, buflen, TRUE, TRUE, TRUE, 0);
1907 
1908   return (binlog_flush_cache(thd, cache_mngr, &end_evt, all, FALSE, TRUE));
1909 }
1910 
1911 /**
1912   This function flushes the trx-cache upon commit.
1913 
1914   @param thd                The thread whose transaction should be flushed
1915   @param cache_mngr         Pointer to the cache manager
1916   @param xid                Transaction Id
1917 
1918   @return
1919     nonzero if an error pops up when flushing the cache.
1920 */
1921 static inline int
binlog_commit_flush_xid_caches(THD * thd,binlog_cache_mngr * cache_mngr,bool all,my_xid xid)1922 binlog_commit_flush_xid_caches(THD *thd, binlog_cache_mngr *cache_mngr,
1923                                bool all, my_xid xid)
1924 {
1925   DBUG_ASSERT(xid); // replaced former treatment of ONE-PHASE XA
1926 
1927   Xid_log_event end_evt(thd, xid, TRUE);
1928   return (binlog_flush_cache(thd, cache_mngr, &end_evt, all, TRUE, TRUE));
1929 }
1930 
1931 /**
1932   This function truncates the transactional cache upon committing or rolling
1933   back either a transaction or a statement.
1934 
1935   @param thd        The thread whose transaction should be flushed
1936   @param cache_mngr Pointer to the cache data to be flushed
1937   @param all        @c true means truncate the transaction, otherwise the
1938                     statement must be truncated.
1939 
1940   @return
1941     nonzero if an error pops up when truncating the transactional cache.
1942 */
1943 static int
binlog_truncate_trx_cache(THD * thd,binlog_cache_mngr * cache_mngr,bool all)1944 binlog_truncate_trx_cache(THD *thd, binlog_cache_mngr *cache_mngr, bool all)
1945 {
1946   DBUG_ENTER("binlog_truncate_trx_cache");
1947   int error=0;
1948   /*
1949     This function handles transactional changes and as such this flag
1950     equals to true.
1951   */
1952   bool const is_transactional= TRUE;
1953 
1954   DBUG_PRINT("info", ("thd->options={ %s %s}, transaction: %s",
1955                       FLAGSTR(thd->variables.option_bits, OPTION_NOT_AUTOCOMMIT),
1956                       FLAGSTR(thd->variables.option_bits, OPTION_BEGIN),
1957                       all ? "all" : "stmt"));
1958 
1959   thd->binlog_remove_pending_rows_event(TRUE, is_transactional);
1960   /*
1961     If rolling back an entire transaction or a single statement not
1962     inside a transaction, we reset the transaction cache.
1963   */
1964   if (ending_trans(thd, all))
1965   {
1966     if (cache_mngr->trx_cache.has_incident())
1967       error= mysql_bin_log.write_incident(thd);
1968 
1969     thd->reset_binlog_for_next_statement();
1970 
1971     cache_mngr->reset(false, true);
1972   }
1973   /*
1974     If rolling back a statement in a transaction, we truncate the
1975     transaction cache to remove the statement.
1976   */
1977   else
1978     cache_mngr->trx_cache.restore_prev_position();
1979 
1980   DBUG_ASSERT(thd->binlog_get_pending_rows_event(is_transactional) == NULL);
1981   DBUG_RETURN(error);
1982 }
1983 
1984 
is_preparing_xa(THD * thd)1985 inline bool is_preparing_xa(THD *thd)
1986 {
1987   return
1988     thd->transaction->xid_state.is_explicit_XA() &&
1989     thd->lex->sql_command == SQLCOM_XA_PREPARE;
1990 }
1991 
1992 
binlog_prepare(handlerton * hton,THD * thd,bool all)1993 static int binlog_prepare(handlerton *hton, THD *thd, bool all)
1994 {
1995   /* Do nothing unless the transaction is a user XA. */
1996   return is_preparing_xa(thd) ? binlog_commit(NULL, thd, all) : 0;
1997 }
1998 
1999 
binlog_xa_recover_dummy(handlerton * hton,XID * xid_list,uint len)2000 static int binlog_xa_recover_dummy(handlerton *hton __attribute__((unused)),
2001                              XID *xid_list __attribute__((unused)),
2002                              uint len __attribute__((unused)))
2003 {
2004   /* Does nothing. */
2005   return 0;
2006 }
2007 
2008 
binlog_commit_by_xid(handlerton * hton,XID * xid)2009 static int binlog_commit_by_xid(handlerton *hton, XID *xid)
2010 {
2011   THD *thd= current_thd;
2012 
2013   (void) thd->binlog_setup_trx_data();
2014 
2015   DBUG_ASSERT(thd->lex->sql_command == SQLCOM_XA_COMMIT);
2016 
2017   return binlog_commit(hton, thd, TRUE);
2018 }
2019 
2020 
binlog_rollback_by_xid(handlerton * hton,XID * xid)2021 static int binlog_rollback_by_xid(handlerton *hton, XID *xid)
2022 {
2023   THD *thd= current_thd;
2024 
2025   (void) thd->binlog_setup_trx_data();
2026 
2027   DBUG_ASSERT(thd->lex->sql_command == SQLCOM_XA_ROLLBACK ||
2028               (thd->transaction->xid_state.get_state_code() == XA_ROLLBACK_ONLY));
2029   return binlog_rollback(hton, thd, TRUE);
2030 }
2031 
2032 
is_prepared_xa(THD * thd)2033 inline bool is_prepared_xa(THD *thd)
2034 {
2035   return thd->transaction->xid_state.is_explicit_XA() &&
2036     thd->transaction->xid_state.get_state_code() == XA_PREPARED;
2037 }
2038 
2039 
2040 /*
2041   We flush the cache wrapped in a beging/rollback if:
2042     . aborting a single or multi-statement transaction and;
2043     . the OPTION_KEEP_LOG is active or;
2044     . the format is STMT and a non-trans table was updated or;
2045     . the format is MIXED and a temporary non-trans table was
2046       updated or;
2047     . the format is MIXED, non-trans table was updated and
2048       aborting a single statement transaction;
2049 */
trans_cannot_safely_rollback(THD * thd,bool all)2050 static bool trans_cannot_safely_rollback(THD *thd, bool all)
2051 {
2052   DBUG_ASSERT(ending_trans(thd, all));
2053 
2054   return ((thd->variables.option_bits & OPTION_KEEP_LOG) ||
2055           (trans_has_updated_non_trans_table(thd) &&
2056            thd->wsrep_binlog_format() == BINLOG_FORMAT_STMT) ||
2057           (thd->transaction->all.has_modified_non_trans_temp_table() &&
2058            thd->wsrep_binlog_format() == BINLOG_FORMAT_MIXED) ||
2059           (trans_has_updated_non_trans_table(thd) &&
2060            ending_single_stmt_trans(thd,all) &&
2061            thd->wsrep_binlog_format() == BINLOG_FORMAT_MIXED) ||
2062           is_prepared_xa(thd));
2063 }
2064 
2065 
2066 /**
2067   Specific log flusher invoked through log_xa_prepare().
2068 */
binlog_commit_flush_xa_prepare(THD * thd,bool all,binlog_cache_mngr * cache_mngr)2069 static int binlog_commit_flush_xa_prepare(THD *thd, bool all,
2070                                           binlog_cache_mngr *cache_mngr)
2071 {
2072   XID *xid= thd->transaction->xid_state.get_xid();
2073   {
2074     // todo assert wsrep_simulate || is_open()
2075 
2076     /*
2077       Log the XA END event first.
2078       We don't do that in trans_xa_end() as XA COMMIT ONE PHASE
2079       is logged as simple BEGIN/COMMIT so the XA END should
2080       not get to the log.
2081     */
2082     const char query[]= "XA END ";
2083     const size_t q_len= sizeof(query) - 1; // do not count trailing 0
2084     char buf[q_len + ser_buf_size];
2085     size_t buflen;
2086     binlog_cache_data *cache_data;
2087     IO_CACHE *file;
2088 
2089     memcpy(buf, query, q_len);
2090     buflen= q_len +
2091       strlen(static_cast<event_xid_t*>(xid)->serialize(buf + q_len));
2092     cache_data= cache_mngr->get_binlog_cache_data(true);
2093     file= &cache_data->cache_log;
2094     thd->lex->sql_command= SQLCOM_XA_END;
2095     Query_log_event xa_end(thd, buf, buflen, true, false, true, 0);
2096     if (mysql_bin_log.write_event(&xa_end, cache_data, file))
2097       return 1;
2098     thd->lex->sql_command= SQLCOM_XA_PREPARE;
2099   }
2100 
2101   cache_mngr->using_xa= FALSE;
2102   XA_prepare_log_event end_evt(thd, xid, FALSE);
2103 
2104   return (binlog_flush_cache(thd, cache_mngr, &end_evt, all, TRUE, TRUE));
2105 }
2106 
2107 
2108 /**
2109   This function is called once after each statement.
2110 
2111   It has the responsibility to flush the caches to the binary log on commits.
2112 
2113   @param hton  The binlog handlerton.
2114   @param thd   The client thread that executes the transaction.
2115   @param all   This is @c true if this is a real transaction commit, and
2116                @false otherwise.
2117 
2118   @see handlerton::commit
2119 */
binlog_commit(handlerton * hton,THD * thd,bool all)2120 static int binlog_commit(handlerton *hton, THD *thd, bool all)
2121 {
2122   int error= 0;
2123   PSI_stage_info org_stage;
2124   DBUG_ENTER("binlog_commit");
2125 
2126   binlog_cache_mngr *const cache_mngr=
2127     (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
2128 
2129   if (!cache_mngr)
2130   {
2131     DBUG_ASSERT(WSREP(thd) ||
2132                 (thd->lex->sql_command != SQLCOM_XA_PREPARE &&
2133                 !(thd->lex->sql_command == SQLCOM_XA_COMMIT &&
2134                   thd->lex->xa_opt == XA_ONE_PHASE)));
2135 
2136     DBUG_RETURN(0);
2137   }
2138   /*
2139     This is true if we are doing an alter table that is replicated as
2140     CREATE TABLE ... SELECT
2141   */
2142   if (thd->variables.option_bits & OPTION_BIN_COMMIT_OFF)
2143     DBUG_RETURN(0);
2144 
2145   DBUG_PRINT("debug",
2146              ("all: %d, in_transaction: %s, all.modified_non_trans_table: %s, stmt.modified_non_trans_table: %s",
2147               all,
2148               YESNO(thd->in_multi_stmt_transaction_mode()),
2149               YESNO(thd->transaction->all.modified_non_trans_table),
2150               YESNO(thd->transaction->stmt.modified_non_trans_table)));
2151 
2152 
2153   thd->backup_stage(&org_stage);
2154   THD_STAGE_INFO(thd, stage_binlog_write);
2155   if (!cache_mngr->stmt_cache.empty())
2156   {
2157     error= binlog_commit_flush_stmt_cache(thd, all, cache_mngr);
2158   }
2159 
2160   if (cache_mngr->trx_cache.empty() &&
2161       thd->transaction->xid_state.get_state_code() != XA_PREPARED)
2162   {
2163     /*
2164       we're here because cache_log was flushed in MYSQL_BIN_LOG::log_xid()
2165     */
2166     cache_mngr->reset(false, true);
2167     THD_STAGE_INFO(thd, org_stage);
2168     DBUG_RETURN(error);
2169   }
2170 
2171   /*
2172     We commit the transaction if:
2173      - We are not in a transaction and committing a statement, or
2174      - We are in a transaction and a full transaction is committed.
2175     Otherwise, we accumulate the changes.
2176   */
2177   if (likely(!error) && ending_trans(thd, all))
2178   {
2179     error= is_preparing_xa(thd) ?
2180       binlog_commit_flush_xa_prepare(thd, all, cache_mngr) :
2181       binlog_commit_flush_trx_cache (thd, all, cache_mngr);
2182   }
2183   /*
2184     This is part of the stmt rollback.
2185   */
2186   if (!all)
2187     cache_mngr->trx_cache.set_prev_position(MY_OFF_T_UNDEF);
2188 
2189   THD_STAGE_INFO(thd, org_stage);
2190   DBUG_RETURN(error);
2191 }
2192 
2193 /**
2194   This function is called when a transaction or a statement is rolled back.
2195 
2196   @param hton  The binlog handlerton.
2197   @param thd   The client thread that executes the transaction.
2198   @param all   This is @c true if this is a real transaction rollback, and
2199                @false otherwise.
2200 
2201   @see handlerton::rollback
2202 */
binlog_rollback(handlerton * hton,THD * thd,bool all)2203 static int binlog_rollback(handlerton *hton, THD *thd, bool all)
2204 {
2205   DBUG_ENTER("binlog_rollback");
2206 
2207   int error= 0;
2208   binlog_cache_mngr *const cache_mngr=
2209     (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
2210 
2211   if (!cache_mngr)
2212   {
2213     DBUG_ASSERT(WSREP(thd));
2214     DBUG_ASSERT(thd->lex->sql_command != SQLCOM_XA_ROLLBACK);
2215 
2216     DBUG_RETURN(0);
2217   }
2218 
2219   DBUG_PRINT("debug", ("all: %s, all.modified_non_trans_table: %s, stmt.modified_non_trans_table: %s",
2220                        YESNO(all),
2221                        YESNO(thd->transaction->all.modified_non_trans_table),
2222                        YESNO(thd->transaction->stmt.modified_non_trans_table)));
2223 
2224   /*
2225     If an incident event is set we do not flush the content of the statement
2226     cache because it may be corrupted.
2227   */
2228   if (cache_mngr->stmt_cache.has_incident())
2229   {
2230     error |= static_cast<int>(mysql_bin_log.write_incident(thd));
2231     cache_mngr->reset(true, false);
2232   }
2233   else if (!cache_mngr->stmt_cache.empty())
2234   {
2235     error |= binlog_commit_flush_stmt_cache(thd, all, cache_mngr);
2236   }
2237 
2238   if (cache_mngr->trx_cache.empty() &&
2239       thd->transaction->xid_state.get_state_code() != XA_PREPARED)
2240   {
2241     /*
2242       we're here because cache_log was flushed in MYSQL_BIN_LOG::log_xid()
2243     */
2244     cache_mngr->reset(false, true);
2245     thd->reset_binlog_for_next_statement();
2246     DBUG_RETURN(error);
2247   }
2248   if (!wsrep_emulate_bin_log && mysql_bin_log.check_write_error(thd))
2249   {
2250     /*
2251       "all == true" means that a "rollback statement" triggered the error and
2252       this function was called. However, this must not happen as a rollback
2253       is written directly to the binary log. And in auto-commit mode, a single
2254       statement that is rolled back has the flag all == false.
2255     */
2256     DBUG_ASSERT(!all);
2257     /*
2258       We reach this point if the effect of a statement did not properly get into
2259       a cache and need to be rolled back.
2260     */
2261     error |= binlog_truncate_trx_cache(thd, cache_mngr, all);
2262   }
2263   else if (likely(!error))
2264   {
2265     if (ending_trans(thd, all) && trans_cannot_safely_rollback(thd, all))
2266       error= binlog_rollback_flush_trx_cache(thd, all, cache_mngr);
2267     /*
2268       Truncate the cache if:
2269         . aborting a single or multi-statement transaction or;
2270         . the current statement created or dropped a temporary table
2271           while having actual STATEMENT format;
2272         . the format is not STMT or no non-trans table was
2273           updated and;
2274         . the format is not MIXED or no temporary non-trans table
2275           was updated.
2276     */
2277     else if (ending_trans(thd, all) ||
2278              (!(thd->transaction->stmt.has_created_dropped_temp_table() &&
2279                 !thd->is_current_stmt_binlog_format_row()) &&
2280               (!stmt_has_updated_non_trans_table(thd) ||
2281                thd->wsrep_binlog_format() != BINLOG_FORMAT_STMT) &&
2282               (!thd->transaction->stmt.has_modified_non_trans_temp_table() ||
2283                thd->wsrep_binlog_format() != BINLOG_FORMAT_MIXED)))
2284       error= binlog_truncate_trx_cache(thd, cache_mngr, all);
2285   }
2286 
2287   /*
2288     This is part of the stmt rollback.
2289   */
2290   if (!all)
2291     cache_mngr->trx_cache.set_prev_position(MY_OFF_T_UNDEF);
2292   thd->reset_binlog_for_next_statement();
2293 
2294   DBUG_RETURN(error);
2295 }
2296 
2297 
binlog_reset_cache(THD * thd)2298 void binlog_reset_cache(THD *thd)
2299 {
2300   binlog_cache_mngr *const cache_mngr= opt_bin_log ?
2301     (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton) : 0;
2302   DBUG_ENTER("binlog_reset_cache");
2303   if (cache_mngr)
2304   {
2305     thd->binlog_remove_pending_rows_event(TRUE, TRUE);
2306     cache_mngr->reset(true, true);
2307   }
2308   DBUG_VOID_RETURN;
2309 }
2310 
2311 
set_write_error(THD * thd,bool is_transactional)2312 void MYSQL_BIN_LOG::set_write_error(THD *thd, bool is_transactional)
2313 {
2314   DBUG_ENTER("MYSQL_BIN_LOG::set_write_error");
2315 
2316   write_error= 1;
2317 
2318   if (unlikely(check_write_error(thd)))
2319     DBUG_VOID_RETURN;
2320 
2321   if (my_errno == EFBIG)
2322   {
2323     if (is_transactional)
2324     {
2325       my_message(ER_TRANS_CACHE_FULL, ER_THD(thd, ER_TRANS_CACHE_FULL), MYF(0));
2326     }
2327     else
2328     {
2329       my_message(ER_STMT_CACHE_FULL, ER_THD(thd, ER_STMT_CACHE_FULL), MYF(0));
2330     }
2331   }
2332   else
2333   {
2334     my_error(ER_ERROR_ON_WRITE, MYF(0), name, errno);
2335   }
2336 #ifdef WITH_WSREP
2337   /* If wsrep transaction is active and binlog emulation is on,
2338      binlog write error may leave transaction without any registered
2339      htons. This makes wsrep rollback hooks to be skipped and the
2340      transaction will remain alive in wsrep world after rollback.
2341      Register binlog hton here to ensure that rollback happens in full. */
2342   if (WSREP_EMULATE_BINLOG(thd))
2343   {
2344     if (is_transactional)
2345       trans_register_ha(thd, TRUE, binlog_hton, 0);
2346     trans_register_ha(thd, FALSE, binlog_hton, 0);
2347   }
2348 #endif /* WITH_WSREP */
2349   DBUG_VOID_RETURN;
2350 }
2351 
check_write_error(THD * thd)2352 bool MYSQL_BIN_LOG::check_write_error(THD *thd)
2353 {
2354   DBUG_ENTER("MYSQL_BIN_LOG::check_write_error");
2355 
2356   bool checked= FALSE;
2357 
2358   if (likely(!thd->is_error()))
2359     DBUG_RETURN(checked);
2360 
2361   switch (thd->get_stmt_da()->sql_errno())
2362   {
2363     case ER_TRANS_CACHE_FULL:
2364     case ER_STMT_CACHE_FULL:
2365     case ER_ERROR_ON_WRITE:
2366     case ER_BINLOG_LOGGING_IMPOSSIBLE:
2367       checked= TRUE;
2368     break;
2369   }
2370 
2371   DBUG_RETURN(checked);
2372 }
2373 
2374 
2375 /**
2376   @note
2377   How do we handle this (unlikely but legal) case:
2378   @verbatim
2379     [transaction] + [update to non-trans table] + [rollback to savepoint] ?
2380   @endverbatim
2381   The problem occurs when a savepoint is before the update to the
2382   non-transactional table. Then when there's a rollback to the savepoint, if we
2383   simply truncate the binlog cache, we lose the part of the binlog cache where
2384   the update is. If we want to not lose it, we need to write the SAVEPOINT
2385   command and the ROLLBACK TO SAVEPOINT command to the binlog cache. The latter
2386   is easy: it's just write at the end of the binlog cache, but the former
2387   should be *inserted* to the place where the user called SAVEPOINT. The
2388   solution is that when the user calls SAVEPOINT, we write it to the binlog
2389   cache (so no need to later insert it). As transactions are never intermixed
2390   in the binary log (i.e. they are serialized), we won't have conflicts with
2391   savepoint names when using mysqlbinlog or in the slave SQL thread.
2392   Then when ROLLBACK TO SAVEPOINT is called, if we updated some
2393   non-transactional table, we don't truncate the binlog cache but instead write
2394   ROLLBACK TO SAVEPOINT to it; otherwise we truncate the binlog cache (which
2395   will chop the SAVEPOINT command from the binlog cache, which is good as in
2396   that case there is no need to have it in the binlog).
2397 */
2398 
binlog_savepoint_set(handlerton * hton,THD * thd,void * sv)2399 static int binlog_savepoint_set(handlerton *hton, THD *thd, void *sv)
2400 {
2401   int error= 1;
2402   DBUG_ENTER("binlog_savepoint_set");
2403 
2404   char buf[1024];
2405 
2406   String log_query(buf, sizeof(buf), &my_charset_bin);
2407   if (log_query.copy(STRING_WITH_LEN("SAVEPOINT "), &my_charset_bin) ||
2408       append_identifier(thd, &log_query, &thd->lex->ident))
2409     DBUG_RETURN(1);
2410   int errcode= query_error_code(thd, thd->killed == NOT_KILLED);
2411   Query_log_event qinfo(thd, log_query.c_ptr_safe(), log_query.length(),
2412                         TRUE, FALSE, TRUE, errcode);
2413   /*
2414     We cannot record the position before writing the statement
2415     because a rollback to a savepoint (.e.g. consider it "S") would
2416     prevent the savepoint statement (i.e. "SAVEPOINT S") from being
2417     written to the binary log despite the fact that the server could
2418     still issue other rollback statements to the same savepoint (i.e.
2419     "S").
2420     Given that the savepoint is valid until the server releases it,
2421     ie, until the transaction commits or it is released explicitly,
2422     we need to log it anyway so that we don't have "ROLLBACK TO S"
2423     or "RELEASE S" without the preceding "SAVEPOINT S" in the binary
2424     log.
2425   */
2426   if (likely(!(error= mysql_bin_log.write(&qinfo))))
2427     binlog_trans_log_savepos(thd, (my_off_t*) sv);
2428 
2429   DBUG_RETURN(error);
2430 }
2431 
binlog_savepoint_rollback(handlerton * hton,THD * thd,void * sv)2432 static int binlog_savepoint_rollback(handlerton *hton, THD *thd, void *sv)
2433 {
2434   DBUG_ENTER("binlog_savepoint_rollback");
2435 
2436   /*
2437     Write ROLLBACK TO SAVEPOINT to the binlog cache if we have updated some
2438     non-transactional table. Otherwise, truncate the binlog cache starting
2439     from the SAVEPOINT command.
2440   */
2441 #ifdef WITH_WSREP
2442   /* for streaming replication, we  must replicate savepoint rollback so that
2443      slaves can maintain SR transactions
2444    */
2445   if (unlikely(thd->wsrep_trx().is_streaming() ||
2446                (trans_has_updated_non_trans_table(thd)) ||
2447                (thd->variables.option_bits & OPTION_KEEP_LOG)))
2448 #else
2449   if (unlikely(trans_has_updated_non_trans_table(thd) ||
2450                (thd->variables.option_bits & OPTION_KEEP_LOG)))
2451 #endif /* WITH_WSREP */
2452   {
2453     char buf[1024];
2454     String log_query(buf, sizeof(buf), &my_charset_bin);
2455     if (log_query.copy(STRING_WITH_LEN("ROLLBACK TO "), &my_charset_bin) ||
2456         append_identifier(thd, &log_query, &thd->lex->ident))
2457       DBUG_RETURN(1);
2458     int errcode= query_error_code(thd, thd->killed == NOT_KILLED);
2459     Query_log_event qinfo(thd, log_query.ptr(), log_query.length(),
2460                           TRUE, FALSE, TRUE, errcode);
2461     DBUG_RETURN(mysql_bin_log.write(&qinfo));
2462   }
2463 
2464   binlog_trans_log_truncate(thd, *(my_off_t*)sv);
2465 
2466   /*
2467     When a SAVEPOINT is executed inside a stored function/trigger we force the
2468     pending event to be flushed with a STMT_END_F flag and reset binlog
2469     as well to ensure that following DMLs will have a clean state to start
2470     with. ROLLBACK inside a stored routine has to finalize possibly existing
2471     current row-based pending event with cleaning up table maps. That ensures
2472     that following DMLs will have a clean state to start with.
2473    */
2474   if (thd->in_sub_stmt)
2475     thd->reset_binlog_for_next_statement();
2476 
2477   DBUG_RETURN(0);
2478 }
2479 
2480 
2481 /**
2482   Check whether binlog state allows to safely release MDL locks after
2483   rollback to savepoint.
2484 
2485   @param hton  The binlog handlerton.
2486   @param thd   The client thread that executes the transaction.
2487 
2488   @return true  - It is safe to release MDL locks.
2489           false - If it is not.
2490 */
binlog_savepoint_rollback_can_release_mdl(handlerton * hton,THD * thd)2491 static bool binlog_savepoint_rollback_can_release_mdl(handlerton *hton,
2492                                                       THD *thd)
2493 {
2494   DBUG_ENTER("binlog_savepoint_rollback_can_release_mdl");
2495   /*
2496     If we have not updated any non-transactional tables rollback
2497     to savepoint will simply truncate binlog cache starting from
2498     SAVEPOINT command. So it should be safe to release MDL acquired
2499     after SAVEPOINT command in this case.
2500   */
2501   DBUG_RETURN(!trans_cannot_safely_rollback(thd, true));
2502 }
2503 
2504 
check_binlog_magic(IO_CACHE * log,const char ** errmsg)2505 int check_binlog_magic(IO_CACHE* log, const char** errmsg)
2506 {
2507   uchar magic[4];
2508   DBUG_ASSERT(my_b_tell(log) == 0);
2509 
2510   if (my_b_read(log, magic, sizeof(magic)))
2511   {
2512     *errmsg = "I/O error reading the header from the binary log";
2513     sql_print_error("%s, errno=%d, io cache code=%d", *errmsg, my_errno,
2514 		    log->error);
2515     return 1;
2516   }
2517   if (bcmp(magic, BINLOG_MAGIC, sizeof(magic)))
2518   {
2519     *errmsg = "Binlog has bad magic number;  It's not a binary log file that can be used by this version of MySQL";
2520     return 1;
2521   }
2522   return 0;
2523 }
2524 
2525 
open_binlog(IO_CACHE * log,const char * log_file_name,const char ** errmsg)2526 File open_binlog(IO_CACHE *log, const char *log_file_name, const char **errmsg)
2527 {
2528   File file;
2529   DBUG_ENTER("open_binlog");
2530 
2531   if ((file= mysql_file_open(key_file_binlog,
2532                              log_file_name, O_RDONLY | O_BINARY | O_SHARE,
2533                              MYF(MY_WME))) < 0)
2534   {
2535     sql_print_error("Failed to open log (file '%s', errno %d)",
2536                     log_file_name, my_errno);
2537     *errmsg = "Could not open log file";
2538     goto err;
2539   }
2540   if (init_io_cache_ext(log, file, (size_t)binlog_file_cache_size, READ_CACHE,
2541             0, 0, MYF(MY_WME|MY_DONT_CHECK_FILESIZE), key_file_binlog_cache))
2542   {
2543     sql_print_error("Failed to create a cache on log (file '%s')",
2544                     log_file_name);
2545     *errmsg = "Could not open log file";
2546     goto err;
2547   }
2548   if (check_binlog_magic(log,errmsg))
2549     goto err;
2550   DBUG_RETURN(file);
2551 
2552 err:
2553   if (file >= 0)
2554   {
2555     mysql_file_close(file, MYF(0));
2556     end_io_cache(log);
2557   }
2558   DBUG_RETURN(-1);
2559 }
2560 
2561 #ifdef _WIN32
2562 static int eventSource = 0;
2563 
setup_windows_event_source()2564 static void setup_windows_event_source()
2565 {
2566   HKEY    hRegKey= NULL;
2567   DWORD   dwError= 0;
2568   TCHAR   szPath[MAX_PATH];
2569   DWORD dwTypes;
2570 
2571   if (eventSource)               // Ensure that we are only called once
2572     return;
2573   eventSource= 1;
2574 
2575   // Create the event source registry key
2576   dwError= RegCreateKey(HKEY_LOCAL_MACHINE,
2577                           "SYSTEM\\CurrentControlSet\\Services\\EventLog\\Application\\MariaDB",
2578                           &hRegKey);
2579 
2580   /* Name of the PE module that contains the message resource */
2581   GetModuleFileName(NULL, szPath, MAX_PATH);
2582 
2583   /* Register EventMessageFile */
2584   dwError = RegSetValueEx(hRegKey, "EventMessageFile", 0, REG_EXPAND_SZ,
2585                           (PBYTE) szPath, (DWORD) (strlen(szPath) + 1));
2586 
2587   /* Register supported event types */
2588   dwTypes= (EVENTLOG_ERROR_TYPE | EVENTLOG_WARNING_TYPE |
2589             EVENTLOG_INFORMATION_TYPE);
2590   dwError= RegSetValueEx(hRegKey, "TypesSupported", 0, REG_DWORD,
2591                          (LPBYTE) &dwTypes, sizeof dwTypes);
2592 
2593   RegCloseKey(hRegKey);
2594 }
2595 
2596 #endif /* _WIN32 */
2597 
2598 
2599 /**
2600   Find a unique filename for 'filename.#'.
2601 
2602   Set '#' to the number next to the maximum found in the most
2603   recent log file extension.
2604 
2605   This function will return nonzero if: (i) the generated name
2606   exceeds FN_REFLEN; (ii) if the number of extensions is exhausted;
2607   or (iii) some other error happened while examining the filesystem.
2608 
2609   @param name                   Base name of file
2610   @param min_log_number_to_use  minimum log number to choose. Set by
2611                                 CHANGE MASTER .. TO
2612   @param last_used_log_number   If 0, find log number based on files.
2613                                 If not 0, then use *last_used_log_number +1
2614                                 Will be update to new generated number
2615   @return
2616     0       ok
2617     nonzero if not possible to get unique filename.
2618 */
2619 
find_uniq_filename(char * name,ulong min_log_number_to_use,ulong * last_used_log_number)2620 static int find_uniq_filename(char *name, ulong min_log_number_to_use,
2621                               ulong *last_used_log_number)
2622 {
2623   uint                  i;
2624   char                  buff[FN_REFLEN], ext_buf[FN_REFLEN];
2625   struct st_my_dir     *dir_info;
2626   struct fileinfo *file_info;
2627   ulong                 max_found= 0, next= 0, number= 0;
2628   size_t		buf_length, length;
2629   char			*start, *end;
2630   int                   error= 0;
2631   DBUG_ENTER("find_uniq_filename");
2632 
2633   length= dirname_part(buff, name, &buf_length);
2634   start=  name + length;
2635   end=    strend(start);
2636 
2637   *end='.';
2638   length= (size_t) (end - start + 1);
2639 
2640   /* The following matches the code for my_dir () below */
2641   DBUG_EXECUTE_IF("error_unique_log_filename",
2642                   {
2643                     strmov(end,".1");
2644                     DBUG_RETURN(1);
2645                   });
2646 
2647   if (*last_used_log_number)
2648     max_found= *last_used_log_number;
2649   else
2650   {
2651     if (unlikely(!(dir_info= my_dir(buff, MYF(MY_DONT_SORT)))))
2652     {						// This shouldn't happen
2653       strmov(end,".1");				// use name+1
2654       DBUG_RETURN(1);
2655     }
2656     file_info= dir_info->dir_entry;
2657     max_found= min_log_number_to_use ? min_log_number_to_use-1 : 0;
2658     for (i= dir_info->number_of_files ; i-- ; file_info++)
2659     {
2660       if (strncmp(file_info->name, start, length) == 0 &&
2661           test_if_number(file_info->name+length, &number,0))
2662       {
2663         set_if_bigger(max_found, number);
2664       }
2665     }
2666     my_dirend(dir_info);
2667   }
2668 
2669   /* check if reached the maximum possible extension number */
2670   if (max_found >= MAX_LOG_UNIQUE_FN_EXT)
2671   {
2672     sql_print_error("Log filename extension number exhausted: %06lu. \
2673 Please fix this by archiving old logs and \
2674 updating the index files.", max_found);
2675     error= 1;
2676     goto end;
2677   }
2678 
2679   next= max_found + 1;
2680   if (sprintf(ext_buf, "%06lu", next)<0)
2681   {
2682     error= 1;
2683     goto end;
2684   }
2685   *end++='.';
2686 
2687   /*
2688     Check if the generated extension size + the file name exceeds the
2689     buffer size used. If one did not check this, then the filename might be
2690     truncated, resulting in error.
2691    */
2692   if (((strlen(ext_buf) + (end - name)) >= FN_REFLEN))
2693   {
2694     sql_print_error("Log filename too large: %s%s (%zu). \
2695 Please fix this by archiving old logs and updating the \
2696 index files.", name, ext_buf, (strlen(ext_buf) + (end - name)));
2697     error= 1;
2698     goto end;
2699   }
2700 
2701   if (sprintf(end, "%06lu", next)<0)
2702   {
2703     error= 1;
2704     goto end;
2705   }
2706   *last_used_log_number= next;
2707 
2708   /* print warning if reaching the end of available extensions. */
2709   if ((next > (MAX_LOG_UNIQUE_FN_EXT - LOG_WARN_UNIQUE_FN_EXT_LEFT)))
2710     sql_print_warning("Next log extension: %lu. \
2711 Remaining log filename extensions: %lu. \
2712 Please consider archiving some logs.", next, (MAX_LOG_UNIQUE_FN_EXT - next));
2713 
2714 end:
2715   DBUG_RETURN(error);
2716 }
2717 
2718 
init_and_set_log_file_name(const char * log_name,const char * new_name,ulong next_log_number,enum_log_type log_type_arg,enum cache_type io_cache_type_arg)2719 bool MYSQL_LOG::init_and_set_log_file_name(const char *log_name,
2720                                            const char *new_name,
2721                                            ulong next_log_number,
2722                                            enum_log_type log_type_arg,
2723                                            enum cache_type io_cache_type_arg)
2724 {
2725   log_type= log_type_arg;
2726   io_cache_type= io_cache_type_arg;
2727 
2728   if (new_name)
2729   {
2730     strmov(log_file_name, new_name);
2731   }
2732   else if (!new_name && generate_new_name(log_file_name, log_name,
2733                                           next_log_number))
2734     return TRUE;
2735 
2736   return FALSE;
2737 }
2738 
2739 
2740 /*
2741   Open a (new) log file.
2742 
2743   SYNOPSIS
2744     open()
2745 
2746     log_name            The name of the log to open
2747     log_type_arg        The type of the log. E.g. LOG_NORMAL
2748     new_name            The new name for the logfile. This is only needed
2749                         when the method is used to open the binlog file.
2750     io_cache_type_arg   The type of the IO_CACHE to use for this log file
2751 
2752   DESCRIPTION
2753     Open the logfile, init IO_CACHE and write startup messages
2754     (in case of general and slow query logs).
2755 
2756   RETURN VALUES
2757     0   ok
2758     1   error
2759 */
2760 
open(PSI_file_key log_file_key,const char * log_name,enum_log_type log_type_arg,const char * new_name,ulong next_log_number,enum cache_type io_cache_type_arg)2761 bool MYSQL_LOG::open(
2762 #ifdef HAVE_PSI_INTERFACE
2763                      PSI_file_key log_file_key,
2764 #endif
2765                      const char *log_name, enum_log_type log_type_arg,
2766                      const char *new_name, ulong next_log_number,
2767                      enum cache_type io_cache_type_arg)
2768 {
2769   char buff[FN_REFLEN];
2770   MY_STAT f_stat;
2771   File file= -1;
2772   my_off_t seek_offset;
2773   bool is_fifo = false;
2774   int open_flags= O_CREAT | O_BINARY | O_CLOEXEC;
2775   DBUG_ENTER("MYSQL_LOG::open");
2776   DBUG_PRINT("enter", ("log_type: %d", (int) log_type_arg));
2777 
2778   write_error= 0;
2779 
2780   if (!(name= my_strdup(key_memory_MYSQL_LOG_name, log_name, MYF(MY_WME))))
2781   {
2782     name= (char *)log_name; // for the error message
2783     goto err;
2784   }
2785 
2786   /*
2787     log_type is LOG_UNKNOWN if we should not generate a new name
2788     This is only used when called from MYSQL_BINARY_LOG::open, which
2789     has already updated log_file_name.
2790    */
2791   if (log_type_arg != LOG_UNKNOWN &&
2792       init_and_set_log_file_name(name, new_name, next_log_number,
2793                                  log_type_arg, io_cache_type_arg))
2794     goto err;
2795 
2796   is_fifo = my_stat(log_file_name, &f_stat, MYF(0)) &&
2797             MY_S_ISFIFO(f_stat.st_mode);
2798 
2799   if (io_cache_type == SEQ_READ_APPEND)
2800     open_flags |= O_RDWR | O_APPEND;
2801   else
2802     open_flags |= O_WRONLY | (log_type == LOG_BIN ? 0 : O_APPEND);
2803 
2804   if (is_fifo)
2805     open_flags |= O_NONBLOCK;
2806 
2807   db[0]= 0;
2808 
2809 #ifdef HAVE_PSI_INTERFACE
2810   /* Keep the key for reopen */
2811   m_log_file_key= log_file_key;
2812 #endif
2813 
2814   if ((file= mysql_file_open(log_file_key, log_file_name, open_flags,
2815                              MYF(MY_WME))) < 0)
2816     goto err;
2817 
2818   if (is_fifo)
2819     seek_offset= 0;
2820   else if ((seek_offset= mysql_file_tell(file, MYF(MY_WME))))
2821     goto err;
2822 
2823   if (init_io_cache(&log_file, file, (log_type == LOG_NORMAL ? IO_SIZE :
2824                                       LOG_BIN_IO_SIZE),
2825                     io_cache_type, seek_offset, 0,
2826                     MYF(MY_WME | MY_NABP |
2827                         ((log_type == LOG_BIN) ? MY_WAIT_IF_FULL : 0))))
2828     goto err;
2829 
2830   if (log_type == LOG_NORMAL)
2831   {
2832     char *end;
2833     size_t len=my_snprintf(buff, sizeof(buff), "%s, Version: %s (%s). "
2834 #ifdef EMBEDDED_LIBRARY
2835                         "embedded library\n",
2836                         my_progname, server_version, MYSQL_COMPILATION_COMMENT
2837 #elif defined(_WIN32)
2838 			"started with:\nTCP Port: %d, Named Pipe: %s\n",
2839                         my_progname, server_version, MYSQL_COMPILATION_COMMENT,
2840                         mysqld_port, mysqld_unix_port
2841 #else
2842 			"started with:\nTcp port: %d  Unix socket: %s\n",
2843                         my_progname, server_version, MYSQL_COMPILATION_COMMENT,
2844                         mysqld_port, mysqld_unix_port
2845 #endif
2846                        );
2847     end= strnmov(buff + len, "Time\t\t    Id Command\tArgument\n",
2848                  sizeof(buff) - len);
2849     if (my_b_write(&log_file, (uchar*) buff, (uint) (end-buff)) ||
2850 	flush_io_cache(&log_file))
2851       goto err;
2852   }
2853 
2854   log_state= LOG_OPENED;
2855   DBUG_RETURN(0);
2856 
2857 err:
2858   sql_print_error(fatal_log_error, name, errno);
2859   if (file >= 0)
2860     mysql_file_close(file, MYF(0));
2861   end_io_cache(&log_file);
2862   my_free(name);
2863   name= NULL;
2864   log_state= LOG_CLOSED;
2865   DBUG_RETURN(1);
2866 }
2867 
MYSQL_LOG()2868 MYSQL_LOG::MYSQL_LOG()
2869   : name(0), write_error(FALSE), inited(FALSE), log_type(LOG_UNKNOWN),
2870     log_state(LOG_CLOSED)
2871 {
2872   /*
2873     We don't want to initialize LOCK_Log here as such initialization depends on
2874     safe_mutex (when using safe_mutex) which depends on MY_INIT(), which is
2875     called only in main(). Doing initialization here would make it happen
2876     before main().
2877   */
2878   bzero((char*) &log_file, sizeof(log_file));
2879 }
2880 
init_pthread_objects()2881 void MYSQL_LOG::init_pthread_objects()
2882 {
2883   DBUG_ASSERT(inited == 0);
2884   inited= 1;
2885   mysql_mutex_init(key_LOG_LOCK_log, &LOCK_log, MY_MUTEX_INIT_SLOW);
2886 }
2887 
2888 /*
2889   Close the log file
2890 
2891   SYNOPSIS
2892     close()
2893     exiting     Bitmask. LOG_CLOSE_TO_BE_OPENED is used if we intend to call
2894                 open at once after close. LOG_CLOSE_DELAYED_CLOSE is used for
2895                 binlog rotation, to delay actual close of the old file until
2896                 we have successfully created the new file.
2897 
2898   NOTES
2899     One can do an open on the object at once after doing a close.
2900     The internal structures are not freed until cleanup() is called
2901 */
2902 
close(uint exiting)2903 void MYSQL_LOG::close(uint exiting)
2904 {					// One can't set log_type here!
2905   DBUG_ENTER("MYSQL_LOG::close");
2906   DBUG_PRINT("enter",("exiting: %d", (int) exiting));
2907   if (log_state == LOG_OPENED)
2908   {
2909     end_io_cache(&log_file);
2910 
2911     if (log_type == LOG_BIN && mysql_file_sync(log_file.file, MYF(MY_WME)) && ! write_error)
2912     {
2913       write_error= 1;
2914       sql_print_error(ER_DEFAULT(ER_ERROR_ON_WRITE), name, errno);
2915     }
2916 
2917     if (!(exiting & LOG_CLOSE_DELAYED_CLOSE) &&
2918         mysql_file_close(log_file.file, MYF(MY_WME)) && ! write_error)
2919     {
2920       write_error= 1;
2921       sql_print_error(ER_DEFAULT(ER_ERROR_ON_WRITE), name, errno);
2922     }
2923   }
2924 
2925   log_state= (exiting & LOG_CLOSE_TO_BE_OPENED) ? LOG_TO_BE_OPENED : LOG_CLOSED;
2926   my_free(name);
2927   name= NULL;
2928   DBUG_VOID_RETURN;
2929 }
2930 
2931 /** This is called only once. */
2932 
cleanup()2933 void MYSQL_LOG::cleanup()
2934 {
2935   DBUG_ENTER("cleanup");
2936   if (inited)
2937   {
2938     inited= 0;
2939     mysql_mutex_destroy(&LOCK_log);
2940     close(0);
2941   }
2942   DBUG_VOID_RETURN;
2943 }
2944 
2945 
generate_new_name(char * new_name,const char * log_name,ulong next_log_number)2946 int MYSQL_LOG::generate_new_name(char *new_name, const char *log_name,
2947                                  ulong next_log_number)
2948 {
2949   fn_format(new_name, log_name, mysql_data_home, "", 4);
2950   return 0;
2951 }
2952 
generate_new_name(char * new_name,const char * log_name,ulong next_log_number)2953 int MYSQL_BIN_LOG::generate_new_name(char *new_name, const char *log_name,
2954                                      ulong next_log_number)
2955 {
2956   fn_format(new_name, log_name, mysql_data_home, "", 4);
2957   if (!fn_ext(log_name)[0])
2958   {
2959     if (DBUG_EVALUATE_IF("binlog_inject_new_name_error", TRUE, FALSE) ||
2960         unlikely(find_uniq_filename(new_name, next_log_number,
2961                                     &last_used_log_number)))
2962     {
2963       THD *thd= current_thd;
2964       if (unlikely(thd))
2965         my_error(ER_NO_UNIQUE_LOGFILE, MYF(ME_FATAL), log_name);
2966       sql_print_error(ER_DEFAULT(ER_NO_UNIQUE_LOGFILE), log_name);
2967       return 1;
2968     }
2969   }
2970   return 0;
2971 }
2972 
2973 
2974 /*
2975   Reopen the log file
2976 
2977   SYNOPSIS
2978     reopen_file()
2979 
2980   DESCRIPTION
2981     Reopen the log file. The method is used during FLUSH LOGS
2982     and locks LOCK_log mutex
2983 */
2984 
2985 
reopen_file()2986 void MYSQL_QUERY_LOG::reopen_file()
2987 {
2988   char *save_name;
2989   DBUG_ENTER("MYSQL_LOG::reopen_file");
2990 
2991   mysql_mutex_lock(&LOCK_log);
2992   if (!is_open())
2993   {
2994     DBUG_PRINT("info",("log is closed"));
2995     mysql_mutex_unlock(&LOCK_log);
2996     DBUG_VOID_RETURN;
2997   }
2998 
2999   save_name= name;
3000   name= 0;				// Don't free name
3001   close(LOG_CLOSE_TO_BE_OPENED);
3002 
3003   /*
3004      Note that at this point, log_state != LOG_CLOSED (important for is_open()).
3005   */
3006 
3007   open(
3008 #ifdef HAVE_PSI_INTERFACE
3009        m_log_file_key,
3010 #endif
3011        save_name, log_type, 0, 0, io_cache_type);
3012   my_free(save_name);
3013 
3014   mysql_mutex_unlock(&LOCK_log);
3015 
3016   DBUG_VOID_RETURN;
3017 }
3018 
3019 
3020 /*
3021   Write a command to traditional general log file
3022 
3023   SYNOPSIS
3024     write()
3025 
3026     event_time        command start timestamp
3027     user_host         the pointer to the string with user@host info
3028     user_host_len     length of the user_host string. this is computed once
3029                       and passed to all general log  event handlers
3030     thread_id         Id of the thread, issued a query
3031     command_type      the type of the command being logged
3032     command_type_len  the length of the string above
3033     sql_text          the very text of the query being executed
3034     sql_text_len      the length of sql_text string
3035 
3036   DESCRIPTION
3037 
3038    Log given command to to normal (not rotable) log file
3039 
3040   RETURN
3041     FASE - OK
3042     TRUE - error occurred
3043 */
3044 
write(time_t event_time,const char * user_host,size_t user_host_len,my_thread_id thread_id_arg,const char * command_type,size_t command_type_len,const char * sql_text,size_t sql_text_len)3045 bool MYSQL_QUERY_LOG::write(time_t event_time, const char *user_host, size_t user_host_len, my_thread_id thread_id_arg,
3046                             const char *command_type, size_t command_type_len,
3047                             const char *sql_text, size_t sql_text_len)
3048 {
3049   char buff[32];
3050   char local_time_buff[MAX_TIME_SIZE];
3051   struct tm start;
3052   size_t time_buff_len= 0;
3053 
3054   mysql_mutex_lock(&LOCK_log);
3055 
3056   /* Test if someone closed between the is_open test and lock */
3057   if (is_open())
3058   {
3059     /* for testing output of timestamp and thread id */
3060     DBUG_EXECUTE_IF("reset_log_last_time", last_time= 0;);
3061 
3062     /* Note that my_b_write() assumes it knows the length for this */
3063     if (event_time != last_time)
3064     {
3065       last_time= event_time;
3066 
3067       localtime_r(&event_time, &start);
3068 
3069       time_buff_len= my_snprintf(local_time_buff, MAX_TIME_SIZE,
3070                                  "%02d%02d%02d %2d:%02d:%02d\t",
3071                                  start.tm_year % 100, start.tm_mon + 1,
3072                                  start.tm_mday, start.tm_hour,
3073                                  start.tm_min, start.tm_sec);
3074 
3075       if (my_b_write(&log_file, (uchar*) local_time_buff, time_buff_len))
3076         goto err;
3077     }
3078     else
3079       if (my_b_write(&log_file, (uchar*) "\t\t" ,2) < 0)
3080         goto err;
3081 
3082     /* command_type, thread_id */
3083     size_t length= my_snprintf(buff, 32, "%6llu ", thread_id_arg);
3084 
3085     if (my_b_write(&log_file, (uchar*) buff, length))
3086       goto err;
3087 
3088     if (my_b_write(&log_file, (uchar*) command_type, command_type_len))
3089       goto err;
3090 
3091     if (my_b_write(&log_file, (uchar*) "\t", 1))
3092       goto err;
3093 
3094     /* sql_text */
3095     if (my_b_write(&log_file, (uchar*) sql_text, sql_text_len))
3096       goto err;
3097 
3098     if (my_b_write(&log_file, (uchar*) "\n", 1) ||
3099         flush_io_cache(&log_file))
3100       goto err;
3101   }
3102 
3103   mysql_mutex_unlock(&LOCK_log);
3104   return FALSE;
3105 err:
3106 
3107   if (!write_error)
3108   {
3109     write_error= 1;
3110     sql_print_error(ER_DEFAULT(ER_ERROR_ON_WRITE), name, errno);
3111   }
3112   mysql_mutex_unlock(&LOCK_log);
3113   return TRUE;
3114 }
3115 
3116 
3117 /*
3118   Log a query to the traditional slow log file
3119 
3120   SYNOPSIS
3121     write()
3122 
3123     thd               THD of the query
3124     current_time      current timestamp
3125     user_host         the pointer to the string with user@host info
3126     user_host_len     length of the user_host string. this is computed once
3127                       and passed to all general log event handlers
3128     query_utime       Amount of time the query took to execute (in microseconds)
3129     lock_utime        Amount of time the query was locked (in microseconds)
3130     is_command        The flag, which determines, whether the sql_text is a
3131                       query or an administrator command.
3132     sql_text          the very text of the query or administrator command
3133                       processed
3134     sql_text_len      the length of sql_text string
3135 
3136   DESCRIPTION
3137 
3138    Log a query to the slow log file.
3139 
3140   RETURN
3141     FALSE - OK
3142     TRUE - error occurred
3143 */
3144 
write(THD * thd,time_t current_time,const char * user_host,size_t user_host_len,ulonglong query_utime,ulonglong lock_utime,bool is_command,const char * sql_text,size_t sql_text_len)3145 bool MYSQL_QUERY_LOG::write(THD *thd, time_t current_time,
3146                             const char *user_host, size_t user_host_len, ulonglong query_utime,
3147                             ulonglong lock_utime, bool is_command,
3148                             const char *sql_text, size_t sql_text_len)
3149 {
3150   bool error= 0;
3151   char llbuff[22];
3152   DBUG_ENTER("MYSQL_QUERY_LOG::write");
3153 
3154   mysql_mutex_lock(&LOCK_log);
3155   if (is_open())
3156   {						// Safety against reopen
3157     char buff[80], *end;
3158     char query_time_buff[22+7], lock_time_buff[22+7];
3159     size_t buff_len;
3160     end= buff;
3161 
3162     if (!(specialflag & SPECIAL_SHORT_LOG_FORMAT))
3163     {
3164       if (current_time != last_time)
3165       {
3166         last_time= current_time;
3167         struct tm start;
3168         localtime_r(&current_time, &start);
3169 
3170         buff_len= my_snprintf(buff, sizeof buff,
3171                               "# Time: %02d%02d%02d %2d:%02d:%02d\n",
3172                               start.tm_year % 100, start.tm_mon + 1,
3173                               start.tm_mday, start.tm_hour,
3174                               start.tm_min, start.tm_sec);
3175 
3176         /* Note that my_b_write() assumes it knows the length for this */
3177         if (my_b_write(&log_file, (uchar*) buff, buff_len))
3178           goto err;
3179       }
3180       const uchar uh[]= "# User@Host: ";
3181       if (my_b_write(&log_file, uh, sizeof(uh) - 1) ||
3182           my_b_write(&log_file, (uchar*) user_host, user_host_len) ||
3183           my_b_write(&log_file, (uchar*) "\n", 1))
3184         goto err;
3185 
3186     /* For slow query log */
3187     sprintf(query_time_buff, "%.6f", ulonglong2double(query_utime)/1000000.0);
3188     sprintf(lock_time_buff,  "%.6f", ulonglong2double(lock_utime)/1000000.0);
3189     if (my_b_printf(&log_file,
3190                     "# Thread_id: %lu  Schema: %s  QC_hit: %s\n"
3191                     "# Query_time: %s  Lock_time: %s  Rows_sent: %lu  Rows_examined: %lu\n"
3192                     "# Rows_affected: %lu  Bytes_sent: %lu\n",
3193                     (ulong) thd->thread_id, thd->get_db(),
3194                     ((thd->query_plan_flags & QPLAN_QC) ? "Yes" : "No"),
3195                     query_time_buff, lock_time_buff,
3196                     (ulong) thd->get_sent_row_count(),
3197                     (ulong) thd->get_examined_row_count(),
3198                     (ulong) thd->get_affected_rows(),
3199                     (ulong) (thd->status_var.bytes_sent - thd->bytes_sent_old)))
3200       goto err;
3201 
3202     if ((thd->variables.log_slow_verbosity & LOG_SLOW_VERBOSITY_QUERY_PLAN)
3203         && thd->tmp_tables_used &&
3204         my_b_printf(&log_file,
3205                     "# Tmp_tables: %lu  Tmp_disk_tables: %lu  "
3206                     "Tmp_table_sizes: %s\n",
3207                     (ulong) thd->tmp_tables_used,
3208                     (ulong) thd->tmp_tables_disk_used,
3209                     llstr(thd->tmp_tables_size, llbuff)))
3210       goto err;
3211 
3212     if (thd->spcont &&
3213         my_b_printf(&log_file, "# Stored_routine: %s\n",
3214                     ErrConvDQName(thd->spcont->m_sp).ptr()))
3215       goto err;
3216 
3217      if ((thd->variables.log_slow_verbosity & LOG_SLOW_VERBOSITY_QUERY_PLAN) &&
3218          (thd->query_plan_flags &
3219           (QPLAN_FULL_SCAN | QPLAN_FULL_JOIN | QPLAN_TMP_TABLE |
3220            QPLAN_TMP_DISK | QPLAN_FILESORT | QPLAN_FILESORT_DISK |
3221            QPLAN_FILESORT_PRIORITY_QUEUE)) &&
3222          my_b_printf(&log_file,
3223                      "# Full_scan: %s  Full_join: %s  "
3224                      "Tmp_table: %s  Tmp_table_on_disk: %s\n"
3225                      "# Filesort: %s  Filesort_on_disk: %s  Merge_passes: %lu  "
3226                      "Priority_queue: %s\n",
3227                      ((thd->query_plan_flags & QPLAN_FULL_SCAN) ? "Yes" : "No"),
3228                      ((thd->query_plan_flags & QPLAN_FULL_JOIN) ? "Yes" : "No"),
3229                      (thd->tmp_tables_used ? "Yes" : "No"),
3230                      (thd->tmp_tables_disk_used ? "Yes" : "No"),
3231                      ((thd->query_plan_flags & QPLAN_FILESORT) ? "Yes" : "No"),
3232                      ((thd->query_plan_flags & QPLAN_FILESORT_DISK) ?
3233                       "Yes" : "No"),
3234                      thd->query_plan_fsort_passes,
3235                      ((thd->query_plan_flags & QPLAN_FILESORT_PRIORITY_QUEUE) ?
3236                        "Yes" : "No")
3237                      ))
3238       goto err;
3239     if (thd->variables.log_slow_verbosity & LOG_SLOW_VERBOSITY_EXPLAIN &&
3240         thd->lex->explain)
3241     {
3242       StringBuffer<128> buf;
3243       DBUG_ASSERT(!thd->free_list);
3244       if (!print_explain_for_slow_log(thd->lex, thd, &buf))
3245         if (my_b_printf(&log_file, "%s", buf.c_ptr_safe()))
3246           goto err;
3247       thd->free_items();
3248     }
3249     if (thd->db.str && strcmp(thd->db.str, db))
3250     {						// Database changed
3251       if (my_b_printf(&log_file,"use %s;\n",thd->db.str))
3252         goto err;
3253       strmov(db,thd->db.str);
3254     }
3255     if (thd->stmt_depends_on_first_successful_insert_id_in_prev_stmt)
3256     {
3257       end=strmov(end, ",last_insert_id=");
3258       end=longlong10_to_str((longlong)
3259                             thd->first_successful_insert_id_in_prev_stmt_for_binlog,
3260                             end, -10);
3261     }
3262     // Save value if we do an insert.
3263     if (thd->auto_inc_intervals_in_cur_stmt_for_binlog.nb_elements() > 0)
3264     {
3265       if (!(specialflag & SPECIAL_SHORT_LOG_FORMAT))
3266       {
3267         end=strmov(end,",insert_id=");
3268         end=longlong10_to_str((longlong)
3269                               thd->auto_inc_intervals_in_cur_stmt_for_binlog.minimum(),
3270                               end, -10);
3271       }
3272     }
3273 
3274     /*
3275       This info used to show up randomly, depending on whether the query
3276       checked the query start time or not. now we always write current
3277       timestamp to the slow log
3278     */
3279     end= strmov(end, ",timestamp=");
3280     end= int10_to_str((long) current_time, end, 10);
3281 
3282     if (end != buff)
3283     {
3284       *end++=';';
3285       *end='\n';
3286       if (my_b_write(&log_file, (uchar*) "SET ", 4) ||
3287           my_b_write(&log_file, (uchar*) buff + 1, (uint) (end-buff)))
3288         goto err;
3289     }
3290     if (is_command)
3291     {
3292       end= strxmov(buff, "# administrator command: ", NullS);
3293       buff_len= (ulong) (end - buff);
3294       DBUG_EXECUTE_IF("simulate_slow_log_write_error",
3295                       {DBUG_SET("+d,simulate_file_write_error");});
3296       if(my_b_write(&log_file, (uchar*) buff, buff_len))
3297         goto err;
3298     }
3299     if (my_b_write(&log_file, (uchar*) sql_text, sql_text_len) ||
3300         my_b_write(&log_file, (uchar*) ";\n",2) ||
3301         flush_io_cache(&log_file))
3302       goto err;
3303 
3304     }
3305   }
3306 end:
3307   mysql_mutex_unlock(&LOCK_log);
3308   DBUG_RETURN(error);
3309 
3310 err:
3311   error= 1;
3312   if (!write_error)
3313   {
3314     write_error= 1;
3315     sql_print_error(ER_THD(thd, ER_ERROR_ON_WRITE), name, errno);
3316   }
3317   goto end;
3318 }
3319 
3320 
3321 /**
3322   @todo
3323   The following should be using fn_format();  We just need to
3324   first change fn_format() to cut the file name if it's too long.
3325 */
generate_name(const char * log_name,const char * suffix,bool strip_ext,char * buff)3326 const char *MYSQL_LOG::generate_name(const char *log_name,
3327                                      const char *suffix,
3328                                      bool strip_ext, char *buff)
3329 {
3330   if (!log_name || !log_name[0])
3331   {
3332     strmake(buff, pidfile_name, FN_REFLEN - strlen(suffix) - 1);
3333     return (const char *)
3334       fn_format(buff, buff, "", suffix, MYF(MY_REPLACE_EXT|MY_REPLACE_DIR));
3335   }
3336   // get rid of extension if the log is binary to avoid problems
3337   if (strip_ext)
3338   {
3339     char *p= fn_ext(log_name);
3340     uint length= (uint) (p - log_name);
3341     strmake(buff, log_name, MY_MIN(length, FN_REFLEN-1));
3342     return (const char*)buff;
3343   }
3344   return log_name;
3345 }
3346 
3347 
3348 /*
3349   Print some additional information about addition/removal of
3350   XID list entries.
3351   TODO: Remove once MDEV-9510 is fixed.
3352 */
3353 #ifdef WITH_WSREP
3354 #define WSREP_XID_LIST_ENTRY(X, Y)                    \
3355   if (wsrep_debug)                                    \
3356   {                                                   \
3357     char buf[FN_REFLEN];                              \
3358     strmake(buf, Y->binlog_name, Y->binlog_name_len); \
3359     WSREP_DEBUG(X, buf, Y->binlog_id);                \
3360   }
3361 #else
3362 #define WSREP_XID_LIST_ENTRY(X, Y) do { } while(0)
3363 #endif
3364 
MYSQL_BIN_LOG(uint * sync_period)3365 MYSQL_BIN_LOG::MYSQL_BIN_LOG(uint *sync_period)
3366   :reset_master_pending(0), mark_xid_done_waiting(0),
3367    bytes_written(0), last_used_log_number(0),
3368    file_id(1), open_count(1),
3369    group_commit_queue(0), group_commit_queue_busy(FALSE),
3370    num_commits(0), num_group_commits(0),
3371    group_commit_trigger_count(0), group_commit_trigger_timeout(0),
3372    group_commit_trigger_lock_wait(0),
3373    sync_period_ptr(sync_period), sync_counter(0),
3374    state_file_deleted(false), binlog_state_recover_done(false),
3375    is_relay_log(0), relay_signal_cnt(0),
3376    checksum_alg_reset(BINLOG_CHECKSUM_ALG_UNDEF),
3377    relay_log_checksum_alg(BINLOG_CHECKSUM_ALG_UNDEF),
3378    description_event_for_exec(0), description_event_for_queue(0),
3379    current_binlog_id(0), reset_master_count(0)
3380 {
3381   /*
3382     We don't want to initialize locks here as such initialization depends on
3383     safe_mutex (when using safe_mutex) which depends on MY_INIT(), which is
3384     called only in main(). Doing initialization here would make it happen
3385     before main().
3386   */
3387   index_file_name[0] = 0;
3388   bzero((char*) &index_file, sizeof(index_file));
3389   bzero((char*) &purge_index_file, sizeof(purge_index_file));
3390 }
3391 
stop_background_thread()3392 void MYSQL_BIN_LOG::stop_background_thread()
3393 {
3394   if (binlog_background_thread_started)
3395   {
3396     mysql_mutex_lock(&LOCK_binlog_background_thread);
3397     binlog_background_thread_stop= true;
3398     mysql_cond_signal(&COND_binlog_background_thread);
3399     while (binlog_background_thread_stop)
3400       mysql_cond_wait(&COND_binlog_background_thread_end,
3401                       &LOCK_binlog_background_thread);
3402     mysql_mutex_unlock(&LOCK_binlog_background_thread);
3403     binlog_background_thread_started= false;
3404   }
3405 }
3406 
3407 /* this is called only once */
3408 
cleanup()3409 void MYSQL_BIN_LOG::cleanup()
3410 {
3411   DBUG_ENTER("cleanup");
3412   if (inited)
3413   {
3414     xid_count_per_binlog *b;
3415 
3416     /* Wait for the binlog background thread to stop. */
3417     if (!is_relay_log)
3418       stop_background_thread();
3419 
3420     inited= 0;
3421     mysql_mutex_lock(&LOCK_log);
3422     close(LOG_CLOSE_INDEX|LOG_CLOSE_STOP_EVENT);
3423     mysql_mutex_unlock(&LOCK_log);
3424     delete description_event_for_queue;
3425     delete description_event_for_exec;
3426 
3427     while ((b= binlog_xid_count_list.get()))
3428     {
3429       /*
3430         There should be no pending XIDs at shutdown, and only one entry (for
3431         the active binlog file) in the list.
3432       */
3433       DBUG_ASSERT(b->xid_count == 0);
3434       DBUG_ASSERT(!binlog_xid_count_list.head());
3435       WSREP_XID_LIST_ENTRY("MYSQL_BIN_LOG::cleanup(): Removing xid_list_entry "
3436                            "for %s (%lu)", b);
3437       delete b;
3438     }
3439 
3440     mysql_mutex_destroy(&LOCK_log);
3441     mysql_mutex_destroy(&LOCK_index);
3442     mysql_mutex_destroy(&LOCK_xid_list);
3443     mysql_mutex_destroy(&LOCK_binlog_background_thread);
3444     mysql_mutex_destroy(&LOCK_binlog_end_pos);
3445     mysql_cond_destroy(&COND_relay_log_updated);
3446     mysql_cond_destroy(&COND_bin_log_updated);
3447     mysql_cond_destroy(&COND_queue_busy);
3448     mysql_cond_destroy(&COND_xid_list);
3449     mysql_cond_destroy(&COND_binlog_background_thread);
3450     mysql_cond_destroy(&COND_binlog_background_thread_end);
3451   }
3452 
3453   /*
3454     Free data for global binlog state.
3455     We can't do that automatically as we need to do this before
3456     safemalloc is shut down
3457   */
3458   if (!is_relay_log)
3459     rpl_global_gtid_binlog_state.free();
3460   DBUG_VOID_RETURN;
3461 }
3462 
3463 
3464 /* Init binlog-specific vars */
init(ulong max_size_arg)3465 void MYSQL_BIN_LOG::init(ulong max_size_arg)
3466 {
3467   DBUG_ENTER("MYSQL_BIN_LOG::init");
3468   max_size= max_size_arg;
3469   DBUG_PRINT("info",("max_size: %lu", max_size));
3470   DBUG_VOID_RETURN;
3471 }
3472 
3473 
init_pthread_objects()3474 void MYSQL_BIN_LOG::init_pthread_objects()
3475 {
3476   MYSQL_LOG::init_pthread_objects();
3477   mysql_mutex_init(m_key_LOCK_index, &LOCK_index, MY_MUTEX_INIT_SLOW);
3478   mysql_mutex_setflags(&LOCK_index, MYF_NO_DEADLOCK_DETECTION);
3479   mysql_mutex_init(key_BINLOG_LOCK_xid_list,
3480                    &LOCK_xid_list, MY_MUTEX_INIT_FAST);
3481   mysql_cond_init(m_key_relay_log_update, &COND_relay_log_updated, 0);
3482   mysql_cond_init(m_key_bin_log_update, &COND_bin_log_updated, 0);
3483   mysql_cond_init(m_key_COND_queue_busy, &COND_queue_busy, 0);
3484   mysql_cond_init(key_BINLOG_COND_xid_list, &COND_xid_list, 0);
3485 
3486   mysql_mutex_init(key_BINLOG_LOCK_binlog_background_thread,
3487                    &LOCK_binlog_background_thread, MY_MUTEX_INIT_FAST);
3488   mysql_cond_init(key_BINLOG_COND_binlog_background_thread,
3489                   &COND_binlog_background_thread, 0);
3490   mysql_cond_init(key_BINLOG_COND_binlog_background_thread_end,
3491                   &COND_binlog_background_thread_end, 0);
3492 
3493   mysql_mutex_init(m_key_LOCK_binlog_end_pos, &LOCK_binlog_end_pos,
3494                    MY_MUTEX_INIT_SLOW);
3495 }
3496 
3497 
open_index_file(const char * index_file_name_arg,const char * log_name,bool need_mutex)3498 bool MYSQL_BIN_LOG::open_index_file(const char *index_file_name_arg,
3499                                     const char *log_name, bool need_mutex)
3500 {
3501   File index_file_nr= -1;
3502   DBUG_ASSERT(!my_b_inited(&index_file));
3503 
3504   /*
3505     First open of this class instance
3506     Create an index file that will hold all file names uses for logging.
3507     Add new entries to the end of it.
3508   */
3509   myf opt= MY_UNPACK_FILENAME;
3510   if (!index_file_name_arg)
3511   {
3512     index_file_name_arg= log_name;    // Use same basename for index file
3513     opt= MY_UNPACK_FILENAME | MY_REPLACE_EXT;
3514   }
3515   fn_format(index_file_name, index_file_name_arg, mysql_data_home,
3516             ".index", opt);
3517   if ((index_file_nr= mysql_file_open(m_key_file_log_index,
3518                                       index_file_name,
3519                                       O_RDWR | O_CREAT | O_BINARY | O_CLOEXEC,
3520                                       MYF(MY_WME))) < 0 ||
3521        mysql_file_sync(index_file_nr, MYF(MY_WME)) ||
3522        init_io_cache_ext(&index_file, index_file_nr,
3523                      IO_SIZE, WRITE_CACHE,
3524                      mysql_file_seek(index_file_nr, 0L, MY_SEEK_END, MYF(0)),
3525                                      0, MYF(MY_WME | MY_WAIT_IF_FULL),
3526                                      m_key_file_log_index_cache) ||
3527       DBUG_EVALUATE_IF("fault_injection_openning_index", 1, 0))
3528   {
3529     /*
3530       TODO: all operations creating/deleting the index file or a log, should
3531       call my_sync_dir() or my_sync_dir_by_file() to be durable.
3532       TODO: file creation should be done with mysql_file_create()
3533       not mysql_file_open().
3534     */
3535     if (index_file_nr >= 0)
3536       mysql_file_close(index_file_nr, MYF(0));
3537     return TRUE;
3538   }
3539 
3540 #ifdef HAVE_REPLICATION
3541   /*
3542     Sync the index by purging any binary log file that is not registered.
3543     In other words, either purge binary log files that were removed from
3544     the index but not purged from the file system due to a crash or purge
3545     any binary log file that was created but not register in the index
3546     due to a crash.
3547   */
3548 
3549   if (set_purge_index_file_name(index_file_name_arg) ||
3550       open_purge_index_file(FALSE) ||
3551       purge_index_entry(NULL, NULL, need_mutex) ||
3552       close_purge_index_file() ||
3553       DBUG_EVALUATE_IF("fault_injection_recovering_index", 1, 0))
3554   {
3555     sql_print_error("MYSQL_BIN_LOG::open_index_file failed to sync the index "
3556                     "file.");
3557     return TRUE;
3558   }
3559 #endif
3560 
3561   return FALSE;
3562 }
3563 
3564 
3565 /**
3566   Open a (new) binlog file.
3567 
3568   - Open the log file and the index file. Register the new
3569   file name in it
3570   - When calling this when the file is in use, you must have a locks
3571   on LOCK_log and LOCK_index.
3572 
3573   @retval
3574     0	ok
3575   @retval
3576     1	error
3577 */
3578 
open(const char * log_name,const char * new_name,ulong next_log_number,enum cache_type io_cache_type_arg,ulong max_size_arg,bool null_created_arg,bool need_mutex)3579 bool MYSQL_BIN_LOG::open(const char *log_name,
3580                          const char *new_name,
3581                          ulong next_log_number,
3582                          enum cache_type io_cache_type_arg,
3583                          ulong max_size_arg,
3584                          bool null_created_arg,
3585                          bool need_mutex)
3586 {
3587   File file= -1;
3588   xid_count_per_binlog *new_xid_list_entry= NULL, *b;
3589   DBUG_ENTER("MYSQL_BIN_LOG::open");
3590 
3591   mysql_mutex_assert_owner(&LOCK_log);
3592 
3593   if (!is_relay_log)
3594   {
3595     if (!binlog_state_recover_done)
3596     {
3597       binlog_state_recover_done= true;
3598       if (do_binlog_recovery(opt_bin_logname, false))
3599         DBUG_RETURN(1);
3600     }
3601 
3602     if (!binlog_background_thread_started &&
3603         start_binlog_background_thread())
3604       DBUG_RETURN(1);
3605   }
3606 
3607   /* We need to calculate new log file name for purge to delete old */
3608   if (init_and_set_log_file_name(log_name, new_name, next_log_number,
3609                                  LOG_BIN, io_cache_type_arg))
3610   {
3611     sql_print_error("MYSQL_BIN_LOG::open failed to generate new file name.");
3612     if (!is_relay_log)
3613       goto err;
3614     DBUG_RETURN(1);
3615   }
3616 
3617 #ifdef HAVE_REPLICATION
3618   if (open_purge_index_file(TRUE) ||
3619       register_create_index_entry(log_file_name) ||
3620       sync_purge_index_file() ||
3621       DBUG_EVALUATE_IF("fault_injection_registering_index", 1, 0))
3622   {
3623     /**
3624         TODO:
3625         Although this was introduced to appease valgrind when
3626         injecting emulated faults using
3627         fault_injection_registering_index it may be good to consider
3628         what actually happens when open_purge_index_file succeeds but
3629         register or sync fails.
3630 
3631         Perhaps we might need the code below in MYSQL_LOG_BIN::cleanup
3632         for "real life" purposes as well?
3633      */
3634     DBUG_EXECUTE_IF("fault_injection_registering_index", {
3635       if (my_b_inited(&purge_index_file))
3636       {
3637         end_io_cache(&purge_index_file);
3638         my_close(purge_index_file.file, MYF(0));
3639       }
3640     });
3641 
3642     sql_print_error("MYSQL_BIN_LOG::open failed to sync the index file.");
3643     DBUG_RETURN(1);
3644   }
3645   DBUG_EXECUTE_IF("crash_create_non_critical_before_update_index", DBUG_SUICIDE(););
3646 #endif
3647 
3648   write_error= 0;
3649 
3650   /* open the main log file */
3651   if (MYSQL_LOG::open(
3652 #ifdef HAVE_PSI_INTERFACE
3653                       m_key_file_log,
3654 #endif
3655                       log_name,
3656                       LOG_UNKNOWN, /* Don't generate new name */
3657                       0, 0, io_cache_type_arg))
3658   {
3659 #ifdef HAVE_REPLICATION
3660     close_purge_index_file();
3661 #endif
3662     DBUG_RETURN(1);                            /* all warnings issued */
3663   }
3664 
3665   init(max_size_arg);
3666 
3667   open_count++;
3668 
3669   DBUG_ASSERT(log_type == LOG_BIN);
3670 
3671   {
3672     bool write_file_name_to_index_file=0;
3673 
3674     if (!my_b_filelength(&log_file))
3675     {
3676       /*
3677 	The binary log file was empty (probably newly created)
3678 	This is the normal case and happens when the user doesn't specify
3679 	an extension for the binary log files.
3680 	In this case we write a standard header to it.
3681       */
3682       if (my_b_safe_write(&log_file, BINLOG_MAGIC,
3683 			  BIN_LOG_HEADER_SIZE))
3684         goto err;
3685       bytes_written+= BIN_LOG_HEADER_SIZE;
3686       write_file_name_to_index_file= 1;
3687     }
3688 
3689     {
3690       /*
3691         In 4.x we put Start event only in the first binlog. But from 5.0 we
3692         want a Start event even if this is not the very first binlog.
3693       */
3694       Format_description_log_event s(BINLOG_VERSION);
3695       /*
3696         don't set LOG_EVENT_BINLOG_IN_USE_F for SEQ_READ_APPEND io_cache
3697         as we won't be able to reset it later
3698       */
3699       if (io_cache_type == WRITE_CACHE)
3700         s.flags |= LOG_EVENT_BINLOG_IN_USE_F;
3701 
3702       if (is_relay_log)
3703       {
3704         if (relay_log_checksum_alg == BINLOG_CHECKSUM_ALG_UNDEF)
3705           relay_log_checksum_alg=
3706             opt_slave_sql_verify_checksum ? (enum_binlog_checksum_alg) binlog_checksum_options
3707                                           : BINLOG_CHECKSUM_ALG_OFF;
3708         s.checksum_alg= relay_log_checksum_alg;
3709         s.set_relay_log_event();
3710       }
3711       else
3712         s.checksum_alg= (enum_binlog_checksum_alg)binlog_checksum_options;
3713 
3714       crypto.scheme = 0;
3715       DBUG_ASSERT(s.checksum_alg != BINLOG_CHECKSUM_ALG_UNDEF);
3716       if (!s.is_valid())
3717         goto err;
3718       s.dont_set_created= null_created_arg;
3719       if (write_event(&s))
3720         goto err;
3721       bytes_written+= s.data_written;
3722 
3723       if (encrypt_binlog)
3724       {
3725         uint key_version= encryption_key_get_latest_version(ENCRYPTION_KEY_SYSTEM_DATA);
3726         if (key_version == ENCRYPTION_KEY_VERSION_INVALID)
3727         {
3728           sql_print_error("Failed to enable encryption of binary logs");
3729           goto err;
3730         }
3731 
3732         if (key_version != ENCRYPTION_KEY_NOT_ENCRYPTED)
3733         {
3734           if (my_random_bytes(crypto.nonce, sizeof(crypto.nonce)))
3735             goto err;
3736 
3737           Start_encryption_log_event sele(1, key_version, crypto.nonce);
3738           sele.checksum_alg= s.checksum_alg;
3739           if (write_event(&sele))
3740             goto err;
3741 
3742           // Start_encryption_log_event is written, enable the encryption
3743           if (crypto.init(sele.crypto_scheme, key_version))
3744             goto err;
3745         }
3746       }
3747 
3748       if (!is_relay_log)
3749       {
3750         char buf[FN_REFLEN];
3751 
3752         /*
3753           Output a Gtid_list_log_event at the start of the binlog file.
3754 
3755           This is used to quickly determine which GTIDs are found in binlog
3756           files earlier than this one, and which are found in this (or later)
3757           binlogs.
3758 
3759           The list gives a mapping from (domain_id, server_id) -> seq_no (so
3760           this means that there is at most one entry for every unique pair
3761           (domain_id, server_id) in the list). It indicates that this seq_no is
3762           the last one found in an earlier binlog file for this (domain_id,
3763           server_id) combination - so any higher seq_no should be search for
3764           from this binlog file, or a later one.
3765 
3766           This allows to locate the binlog file containing a given GTID by
3767           scanning backwards, reading just the Gtid_list_log_event at the
3768           start of each file, and scanning only the relevant binlog file when
3769           found, not all binlog files.
3770 
3771           The existence of a given entry (domain_id, server_id, seq_no)
3772           guarantees only that this seq_no will not be found in this or any
3773           later binlog file. It does not guarantee that it can be found it an
3774           earlier binlog file, for example the file may have been purged.
3775 
3776           If there is no entry for a given (domain_id, server_id) pair, then
3777           it means that no such GTID exists in any earlier binlog. It is
3778           permissible to remove such pair from future Gtid_list_log_events
3779           if all previous binlog files containing such GTIDs have been purged
3780           (though such optimization is not performed at the time of this
3781           writing). So if there is no entry for given GTID it means that such
3782           GTID should be search for in this or later binlog file, same as if
3783           there had been an entry (domain_id, server_id, 0).
3784         */
3785 
3786         Gtid_list_log_event gl_ev(&rpl_global_gtid_binlog_state, 0);
3787         if (write_event(&gl_ev))
3788           goto err;
3789 
3790         /* Output a binlog checkpoint event at the start of the binlog file. */
3791 
3792         /*
3793           Construct an entry in the binlog_xid_count_list for the new binlog
3794           file (we will not link it into the list until we know the new file
3795           is successfully created; otherwise we would have to remove it again
3796           if creation failed, which gets tricky since other threads may have
3797           seen the entry in the meantime - and we do not want to hold
3798           LOCK_xid_list for long periods of time).
3799 
3800           Write the current binlog checkpoint into the log, so XA recovery will
3801           know from where to start recovery.
3802         */
3803         size_t off= dirname_length(log_file_name);
3804         uint len= static_cast<uint>(strlen(log_file_name) - off);
3805         new_xid_list_entry= new xid_count_per_binlog(log_file_name+off, len);
3806         if (!new_xid_list_entry)
3807           goto err;
3808 
3809         /*
3810           Find the name for the Initial binlog checkpoint.
3811 
3812           Normally this will just be the first entry, as we delete entries
3813           when their count drops to zero. But we scan the list to handle any
3814           corner case, eg. for the first binlog file opened after startup, the
3815           list will be empty.
3816         */
3817         mysql_mutex_lock(&LOCK_xid_list);
3818         I_List_iterator<xid_count_per_binlog> it(binlog_xid_count_list);
3819         while ((b= it++) && b->xid_count == 0)
3820           ;
3821         mysql_mutex_unlock(&LOCK_xid_list);
3822         if (!b)
3823           b= new_xid_list_entry;
3824         if (b->binlog_name)
3825           strmake(buf, b->binlog_name, b->binlog_name_len);
3826         else
3827           goto err;
3828         Binlog_checkpoint_log_event ev(buf, len);
3829         DBUG_EXECUTE_IF("crash_before_write_checkpoint_event",
3830                         flush_io_cache(&log_file);
3831                         mysql_file_sync(log_file.file, MYF(MY_WME));
3832                         DBUG_SUICIDE(););
3833         if (write_event(&ev))
3834           goto err;
3835         bytes_written+= ev.data_written;
3836       }
3837     }
3838     if (description_event_for_queue &&
3839         description_event_for_queue->binlog_version>=4)
3840     {
3841       /*
3842         This is a relay log written to by the I/O slave thread.
3843         Write the event so that others can later know the format of this relay
3844         log.
3845         Note that this event is very close to the original event from the
3846         master (it has binlog version of the master, event types of the
3847         master), so this is suitable to parse the next relay log's event. It
3848         has been produced by
3849         Format_description_log_event::Format_description_log_event(char* buf,).
3850         Why don't we want to write the description_event_for_queue if this
3851         event is for format<4 (3.23 or 4.x): this is because in that case, the
3852         description_event_for_queue describes the data received from the
3853         master, but not the data written to the relay log (*conversion*),
3854         which is in format 4 (slave's).
3855       */
3856       /*
3857         Set 'created' to 0, so that in next relay logs this event does not
3858         trigger cleaning actions on the slave in
3859         Format_description_log_event::apply_event_impl().
3860       */
3861       description_event_for_queue->created= 0;
3862       /* Don't set log_pos in event header */
3863       description_event_for_queue->set_artificial_event();
3864 
3865       if (write_event(description_event_for_queue))
3866         goto err;
3867       bytes_written+= description_event_for_queue->data_written;
3868     }
3869     if (flush_io_cache(&log_file) ||
3870         mysql_file_sync(log_file.file, MYF(MY_WME|MY_SYNC_FILESIZE)))
3871       goto err;
3872 
3873     my_off_t offset= my_b_tell(&log_file);
3874 
3875     if (!is_relay_log)
3876     {
3877       /* update binlog_end_pos so that it can be read by after sync hook */
3878       reset_binlog_end_pos(log_file_name, offset);
3879 
3880       mysql_mutex_lock(&LOCK_commit_ordered);
3881       strmake_buf(last_commit_pos_file, log_file_name);
3882       last_commit_pos_offset= offset;
3883       mysql_mutex_unlock(&LOCK_commit_ordered);
3884     }
3885 
3886     if (write_file_name_to_index_file)
3887     {
3888 #ifdef HAVE_REPLICATION
3889 #ifdef ENABLED_DEBUG_SYNC
3890       if (current_thd)
3891         DEBUG_SYNC(current_thd, "binlog_open_before_update_index");
3892 #endif
3893       DBUG_EXECUTE_IF("crash_create_critical_before_update_index", DBUG_SUICIDE(););
3894 #endif
3895 
3896       DBUG_ASSERT(my_b_inited(&index_file) != 0);
3897       reinit_io_cache(&index_file, WRITE_CACHE,
3898                       my_b_filelength(&index_file), 0, 0);
3899       /*
3900         As this is a new log file, we write the file name to the index
3901         file. As every time we write to the index file, we sync it.
3902       */
3903       if (DBUG_EVALUATE_IF("fault_injection_updating_index", 1, 0) ||
3904           my_b_write(&index_file, (uchar*) log_file_name,
3905                      strlen(log_file_name)) ||
3906           my_b_write(&index_file, (uchar*) "\n", 1) ||
3907           flush_io_cache(&index_file) ||
3908           mysql_file_sync(index_file.file, MYF(MY_WME|MY_SYNC_FILESIZE)))
3909         goto err;
3910 
3911 #ifdef HAVE_REPLICATION
3912       DBUG_EXECUTE_IF("crash_create_after_update_index", DBUG_SUICIDE(););
3913 #endif
3914     }
3915   }
3916 
3917   if (!is_relay_log)
3918   {
3919     /*
3920       Now the file was created successfully, so we can link in the entry for
3921       the new binlog file in binlog_xid_count_list.
3922     */
3923     mysql_mutex_lock(&LOCK_xid_list);
3924     ++current_binlog_id;
3925     new_xid_list_entry->binlog_id= current_binlog_id;
3926     /* Remove any initial entries with no pending XIDs.  */
3927     while ((b= binlog_xid_count_list.head()) && b->xid_count == 0)
3928     {
3929       WSREP_XID_LIST_ENTRY("MYSQL_BIN_LOG::open(): Removing xid_list_entry for "
3930                            "%s (%lu)", b);
3931       delete binlog_xid_count_list.get();
3932     }
3933     mysql_cond_broadcast(&COND_xid_list);
3934     WSREP_XID_LIST_ENTRY("MYSQL_BIN_LOG::open(): Adding new xid_list_entry for "
3935                          "%s (%lu)", new_xid_list_entry);
3936     binlog_xid_count_list.push_back(new_xid_list_entry);
3937     mysql_mutex_unlock(&LOCK_xid_list);
3938 
3939     /*
3940       Now that we have synced a new binlog file with an initial Gtid_list
3941       event, it is safe to delete the binlog state file. We will write out
3942       a new, updated file at shutdown, and if we crash before we can recover
3943       the state from the newly written binlog file.
3944 
3945       Since the state file will contain out-of-date data as soon as the first
3946       new GTID is binlogged, it is better to remove it, to avoid any risk of
3947       accidentally reading incorrect data later.
3948     */
3949     if (!state_file_deleted)
3950     {
3951       char buf[FN_REFLEN];
3952       fn_format(buf, opt_bin_logname, mysql_data_home, ".state",
3953                 MY_UNPACK_FILENAME);
3954       my_delete(buf, MY_SYNC_DIR);
3955       state_file_deleted= true;
3956     }
3957   }
3958 
3959   log_state= LOG_OPENED;
3960 
3961 #ifdef HAVE_REPLICATION
3962   close_purge_index_file();
3963 #endif
3964 
3965   /* Notify the io thread that binlog is rotated to a new file */
3966   if (is_relay_log)
3967     signal_relay_log_update();
3968   else
3969     update_binlog_end_pos();
3970   DBUG_RETURN(0);
3971 
3972 err:
3973   int tmp_errno= errno;
3974 #ifdef HAVE_REPLICATION
3975   if (is_inited_purge_index_file())
3976     purge_index_entry(NULL, NULL, need_mutex);
3977   close_purge_index_file();
3978 #endif
3979   sql_print_error(fatal_log_error, (name) ? name : log_name, tmp_errno);
3980   if (new_xid_list_entry)
3981     delete new_xid_list_entry;
3982   if (file >= 0)
3983     mysql_file_close(file, MYF(0));
3984   close(LOG_CLOSE_INDEX);
3985   DBUG_RETURN(1);
3986 }
3987 
3988 
get_current_log(LOG_INFO * linfo)3989 int MYSQL_BIN_LOG::get_current_log(LOG_INFO* linfo)
3990 {
3991   mysql_mutex_lock(&LOCK_log);
3992   int ret = raw_get_current_log(linfo);
3993   mysql_mutex_unlock(&LOCK_log);
3994   return ret;
3995 }
3996 
raw_get_current_log(LOG_INFO * linfo)3997 int MYSQL_BIN_LOG::raw_get_current_log(LOG_INFO* linfo)
3998 {
3999   mysql_mutex_assert_owner(&LOCK_log);
4000   strmake_buf(linfo->log_file_name, log_file_name);
4001   linfo->pos = my_b_tell(&log_file);
4002   return 0;
4003 }
4004 
4005 /**
4006   Move all data up in a file in an filename index file.
4007 
4008     We do the copy outside of the IO_CACHE as the cache buffers would just
4009     make things slower and more complicated.
4010     In most cases the copy loop should only do one read.
4011 
4012   @param index_file			File to move
4013   @param offset			Move everything from here to beginning
4014 
4015   @note
4016     File will be truncated to be 'offset' shorter or filled up with newlines
4017 
4018   @retval
4019     0	ok
4020 */
4021 
4022 #ifdef HAVE_REPLICATION
4023 
copy_up_file_and_fill(IO_CACHE * index_file,my_off_t offset)4024 static bool copy_up_file_and_fill(IO_CACHE *index_file, my_off_t offset)
4025 {
4026   int bytes_read;
4027   my_off_t init_offset= offset;
4028   File file= index_file->file;
4029   uchar io_buf[IO_SIZE*2];
4030   DBUG_ENTER("copy_up_file_and_fill");
4031 
4032   for (;; offset+= bytes_read)
4033   {
4034     mysql_file_seek(file, offset, MY_SEEK_SET, MYF(0));
4035     if ((bytes_read= (int) mysql_file_read(file, io_buf, sizeof(io_buf),
4036                                            MYF(MY_WME)))
4037 	< 0)
4038       goto err;
4039     if (!bytes_read)
4040       break;					// end of file
4041     mysql_file_seek(file, offset-init_offset, MY_SEEK_SET, MYF(0));
4042     if (mysql_file_write(file, io_buf, bytes_read,
4043                          MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
4044       goto err;
4045   }
4046   /* The following will either truncate the file or fill the end with \n' */
4047   if (mysql_file_chsize(file, offset - init_offset, '\n', MYF(MY_WME)) ||
4048       mysql_file_sync(file, MYF(MY_WME|MY_SYNC_FILESIZE)))
4049     goto err;
4050 
4051   /* Reset data in old index cache */
4052   reinit_io_cache(index_file, READ_CACHE, (my_off_t) 0, 0, 1);
4053   DBUG_RETURN(0);
4054 
4055 err:
4056   DBUG_RETURN(1);
4057 }
4058 
4059 #endif /* HAVE_REPLICATION */
4060 
4061 /**
4062   Find the position in the log-index-file for the given log name.
4063 
4064   @param linfo		Store here the found log file name and position to
4065                        the NEXT log file name in the index file.
4066   @param log_name	Filename to find in the index file.
4067                        Is a null pointer if we want to read the first entry
4068   @param need_lock	Set this to 1 if the parent doesn't already have a
4069                        lock on LOCK_index
4070 
4071   @note
4072     On systems without the truncate function the file will end with one or
4073     more empty lines.  These will be ignored when reading the file.
4074 
4075   @retval
4076     0			ok
4077   @retval
4078     LOG_INFO_EOF	        End of log-index-file found
4079   @retval
4080     LOG_INFO_IO		Got IO error while reading file
4081 */
4082 
find_log_pos(LOG_INFO * linfo,const char * log_name,bool need_lock)4083 int MYSQL_BIN_LOG::find_log_pos(LOG_INFO *linfo, const char *log_name,
4084 			    bool need_lock)
4085 {
4086   int error= 0;
4087   char *full_fname= linfo->log_file_name;
4088   char full_log_name[FN_REFLEN], fname[FN_REFLEN];
4089   uint log_name_len= 0, fname_len= 0;
4090   DBUG_ENTER("find_log_pos");
4091   full_log_name[0]= full_fname[0]= 0;
4092 
4093   /*
4094     Mutex needed because we need to make sure the file pointer does not
4095     move from under our feet
4096   */
4097   if (need_lock)
4098     mysql_mutex_lock(&LOCK_index);
4099   mysql_mutex_assert_owner(&LOCK_index);
4100 
4101   // extend relative paths for log_name to be searched
4102   if (log_name)
4103   {
4104     if(normalize_binlog_name(full_log_name, log_name, is_relay_log))
4105     {
4106       error= LOG_INFO_EOF;
4107       goto end;
4108     }
4109   }
4110 
4111   log_name_len= log_name ? (uint) strlen(full_log_name) : 0;
4112   DBUG_PRINT("enter", ("log_name: %s, full_log_name: %s",
4113                        log_name ? log_name : "NULL", full_log_name));
4114 
4115   /* As the file is flushed, we can't get an error here */
4116   (void) reinit_io_cache(&index_file, READ_CACHE, (my_off_t) 0, 0, 0);
4117 
4118   for (;;)
4119   {
4120     size_t length;
4121     my_off_t offset= my_b_tell(&index_file);
4122 
4123     DBUG_EXECUTE_IF("simulate_find_log_pos_error",
4124                     error=  LOG_INFO_EOF; break;);
4125     /* If we get 0 or 1 characters, this is the end of the file */
4126     if ((length= my_b_gets(&index_file, fname, FN_REFLEN)) <= 1)
4127     {
4128       /* Did not find the given entry; Return not found or error */
4129       error= !index_file.error ? LOG_INFO_EOF : LOG_INFO_IO;
4130       break;
4131     }
4132     if (fname[length-1] != '\n')
4133       continue;                                 // Not a log entry
4134     fname[length-1]= 0;                         // Remove end \n
4135 
4136     // extend relative paths and match against full path
4137     if (normalize_binlog_name(full_fname, fname, is_relay_log))
4138     {
4139       error= LOG_INFO_EOF;
4140       break;
4141     }
4142     fname_len= (uint) strlen(full_fname);
4143 
4144     // if the log entry matches, null string matching anything
4145     if (!log_name ||
4146         (log_name_len == fname_len &&
4147 	 !strncmp(full_fname, full_log_name, log_name_len)))
4148     {
4149       DBUG_PRINT("info", ("Found log file entry"));
4150       linfo->index_file_start_offset= offset;
4151       linfo->index_file_offset = my_b_tell(&index_file);
4152       break;
4153     }
4154   }
4155 
4156 end:
4157   if (need_lock)
4158     mysql_mutex_unlock(&LOCK_index);
4159   DBUG_RETURN(error);
4160 }
4161 
4162 
4163 /**
4164   Find the position in the log-index-file for the given log name.
4165 
4166   @param
4167     linfo		Store here the next log file name and position to
4168 			the file name after that.
4169   @param
4170     need_lock		Set this to 1 if the parent doesn't already have a
4171 			lock on LOCK_index
4172 
4173   @note
4174     - Before calling this function, one has to call find_log_pos()
4175     to set up 'linfo'
4176     - Mutex needed because we need to make sure the file pointer does not move
4177     from under our feet
4178 
4179   @retval
4180     0			ok
4181   @retval
4182     LOG_INFO_EOF	        End of log-index-file found
4183   @retval
4184     LOG_INFO_IO		Got IO error while reading file
4185 */
4186 
find_next_log(LOG_INFO * linfo,bool need_lock)4187 int MYSQL_BIN_LOG::find_next_log(LOG_INFO* linfo, bool need_lock)
4188 {
4189   int error= 0;
4190   size_t length;
4191   char fname[FN_REFLEN];
4192   char *full_fname= linfo->log_file_name;
4193 
4194   if (need_lock)
4195     mysql_mutex_lock(&LOCK_index);
4196   mysql_mutex_assert_owner(&LOCK_index);
4197 
4198   /* As the file is flushed, we can't get an error here */
4199   (void) reinit_io_cache(&index_file, READ_CACHE, linfo->index_file_offset, 0,
4200 			 0);
4201 
4202   linfo->index_file_start_offset= linfo->index_file_offset;
4203   if ((length=my_b_gets(&index_file, fname, FN_REFLEN)) <= 1)
4204   {
4205     error = !index_file.error ? LOG_INFO_EOF : LOG_INFO_IO;
4206     goto err;
4207   }
4208 
4209   if (fname[0] != 0)
4210   {
4211     if(normalize_binlog_name(full_fname, fname, is_relay_log))
4212     {
4213       error= LOG_INFO_EOF;
4214       goto err;
4215     }
4216     length= strlen(full_fname);
4217   }
4218 
4219   full_fname[length-1]= 0;			// kill \n
4220   linfo->index_file_offset= my_b_tell(&index_file);
4221 
4222 err:
4223   if (need_lock)
4224     mysql_mutex_unlock(&LOCK_index);
4225   return error;
4226 }
4227 
4228 
4229 /**
4230   Delete all logs referred to in the index file.
4231 
4232   The new index file will only contain this file.
4233 
4234   @param thd		  Thread id. This can be zero in case of resetting
4235                           relay logs
4236   @param create_new_log   1 if we should start writing to a new log file
4237   @param next_log_number  min number of next log file to use, if possible.
4238 
4239   @note
4240     If not called from slave thread, write start event to new log
4241 
4242   @retval
4243     0	ok
4244   @retval
4245     1   error
4246 */
4247 
reset_logs(THD * thd,bool create_new_log,rpl_gtid * init_state,uint32 init_state_len,ulong next_log_number)4248 bool MYSQL_BIN_LOG::reset_logs(THD *thd, bool create_new_log,
4249                                rpl_gtid *init_state, uint32 init_state_len,
4250                                ulong next_log_number)
4251 {
4252   LOG_INFO linfo;
4253   bool error=0;
4254   int err;
4255   const char* save_name;
4256   DBUG_ENTER("reset_logs");
4257 
4258   if (!is_relay_log)
4259   {
4260     if (init_state && !is_empty_state())
4261     {
4262       my_error(ER_BINLOG_MUST_BE_EMPTY, MYF(0));
4263       DBUG_RETURN(1);
4264     }
4265 
4266     /*
4267       Mark that a RESET MASTER is in progress.
4268       This ensures that a binlog checkpoint will not try to write binlog
4269       checkpoint events, which would be useless (as we are deleting the binlog
4270       anyway) and could deadlock, as we are holding LOCK_log.
4271 
4272       Wait for any mark_xid_done() calls that might be already running to
4273       complete (mark_xid_done_waiting counter to drop to zero); we need to
4274       do this before we take the LOCK_log to not deadlock.
4275     */
4276     mysql_mutex_lock(&LOCK_xid_list);
4277     reset_master_pending++;
4278     while (mark_xid_done_waiting > 0)
4279       mysql_cond_wait(&COND_xid_list, &LOCK_xid_list);
4280     mysql_mutex_unlock(&LOCK_xid_list);
4281   }
4282 
4283   DEBUG_SYNC_C_IF_THD(thd, "reset_logs_after_set_reset_master_pending");
4284   /*
4285     We need to get both locks to be sure that no one is trying to
4286     write to the index log file.
4287   */
4288   mysql_mutex_lock(&LOCK_log);
4289   mysql_mutex_lock(&LOCK_index);
4290 
4291   if (!is_relay_log)
4292   {
4293     /*
4294       We are going to nuke all binary log files.
4295       Without binlog, we cannot XA recover prepared-but-not-committed
4296       transactions in engines. So force a commit checkpoint first.
4297 
4298       Note that we take and immediately
4299       release LOCK_after_binlog_sync/LOCK_commit_ordered. This has
4300       the effect to ensure that any on-going group commit (in
4301       trx_group_commit_leader()) has completed before we request the checkpoint,
4302       due to the chaining of LOCK_log and LOCK_commit_ordered in that function.
4303       (We are holding LOCK_log, so no new group commit can start).
4304 
4305       Without this, it is possible (though perhaps unlikely) that the RESET
4306       MASTER could run in-between the write to the binlog and the
4307       commit_ordered() in the engine of some transaction, and then a crash
4308       later would leave such transaction not recoverable.
4309     */
4310 
4311     mysql_mutex_lock(&LOCK_after_binlog_sync);
4312     mysql_mutex_lock(&LOCK_commit_ordered);
4313     mysql_mutex_unlock(&LOCK_after_binlog_sync);
4314     mysql_mutex_unlock(&LOCK_commit_ordered);
4315 
4316     mark_xids_active(current_binlog_id, 1);
4317     do_checkpoint_request(current_binlog_id);
4318 
4319     /* Now wait for all checkpoint requests and pending unlog() to complete. */
4320     mysql_mutex_lock(&LOCK_xid_list);
4321     for (;;)
4322     {
4323       if (is_xidlist_idle_nolock())
4324         break;
4325       /*
4326         Wait until signalled that one more binlog dropped to zero, then check
4327         again.
4328       */
4329       mysql_cond_wait(&COND_xid_list, &LOCK_xid_list);
4330     }
4331 
4332     /*
4333       Now all XIDs are fully flushed to disk, and we are holding LOCK_log so
4334       no new ones will be written. So we can proceed to delete the logs.
4335     */
4336     mysql_mutex_unlock(&LOCK_xid_list);
4337   }
4338 
4339   /* Save variables so that we can reopen the log */
4340   save_name=name;
4341   name=0;					// Protect against free
4342   close(LOG_CLOSE_TO_BE_OPENED);
4343 
4344   last_used_log_number= 0;                      // Reset log number cache
4345 
4346   /*
4347     First delete all old log files and then update the index file.
4348     As we first delete the log files and do not use sort of logging,
4349     a crash may lead to an inconsistent state where the index has
4350     references to non-existent files.
4351 
4352     We need to invert the steps and use the purge_index_file methods
4353     in order to make the operation safe.
4354   */
4355 
4356   if ((err= find_log_pos(&linfo, NullS, 0)) != 0)
4357   {
4358     uint errcode= purge_log_get_error_code(err);
4359     sql_print_error("Failed to locate old binlog or relay log files");
4360     my_message(errcode, ER_THD_OR_DEFAULT(thd, errcode), MYF(0));
4361     error= 1;
4362     goto err;
4363   }
4364 
4365   for (;;)
4366   {
4367     if (unlikely((error= my_delete(linfo.log_file_name, MYF(0)))))
4368     {
4369       if (my_errno == ENOENT)
4370       {
4371         if (thd)
4372           push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
4373                               ER_LOG_PURGE_NO_FILE,
4374                               ER_THD(thd, ER_LOG_PURGE_NO_FILE),
4375                               linfo.log_file_name);
4376 
4377         sql_print_information("Failed to delete file '%s'",
4378                               linfo.log_file_name);
4379         my_errno= 0;
4380         error= 0;
4381       }
4382       else
4383       {
4384         if (thd)
4385           push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
4386                               ER_BINLOG_PURGE_FATAL_ERR,
4387                               "a problem with deleting %s; "
4388                               "consider examining correspondence "
4389                               "of your binlog index file "
4390                               "to the actual binlog files",
4391                               linfo.log_file_name);
4392         error= 1;
4393         goto err;
4394       }
4395     }
4396     if (find_next_log(&linfo, 0))
4397       break;
4398   }
4399 
4400   if (!is_relay_log)
4401   {
4402     if (init_state)
4403       rpl_global_gtid_binlog_state.load(init_state, init_state_len);
4404     else
4405       rpl_global_gtid_binlog_state.reset();
4406   }
4407 
4408   /* Start logging with a new file */
4409   close(LOG_CLOSE_INDEX | LOG_CLOSE_TO_BE_OPENED);
4410   // Reset (open will update)
4411   if (unlikely((error= my_delete(index_file_name, MYF(0)))))
4412   {
4413     if (my_errno == ENOENT)
4414     {
4415       if (thd)
4416         push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
4417                             ER_LOG_PURGE_NO_FILE,
4418                             ER_THD(thd, ER_LOG_PURGE_NO_FILE),
4419                             index_file_name);
4420       sql_print_information("Failed to delete file '%s'",
4421                             index_file_name);
4422       my_errno= 0;
4423       error= 0;
4424     }
4425     else
4426     {
4427       if (thd)
4428         push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
4429                             ER_BINLOG_PURGE_FATAL_ERR,
4430                             "a problem with deleting %s; "
4431                             "consider examining correspondence "
4432                             "of your binlog index file "
4433                             "to the actual binlog files",
4434                             index_file_name);
4435       error= 1;
4436       goto err;
4437     }
4438   }
4439   if (create_new_log && !open_index_file(index_file_name, 0, FALSE))
4440     if (unlikely((error= open(save_name, 0, next_log_number,
4441                               io_cache_type, max_size, 0, FALSE))))
4442       goto err;
4443   my_free((void *) save_name);
4444 
4445 err:
4446   if (error == 1)
4447     name= const_cast<char*>(save_name);
4448 
4449   if (!is_relay_log)
4450   {
4451     xid_count_per_binlog *b;
4452     /*
4453       Remove all entries in the xid_count list except the last.
4454       Normally we will just be deleting all the entries that we waited for to
4455       drop to zero above. But if we fail during RESET MASTER for some reason
4456       then we will not have created any new log file, and we may keep the last
4457       of the old entries.
4458     */
4459     mysql_mutex_lock(&LOCK_xid_list);
4460     for (;;)
4461     {
4462       b= binlog_xid_count_list.head();
4463       DBUG_ASSERT(b /* List can never become empty. */);
4464       if (b->binlog_id == current_binlog_id)
4465         break;
4466       DBUG_ASSERT(b->xid_count == 0);
4467       WSREP_XID_LIST_ENTRY("MYSQL_BIN_LOG::reset_logs(): Removing "
4468                            "xid_list_entry for %s (%lu)", b);
4469       delete binlog_xid_count_list.get();
4470     }
4471     mysql_cond_broadcast(&COND_xid_list);
4472     reset_master_pending--;
4473     reset_master_count++;
4474     mysql_mutex_unlock(&LOCK_xid_list);
4475   }
4476 
4477   mysql_mutex_unlock(&LOCK_index);
4478   mysql_mutex_unlock(&LOCK_log);
4479   DBUG_RETURN(error);
4480 }
4481 
4482 
wait_for_last_checkpoint_event()4483 void MYSQL_BIN_LOG::wait_for_last_checkpoint_event()
4484 {
4485   mysql_mutex_lock(&LOCK_xid_list);
4486   for (;;)
4487   {
4488     if (binlog_xid_count_list.is_last(binlog_xid_count_list.head()))
4489       break;
4490     mysql_cond_wait(&COND_xid_list, &LOCK_xid_list);
4491   }
4492   mysql_mutex_unlock(&LOCK_xid_list);
4493 
4494   /*
4495     LOCK_xid_list and LOCK_log are chained, so the LOCK_log will only be
4496     obtained after mark_xid_done() has written the last checkpoint event.
4497   */
4498   mysql_mutex_lock(&LOCK_log);
4499   mysql_mutex_unlock(&LOCK_log);
4500 }
4501 
4502 
4503 /**
4504   Delete relay log files prior to rli->group_relay_log_name
4505   (i.e. all logs which are not involved in a non-finished group
4506   (transaction)), remove them from the index file and start on next
4507   relay log.
4508 
4509   IMPLEMENTATION
4510 
4511   - You must hold rli->data_lock before calling this function, since
4512     it writes group_relay_log_pos and similar fields of
4513     Relay_log_info.
4514   - Protects index file with LOCK_index
4515   - Delete relevant relay log files
4516   - Copy all file names after these ones to the front of the index file
4517   - If the OS has truncate, truncate the file, else fill it with \n'
4518   - Read the next file name from the index file and store in rli->linfo
4519 
4520   @param rli	       Relay log information
4521   @param included     If false, all relay logs that are strictly before
4522                       rli->group_relay_log_name are deleted ; if true, the
4523                       latter is deleted too (i.e. all relay logs
4524                       read by the SQL slave thread are deleted).
4525 
4526   @note
4527     - This is only called from the slave SQL thread when it has read
4528     all commands from a relay log and want to switch to a new relay log.
4529     - When this happens, we can be in an active transaction as
4530     a transaction can span over two relay logs
4531     (although it is always written as a single block to the master's binary
4532     log, hence cannot span over two master's binary logs).
4533 
4534   @retval
4535     0			ok
4536   @retval
4537     LOG_INFO_EOF	        End of log-index-file found
4538   @retval
4539     LOG_INFO_SEEK	Could not allocate IO cache
4540   @retval
4541     LOG_INFO_IO		Got IO error while reading file
4542 */
4543 
4544 #ifdef HAVE_REPLICATION
4545 
purge_first_log(Relay_log_info * rli,bool included)4546 int MYSQL_BIN_LOG::purge_first_log(Relay_log_info* rli, bool included)
4547 {
4548   int error, errcode;
4549   char *to_purge_if_included= NULL;
4550   inuse_relaylog *ir;
4551   ulonglong log_space_reclaimed= 0;
4552   DBUG_ENTER("purge_first_log");
4553 
4554   DBUG_ASSERT(is_open());
4555   DBUG_ASSERT(rli->slave_running == MYSQL_SLAVE_RUN_NOT_CONNECT);
4556   DBUG_ASSERT(!strcmp(rli->linfo.log_file_name,rli->event_relay_log_name));
4557 
4558   mysql_mutex_assert_owner(&rli->data_lock);
4559 
4560   mysql_mutex_lock(&LOCK_index);
4561 
4562   ir= rli->inuse_relaylog_list;
4563   while (ir)
4564   {
4565     inuse_relaylog *next= ir->next;
4566     if (!ir->completed || ir->dequeued_count < ir->queued_count)
4567     {
4568       included= false;
4569       break;
4570     }
4571     if (!included && !strcmp(ir->name, rli->group_relay_log_name))
4572       break;
4573     if (!next)
4574     {
4575       rli->last_inuse_relaylog= NULL;
4576       included= 1;
4577       to_purge_if_included= my_strdup(key_memory_Relay_log_info_group_relay_log_name,
4578                                       ir->name, MYF(0));
4579     }
4580     rli->free_inuse_relaylog(ir);
4581     ir= next;
4582   }
4583   rli->inuse_relaylog_list= ir;
4584   if (ir)
4585     to_purge_if_included= my_strdup(key_memory_Relay_log_info_group_relay_log_name,
4586                                     ir->name, MYF(0));
4587 
4588   /*
4589     Read the next log file name from the index file and pass it back to
4590     the caller.
4591   */
4592   if (unlikely((error=find_log_pos(&rli->linfo, rli->event_relay_log_name,
4593                                    0))) ||
4594       unlikely((error=find_next_log(&rli->linfo, 0))))
4595   {
4596     sql_print_error("next log error: %d  offset: %llu  log: %s included: %d",
4597                     error, rli->linfo.index_file_offset,
4598                     rli->event_relay_log_name, included);
4599     goto err;
4600   }
4601 
4602   /*
4603     Reset rli's coordinates to the current log.
4604   */
4605   rli->event_relay_log_pos= BIN_LOG_HEADER_SIZE;
4606   strmake_buf(rli->event_relay_log_name,rli->linfo.log_file_name);
4607 
4608   /*
4609     If we removed the rli->group_relay_log_name file,
4610     we must update the rli->group* coordinates, otherwise do not touch it as the
4611     group's execution is not finished (e.g. COMMIT not executed)
4612   */
4613   if (included)
4614   {
4615     rli->group_relay_log_pos = BIN_LOG_HEADER_SIZE;
4616     strmake_buf(rli->group_relay_log_name,rli->linfo.log_file_name);
4617     rli->notify_group_relay_log_name_update();
4618   }
4619 
4620   /* Store where we are in the new file for the execution thread */
4621   if (rli->flush())
4622     error= LOG_INFO_IO;
4623 
4624   DBUG_EXECUTE_IF("crash_before_purge_logs", DBUG_SUICIDE(););
4625 
4626   rli->relay_log.purge_logs(to_purge_if_included, included,
4627                             0, 0, &log_space_reclaimed);
4628 
4629   mysql_mutex_lock(&rli->log_space_lock);
4630   rli->log_space_total-= log_space_reclaimed;
4631   mysql_cond_broadcast(&rli->log_space_cond);
4632   mysql_mutex_unlock(&rli->log_space_lock);
4633 
4634   /*
4635    * Need to update the log pos because purge logs has been called
4636    * after fetching initially the log pos at the beginning of the method.
4637    */
4638   if ((errcode= find_log_pos(&rli->linfo, rli->event_relay_log_name, 0)))
4639   {
4640     sql_print_error("next log error: %d  offset: %llu  log: %s included: %d",
4641                     errcode, rli->linfo.index_file_offset,
4642                     rli->group_relay_log_name, included);
4643     goto err;
4644   }
4645 
4646   /* If included was passed, rli->linfo should be the first entry. */
4647   DBUG_ASSERT(!included || rli->linfo.index_file_start_offset == 0);
4648 
4649 err:
4650   my_free(to_purge_if_included);
4651   mysql_mutex_unlock(&LOCK_index);
4652   DBUG_RETURN(error);
4653 }
4654 
4655 /**
4656   Update log index_file.
4657 */
4658 
update_log_index(LOG_INFO * log_info,bool need_update_threads)4659 int MYSQL_BIN_LOG::update_log_index(LOG_INFO* log_info, bool need_update_threads)
4660 {
4661   if (copy_up_file_and_fill(&index_file, log_info->index_file_start_offset))
4662     return LOG_INFO_IO;
4663 
4664   // now update offsets in index file for running threads
4665   if (need_update_threads)
4666     adjust_linfo_offsets(log_info->index_file_start_offset);
4667   return 0;
4668 }
4669 
4670 /**
4671   Remove all logs before the given log from disk and from the index file.
4672 
4673   @param to_log	      Delete all log file name before this file.
4674   @param included            If true, to_log is deleted too.
4675   @param need_mutex
4676   @param need_update_threads If we want to update the log coordinates of
4677                              all threads. False for relay logs, true otherwise.
4678   @param reclaimeed_log_space If not null, increment this variable to
4679                               the amount of log space freed
4680 
4681   @note
4682     If any of the logs before the deleted one is in use,
4683     only purge logs up to this one.
4684 
4685   @retval
4686     0			ok
4687   @retval
4688     LOG_INFO_EOF		to_log not found
4689     LOG_INFO_EMFILE             too many files opened
4690     LOG_INFO_FATAL              if any other than ENOENT error from
4691                                 mysql_file_stat() or mysql_file_delete()
4692 */
4693 
purge_logs(const char * to_log,bool included,bool need_mutex,bool need_update_threads,ulonglong * reclaimed_space)4694 int MYSQL_BIN_LOG::purge_logs(const char *to_log,
4695                               bool included,
4696                               bool need_mutex,
4697                               bool need_update_threads,
4698                               ulonglong *reclaimed_space)
4699 {
4700   int error= 0;
4701   bool exit_loop= 0;
4702   LOG_INFO log_info;
4703   THD *thd= current_thd;
4704   DBUG_ENTER("purge_logs");
4705   DBUG_PRINT("info",("to_log= %s",to_log));
4706 
4707   if (need_mutex)
4708     mysql_mutex_lock(&LOCK_index);
4709   if (unlikely((error=find_log_pos(&log_info, to_log, 0 /*no mutex*/))) )
4710   {
4711     sql_print_error("MYSQL_BIN_LOG::purge_logs was called with file %s not "
4712                     "listed in the index.", to_log);
4713     goto err;
4714   }
4715 
4716   if (unlikely((error= open_purge_index_file(TRUE))))
4717   {
4718     sql_print_error("MYSQL_BIN_LOG::purge_logs failed to sync the index file.");
4719     goto err;
4720   }
4721 
4722   /*
4723     File name exists in index file; delete until we find this file
4724     or a file that is used.
4725   */
4726   if (unlikely((error=find_log_pos(&log_info, NullS, 0 /*no mutex*/))))
4727     goto err;
4728   while ((strcmp(to_log,log_info.log_file_name) || (exit_loop=included)) &&
4729          can_purge_log(log_info.log_file_name))
4730   {
4731     if (unlikely((error= register_purge_index_entry(log_info.log_file_name))))
4732     {
4733       sql_print_error("MYSQL_BIN_LOG::purge_logs failed to copy %s to register file.",
4734                       log_info.log_file_name);
4735       goto err;
4736     }
4737 
4738     if (find_next_log(&log_info, 0) || exit_loop)
4739       break;
4740   }
4741 
4742   DBUG_EXECUTE_IF("crash_purge_before_update_index", DBUG_SUICIDE(););
4743 
4744   if (unlikely((error= sync_purge_index_file())))
4745   {
4746     sql_print_error("MYSQL_BIN_LOG::purge_logs failed to flush register file.");
4747     goto err;
4748   }
4749 
4750   /* We know how many files to delete. Update index file. */
4751   if (unlikely((error=update_log_index(&log_info, need_update_threads))))
4752   {
4753     sql_print_error("MYSQL_BIN_LOG::purge_logs failed to update the index file");
4754     goto err;
4755   }
4756 
4757   DBUG_EXECUTE_IF("crash_purge_critical_after_update_index", DBUG_SUICIDE(););
4758 
4759 err:
4760   /* Read each entry from purge_index_file and delete the file. */
4761   if (is_inited_purge_index_file() &&
4762       (error= purge_index_entry(thd, reclaimed_space, FALSE)))
4763     sql_print_error("MYSQL_BIN_LOG::purge_logs failed to process registered files"
4764                     " that would be purged.");
4765   close_purge_index_file();
4766 
4767   DBUG_EXECUTE_IF("crash_purge_non_critical_after_update_index", DBUG_SUICIDE(););
4768 
4769   if (need_mutex)
4770     mysql_mutex_unlock(&LOCK_index);
4771   DBUG_RETURN(error);
4772 }
4773 
set_purge_index_file_name(const char * base_file_name)4774 int MYSQL_BIN_LOG::set_purge_index_file_name(const char *base_file_name)
4775 {
4776   int error= 0;
4777   DBUG_ENTER("MYSQL_BIN_LOG::set_purge_index_file_name");
4778   if (fn_format(purge_index_file_name, base_file_name, mysql_data_home,
4779                 ".~rec~", MYF(MY_UNPACK_FILENAME | MY_SAFE_PATH |
4780                               MY_REPLACE_EXT)) == NULL)
4781   {
4782     error= 1;
4783     sql_print_error("MYSQL_BIN_LOG::set_purge_index_file_name failed to set "
4784                       "file name.");
4785   }
4786   DBUG_RETURN(error);
4787 }
4788 
open_purge_index_file(bool destroy)4789 int MYSQL_BIN_LOG::open_purge_index_file(bool destroy)
4790 {
4791   int error= 0;
4792   File file= -1;
4793 
4794   DBUG_ENTER("MYSQL_BIN_LOG::open_purge_index_file");
4795 
4796   if (destroy)
4797     close_purge_index_file();
4798 
4799   if (!my_b_inited(&purge_index_file))
4800   {
4801     if ((file= my_open(purge_index_file_name, O_RDWR | O_CREAT | O_BINARY,
4802                        MYF(MY_WME))) < 0  ||
4803         init_io_cache(&purge_index_file, file, IO_SIZE,
4804                       (destroy ? WRITE_CACHE : READ_CACHE),
4805                       0, 0, MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
4806     {
4807       error= 1;
4808       sql_print_error("MYSQL_BIN_LOG::open_purge_index_file failed to open register "
4809                       " file.");
4810     }
4811   }
4812   DBUG_RETURN(error);
4813 }
4814 
close_purge_index_file()4815 int MYSQL_BIN_LOG::close_purge_index_file()
4816 {
4817   int error= 0;
4818 
4819   DBUG_ENTER("MYSQL_BIN_LOG::close_purge_index_file");
4820 
4821   if (my_b_inited(&purge_index_file))
4822   {
4823     end_io_cache(&purge_index_file);
4824     error= my_close(purge_index_file.file, MYF(0));
4825   }
4826   my_delete(purge_index_file_name, MYF(0));
4827   bzero((char*) &purge_index_file, sizeof(purge_index_file));
4828 
4829   DBUG_RETURN(error);
4830 }
4831 
is_inited_purge_index_file()4832 bool MYSQL_BIN_LOG::is_inited_purge_index_file()
4833 {
4834   return my_b_inited(&purge_index_file);
4835 }
4836 
sync_purge_index_file()4837 int MYSQL_BIN_LOG::sync_purge_index_file()
4838 {
4839   int error= 0;
4840   DBUG_ENTER("MYSQL_BIN_LOG::sync_purge_index_file");
4841 
4842   if (unlikely((error= flush_io_cache(&purge_index_file))) ||
4843       unlikely((error= my_sync(purge_index_file.file,
4844                                MYF(MY_WME | MY_SYNC_FILESIZE)))))
4845     DBUG_RETURN(error);
4846 
4847   DBUG_RETURN(error);
4848 }
4849 
register_purge_index_entry(const char * entry)4850 int MYSQL_BIN_LOG::register_purge_index_entry(const char *entry)
4851 {
4852   int error= 0;
4853   DBUG_ENTER("MYSQL_BIN_LOG::register_purge_index_entry");
4854 
4855   if (unlikely((error=my_b_write(&purge_index_file, (const uchar*)entry,
4856                                  strlen(entry)))) ||
4857       unlikely((error=my_b_write(&purge_index_file, (const uchar*)"\n", 1))))
4858     DBUG_RETURN (error);
4859 
4860   DBUG_RETURN(error);
4861 }
4862 
register_create_index_entry(const char * entry)4863 int MYSQL_BIN_LOG::register_create_index_entry(const char *entry)
4864 {
4865   DBUG_ENTER("MYSQL_BIN_LOG::register_create_index_entry");
4866   DBUG_RETURN(register_purge_index_entry(entry));
4867 }
4868 
purge_index_entry(THD * thd,ulonglong * reclaimed_space,bool need_mutex)4869 int MYSQL_BIN_LOG::purge_index_entry(THD *thd, ulonglong *reclaimed_space,
4870                                      bool need_mutex)
4871 {
4872   DBUG_ENTER("MYSQL_BIN_LOG:purge_index_entry");
4873   MY_STAT s;
4874   int error= 0;
4875   LOG_INFO log_info;
4876   LOG_INFO check_log_info;
4877 
4878   DBUG_ASSERT(my_b_inited(&purge_index_file));
4879 
4880   if (unlikely((error= reinit_io_cache(&purge_index_file, READ_CACHE, 0, 0,
4881                                        0))))
4882   {
4883     sql_print_error("MYSQL_BIN_LOG::purge_index_entry failed to reinit register file "
4884                     "for read");
4885     goto err;
4886   }
4887 
4888   for (;;)
4889   {
4890     size_t length;
4891 
4892     if ((length=my_b_gets(&purge_index_file, log_info.log_file_name,
4893                           FN_REFLEN)) <= 1)
4894     {
4895       if (purge_index_file.error)
4896       {
4897         error= purge_index_file.error;
4898         sql_print_error("MYSQL_BIN_LOG::purge_index_entry error %d reading from "
4899                         "register file.", error);
4900         goto err;
4901       }
4902 
4903       /* Reached EOF */
4904       break;
4905     }
4906 
4907     /* Get rid of the trailing '\n' */
4908     log_info.log_file_name[length-1]= 0;
4909 
4910     if (unlikely(!mysql_file_stat(m_key_file_log, log_info.log_file_name, &s,
4911                                   MYF(0))))
4912     {
4913       if (my_errno == ENOENT)
4914       {
4915         /*
4916           It's not fatal if we can't stat a log file that does not exist;
4917           If we could not stat, we won't delete.
4918         */
4919         if (thd)
4920         {
4921           push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
4922                               ER_LOG_PURGE_NO_FILE, ER_THD(thd, ER_LOG_PURGE_NO_FILE),
4923                               log_info.log_file_name);
4924         }
4925         sql_print_information("Failed to execute mysql_file_stat on file '%s'",
4926 			      log_info.log_file_name);
4927         my_errno= 0;
4928       }
4929       else
4930       {
4931         /*
4932           Other than ENOENT are fatal
4933         */
4934         if (thd)
4935         {
4936           push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
4937                               ER_BINLOG_PURGE_FATAL_ERR,
4938                               "a problem with getting info on being purged %s; "
4939                               "consider examining correspondence "
4940                               "of your binlog index file "
4941                               "to the actual binlog files",
4942                               log_info.log_file_name);
4943         }
4944         else
4945         {
4946           sql_print_information("Failed to delete log file '%s'; "
4947                                 "consider examining correspondence "
4948                                 "of your binlog index file "
4949                                 "to the actual binlog files",
4950                                 log_info.log_file_name);
4951         }
4952         error= LOG_INFO_FATAL;
4953         goto err;
4954       }
4955     }
4956     else
4957     {
4958       if (unlikely((error= find_log_pos(&check_log_info,
4959                                         log_info.log_file_name, need_mutex))))
4960       {
4961         if (error != LOG_INFO_EOF)
4962         {
4963           if (thd)
4964           {
4965             push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
4966                                 ER_BINLOG_PURGE_FATAL_ERR,
4967                                 "a problem with deleting %s and "
4968                                 "reading the binlog index file",
4969                                 log_info.log_file_name);
4970           }
4971           else
4972           {
4973             sql_print_information("Failed to delete file '%s' and "
4974                                   "read the binlog index file",
4975                                   log_info.log_file_name);
4976           }
4977           goto err;
4978         }
4979 
4980         error= 0;
4981 
4982         DBUG_PRINT("info",("purging %s",log_info.log_file_name));
4983         if (!my_delete(log_info.log_file_name, MYF(0)))
4984         {
4985           if (reclaimed_space)
4986             *reclaimed_space+= s.st_size;
4987         }
4988         else
4989         {
4990           if (my_errno == ENOENT)
4991           {
4992             if (thd)
4993             {
4994               push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
4995                                   ER_LOG_PURGE_NO_FILE, ER_THD(thd, ER_LOG_PURGE_NO_FILE),
4996                                   log_info.log_file_name);
4997             }
4998             sql_print_information("Failed to delete file '%s'",
4999                                   log_info.log_file_name);
5000             my_errno= 0;
5001           }
5002           else
5003           {
5004             if (thd)
5005             {
5006               push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
5007                                   ER_BINLOG_PURGE_FATAL_ERR,
5008                                   "a problem with deleting %s; "
5009                                   "consider examining correspondence "
5010                                   "of your binlog index file "
5011                                   "to the actual binlog files",
5012                                   log_info.log_file_name);
5013             }
5014             else
5015             {
5016               sql_print_information("Failed to delete file '%s'; "
5017                                     "consider examining correspondence "
5018                                     "of your binlog index file "
5019                                     "to the actual binlog files",
5020                                     log_info.log_file_name);
5021             }
5022             if (my_errno == EMFILE)
5023             {
5024               DBUG_PRINT("info",
5025                          ("my_errno: %d, set ret = LOG_INFO_EMFILE", my_errno));
5026               error= LOG_INFO_EMFILE;
5027               goto err;
5028             }
5029             error= LOG_INFO_FATAL;
5030             goto err;
5031           }
5032         }
5033       }
5034     }
5035   }
5036 
5037 err:
5038   DBUG_RETURN(error);
5039 }
5040 
5041 /**
5042   Remove all logs before the given file date from disk and from the
5043   index file.
5044 
5045   @param thd		Thread pointer
5046   @param purge_time	Delete all log files before given date.
5047 
5048   @note
5049     If any of the logs before the deleted one is in use,
5050     only purge logs up to this one.
5051 
5052   @retval
5053     0				ok
5054   @retval
5055     LOG_INFO_PURGE_NO_ROTATE	Binary file that can't be rotated
5056     LOG_INFO_FATAL              if any other than ENOENT error from
5057                                 mysql_file_stat() or mysql_file_delete()
5058 */
5059 
purge_logs_before_date(time_t purge_time)5060 int MYSQL_BIN_LOG::purge_logs_before_date(time_t purge_time)
5061 {
5062   int error;
5063   char to_log[FN_REFLEN];
5064   LOG_INFO log_info;
5065   MY_STAT stat_area;
5066   THD *thd= current_thd;
5067   DBUG_ENTER("purge_logs_before_date");
5068 
5069   mysql_mutex_lock(&LOCK_index);
5070   to_log[0]= 0;
5071 
5072   if (unlikely((error=find_log_pos(&log_info, NullS, 0 /*no mutex*/))))
5073     goto err;
5074 
5075   while (strcmp(log_file_name, log_info.log_file_name) &&
5076 	 can_purge_log(log_info.log_file_name))
5077   {
5078     if (!mysql_file_stat(m_key_file_log,
5079                          log_info.log_file_name, &stat_area, MYF(0)))
5080     {
5081       if (my_errno == ENOENT)
5082       {
5083         /*
5084           It's not fatal if we can't stat a log file that does not exist.
5085         */
5086         my_errno= 0;
5087       }
5088       else
5089       {
5090         /*
5091           Other than ENOENT are fatal
5092         */
5093         if (thd)
5094         {
5095           push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
5096                               ER_BINLOG_PURGE_FATAL_ERR,
5097                               "a problem with getting info on being purged %s; "
5098                               "consider examining correspondence "
5099                               "of your binlog index file "
5100                               "to the actual binlog files",
5101                               log_info.log_file_name);
5102         }
5103         else
5104         {
5105           sql_print_information("Failed to delete log file '%s'",
5106                                 log_info.log_file_name);
5107         }
5108         error= LOG_INFO_FATAL;
5109         goto err;
5110       }
5111     }
5112     else
5113     {
5114       if (stat_area.st_mtime < purge_time)
5115         strmake_buf(to_log, log_info.log_file_name);
5116       else
5117         break;
5118     }
5119     if (find_next_log(&log_info, 0))
5120       break;
5121   }
5122 
5123   error= (to_log[0] ? purge_logs(to_log, 1, 0, 1, (ulonglong *) 0) : 0);
5124 
5125 err:
5126   mysql_mutex_unlock(&LOCK_index);
5127   DBUG_RETURN(error);
5128 }
5129 
5130 
5131 bool
can_purge_log(const char * log_file_name_arg)5132 MYSQL_BIN_LOG::can_purge_log(const char *log_file_name_arg)
5133 {
5134   xid_count_per_binlog *b;
5135 
5136   if (is_active(log_file_name_arg))
5137     return false;
5138   mysql_mutex_lock(&LOCK_xid_list);
5139   {
5140     I_List_iterator<xid_count_per_binlog> it(binlog_xid_count_list);
5141     while ((b= it++) &&
5142            0 != strncmp(log_file_name_arg+dirname_length(log_file_name_arg),
5143                         b->binlog_name, b->binlog_name_len))
5144       ;
5145   }
5146   mysql_mutex_unlock(&LOCK_xid_list);
5147   if (b)
5148     return false;
5149   return !log_in_use(log_file_name_arg);
5150 }
5151 #endif /* HAVE_REPLICATION */
5152 
5153 
5154 bool
is_xidlist_idle()5155 MYSQL_BIN_LOG::is_xidlist_idle()
5156 {
5157   bool res;
5158   mysql_mutex_lock(&LOCK_xid_list);
5159   res= is_xidlist_idle_nolock();
5160   mysql_mutex_unlock(&LOCK_xid_list);
5161   return res;
5162 }
5163 
5164 
5165 bool
is_xidlist_idle_nolock()5166 MYSQL_BIN_LOG::is_xidlist_idle_nolock()
5167 {
5168   xid_count_per_binlog *b;
5169 
5170   I_List_iterator<xid_count_per_binlog> it(binlog_xid_count_list);
5171   while ((b= it++))
5172   {
5173     if (b->xid_count > 0)
5174       return false;
5175   }
5176   return true;
5177 }
5178 
5179 /**
5180   Create a new log file name.
5181 
5182   @param buf		buf of at least FN_REFLEN where new name is stored
5183 
5184   @note
5185     If file name will be longer then FN_REFLEN it will be truncated
5186 */
5187 
make_log_name(char * buf,const char * log_ident)5188 void MYSQL_BIN_LOG::make_log_name(char* buf, const char* log_ident)
5189 {
5190   size_t dir_len = dirname_length(log_file_name);
5191   if (dir_len >= FN_REFLEN)
5192     dir_len=FN_REFLEN-1;
5193   strnmov(buf, log_file_name, dir_len);
5194   strmake(buf+dir_len, log_ident, FN_REFLEN - dir_len -1);
5195 }
5196 
5197 
5198 /**
5199   Check if we are writing/reading to the given log file.
5200 */
5201 
is_active(const char * log_file_name_arg)5202 bool MYSQL_BIN_LOG::is_active(const char *log_file_name_arg)
5203 {
5204   /**
5205    * there should/must be mysql_mutex_assert_owner(&LOCK_log) here...
5206    * but code violates this! (scary monsters and super creeps!)
5207    *
5208    * example stacktrace:
5209    * #8  MYSQL_BIN_LOG::is_active
5210    * #9  MYSQL_BIN_LOG::can_purge_log
5211    * #10 MYSQL_BIN_LOG::purge_logs
5212    * #11 MYSQL_BIN_LOG::purge_first_log
5213    * #12 next_event
5214    * #13 exec_relay_log_event
5215    *
5216    * I didn't investigate if this is ligit...(i.e if my comment is wrong)
5217    */
5218   return !strcmp(log_file_name, log_file_name_arg);
5219 }
5220 
5221 
5222 /*
5223   Wrappers around new_file_impl to avoid using argument
5224   to control locking. The argument 1) less readable 2) breaks
5225   incapsulation 3) allows external access to the class without
5226   a lock (which is not possible with private new_file_without_locking
5227   method).
5228 
5229   @retval
5230     nonzero - error
5231 */
5232 
new_file()5233 int MYSQL_BIN_LOG::new_file()
5234 {
5235   int res;
5236   mysql_mutex_lock(&LOCK_log);
5237   res= new_file_impl();
5238   mysql_mutex_unlock(&LOCK_log);
5239   return res;
5240 }
5241 
5242 /*
5243   @retval
5244     nonzero - error
5245  */
new_file_without_locking()5246 int MYSQL_BIN_LOG::new_file_without_locking()
5247 {
5248   return new_file_impl();
5249 }
5250 
5251 
5252 /**
5253   Start writing to a new log file or reopen the old file.
5254 
5255   @param need_lock		Set to 1 if caller has not locked LOCK_log
5256 
5257   @retval
5258     nonzero - error
5259 
5260   @note
5261     The new file name is stored last in the index file
5262 */
5263 
new_file_impl()5264 int MYSQL_BIN_LOG::new_file_impl()
5265 {
5266   int error= 0, close_on_error= FALSE;
5267   char new_name[FN_REFLEN], *new_name_ptr, *old_name, *file_to_open;
5268   uint close_flag;
5269   bool delay_close= false;
5270   File UNINIT_VAR(old_file);
5271   DBUG_ENTER("MYSQL_BIN_LOG::new_file_impl");
5272 
5273   DBUG_ASSERT(log_type == LOG_BIN);
5274   mysql_mutex_assert_owner(&LOCK_log);
5275 
5276   if (!is_open())
5277   {
5278     DBUG_PRINT("info",("log is closed"));
5279     DBUG_RETURN(error);
5280   }
5281 
5282   mysql_mutex_lock(&LOCK_index);
5283 
5284   /* Reuse old name if not binlog and not update log */
5285   new_name_ptr= name;
5286 
5287   /*
5288     If user hasn't specified an extension, generate a new log name
5289     We have to do this here and not in open as we want to store the
5290     new file name in the current binary log file.
5291   */
5292   if (unlikely((error= generate_new_name(new_name, name, 0))))
5293   {
5294 #ifdef ENABLE_AND_FIX_HANG
5295     close_on_error= TRUE;
5296 #endif
5297     goto end2;
5298   }
5299   new_name_ptr=new_name;
5300 
5301   {
5302     /*
5303       We log the whole file name for log file as the user may decide
5304       to change base names at some point.
5305     */
5306     Rotate_log_event r(new_name + dirname_length(new_name), 0, LOG_EVENT_OFFSET,
5307                        is_relay_log ? Rotate_log_event::RELAY_LOG : 0);
5308     /*
5309       The current relay-log's closing Rotate event must have checksum
5310       value computed with an algorithm of the last relay-logged FD event.
5311     */
5312     if (is_relay_log)
5313       r.checksum_alg= relay_log_checksum_alg;
5314     DBUG_ASSERT(!is_relay_log ||
5315                 relay_log_checksum_alg != BINLOG_CHECKSUM_ALG_UNDEF);
5316     if (DBUG_EVALUATE_IF("fault_injection_new_file_rotate_event",
5317                          (error= close_on_error= TRUE), FALSE) ||
5318         (error= write_event(&r)))
5319     {
5320       DBUG_EXECUTE_IF("fault_injection_new_file_rotate_event", errno= 2;);
5321       close_on_error= TRUE;
5322       my_printf_error(ER_ERROR_ON_WRITE,
5323                       ER_THD_OR_DEFAULT(current_thd, ER_CANT_OPEN_FILE),
5324                       MYF(ME_FATAL), name, errno);
5325       goto end;
5326     }
5327     bytes_written+= r.data_written;
5328   }
5329 
5330   /*
5331     Update needs to be signalled even if there is no rotate event
5332     log rotation should give the waiting thread a signal to
5333     discover EOF and move on to the next log.
5334   */
5335   if (unlikely((error= flush_io_cache(&log_file))))
5336   {
5337     close_on_error= TRUE;
5338     goto end;
5339   }
5340   update_binlog_end_pos();
5341 
5342   old_name=name;
5343   name=0;				// Don't free name
5344   close_flag= LOG_CLOSE_TO_BE_OPENED | LOG_CLOSE_INDEX;
5345   if (!is_relay_log)
5346   {
5347     /*
5348       We need to keep the old binlog file open (and marked as in-use) until
5349       the new one is fully created and synced to disk and index. Otherwise we
5350       leave a window where if we crash, there is no binlog file marked as
5351       crashed for server restart to detect the need for recovery.
5352     */
5353     old_file= log_file.file;
5354     close_flag|= LOG_CLOSE_DELAYED_CLOSE;
5355     delay_close= true;
5356   }
5357   close(close_flag);
5358   if (checksum_alg_reset != BINLOG_CHECKSUM_ALG_UNDEF)
5359   {
5360     DBUG_ASSERT(!is_relay_log);
5361     DBUG_ASSERT(binlog_checksum_options != checksum_alg_reset);
5362     binlog_checksum_options= checksum_alg_reset;
5363   }
5364   /*
5365      Note that at this point, log_state != LOG_CLOSED
5366      (important for is_open()).
5367   */
5368 
5369   /*
5370      new_file() is only used for rotation (in FLUSH LOGS or because size >
5371      max_binlog_size or max_relay_log_size).
5372      If this is a binary log, the Format_description_log_event at the
5373      beginning of the new file should have created=0 (to distinguish with the
5374      Format_description_log_event written at server startup, which should
5375      trigger temp tables deletion on slaves.
5376   */
5377 
5378   /* reopen index binlog file, BUG#34582 */
5379   file_to_open= index_file_name;
5380   error= open_index_file(index_file_name, 0, FALSE);
5381   if (likely(!error))
5382   {
5383     /* reopen the binary log file. */
5384     file_to_open= new_name_ptr;
5385     error= open(old_name, new_name_ptr, 0, io_cache_type, max_size, 1, FALSE);
5386   }
5387 
5388   /* handle reopening errors */
5389   if (unlikely(error))
5390   {
5391     my_error(ER_CANT_OPEN_FILE, MYF(ME_FATAL), file_to_open, error);
5392     close_on_error= TRUE;
5393   }
5394 
5395   my_free(old_name);
5396 
5397 end:
5398   /* In case of errors, reuse the last generated log file name */
5399   if (unlikely(error))
5400   {
5401     DBUG_ASSERT(last_used_log_number > 0);
5402     last_used_log_number--;
5403   }
5404 
5405 end2:
5406   if (delay_close)
5407   {
5408     clear_inuse_flag_when_closing(old_file);
5409     mysql_file_close(old_file, MYF(MY_WME));
5410   }
5411 
5412   if (unlikely(error && close_on_error)) /* rotate or reopen failed */
5413   {
5414     /*
5415       Close whatever was left opened.
5416 
5417       We are keeping the behavior as it exists today, ie,
5418       we disable logging and move on (see: BUG#51014).
5419 
5420       TODO: as part of WL#1790 consider other approaches:
5421        - kill mysql (safety);
5422        - try multiple locations for opening a log file;
5423        - switch server to protected/readonly mode
5424        - ...
5425     */
5426     close(LOG_CLOSE_INDEX);
5427     sql_print_error(fatal_log_error, new_name_ptr, errno);
5428   }
5429 
5430   mysql_mutex_unlock(&LOCK_index);
5431 
5432   DBUG_RETURN(error);
5433 }
5434 
write_event(Log_event * ev,binlog_cache_data * cache_data,IO_CACHE * file)5435 bool MYSQL_BIN_LOG::write_event(Log_event *ev, binlog_cache_data *cache_data,
5436                                 IO_CACHE *file)
5437 {
5438   Log_event_writer writer(file, 0, &crypto);
5439   if (crypto.scheme && file == &log_file)
5440   {
5441     writer.ctx= alloca(crypto.ctx_size);
5442     writer.set_encrypted_writer();
5443   }
5444   if (cache_data)
5445     cache_data->add_status(ev->logged_status());
5446   return writer.write(ev);
5447 }
5448 
append(Log_event * ev)5449 bool MYSQL_BIN_LOG::append(Log_event *ev)
5450 {
5451   bool res;
5452   mysql_mutex_lock(&LOCK_log);
5453   res= append_no_lock(ev);
5454   mysql_mutex_unlock(&LOCK_log);
5455   return res;
5456 }
5457 
5458 
append_no_lock(Log_event * ev)5459 bool MYSQL_BIN_LOG::append_no_lock(Log_event* ev)
5460 {
5461   bool error = 0;
5462   DBUG_ENTER("MYSQL_BIN_LOG::append");
5463 
5464   mysql_mutex_assert_owner(&LOCK_log);
5465   DBUG_ASSERT(log_file.type == SEQ_READ_APPEND);
5466 
5467   if (write_event(ev))
5468   {
5469     error=1;
5470     goto err;
5471   }
5472   bytes_written+= ev->data_written;
5473   DBUG_PRINT("info",("max_size: %lu",max_size));
5474   if (flush_and_sync(0))
5475     goto err;
5476   if (my_b_append_tell(&log_file) > max_size)
5477     error= new_file_without_locking();
5478 err:
5479   update_binlog_end_pos();
5480   DBUG_RETURN(error);
5481 }
5482 
write_event_buffer(uchar * buf,uint len)5483 bool MYSQL_BIN_LOG::write_event_buffer(uchar* buf, uint len)
5484 {
5485   bool error= 1;
5486   uchar *ebuf= 0;
5487   DBUG_ENTER("MYSQL_BIN_LOG::write_event_buffer");
5488 
5489   DBUG_ASSERT(log_file.type == SEQ_READ_APPEND);
5490 
5491   mysql_mutex_assert_owner(&LOCK_log);
5492 
5493   if (crypto.scheme != 0)
5494   {
5495     DBUG_ASSERT(crypto.scheme == 1);
5496 
5497     uint elen;
5498     uchar iv[BINLOG_IV_LENGTH];
5499 
5500     ebuf= (uchar*)my_safe_alloca(len);
5501     if (!ebuf)
5502       goto err;
5503 
5504     crypto.set_iv(iv, (uint32)my_b_append_tell(&log_file));
5505 
5506     /*
5507       we want to encrypt everything, excluding the event length:
5508       massage the data before the encryption
5509     */
5510     memcpy(buf + EVENT_LEN_OFFSET, buf, 4);
5511 
5512     if (encryption_crypt(buf + 4, len - 4,
5513                          ebuf + 4, &elen,
5514                          crypto.key, crypto.key_length, iv, sizeof(iv),
5515                          ENCRYPTION_FLAG_ENCRYPT | ENCRYPTION_FLAG_NOPAD,
5516                          ENCRYPTION_KEY_SYSTEM_DATA, crypto.key_version))
5517       goto err;
5518 
5519     DBUG_ASSERT(elen == len - 4);
5520 
5521     /* massage the data after the encryption */
5522     memcpy(ebuf, ebuf + EVENT_LEN_OFFSET, 4);
5523     int4store(ebuf + EVENT_LEN_OFFSET, len);
5524 
5525     buf= ebuf;
5526   }
5527   if (my_b_append(&log_file, buf, len))
5528     goto err;
5529   bytes_written+= len;
5530 
5531   error= 0;
5532   DBUG_PRINT("info",("max_size: %lu",max_size));
5533   if (flush_and_sync(0))
5534     goto err;
5535   if (my_b_append_tell(&log_file) > max_size)
5536     error= new_file_without_locking();
5537 err:
5538   my_safe_afree(ebuf, len);
5539   if (likely(!error))
5540     update_binlog_end_pos();
5541   DBUG_RETURN(error);
5542 }
5543 
flush_and_sync(bool * synced)5544 bool MYSQL_BIN_LOG::flush_and_sync(bool *synced)
5545 {
5546   int err=0, fd=log_file.file;
5547   if (synced)
5548     *synced= 0;
5549   mysql_mutex_assert_owner(&LOCK_log);
5550   if (flush_io_cache(&log_file))
5551     return 1;
5552   uint sync_period= get_sync_period();
5553   if (sync_period && ++sync_counter >= sync_period)
5554   {
5555     sync_counter= 0;
5556     err= mysql_file_sync(fd, MYF(MY_WME|MY_SYNC_FILESIZE));
5557     if (synced)
5558       *synced= 1;
5559 #ifndef DBUG_OFF
5560     if (opt_binlog_dbug_fsync_sleep > 0)
5561       my_sleep(opt_binlog_dbug_fsync_sleep);
5562 #endif
5563   }
5564   return err;
5565 }
5566 
start_union_events(THD * thd,query_id_t query_id_param)5567 void MYSQL_BIN_LOG::start_union_events(THD *thd, query_id_t query_id_param)
5568 {
5569   DBUG_ASSERT(!thd->binlog_evt_union.do_union);
5570   thd->binlog_evt_union.do_union= TRUE;
5571   thd->binlog_evt_union.unioned_events= FALSE;
5572   thd->binlog_evt_union.unioned_events_trans= FALSE;
5573   thd->binlog_evt_union.first_query_id= query_id_param;
5574 }
5575 
stop_union_events(THD * thd)5576 void MYSQL_BIN_LOG::stop_union_events(THD *thd)
5577 {
5578   DBUG_ASSERT(thd->binlog_evt_union.do_union);
5579   thd->binlog_evt_union.do_union= FALSE;
5580 }
5581 
is_query_in_union(THD * thd,query_id_t query_id_param)5582 bool MYSQL_BIN_LOG::is_query_in_union(THD *thd, query_id_t query_id_param)
5583 {
5584   return (thd->binlog_evt_union.do_union &&
5585           query_id_param >= thd->binlog_evt_union.first_query_id);
5586 }
5587 
5588 /**
5589   This function checks if a transactional table was updated by the
5590   current transaction.
5591 
5592   @param thd The client thread that executed the current statement.
5593   @return
5594     @c true if a transactional table was updated, @c false otherwise.
5595 */
5596 bool
trans_has_updated_trans_table(const THD * thd)5597 trans_has_updated_trans_table(const THD* thd)
5598 {
5599   binlog_cache_mngr *const cache_mngr=
5600     (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
5601 
5602   return (cache_mngr ? !cache_mngr->trx_cache.empty() : 0);
5603 }
5604 
5605 /**
5606   This function checks if a transactional table was updated by the
5607   current statement.
5608 
5609   @param thd The client thread that executed the current statement.
5610   @return
5611     @c true if a transactional table with rollback was updated,
5612     @c false otherwise.
5613 */
5614 bool
stmt_has_updated_trans_table(const THD * thd)5615 stmt_has_updated_trans_table(const THD *thd)
5616 {
5617   Ha_trx_info *ha_info;
5618 
5619   for (ha_info= thd->transaction->stmt.ha_list; ha_info;
5620        ha_info= ha_info->next())
5621   {
5622     if (ha_info->is_trx_read_write() &&
5623         !(ha_info->ht()->flags & HTON_NO_ROLLBACK))
5624       return (TRUE);
5625   }
5626   return (FALSE);
5627 }
5628 
5629 /**
5630   This function checks if either a trx-cache or a non-trx-cache should
5631   be used. If @c bin_log_direct_non_trans_update is active or the format
5632   is either MIXED or ROW, the cache to be used depends on the flag @c
5633   is_transactional.
5634 
5635   On the other hand, if binlog_format is STMT or direct option is
5636   OFF, the trx-cache should be used if and only if the statement is
5637   transactional or the trx-cache is not empty. Otherwise, the
5638   non-trx-cache should be used.
5639 
5640   @param thd              The client thread.
5641   @param is_transactional The changes are related to a trx-table.
5642   @return
5643     @c true if a trx-cache should be used, @c false otherwise.
5644 */
use_trans_cache(const THD * thd,bool is_transactional)5645 bool use_trans_cache(const THD* thd, bool is_transactional)
5646 {
5647   if (is_transactional)
5648     return 1;
5649   binlog_cache_mngr *const cache_mngr=
5650     (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
5651 
5652   return ((thd->is_current_stmt_binlog_format_row() ||
5653            thd->variables.binlog_direct_non_trans_update) ? 0 :
5654           !cache_mngr->trx_cache.empty());
5655 }
5656 
5657 /**
5658   This function checks if a transaction, either a multi-statement
5659   or a single statement transaction is about to commit or not.
5660 
5661   @param thd The client thread that executed the current statement.
5662   @param all Committing a transaction (i.e. TRUE) or a statement
5663              (i.e. FALSE).
5664   @return
5665     @c true if committing a transaction, otherwise @c false.
5666 */
ending_trans(THD * thd,const bool all)5667 bool ending_trans(THD* thd, const bool all)
5668 {
5669   return (all || ending_single_stmt_trans(thd, all));
5670 }
5671 
5672 /**
5673   This function checks if a single statement transaction is about
5674   to commit or not.
5675 
5676   @param thd The client thread that executed the current statement.
5677   @param all Committing a transaction (i.e. TRUE) or a statement
5678              (i.e. FALSE).
5679   @return
5680     @c true if committing a single statement transaction, otherwise
5681     @c false.
5682 */
ending_single_stmt_trans(THD * thd,const bool all)5683 bool ending_single_stmt_trans(THD* thd, const bool all)
5684 {
5685   return (!all && !thd->in_multi_stmt_transaction_mode());
5686 }
5687 
5688 /**
5689   This function checks if a non-transactional table was updated by
5690   the current transaction.
5691 
5692   @param thd The client thread that executed the current statement.
5693   @return
5694     @c true if a non-transactional table was updated, @c false
5695     otherwise.
5696 */
trans_has_updated_non_trans_table(const THD * thd)5697 bool trans_has_updated_non_trans_table(const THD* thd)
5698 {
5699   return (thd->transaction->all.modified_non_trans_table ||
5700           thd->transaction->stmt.modified_non_trans_table);
5701 }
5702 
5703 /**
5704   This function checks if a non-transactional table was updated by the
5705   current statement.
5706 
5707   @param thd The client thread that executed the current statement.
5708   @return
5709     @c true if a non-transactional table was updated, @c false otherwise.
5710 */
stmt_has_updated_non_trans_table(const THD * thd)5711 bool stmt_has_updated_non_trans_table(const THD* thd)
5712 {
5713   return (thd->transaction->stmt.modified_non_trans_table);
5714 }
5715 
5716 /*
5717   These functions are placed in this file since they need access to
5718   binlog_hton, which has internal linkage.
5719 */
5720 
binlog_setup_trx_data()5721 binlog_cache_mngr *THD::binlog_setup_trx_data()
5722 {
5723   DBUG_ENTER("THD::binlog_setup_trx_data");
5724   binlog_cache_mngr *cache_mngr=
5725     (binlog_cache_mngr*) thd_get_ha_data(this, binlog_hton);
5726 
5727   if (cache_mngr)
5728     DBUG_RETURN(cache_mngr);                             // Already set up
5729 
5730   cache_mngr= (binlog_cache_mngr*) my_malloc(key_memory_binlog_cache_mngr,
5731                                   sizeof(binlog_cache_mngr), MYF(MY_ZEROFILL));
5732   if (!cache_mngr ||
5733       open_cached_file(&cache_mngr->stmt_cache.cache_log, mysql_tmpdir,
5734                        LOG_PREFIX, (size_t)binlog_stmt_cache_size, MYF(MY_WME)) ||
5735       open_cached_file(&cache_mngr->trx_cache.cache_log, mysql_tmpdir,
5736                        LOG_PREFIX, (size_t)binlog_cache_size, MYF(MY_WME)))
5737   {
5738     my_free(cache_mngr);
5739     DBUG_RETURN(0);                      // Didn't manage to set it up
5740   }
5741   thd_set_ha_data(this, binlog_hton, cache_mngr);
5742 
5743   cache_mngr= new (cache_mngr)
5744               binlog_cache_mngr(max_binlog_stmt_cache_size,
5745                                 max_binlog_cache_size,
5746                                 &binlog_stmt_cache_use,
5747                                 &binlog_stmt_cache_disk_use,
5748                                 &binlog_cache_use,
5749                                 &binlog_cache_disk_use);
5750   DBUG_RETURN(cache_mngr);
5751 }
5752 
5753 /*
5754   Function to start a statement and optionally a transaction for the
5755   binary log.
5756 
5757   SYNOPSIS
5758     binlog_start_trans_and_stmt()
5759 
5760   DESCRIPTION
5761 
5762     This function does three things:
5763     - Start a transaction if not in autocommit mode or if a BEGIN
5764       statement has been seen.
5765 
5766     - Start a statement transaction to allow us to truncate the cache.
5767 
5768     - Save the current binlog position so that we can roll back the
5769       statement by truncating the cache.
5770 
5771       We only update the saved position if the old one was undefined,
5772       the reason is that there are some cases (e.g., for CREATE-SELECT)
5773       where the position is saved twice (e.g., both in
5774       select_create::prepare() and binlog_write_table_map()) , but
5775       we should use the first. This means that calls to this function
5776       can be used to start the statement before the first table map
5777       event, to include some extra events.
5778  */
5779 
5780 void
binlog_start_trans_and_stmt()5781 THD::binlog_start_trans_and_stmt()
5782 {
5783   binlog_cache_mngr *cache_mngr= (binlog_cache_mngr*) thd_get_ha_data(this, binlog_hton);
5784   DBUG_ENTER("binlog_start_trans_and_stmt");
5785   DBUG_PRINT("enter", ("cache_mngr: %p  cache_mngr->trx_cache.get_prev_position(): %lu",
5786                        cache_mngr,
5787                        (cache_mngr ? (ulong) cache_mngr->trx_cache.get_prev_position() :
5788                         (ulong) 0)));
5789 
5790   if (cache_mngr == NULL ||
5791       cache_mngr->trx_cache.get_prev_position() == MY_OFF_T_UNDEF)
5792   {
5793     this->binlog_set_stmt_begin();
5794     bool mstmt_mode= in_multi_stmt_transaction_mode();
5795 #ifdef WITH_WSREP
5796     /*
5797       With wsrep binlog emulation we can skip the rest because the
5798       binlog cache will not be written into binlog. Note however that
5799       because of this the hton callbacks will not get called to clean
5800       up the cache, so this must be done explicitly when the transaction
5801       terminates.
5802     */
5803     if (WSREP_EMULATE_BINLOG_NNULL(this))
5804     {
5805       DBUG_VOID_RETURN;
5806     }
5807     /* If this event replicates through a master-slave then we need to
5808        inject manually GTID so it is preserved in the cluster. We are writing
5809        directly to WSREP buffer and not in IO cache because in case of IO cache
5810        GTID event will be duplicated in binlog.
5811        We have to do this only one time in mysql transaction.
5812        Since this function is called multiple times , We will check for
5813        ha_info->is_started().
5814     */
5815     Ha_trx_info *ha_info;
5816     ha_info= this->ha_data[binlog_hton->slot].ha_info + (mstmt_mode ? 1 : 0);
5817 
5818     if (!ha_info->is_started() &&
5819         (this->variables.gtid_seq_no || this->variables.wsrep_gtid_seq_no) &&
5820         wsrep_on(this) &&
5821         (this->wsrep_cs().mode() == wsrep::client_state::m_local))
5822     {
5823       uchar *buf= 0;
5824       size_t len= 0;
5825       IO_CACHE tmp_io_cache;
5826       Log_event_writer writer(&tmp_io_cache, 0);
5827       if(!open_cached_file(&tmp_io_cache, mysql_tmpdir, TEMP_PREFIX,
5828                           128, MYF(MY_WME)))
5829       {
5830         uint64 seqno= this->variables.gtid_seq_no;
5831         uint32 domain_id= this->variables.gtid_domain_id;
5832         uint32 server_id= this->variables.server_id;
5833         if (!this->variables.gtid_seq_no && this->variables.wsrep_gtid_seq_no)
5834         {
5835           seqno= this->variables.wsrep_gtid_seq_no;
5836           domain_id= wsrep_gtid_server.domain_id;
5837           server_id= wsrep_gtid_server.server_id;
5838         }
5839         Gtid_log_event gtid_event(this, seqno, domain_id, true,
5840                                   LOG_EVENT_SUPPRESS_USE_F, true, 0);
5841         gtid_event.server_id= server_id;
5842         writer.write(&gtid_event);
5843         wsrep_write_cache_buf(&tmp_io_cache, &buf, &len);
5844         if (len > 0) this->wsrep_cs().append_data(wsrep::const_buffer(buf, len));
5845         if (buf) my_free(buf);
5846         close_cached_file(&tmp_io_cache);
5847       }
5848     }
5849 #endif
5850     if (mstmt_mode)
5851       trans_register_ha(this, TRUE, binlog_hton, 0);
5852     trans_register_ha(this, FALSE, binlog_hton, 0);
5853     /*
5854       Mark statement transaction as read/write. We never start
5855       a binary log transaction and keep it read-only,
5856       therefore it's best to mark the transaction read/write just
5857       at the same time we start it.
5858       Not necessary to mark the normal transaction read/write
5859       since the statement-level flag will be propagated automatically
5860       inside ha_commit_trans.
5861     */
5862     ha_data[binlog_hton->slot].ha_info[0].set_trx_read_write();
5863   }
5864   DBUG_VOID_RETURN;
5865 }
5866 
binlog_set_stmt_begin()5867 void THD::binlog_set_stmt_begin() {
5868   binlog_cache_mngr *cache_mngr=
5869     (binlog_cache_mngr*) thd_get_ha_data(this, binlog_hton);
5870 
5871   /*
5872     The call to binlog_trans_log_savepos() might create the cache_mngr
5873     structure, if it didn't exist before, so we save the position
5874     into an auto variable and then write it into the transaction
5875     data for the binary log (i.e., cache_mngr).
5876   */
5877   my_off_t pos= 0;
5878   binlog_trans_log_savepos(this, &pos);
5879   cache_mngr= (binlog_cache_mngr*) thd_get_ha_data(this, binlog_hton);
5880   cache_mngr->trx_cache.set_prev_position(pos);
5881 }
5882 
5883 static int
binlog_start_consistent_snapshot(handlerton * hton,THD * thd)5884 binlog_start_consistent_snapshot(handlerton *hton, THD *thd)
5885 {
5886   int err= 0;
5887   DBUG_ENTER("binlog_start_consistent_snapshot");
5888 
5889   binlog_cache_mngr *const cache_mngr= thd->binlog_setup_trx_data();
5890 
5891   /* Server layer calls us with LOCK_commit_ordered locked, so this is safe. */
5892   mysql_mutex_assert_owner(&LOCK_commit_ordered);
5893   strmake_buf(cache_mngr->last_commit_pos_file, mysql_bin_log.last_commit_pos_file);
5894   cache_mngr->last_commit_pos_offset= mysql_bin_log.last_commit_pos_offset;
5895 
5896   trans_register_ha(thd, TRUE, binlog_hton, 0);
5897 
5898   DBUG_RETURN(err);
5899 }
5900 
5901 
5902 /**
5903    Prepare all tables that are updated for row logging
5904 
5905    Annotate events and table maps are written by binlog_write_table_maps()
5906 */
5907 
binlog_prepare_for_row_logging()5908 void THD::binlog_prepare_for_row_logging()
5909 {
5910   DBUG_ENTER("THD::binlog_prepare_for_row_logging");
5911   for (TABLE *table= open_tables ; table; table= table->next)
5912   {
5913     if (table->query_id == query_id && table->current_lock == F_WRLCK)
5914       table->file->prepare_for_row_logging();
5915   }
5916   DBUG_VOID_RETURN;
5917 }
5918 
5919 /**
5920    Write annnotated row event (the query) if needed
5921 */
5922 
binlog_write_annotated_row(Log_event_writer * writer)5923 bool THD::binlog_write_annotated_row(Log_event_writer *writer)
5924 {
5925   int error;
5926   DBUG_ENTER("THD::binlog_write_annotated_row");
5927 
5928   if (!(IF_WSREP(!wsrep_fragments_certified_for_stmt(this), true) &&
5929         variables.binlog_annotate_row_events &&
5930         query_length()))
5931     DBUG_RETURN(0);
5932 
5933   Annotate_rows_log_event anno(this, 0, false);
5934   if (unlikely((error= writer->write(&anno))))
5935   {
5936     if (my_errno == EFBIG)
5937       writer->set_incident();
5938     DBUG_RETURN(error);
5939   }
5940   DBUG_RETURN(0);
5941 }
5942 
5943 
5944 /**
5945    Write table map events for all tables that are using row logging.
5946    This includes all tables used by this statement, including tables
5947    used in triggers.
5948 
5949    Also write annotate events and start transactions.
5950    This is using the "tables_with_row_logging" list prepared by
5951    THD::binlog_prepare_for_row_logging
5952 */
5953 
binlog_write_table_maps()5954 bool THD::binlog_write_table_maps()
5955 {
5956   bool with_annotate;
5957   MYSQL_LOCK *locks[2], **locks_end= locks;
5958   DBUG_ENTER("THD::binlog_write_table_maps");
5959 
5960   DBUG_ASSERT(!binlog_table_maps);
5961   DBUG_ASSERT(is_current_stmt_binlog_format_row());
5962 
5963   /* Initialize cache_mngr once per statement */
5964   binlog_start_trans_and_stmt();
5965   with_annotate= 1;                    // Write annotate with first map
5966 
5967   if ((*locks_end= extra_lock))
5968     locks_end++;
5969   if ((*locks_end= lock))
5970     locks_end++;
5971 
5972   for (MYSQL_LOCK **cur_lock= locks ; cur_lock < locks_end ; cur_lock++)
5973   {
5974     TABLE **const end_ptr= (*cur_lock)->table + (*cur_lock)->table_count;
5975     for (TABLE **table_ptr= (*cur_lock)->table;
5976          table_ptr != end_ptr ;
5977          ++table_ptr)
5978     {
5979       TABLE *table= *table_ptr;
5980       bool restore= 0;
5981       /*
5982         We have to also write table maps for tables that have not yet been
5983         used, like for tables in after triggers
5984       */
5985       if (!table->file->row_logging &&
5986           table->query_id != query_id && table->current_lock == F_WRLCK)
5987       {
5988         if (table->file->prepare_for_row_logging())
5989           restore= 1;
5990       }
5991       if (table->file->row_logging)
5992       {
5993         if (binlog_write_table_map(table, with_annotate))
5994           DBUG_RETURN(1);
5995         with_annotate= 0;
5996       }
5997       if (restore)
5998       {
5999         /*
6000           Restore original setting so that it doesn't cause problem for the
6001           next statement
6002         */
6003         table->file->row_logging= table->file->row_logging_init= 0;
6004       }
6005     }
6006   }
6007   binlog_table_maps= 1;                         // Table maps written
6008   DBUG_RETURN(0);
6009 }
6010 
6011 
6012 /**
6013   This function writes a table map to the binary log.
6014   Note that in order to keep the signature uniform with related methods,
6015   we use a redundant parameter to indicate whether a transactional table
6016   was changed or not.
6017 
6018   @param table             a pointer to the table.
6019   @param with_annotate  If true call binlog_write_annotated_row()
6020 
6021   @return
6022     nonzero if an error pops up when writing the table map event.
6023 */
6024 
binlog_write_table_map(TABLE * table,bool with_annotate)6025 bool THD::binlog_write_table_map(TABLE *table, bool with_annotate)
6026 {
6027   int error;
6028   bool is_transactional= table->file->row_logging_has_trans;
6029   DBUG_ENTER("THD::binlog_write_table_map");
6030   DBUG_PRINT("enter", ("table: %p  (%s: #%lu)",
6031                        table, table->s->table_name.str,
6032                        table->s->table_map_id));
6033 
6034   /* Pre-conditions */
6035   DBUG_ASSERT(table->s->table_map_id != ULONG_MAX);
6036 
6037   /* Ensure that all events in a GTID group are in the same cache */
6038   if (variables.option_bits & OPTION_GTID_BEGIN)
6039     is_transactional= 1;
6040 
6041   Table_map_log_event
6042     the_event(this, table, table->s->table_map_id, is_transactional);
6043 
6044   binlog_cache_mngr *const cache_mngr=
6045     (binlog_cache_mngr*) thd_get_ha_data(this, binlog_hton);
6046   binlog_cache_data *cache_data= (cache_mngr->
6047                                   get_binlog_cache_data(is_transactional));
6048   IO_CACHE *file= &cache_data->cache_log;
6049   Log_event_writer writer(file, cache_data);
6050 
6051   if (with_annotate)
6052     if (binlog_write_annotated_row(&writer))
6053       DBUG_RETURN(1);
6054 
6055   if (unlikely((error= writer.write(&the_event))))
6056     DBUG_RETURN(error);
6057 
6058   DBUG_RETURN(0);
6059 }
6060 
6061 
6062 /**
6063   This function retrieves a pending row event from a cache which is
6064   specified through the parameter @c is_transactional. Respectively, when it
6065   is @c true, the pending event is returned from the transactional cache.
6066   Otherwise from the non-transactional cache.
6067 
6068   @param is_transactional  @c true indicates a transactional cache,
6069                            otherwise @c false a non-transactional.
6070   @return
6071     The row event if any.
6072 */
6073 Rows_log_event*
binlog_get_pending_rows_event(bool is_transactional) const6074 THD::binlog_get_pending_rows_event(bool is_transactional) const
6075 {
6076   Rows_log_event* rows= NULL;
6077   binlog_cache_mngr *const cache_mngr=
6078     (binlog_cache_mngr*) thd_get_ha_data(this, binlog_hton);
6079 
6080   /*
6081     This is less than ideal, but here's the story: If there is no cache_mngr,
6082     prepare_pending_rows_event() has never been called (since the cache_mngr
6083     is set up there). In that case, we just return NULL.
6084    */
6085   if (cache_mngr)
6086   {
6087     binlog_cache_data *cache_data=
6088       cache_mngr->get_binlog_cache_data(use_trans_cache(this, is_transactional));
6089 
6090     rows= cache_data->pending();
6091   }
6092   return (rows);
6093 }
6094 
6095 /**
6096   This function stores a pending row event into a cache which is specified
6097   through the parameter @c is_transactional. Respectively, when it is @c
6098   true, the pending event is stored into the transactional cache. Otherwise
6099   into the non-transactional cache.
6100 
6101   @param evt               a pointer to the row event.
6102   @param is_transactional  @c true indicates a transactional cache,
6103                            otherwise @c false a non-transactional.
6104 */
6105 void
binlog_set_pending_rows_event(Rows_log_event * ev,bool is_transactional)6106 THD::binlog_set_pending_rows_event(Rows_log_event* ev, bool is_transactional)
6107 {
6108   binlog_cache_mngr *const cache_mngr= binlog_setup_trx_data();
6109 
6110   DBUG_ASSERT(cache_mngr);
6111 
6112   binlog_cache_data *cache_data=
6113     cache_mngr->get_binlog_cache_data(use_trans_cache(this, is_transactional));
6114 
6115   cache_data->set_pending(ev);
6116 }
6117 
6118 
6119 /**
6120   This function removes the pending rows event, discarding any outstanding
6121   rows. If there is no pending rows event available, this is effectively a
6122   no-op.
6123 
6124   @param thd               a pointer to the user thread.
6125   @param is_transactional  @c true indicates a transactional cache,
6126                            otherwise @c false a non-transactional.
6127 */
6128 int
remove_pending_rows_event(THD * thd,bool is_transactional)6129 MYSQL_BIN_LOG::remove_pending_rows_event(THD *thd, bool is_transactional)
6130 {
6131   DBUG_ENTER("MYSQL_BIN_LOG::remove_pending_rows_event");
6132 
6133   binlog_cache_mngr *const cache_mngr=
6134     (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
6135 
6136   DBUG_ASSERT(cache_mngr);
6137 
6138   binlog_cache_data *cache_data=
6139     cache_mngr->get_binlog_cache_data(use_trans_cache(thd, is_transactional));
6140 
6141   if (Rows_log_event* pending= cache_data->pending())
6142   {
6143     delete pending;
6144     cache_data->set_pending(NULL);
6145   }
6146 
6147   DBUG_RETURN(0);
6148 }
6149 
6150 /*
6151   Moves the last bunch of rows from the pending Rows event to a cache (either
6152   transactional cache if is_transaction is @c true, or the non-transactional
6153   cache otherwise. Sets a new pending event.
6154 
6155   @param thd               a pointer to the user thread.
6156   @param evt               a pointer to the row event.
6157   @param is_transactional  @c true indicates a transactional cache,
6158                            otherwise @c false a non-transactional.
6159 */
6160 int
flush_and_set_pending_rows_event(THD * thd,Rows_log_event * event,bool is_transactional)6161 MYSQL_BIN_LOG::flush_and_set_pending_rows_event(THD *thd,
6162                                                 Rows_log_event* event,
6163                                                 bool is_transactional)
6164 {
6165   DBUG_ENTER("MYSQL_BIN_LOG::flush_and_set_pending_rows_event(event)");
6166   DBUG_ASSERT(WSREP_EMULATE_BINLOG(thd) || mysql_bin_log.is_open());
6167   DBUG_PRINT("enter", ("event: %p", event));
6168 
6169   binlog_cache_mngr *const cache_mngr=
6170     (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
6171 
6172   DBUG_ASSERT(cache_mngr);
6173 
6174   binlog_cache_data *cache_data=
6175     cache_mngr->get_binlog_cache_data(use_trans_cache(thd, is_transactional));
6176 
6177   DBUG_PRINT("info", ("cache_mngr->pending(): %p", cache_data->pending()));
6178 
6179   if (Rows_log_event* pending= cache_data->pending())
6180   {
6181     Log_event_writer writer(&cache_data->cache_log, cache_data);
6182 
6183     /*
6184       Write pending event to the cache.
6185     */
6186     DBUG_EXECUTE_IF("simulate_disk_full_at_flush_pending",
6187                     {DBUG_SET("+d,simulate_file_write_error");});
6188     if (writer.write(pending))
6189     {
6190       set_write_error(thd, is_transactional);
6191       if (check_write_error(thd) && cache_data &&
6192           stmt_has_updated_non_trans_table(thd))
6193         cache_data->set_incident();
6194       delete pending;
6195       cache_data->set_pending(NULL);
6196       DBUG_EXECUTE_IF("simulate_disk_full_at_flush_pending",
6197                       {DBUG_SET("-d,simulate_file_write_error");});
6198       DBUG_RETURN(1);
6199     }
6200 
6201     delete pending;
6202   }
6203 
6204   thd->binlog_set_pending_rows_event(event, is_transactional);
6205 
6206   DBUG_RETURN(0);
6207 }
6208 
6209 
6210 /* Generate a new global transaction ID, and write it to the binlog */
6211 
6212 bool
write_gtid_event(THD * thd,bool standalone,bool is_transactional,uint64 commit_id)6213 MYSQL_BIN_LOG::write_gtid_event(THD *thd, bool standalone,
6214                                 bool is_transactional, uint64 commit_id)
6215 {
6216   rpl_gtid gtid;
6217   uint32 domain_id;
6218   uint32 local_server_id;
6219   uint64 seq_no;
6220   int err;
6221   DBUG_ENTER("write_gtid_event");
6222   DBUG_PRINT("enter", ("standalone: %d", standalone));
6223 
6224   seq_no= thd->variables.gtid_seq_no;
6225   domain_id= thd->variables.gtid_domain_id;
6226   local_server_id= thd->variables.server_id;
6227 
6228   DBUG_ASSERT(local_server_id != 0);
6229 
6230   if (thd->variables.option_bits & OPTION_GTID_BEGIN)
6231   {
6232     DBUG_PRINT("error", ("OPTION_GTID_BEGIN is set. "
6233                          "Master and slave will have different GTID values"));
6234     /* Reset the flag, as we will write out a GTID anyway */
6235     thd->variables.option_bits&= ~OPTION_GTID_BEGIN;
6236   }
6237 
6238   /*
6239     Reset the session variable gtid_seq_no, to reduce the risk of accidentally
6240     producing a duplicate GTID.
6241   */
6242   thd->variables.gtid_seq_no= 0;
6243   if (seq_no != 0)
6244   {
6245     /* Use the specified sequence number. */
6246     gtid.domain_id= domain_id;
6247     gtid.server_id= local_server_id;
6248     gtid.seq_no= seq_no;
6249     err= rpl_global_gtid_binlog_state.update(&gtid, opt_gtid_strict_mode);
6250     if (err && thd->get_stmt_da()->sql_errno()==ER_GTID_STRICT_OUT_OF_ORDER)
6251       errno= ER_GTID_STRICT_OUT_OF_ORDER;
6252   }
6253   else
6254   {
6255     /* Allocate the next sequence number for the GTID. */
6256     err= rpl_global_gtid_binlog_state.update_with_next_gtid(domain_id,
6257                                                             local_server_id, &gtid);
6258     seq_no= gtid.seq_no;
6259   }
6260   if (err)
6261     DBUG_RETURN(true);
6262 
6263   thd->set_last_commit_gtid(gtid);
6264 
6265   Gtid_log_event gtid_event(thd, seq_no, domain_id, standalone,
6266                             LOG_EVENT_SUPPRESS_USE_F, is_transactional,
6267                             commit_id);
6268 
6269   /* Write the event to the binary log. */
6270   DBUG_ASSERT(this == &mysql_bin_log);
6271 
6272 #ifdef WITH_WSREP
6273   if (wsrep_gtid_mode)
6274   {
6275     thd->variables.gtid_domain_id= global_system_variables.gtid_domain_id;
6276     thd->variables.server_id= global_system_variables.server_id;
6277   }
6278 #endif
6279 
6280   if (write_event(&gtid_event))
6281     DBUG_RETURN(true);
6282   status_var_add(thd->status_var.binlog_bytes_written, gtid_event.data_written);
6283 
6284   DBUG_RETURN(false);
6285 }
6286 
6287 
6288 int
write_state_to_file()6289 MYSQL_BIN_LOG::write_state_to_file()
6290 {
6291   File file_no;
6292   IO_CACHE cache;
6293   char buf[FN_REFLEN];
6294   int err;
6295   bool opened= false;
6296   bool log_inited= false;
6297 
6298   fn_format(buf, opt_bin_logname, mysql_data_home, ".state",
6299             MY_UNPACK_FILENAME);
6300   if ((file_no= mysql_file_open(key_file_binlog_state, buf,
6301                                 O_RDWR|O_CREAT|O_TRUNC|O_BINARY,
6302                                 MYF(MY_WME))) < 0)
6303   {
6304     err= 1;
6305     goto err;
6306   }
6307   opened= true;
6308   if ((err= init_io_cache(&cache, file_no, IO_SIZE, WRITE_CACHE, 0, 0,
6309                            MYF(MY_WME|MY_WAIT_IF_FULL))))
6310     goto err;
6311   log_inited= true;
6312   if ((err= rpl_global_gtid_binlog_state.write_to_iocache(&cache)))
6313     goto err;
6314   log_inited= false;
6315   if ((err= end_io_cache(&cache)))
6316     goto err;
6317   if ((err= mysql_file_sync(file_no, MYF(MY_WME|MY_SYNC_FILESIZE))))
6318     goto err;
6319   goto end;
6320 
6321 err:
6322   sql_print_error("Error writing binlog state to file '%s'.", buf);
6323   if (log_inited)
6324     end_io_cache(&cache);
6325 end:
6326   if (opened)
6327     mysql_file_close(file_no, MYF(0));
6328 
6329   return err;
6330 }
6331 
6332 
6333 /*
6334   Initialize the binlog state from the master-bin.state file, at server startup.
6335 
6336   Returns:
6337     0 for success.
6338     2 for when .state file did not exist.
6339     1 for other error.
6340 */
6341 int
read_state_from_file()6342 MYSQL_BIN_LOG::read_state_from_file()
6343 {
6344   File file_no;
6345   IO_CACHE cache;
6346   char buf[FN_REFLEN];
6347   int err;
6348   bool opened= false;
6349   bool log_inited= false;
6350 
6351   fn_format(buf, opt_bin_logname, mysql_data_home, ".state",
6352             MY_UNPACK_FILENAME);
6353   if ((file_no= mysql_file_open(key_file_binlog_state, buf,
6354                                 O_RDONLY|O_BINARY, MYF(0))) < 0)
6355   {
6356     if (my_errno != ENOENT)
6357     {
6358       err= 1;
6359       goto err;
6360     }
6361     else
6362     {
6363       /*
6364         If the state file does not exist, this is the first server startup
6365         with GTID enabled. So initialize to empty state.
6366       */
6367       rpl_global_gtid_binlog_state.reset();
6368       err= 2;
6369       goto end;
6370     }
6371   }
6372   opened= true;
6373   if ((err= init_io_cache(&cache, file_no, IO_SIZE, READ_CACHE, 0, 0,
6374                           MYF(MY_WME|MY_WAIT_IF_FULL))))
6375     goto err;
6376   log_inited= true;
6377   if ((err= rpl_global_gtid_binlog_state.read_from_iocache(&cache)))
6378     goto err;
6379   goto end;
6380 
6381 err:
6382   sql_print_error("Error reading binlog GTID state from file '%s'.", buf);
6383 end:
6384   if (log_inited)
6385     end_io_cache(&cache);
6386   if (opened)
6387     mysql_file_close(file_no, MYF(0));
6388 
6389   return err;
6390 }
6391 
6392 
6393 int
get_most_recent_gtid_list(rpl_gtid ** list,uint32 * size)6394 MYSQL_BIN_LOG::get_most_recent_gtid_list(rpl_gtid **list, uint32 *size)
6395 {
6396   return rpl_global_gtid_binlog_state.get_most_recent_gtid_list(list, size);
6397 }
6398 
6399 
6400 bool
append_state_pos(String * str)6401 MYSQL_BIN_LOG::append_state_pos(String *str)
6402 {
6403   return rpl_global_gtid_binlog_state.append_pos(str);
6404 }
6405 
6406 
6407 bool
append_state(String * str)6408 MYSQL_BIN_LOG::append_state(String *str)
6409 {
6410   return rpl_global_gtid_binlog_state.append_state(str);
6411 }
6412 
6413 
6414 bool
is_empty_state()6415 MYSQL_BIN_LOG::is_empty_state()
6416 {
6417   return (rpl_global_gtid_binlog_state.count() == 0);
6418 }
6419 
6420 
6421 bool
find_in_binlog_state(uint32 domain_id,uint32 server_id_arg,rpl_gtid * out_gtid)6422 MYSQL_BIN_LOG::find_in_binlog_state(uint32 domain_id, uint32 server_id_arg,
6423                                     rpl_gtid *out_gtid)
6424 {
6425   rpl_gtid *gtid;
6426   if ((gtid= rpl_global_gtid_binlog_state.find(domain_id, server_id_arg)))
6427     *out_gtid= *gtid;
6428   return gtid != NULL;
6429 }
6430 
6431 
6432 bool
lookup_domain_in_binlog_state(uint32 domain_id,rpl_gtid * out_gtid)6433 MYSQL_BIN_LOG::lookup_domain_in_binlog_state(uint32 domain_id,
6434                                              rpl_gtid *out_gtid)
6435 {
6436   rpl_gtid *found_gtid;
6437 
6438   if ((found_gtid= rpl_global_gtid_binlog_state.find_most_recent(domain_id)))
6439   {
6440     *out_gtid= *found_gtid;
6441     return true;
6442   }
6443 
6444   return false;
6445 }
6446 
6447 
6448 int
bump_seq_no_counter_if_needed(uint32 domain_id,uint64 seq_no)6449 MYSQL_BIN_LOG::bump_seq_no_counter_if_needed(uint32 domain_id, uint64 seq_no)
6450 {
6451   return rpl_global_gtid_binlog_state.bump_seq_no_if_needed(domain_id, seq_no);
6452 }
6453 
6454 
6455 bool
check_strict_gtid_sequence(uint32 domain_id,uint32 server_id_arg,uint64 seq_no)6456 MYSQL_BIN_LOG::check_strict_gtid_sequence(uint32 domain_id,
6457                                           uint32 server_id_arg,
6458                                           uint64 seq_no)
6459 {
6460   return rpl_global_gtid_binlog_state.check_strict_sequence(domain_id,
6461                                                             server_id_arg,
6462                                                             seq_no);
6463 }
6464 
6465 
6466 /**
6467   Write an event to the binary log. If with_annotate != NULL and
6468   *with_annotate = TRUE write also Annotate_rows before the event
6469   (this should happen only if the event is a Table_map).
6470 */
6471 
write(Log_event * event_info,my_bool * with_annotate)6472 bool MYSQL_BIN_LOG::write(Log_event *event_info, my_bool *with_annotate)
6473 {
6474   THD *thd= event_info->thd;
6475   bool error= 1;
6476   binlog_cache_data *cache_data= 0;
6477   bool is_trans_cache= FALSE;
6478   bool using_trans= event_info->use_trans_cache();
6479   bool direct= event_info->use_direct_logging();
6480   ulong UNINIT_VAR(prev_binlog_id);
6481   DBUG_ENTER("MYSQL_BIN_LOG::write(Log_event *)");
6482 
6483   /*
6484     When binary logging is not enabled (--log-bin=0), wsrep-patch partially
6485     enables it without opening the binlog file (MYSQL_BIN_LOG::open().
6486     So, avoid writing to binlog file.
6487   */
6488   if (direct &&
6489       (wsrep_emulate_bin_log ||
6490        (WSREP(thd) && !(thd->variables.option_bits & OPTION_BIN_LOG))))
6491     DBUG_RETURN(0);
6492 
6493   if (thd->variables.option_bits &
6494       (OPTION_GTID_BEGIN | OPTION_BIN_COMMIT_OFF))
6495   {
6496     DBUG_PRINT("info", ("OPTION_GTID_BEGIN was set"));
6497     /* Wait for commit from binary log before we commit */
6498     direct= 0;
6499     using_trans= 1;
6500     /* Set cache_type to ensure we don't get checksums for this event */
6501     event_info->cache_type= Log_event::EVENT_TRANSACTIONAL_CACHE;
6502   }
6503 
6504   if (thd->binlog_evt_union.do_union)
6505   {
6506     /*
6507       In Stored function; Remember that function call caused an update.
6508       We will log the function call to the binary log on function exit
6509     */
6510     thd->binlog_evt_union.unioned_events= TRUE;
6511     thd->binlog_evt_union.unioned_events_trans |= using_trans;
6512     DBUG_RETURN(0);
6513   }
6514 
6515   /*
6516     We only end the statement if we are in a top-level statement.  If
6517     we are inside a stored function, we do not end the statement since
6518     this will close all tables on the slave. But there can be a special case
6519     where we are inside a stored function/trigger and a SAVEPOINT is being
6520     set in side the stored function/trigger. This SAVEPOINT execution will
6521     force the pending event to be flushed without an STMT_END_F flag. This
6522     will result in a case where following DMLs will be considered as part of
6523     same statement and result in data loss on slave. Hence in this case we
6524     force the end_stmt to be true.
6525   */
6526   bool const end_stmt= (thd->in_sub_stmt && thd->lex->sql_command ==
6527                         SQLCOM_SAVEPOINT) ? true :
6528     (thd->locked_tables_mode && thd->lex->requires_prelocking());
6529   if (thd->binlog_flush_pending_rows_event(end_stmt, using_trans))
6530     DBUG_RETURN(error);
6531 
6532   /*
6533      In most cases this is only called if 'is_open()' is true; in fact this is
6534      mostly called if is_open() *was* true a few instructions before, but it
6535      could have changed since.
6536   */
6537   /* applier and replayer can skip writing binlog events */
6538   if ((WSREP_EMULATE_BINLOG(thd) &&
6539        IF_WSREP(thd->wsrep_cs().mode() == wsrep::client_state::m_local, 0)) || is_open())
6540   {
6541     my_off_t UNINIT_VAR(my_org_b_tell);
6542 #ifdef HAVE_REPLICATION
6543     /*
6544       In the future we need to add to the following if tests like
6545       "do the involved tables match (to be implemented)
6546       binlog_[wild_]{do|ignore}_table?" (WL#1049)"
6547     */
6548     const char *local_db= event_info->get_db();
6549 
6550     bool option_bin_log_flag= (thd->variables.option_bits & OPTION_BIN_LOG);
6551 
6552     /*
6553       Log all updates to binlog cache so that they can get replicated to other
6554       nodes. A check has been added to stop them from getting logged into
6555       binary log files.
6556     */
6557     if (WSREP(thd))
6558       option_bin_log_flag= true;
6559 
6560     if ((!(option_bin_log_flag)) ||
6561 	(thd->lex->sql_command != SQLCOM_ROLLBACK_TO_SAVEPOINT &&
6562          thd->lex->sql_command != SQLCOM_SAVEPOINT &&
6563          !binlog_filter->db_ok(local_db)))
6564       DBUG_RETURN(0);
6565 #endif /* HAVE_REPLICATION */
6566 
6567     IO_CACHE *file= NULL;
6568 
6569     if (direct)
6570     {
6571       /* We come here only for incident events */
6572       int res;
6573       uint64 commit_id= 0;
6574       MDL_request mdl_request;
6575       DBUG_PRINT("info", ("direct is set"));
6576       DBUG_ASSERT(!thd->backup_commit_lock);
6577 
6578       MDL_REQUEST_INIT(&mdl_request, MDL_key::BACKUP, "", "", MDL_BACKUP_COMMIT,
6579                      MDL_EXPLICIT);
6580       if (thd->mdl_context.acquire_lock(&mdl_request,
6581                                         thd->variables.lock_wait_timeout))
6582         DBUG_RETURN(1);
6583       thd->backup_commit_lock= &mdl_request;
6584 
6585       if ((res= thd->wait_for_prior_commit()))
6586       {
6587         if (mdl_request.ticket)
6588           thd->mdl_context.release_lock(mdl_request.ticket);
6589         thd->backup_commit_lock= 0;
6590         DBUG_RETURN(res);
6591       }
6592       file= &log_file;
6593       my_org_b_tell= my_b_tell(file);
6594       mysql_mutex_lock(&LOCK_log);
6595       prev_binlog_id= current_binlog_id;
6596       DBUG_EXECUTE_IF("binlog_force_commit_id",
6597         {
6598           const LEX_CSTRING commit_name= { STRING_WITH_LEN("commit_id") };
6599           bool null_value;
6600           user_var_entry *entry=
6601             (user_var_entry*) my_hash_search(&thd->user_vars,
6602                                              (uchar*) commit_name.str,
6603                                              commit_name.length);
6604           commit_id= entry->val_int(&null_value);
6605         });
6606       res= write_gtid_event(thd, true, using_trans, commit_id);
6607       if (mdl_request.ticket)
6608         thd->mdl_context.release_lock(mdl_request.ticket);
6609       thd->backup_commit_lock= 0;
6610       if (res)
6611         goto err;
6612     }
6613     else
6614     {
6615       binlog_cache_mngr *const cache_mngr= thd->binlog_setup_trx_data();
6616       if (!cache_mngr)
6617         goto err;
6618 
6619       is_trans_cache= use_trans_cache(thd, using_trans);
6620       cache_data= cache_mngr->get_binlog_cache_data(is_trans_cache);
6621       file= &cache_data->cache_log;
6622 
6623       if (thd->lex->stmt_accessed_non_trans_temp_table() && is_trans_cache)
6624         thd->transaction->stmt.mark_modified_non_trans_temp_table();
6625       thd->binlog_start_trans_and_stmt();
6626     }
6627     DBUG_PRINT("info",("event type: %d",event_info->get_type_code()));
6628 
6629     /*
6630        No check for auto events flag here - this write method should
6631        never be called if auto-events are enabled.
6632 
6633        Write first log events which describe the 'run environment'
6634        of the SQL command. If row-based binlogging, Insert_id, Rand
6635        and other kind of "setting context" events are not needed.
6636     */
6637 
6638     if (with_annotate && *with_annotate)
6639     {
6640       DBUG_ASSERT(event_info->get_type_code() == TABLE_MAP_EVENT);
6641       Annotate_rows_log_event anno(thd, using_trans, direct);
6642       /* Annotate event should be written not more than once */
6643       *with_annotate= 0;
6644       if (write_event(&anno, cache_data, file))
6645         goto err;
6646     }
6647 
6648     {
6649       if (!thd->is_current_stmt_binlog_format_row())
6650       {
6651         if (thd->stmt_depends_on_first_successful_insert_id_in_prev_stmt)
6652         {
6653           Intvar_log_event e(thd,(uchar) LAST_INSERT_ID_EVENT,
6654                              thd->first_successful_insert_id_in_prev_stmt_for_binlog,
6655                              using_trans, direct);
6656           if (write_event(&e, cache_data, file))
6657             goto err;
6658         }
6659         if (thd->auto_inc_intervals_in_cur_stmt_for_binlog.nb_elements() > 0)
6660         {
6661           DBUG_PRINT("info",("number of auto_inc intervals: %u",
6662                              thd->auto_inc_intervals_in_cur_stmt_for_binlog.
6663                              nb_elements()));
6664           Intvar_log_event e(thd, (uchar) INSERT_ID_EVENT,
6665                              thd->auto_inc_intervals_in_cur_stmt_for_binlog.
6666                              minimum(), using_trans, direct);
6667           if (write_event(&e, cache_data, file))
6668             goto err;
6669         }
6670         if (thd->rand_used)
6671         {
6672           Rand_log_event e(thd,thd->rand_saved_seed1,thd->rand_saved_seed2,
6673                            using_trans, direct);
6674           if (write_event(&e, cache_data, file))
6675             goto err;
6676         }
6677         if (thd->user_var_events.elements)
6678         {
6679           for (uint i= 0; i < thd->user_var_events.elements; i++)
6680           {
6681             BINLOG_USER_VAR_EVENT *user_var_event;
6682             get_dynamic(&thd->user_var_events,(uchar*) &user_var_event, i);
6683 
6684             /* setting flags for user var log event */
6685             uchar flags= User_var_log_event::UNDEF_F;
6686             if (user_var_event->unsigned_flag)
6687               flags|= User_var_log_event::UNSIGNED_F;
6688 
6689             User_var_log_event e(thd, user_var_event->user_var_event->name.str,
6690                                  user_var_event->user_var_event->name.length,
6691                                  user_var_event->value,
6692                                  user_var_event->length,
6693                                  user_var_event->type,
6694                                  user_var_event->charset_number,
6695                                  flags,
6696                                  using_trans,
6697                                  direct);
6698             if (write_event(&e, cache_data, file))
6699               goto err;
6700           }
6701         }
6702       }
6703     }
6704 
6705     /*
6706       Write the event.
6707     */
6708     if (write_event(event_info, cache_data, file) ||
6709         DBUG_EVALUATE_IF("injecting_fault_writing", 1, 0))
6710       goto err;
6711 
6712     error= 0;
6713 err:
6714     if (direct)
6715     {
6716       my_off_t offset= my_b_tell(file);
6717       bool check_purge= false;
6718       DBUG_ASSERT(!is_relay_log);
6719 
6720       if (likely(!error))
6721       {
6722         bool synced;
6723 
6724         if ((error= flush_and_sync(&synced)))
6725         {
6726         }
6727         else
6728         {
6729           mysql_mutex_assert_not_owner(&LOCK_prepare_ordered);
6730           mysql_mutex_assert_owner(&LOCK_log);
6731           mysql_mutex_assert_not_owner(&LOCK_after_binlog_sync);
6732           mysql_mutex_assert_not_owner(&LOCK_commit_ordered);
6733 #ifdef HAVE_REPLICATION
6734           if (repl_semisync_master.report_binlog_update(thd, log_file_name,
6735                                                         file->pos_in_file))
6736           {
6737             sql_print_error("Failed to run 'after_flush' hooks");
6738             error= 1;
6739           }
6740           else
6741 #endif
6742           {
6743             /*
6744               update binlog_end_pos so it can be read by dump thread
6745               note: must be _after_ the RUN_HOOK(after_flush) or else
6746               semi-sync might not have put the transaction into
6747               it's list before dump-thread tries to send it
6748             */
6749             update_binlog_end_pos(offset);
6750             if (unlikely((error= rotate(false, &check_purge))))
6751               check_purge= false;
6752           }
6753         }
6754       }
6755 
6756       status_var_add(thd->status_var.binlog_bytes_written,
6757                      offset - my_org_b_tell);
6758 
6759       mysql_mutex_lock(&LOCK_after_binlog_sync);
6760       mysql_mutex_unlock(&LOCK_log);
6761 
6762       mysql_mutex_assert_not_owner(&LOCK_prepare_ordered);
6763       mysql_mutex_assert_not_owner(&LOCK_log);
6764       mysql_mutex_assert_owner(&LOCK_after_binlog_sync);
6765       mysql_mutex_assert_not_owner(&LOCK_commit_ordered);
6766 #ifdef HAVE_REPLICATION
6767       if (repl_semisync_master.wait_after_sync(log_file_name,
6768                                                file->pos_in_file))
6769       {
6770         error=1;
6771         /* error is already printed inside hook */
6772       }
6773 #endif
6774 
6775       /*
6776         Take mutex to protect against a reader seeing partial writes of 64-bit
6777         offset on 32-bit CPUs.
6778       */
6779       mysql_mutex_lock(&LOCK_commit_ordered);
6780       mysql_mutex_unlock(&LOCK_after_binlog_sync);
6781       last_commit_pos_offset= offset;
6782       mysql_mutex_unlock(&LOCK_commit_ordered);
6783 
6784       if (check_purge)
6785         checkpoint_and_purge(prev_binlog_id);
6786     }
6787 
6788     if (unlikely(error))
6789     {
6790       set_write_error(thd, is_trans_cache);
6791       if (check_write_error(thd) && cache_data &&
6792           stmt_has_updated_non_trans_table(thd))
6793         cache_data->set_incident();
6794     }
6795   }
6796 
6797   DBUG_RETURN(error);
6798 }
6799 
6800 
error_log_print(enum loglevel level,const char * format,va_list args)6801 int error_log_print(enum loglevel level, const char *format,
6802                     va_list args)
6803 {
6804   return logger.error_log_print(level, format, args);
6805 }
6806 
6807 
slow_log_print(THD * thd,const char * query,uint query_length,ulonglong current_utime)6808 bool slow_log_print(THD *thd, const char *query, uint query_length,
6809                     ulonglong current_utime)
6810 {
6811   return logger.slow_log_print(thd, query, query_length, current_utime);
6812 }
6813 
6814 
6815 /**
6816   Decide if we should log the command to general log
6817 
6818   @retval
6819      FALSE  No logging
6820      TRUE   Ok to log
6821 */
6822 
log_command(THD * thd,enum enum_server_command command)6823 bool LOGGER::log_command(THD *thd, enum enum_server_command command)
6824 {
6825   /*
6826     Log command if we have at least one log event handler enabled and want
6827     to log this king of commands
6828   */
6829   if (!(*general_log_handler_list && (what_to_log & (1L << (uint) command))))
6830     return FALSE;
6831 
6832   /*
6833     If LOG_SLOW_DISABLE_SLAVE is set when slave thread starts, then
6834     OPTION_LOG_OFF is set.
6835     Only the super user can set this bit.
6836   */
6837   return !(thd->variables.option_bits & OPTION_LOG_OFF);
6838 }
6839 
6840 
general_log_print(THD * thd,enum enum_server_command command,const char * format,...)6841 bool general_log_print(THD *thd, enum enum_server_command command,
6842                        const char *format, ...)
6843 {
6844   va_list args;
6845   uint error= 0;
6846 
6847   /* Print the message to the buffer if we want to log this kind of commands */
6848   if (! logger.log_command(thd, command))
6849     return FALSE;
6850 
6851   va_start(args, format);
6852   error= logger.general_log_print(thd, command, format, args);
6853   va_end(args);
6854 
6855   return error;
6856 }
6857 
general_log_write(THD * thd,enum enum_server_command command,const char * query,size_t query_length)6858 bool general_log_write(THD *thd, enum enum_server_command command,
6859                        const char *query, size_t query_length)
6860 {
6861   /* Write the message to the log if we want to log this king of commands */
6862   if (logger.log_command(thd, command) || mysql_audit_general_enabled())
6863     return logger.general_log_write(thd, command, query, query_length);
6864 
6865   return FALSE;
6866 }
6867 
6868 
6869 static void
binlog_checkpoint_callback(void * cookie)6870 binlog_checkpoint_callback(void *cookie)
6871 {
6872   MYSQL_BIN_LOG::xid_count_per_binlog *entry=
6873     (MYSQL_BIN_LOG::xid_count_per_binlog *)cookie;
6874   /*
6875     For every supporting engine, we increment the xid_count and issue a
6876     commit_checkpoint_request(). Then we can count when all
6877     commit_checkpoint_notify() callbacks have occurred, and then log a new
6878     binlog checkpoint event.
6879   */
6880   mysql_bin_log.mark_xids_active(entry->binlog_id, 1);
6881 }
6882 
6883 
6884 /*
6885   Request a commit checkpoint from each supporting engine.
6886   This must be called after each binlog rotate, and after LOCK_log has been
6887   released. The xid_count value in the xid_count_per_binlog entry was
6888   incremented by 1 and will be decremented in this function; this ensures
6889   that the entry will not go away early despite LOCK_log not being held.
6890 */
6891 void
do_checkpoint_request(ulong binlog_id)6892 MYSQL_BIN_LOG::do_checkpoint_request(ulong binlog_id)
6893 {
6894   xid_count_per_binlog *entry;
6895 
6896   /*
6897     Find the binlog entry, and invoke commit_checkpoint_request() on it in
6898     each supporting storage engine.
6899   */
6900   mysql_mutex_lock(&LOCK_xid_list);
6901   I_List_iterator<xid_count_per_binlog> it(binlog_xid_count_list);
6902   do {
6903     entry= it++;
6904     DBUG_ASSERT(entry /* binlog_id is always somewhere in the list. */);
6905   } while (entry->binlog_id != binlog_id);
6906   mysql_mutex_unlock(&LOCK_xid_list);
6907 
6908   ha_commit_checkpoint_request(entry, binlog_checkpoint_callback);
6909   /*
6910     When we rotated the binlog, we incremented xid_count to make sure the
6911     entry would not go away until this point, where we have done all necessary
6912     commit_checkpoint_request() calls.
6913     So now we can (and must) decrease the count - when it reaches zero, we
6914     will know that both all pending unlog() and all pending
6915     commit_checkpoint_notify() calls are done, and we can log a new binlog
6916     checkpoint.
6917   */
6918   mark_xid_done(binlog_id, true);
6919 }
6920 
6921 
6922 /**
6923   The method executes rotation when LOCK_log is already acquired
6924   by the caller.
6925 
6926   @param force_rotate  caller can request the log rotation
6927   @param check_purge   is set to true if rotation took place
6928 
6929   @note
6930     Caller _must_ check the check_purge variable. If this is set, it means
6931     that the binlog was rotated, and caller _must_ ensure that
6932     do_checkpoint_request() is called later with the binlog_id of the rotated
6933     binlog file. The call to do_checkpoint_request() must happen after
6934     LOCK_log is released (which is why we cannot simply do it here).
6935     Usually, checkpoint_and_purge() is appropriate, as it will both handle
6936     the checkpointing and any needed purging of old logs.
6937 
6938   @note
6939     If rotation fails, for instance the server was unable
6940     to create a new log file, we still try to write an
6941     incident event to the current log.
6942 
6943   @retval
6944     nonzero - error in rotating routine.
6945 */
rotate(bool force_rotate,bool * check_purge)6946 int MYSQL_BIN_LOG::rotate(bool force_rotate, bool* check_purge)
6947 {
6948   int error= 0;
6949   DBUG_ENTER("MYSQL_BIN_LOG::rotate");
6950 
6951 #ifdef WITH_WSREP
6952   if (WSREP_ON && wsrep_to_isolation)
6953   {
6954     *check_purge= false;
6955     WSREP_DEBUG("avoiding binlog rotate due to TO isolation: %d",
6956                 wsrep_to_isolation);
6957     DBUG_RETURN(0);
6958   }
6959 #endif /* WITH_WSREP */
6960 
6961   //todo: fix the macro def and restore safe_mutex_assert_owner(&LOCK_log);
6962   *check_purge= false;
6963 
6964   if (force_rotate || (my_b_tell(&log_file) >= (my_off_t) max_size))
6965   {
6966     ulong binlog_id= current_binlog_id;
6967     /*
6968       We rotate the binlog, so we need to start a commit checkpoint in all
6969       supporting engines - when it finishes, we can log a new binlog checkpoint
6970       event.
6971 
6972       But we cannot start the checkpoint here - there could be a group commit
6973       still in progress which needs to be included in the checkpoint, and
6974       besides we do not want to do the (possibly expensive) checkpoint while
6975       LOCK_log is held.
6976 
6977       On the other hand, we must be sure that the xid_count entry for the
6978       previous log does not go away until we start the checkpoint - which it
6979       could do as it is no longer the most recent. So we increment xid_count
6980       (to count the pending checkpoint request) - this will fix the entry in
6981       place until we decrement again in do_checkpoint_request().
6982     */
6983     mark_xids_active(binlog_id, 1);
6984 
6985     if (unlikely((error= new_file_without_locking())))
6986     {
6987       /**
6988          Be conservative... There are possible lost events (eg,
6989          failing to log the Execute_load_query_log_event
6990          on a LOAD DATA while using a non-transactional
6991          table)!
6992 
6993          We give it a shot and try to write an incident event anyway
6994          to the current log.
6995       */
6996       if (!write_incident_already_locked(current_thd))
6997         flush_and_sync(0);
6998 
6999       /*
7000         We failed to rotate - so we have to decrement the xid_count back that
7001         we incremented before attempting the rotate.
7002       */
7003       mark_xid_done(binlog_id, false);
7004     }
7005     else
7006       *check_purge= true;
7007   }
7008   DBUG_RETURN(error);
7009 }
7010 
7011 /**
7012   The method executes logs purging routine.
7013 
7014   @retval
7015     nonzero - error in rotating routine.
7016 */
purge()7017 void MYSQL_BIN_LOG::purge()
7018 {
7019   mysql_mutex_assert_not_owner(&LOCK_log);
7020 #ifdef HAVE_REPLICATION
7021   if (expire_logs_days)
7022   {
7023     DEBUG_SYNC(current_thd, "at_purge_logs_before_date");
7024     time_t purge_time= my_time(0) - expire_logs_days*24*60*60;
7025     if (purge_time >= 0)
7026     {
7027       purge_logs_before_date(purge_time);
7028     }
7029     DEBUG_SYNC(current_thd, "after_purge_logs_before_date");
7030   }
7031 #endif
7032 }
7033 
7034 
checkpoint_and_purge(ulong binlog_id)7035 void MYSQL_BIN_LOG::checkpoint_and_purge(ulong binlog_id)
7036 {
7037   do_checkpoint_request(binlog_id);
7038   purge();
7039 }
7040 
7041 
7042 /**
7043   Searches for the first (oldest) binlog file name in in the binlog index.
7044 
7045   @param[in,out]  buf_arg  pointer to a buffer to hold found
7046                            the first binary log file name
7047   @return         NULL     on success, otherwise error message
7048 */
get_first_binlog(char * buf_arg)7049 static const char* get_first_binlog(char* buf_arg)
7050 {
7051   IO_CACHE *index_file;
7052   size_t length;
7053   char fname[FN_REFLEN];
7054   const char* errmsg= NULL;
7055 
7056   DBUG_ENTER("get_first_binlog");
7057 
7058   DBUG_ASSERT(mysql_bin_log.is_open());
7059 
7060   mysql_bin_log.lock_index();
7061 
7062   index_file=mysql_bin_log.get_index_file();
7063   if (reinit_io_cache(index_file, READ_CACHE, (my_off_t) 0, 0, 0))
7064   {
7065     errmsg= "failed to create a cache on binlog index";
7066     goto end;
7067   }
7068   /* The file ends with EOF or empty line */
7069   if ((length=my_b_gets(index_file, fname, sizeof(fname))) <= 1)
7070   {
7071     errmsg= "empty binlog index";
7072     goto end;
7073   }
7074   else
7075   {
7076     fname[length-1]= 0;                         // Remove end \n
7077   }
7078   if (normalize_binlog_name(buf_arg, fname, false))
7079   {
7080     errmsg= "could not normalize the first file name in the binlog index";
7081     goto end;
7082   }
7083 end:
7084   mysql_bin_log.unlock_index();
7085 
7086   DBUG_RETURN(errmsg);
7087 }
7088 
7089 /**
7090   Check weather the gtid binlog state can safely remove gtid
7091   domains passed as the argument. A safety condition is satisfied when
7092   there are no events from the being deleted domains in the currently existing
7093   binlog files. Upon successful check the supplied domains are removed
7094   from @@gtid_binlog_state. The caller is supposed to rotate binlog so that
7095   the active latest file won't have the deleted domains in its Gtid_list header.
7096 
7097   @param  domain_drop_lex  gtid domain id sequence from lex.
7098                            Passed as a pointer to dynamic array must be not empty
7099                            unless pointer value NULL.
7100   @retval zero             on success
7101   @retval > 0              ineffective call none from the *non* empty
7102                            gtid domain sequence is deleted
7103   @retval < 0              on error
7104 */
do_delete_gtid_domain(DYNAMIC_ARRAY * domain_drop_lex)7105 static int do_delete_gtid_domain(DYNAMIC_ARRAY *domain_drop_lex)
7106 {
7107   int rc= 0;
7108   Gtid_list_log_event *glev= NULL;
7109   char buf[FN_REFLEN];
7110   File file;
7111   IO_CACHE cache;
7112   const char* errmsg= NULL;
7113   char errbuf[MYSQL_ERRMSG_SIZE]= {0};
7114 
7115   if (!domain_drop_lex)
7116     return 0; // still "effective" having empty domain sequence to delete
7117 
7118   DBUG_ASSERT(domain_drop_lex->elements > 0);
7119   mysql_mutex_assert_owner(mysql_bin_log.get_log_lock());
7120 
7121   if ((errmsg= get_first_binlog(buf)) != NULL)
7122     goto end;
7123   bzero((char*) &cache, sizeof(cache));
7124   if ((file= open_binlog(&cache, buf, &errmsg)) == (File) -1)
7125     goto end;
7126   errmsg= get_gtid_list_event(&cache, &glev);
7127   end_io_cache(&cache);
7128   mysql_file_close(file, MYF(MY_WME));
7129 
7130   DBUG_EXECUTE_IF("inject_binlog_delete_domain_init_error",
7131                   errmsg= "injected error";);
7132   if (errmsg)
7133     goto end;
7134   errmsg= rpl_global_gtid_binlog_state.drop_domain(domain_drop_lex,
7135                                                    glev, errbuf);
7136 
7137 end:
7138   if (errmsg)
7139   {
7140     if (strlen(errmsg) > 0)
7141     {
7142       my_error(ER_BINLOG_CANT_DELETE_GTID_DOMAIN, MYF(0), errmsg);
7143       rc= -1;
7144     }
7145     else
7146     {
7147       rc= 1;
7148     }
7149   }
7150   delete glev;
7151 
7152   return rc;
7153 }
7154 
7155 /**
7156   The method is a shortcut of @c rotate() and @c purge().
7157   LOCK_log is acquired prior to rotate and is released after it.
7158 
7159   @param force_rotate  caller can request the log rotation
7160 
7161   @retval
7162     nonzero - error in rotating routine.
7163 */
rotate_and_purge(bool force_rotate,DYNAMIC_ARRAY * domain_drop_lex)7164 int MYSQL_BIN_LOG::rotate_and_purge(bool force_rotate,
7165                                     DYNAMIC_ARRAY *domain_drop_lex)
7166 {
7167   int err_gtid=0, error= 0;
7168   ulong prev_binlog_id;
7169   DBUG_ENTER("MYSQL_BIN_LOG::rotate_and_purge");
7170   bool check_purge= false;
7171 
7172   mysql_mutex_lock(&LOCK_log);
7173 
7174   DEBUG_SYNC(current_thd, "rotate_after_acquire_LOCK_log");
7175 
7176   prev_binlog_id= current_binlog_id;
7177 
7178   if ((err_gtid= do_delete_gtid_domain(domain_drop_lex)))
7179   {
7180     // inffective attempt to delete merely skips rotate and purge
7181     if (err_gtid < 0)
7182       error= 1; // otherwise error is propagated the user
7183   }
7184   else if (unlikely((error= rotate(force_rotate, &check_purge))))
7185     check_purge= false;
7186 
7187   DEBUG_SYNC(current_thd, "rotate_after_rotate");
7188 
7189   /*
7190     NOTE: Run purge_logs wo/ holding LOCK_log because it does not need
7191           the mutex. Otherwise causes various deadlocks.
7192           Explicit binlog rotation must be synchronized with a concurrent
7193           binlog ordered commit, in particular not let binlog
7194           checkpoint notification request until early binlogged
7195           concurrent commits have has been completed.
7196   */
7197   mysql_mutex_lock(&LOCK_after_binlog_sync);
7198   mysql_mutex_unlock(&LOCK_log);
7199   mysql_mutex_lock(&LOCK_commit_ordered);
7200   mysql_mutex_unlock(&LOCK_after_binlog_sync);
7201   mysql_mutex_unlock(&LOCK_commit_ordered);
7202 
7203   if (check_purge)
7204     checkpoint_and_purge(prev_binlog_id);
7205 
7206   DBUG_RETURN(error);
7207 }
7208 
next_file_id()7209 uint MYSQL_BIN_LOG::next_file_id()
7210 {
7211   uint res;
7212   mysql_mutex_lock(&LOCK_log);
7213   res = file_id++;
7214   mysql_mutex_unlock(&LOCK_log);
7215   return res;
7216 }
7217 
7218 class CacheWriter: public Log_event_writer
7219 {
7220 public:
7221   size_t remains;
7222 
CacheWriter(THD * thd_arg,IO_CACHE * file_arg,bool do_checksum,Binlog_crypt_data * cr)7223   CacheWriter(THD *thd_arg, IO_CACHE *file_arg, bool do_checksum,
7224               Binlog_crypt_data *cr)
7225     : Log_event_writer(file_arg, 0, cr), remains(0), thd(thd_arg),
7226       first(true)
7227   { checksum_len= do_checksum ? BINLOG_CHECKSUM_LEN : 0; }
7228 
~CacheWriter()7229   ~CacheWriter()
7230   { status_var_add(thd->status_var.binlog_bytes_written, bytes_written); }
7231 
write(uchar * pos,size_t len)7232   int write(uchar* pos, size_t len)
7233   {
7234     DBUG_ENTER("CacheWriter::write");
7235     if (first)
7236       write_header(pos, len);
7237     else
7238       write_data(pos, len);
7239 
7240     remains -= len;
7241     if ((first= !remains))
7242       write_footer();
7243     DBUG_RETURN(0);
7244   }
7245 private:
7246   THD *thd;
7247   bool first;
7248 };
7249 
7250 /*
7251   Write the contents of a cache to the binary log.
7252 
7253   SYNOPSIS
7254     write_cache()
7255     thd      Current_thread
7256     cache    Cache to write to the binary log
7257 
7258   DESCRIPTION
7259     Write the contents of the cache to the binary log. The cache will
7260     be reset as a READ_CACHE to be able to read the contents from it.
7261 
7262     Reading from the trans cache with possible (per @c binlog_checksum_options)
7263     adding checksum value  and then fixing the length and the end_log_pos of
7264     events prior to fill in the binlog cache.
7265 */
7266 
write_cache(THD * thd,IO_CACHE * cache)7267 int MYSQL_BIN_LOG::write_cache(THD *thd, IO_CACHE *cache)
7268 {
7269   DBUG_ENTER("MYSQL_BIN_LOG::write_cache");
7270 
7271   mysql_mutex_assert_owner(&LOCK_log);
7272   if (reinit_io_cache(cache, READ_CACHE, 0, 0, 0))
7273     DBUG_RETURN(ER_ERROR_ON_WRITE);
7274   size_t length= my_b_bytes_in_cache(cache), group, carry, hdr_offs;
7275   size_t val;
7276   size_t end_log_pos_inc= 0; // each event processed adds BINLOG_CHECKSUM_LEN 2 t
7277   uchar header[LOG_EVENT_HEADER_LEN];
7278   CacheWriter writer(thd, &log_file, binlog_checksum_options, &crypto);
7279 
7280   if (crypto.scheme)
7281   {
7282     writer.ctx= alloca(crypto.ctx_size);
7283     writer.set_encrypted_writer();
7284   }
7285   // while there is just one alg the following must hold:
7286   DBUG_ASSERT(binlog_checksum_options == BINLOG_CHECKSUM_ALG_OFF ||
7287               binlog_checksum_options == BINLOG_CHECKSUM_ALG_CRC32);
7288 
7289   /*
7290     The events in the buffer have incorrect end_log_pos data
7291     (relative to beginning of group rather than absolute),
7292     so we'll recalculate them in situ so the binlog is always
7293     correct, even in the middle of a group. This is possible
7294     because we now know the start position of the group (the
7295     offset of this cache in the log, if you will); all we need
7296     to do is to find all event-headers, and add the position of
7297     the group to the end_log_pos of each event.  This is pretty
7298     straight forward, except that we read the cache in segments,
7299     so an event-header might end up on the cache-border and get
7300     split.
7301   */
7302 
7303   group= (size_t)my_b_tell(&log_file);
7304   hdr_offs= carry= 0;
7305 
7306   do
7307   {
7308     /*
7309       if we only got a partial header in the last iteration,
7310       get the other half now and process a full header.
7311     */
7312     if (unlikely(carry > 0))
7313     {
7314       DBUG_ASSERT(carry < LOG_EVENT_HEADER_LEN);
7315       size_t tail= LOG_EVENT_HEADER_LEN - carry;
7316 
7317       /* assemble both halves */
7318       memcpy(&header[carry], (char *)cache->read_pos, tail);
7319 
7320       uint32 len= uint4korr(header + EVENT_LEN_OFFSET);
7321       writer.remains= len;
7322 
7323       /* fix end_log_pos */
7324       end_log_pos_inc += writer.checksum_len;
7325       val= uint4korr(header + LOG_POS_OFFSET) + group + end_log_pos_inc;
7326       int4store(header + LOG_POS_OFFSET, val);
7327 
7328       /* fix len */
7329       len+= writer.checksum_len;
7330       int4store(header + EVENT_LEN_OFFSET, len);
7331 
7332       if (writer.write(header, LOG_EVENT_HEADER_LEN))
7333         DBUG_RETURN(ER_ERROR_ON_WRITE);
7334 
7335       cache->read_pos+= tail;
7336       length-= tail;
7337       carry= 0;
7338 
7339       /* next event header at ... */
7340       hdr_offs= len - LOG_EVENT_HEADER_LEN - writer.checksum_len;
7341     }
7342 
7343     /* if there is anything to write, process it. */
7344 
7345     if (likely(length > 0))
7346     {
7347       DBUG_EXECUTE_IF("fail_binlog_write_1",
7348                       errno= 28; DBUG_RETURN(ER_ERROR_ON_WRITE););
7349       /*
7350         process all event-headers in this (partial) cache.
7351         if next header is beyond current read-buffer,
7352         we'll get it later (though not necessarily in the
7353         very next iteration, just "eventually").
7354       */
7355 
7356       if (hdr_offs >= length)
7357       {
7358         if (writer.write(cache->read_pos, length))
7359           DBUG_RETURN(ER_ERROR_ON_WRITE);
7360       }
7361 
7362       while (hdr_offs < length)
7363       {
7364         /*
7365           finish off with remains of the last event that crawls
7366           from previous into the current buffer
7367         */
7368         if (writer.remains != 0)
7369         {
7370           if (writer.write(cache->read_pos, hdr_offs))
7371             DBUG_RETURN(ER_ERROR_ON_WRITE);
7372         }
7373 
7374         /*
7375           partial header only? save what we can get, process once
7376           we get the rest.
7377         */
7378         if (hdr_offs + LOG_EVENT_HEADER_LEN > length)
7379         {
7380           carry= length - hdr_offs;
7381           memcpy(header, (char *)cache->read_pos + hdr_offs, carry);
7382           length= hdr_offs;
7383         }
7384         else
7385         {
7386           /* we've got a full event-header, and it came in one piece */
7387           uchar *ev= (uchar *)cache->read_pos + hdr_offs;
7388           uint ev_len= uint4korr(ev + EVENT_LEN_OFFSET); // netto len
7389           uchar *log_pos= ev + LOG_POS_OFFSET;
7390 
7391           end_log_pos_inc += writer.checksum_len;
7392           /* fix end_log_pos */
7393           val= uint4korr(log_pos) + group + end_log_pos_inc;
7394           int4store(log_pos, val);
7395 
7396           /* fix length */
7397           int4store(ev + EVENT_LEN_OFFSET, ev_len + writer.checksum_len);
7398 
7399           writer.remains= ev_len;
7400           if (writer.write(ev, MY_MIN(ev_len, length - hdr_offs)))
7401             DBUG_RETURN(ER_ERROR_ON_WRITE);
7402 
7403           /* next event header at ... */
7404           hdr_offs += ev_len; // incr by the netto len
7405 
7406           DBUG_ASSERT(!writer.checksum_len || writer.remains == 0 || hdr_offs >= length);
7407         }
7408       }
7409 
7410       /*
7411         Adjust hdr_offs. Note that it may still point beyond the segment
7412         read in the next iteration; if the current event is very long,
7413         it may take a couple of read-iterations (and subsequent adjustments
7414         of hdr_offs) for it to point into the then-current segment.
7415         If we have a split header (!carry), hdr_offs will be set at the
7416         beginning of the next iteration, overwriting the value we set here:
7417       */
7418       hdr_offs -= length;
7419     }
7420   } while ((length= my_b_fill(cache)));
7421 
7422   DBUG_ASSERT(carry == 0);
7423   DBUG_ASSERT(!writer.checksum_len || writer.remains == 0);
7424 
7425   DBUG_RETURN(0);                               // All OK
7426 }
7427 
7428 /*
7429   Helper function to get the error code of the query to be binlogged.
7430  */
query_error_code(THD * thd,bool not_killed)7431 int query_error_code(THD *thd, bool not_killed)
7432 {
7433   int error;
7434 
7435   if (not_killed || (killed_mask_hard(thd->killed) == KILL_BAD_DATA))
7436   {
7437     error= thd->is_error() ? thd->get_stmt_da()->sql_errno() : 0;
7438     if (!error)
7439       return error;
7440 
7441     /* thd->get_get_stmt_da()->sql_errno() might be ER_SERVER_SHUTDOWN or
7442        ER_QUERY_INTERRUPTED, So here we need to make sure that error
7443        is not set to these errors when specified not_killed by the
7444        caller.
7445     */
7446     if (error == ER_SERVER_SHUTDOWN || error == ER_QUERY_INTERRUPTED ||
7447         error == ER_NEW_ABORTING_CONNECTION || error == ER_CONNECTION_KILLED)
7448       error= 0;
7449   }
7450   else
7451   {
7452     /* killed status for DELAYED INSERT thread should never be used */
7453     DBUG_ASSERT(!(thd->system_thread & SYSTEM_THREAD_DELAYED_INSERT));
7454     error= thd->killed_errno();
7455   }
7456 
7457   return error;
7458 }
7459 
7460 
write_incident_already_locked(THD * thd)7461 bool MYSQL_BIN_LOG::write_incident_already_locked(THD *thd)
7462 {
7463   uint error= 0;
7464   DBUG_ENTER("MYSQL_BIN_LOG::write_incident_already_locked");
7465   Incident incident= INCIDENT_LOST_EVENTS;
7466   Incident_log_event ev(thd, incident, &write_error_msg);
7467 
7468   if (likely(is_open()))
7469   {
7470     error= write_event(&ev);
7471     status_var_add(thd->status_var.binlog_bytes_written, ev.data_written);
7472   }
7473 
7474   DBUG_RETURN(error);
7475 }
7476 
7477 
write_incident(THD * thd)7478 bool MYSQL_BIN_LOG::write_incident(THD *thd)
7479 {
7480   uint error= 0;
7481   my_off_t offset;
7482   bool check_purge= false;
7483   ulong prev_binlog_id;
7484   DBUG_ENTER("MYSQL_BIN_LOG::write_incident");
7485 
7486   mysql_mutex_lock(&LOCK_log);
7487   if (likely(is_open()))
7488   {
7489     prev_binlog_id= current_binlog_id;
7490     if (likely(!(error= write_incident_already_locked(thd))) &&
7491         likely(!(error= flush_and_sync(0))))
7492     {
7493       update_binlog_end_pos();
7494       if (unlikely((error= rotate(false, &check_purge))))
7495         check_purge= false;
7496     }
7497 
7498     offset= my_b_tell(&log_file);
7499 
7500     update_binlog_end_pos(offset);
7501 
7502     /*
7503       Take mutex to protect against a reader seeing partial writes of 64-bit
7504       offset on 32-bit CPUs.
7505     */
7506     mysql_mutex_lock(&LOCK_commit_ordered);
7507     last_commit_pos_offset= offset;
7508     mysql_mutex_unlock(&LOCK_commit_ordered);
7509     mysql_mutex_unlock(&LOCK_log);
7510 
7511     if (check_purge)
7512       checkpoint_and_purge(prev_binlog_id);
7513   }
7514   else
7515   {
7516     mysql_mutex_unlock(&LOCK_log);
7517   }
7518 
7519   DBUG_RETURN(error);
7520 }
7521 
7522 void
write_binlog_checkpoint_event_already_locked(const char * name_arg,uint len)7523 MYSQL_BIN_LOG::write_binlog_checkpoint_event_already_locked(const char *name_arg, uint len)
7524 {
7525   my_off_t offset;
7526   Binlog_checkpoint_log_event ev(name_arg, len);
7527   /*
7528     Note that we must sync the binlog checkpoint to disk.
7529     Otherwise a subsequent log purge could delete binlogs that XA recovery
7530     thinks are needed (even though they are not really).
7531   */
7532   if (!write_event(&ev) && !flush_and_sync(0))
7533   {
7534     update_binlog_end_pos();
7535   }
7536   else
7537   {
7538     /*
7539       If we fail to write the checkpoint event, something is probably really
7540       bad with the binlog. We complain in the error log.
7541 
7542       Note that failure to write binlog checkpoint does not compromise the
7543       ability to do crash recovery - crash recovery will just have to scan a
7544       bit more of the binlog than strictly necessary.
7545     */
7546     sql_print_error("Failed to write binlog checkpoint event to binary log");
7547   }
7548 
7549   offset= my_b_tell(&log_file);
7550 
7551   update_binlog_end_pos(offset);
7552 
7553   /*
7554     Take mutex to protect against a reader seeing partial writes of 64-bit
7555     offset on 32-bit CPUs.
7556   */
7557   mysql_mutex_lock(&LOCK_commit_ordered);
7558   last_commit_pos_offset= offset;
7559   mysql_mutex_unlock(&LOCK_commit_ordered);
7560 }
7561 
7562 
7563 /**
7564   Write a cached log entry to the binary log.
7565   - To support transaction over replication, we wrap the transaction
7566   with BEGIN/COMMIT or BEGIN/ROLLBACK in the binary log.
7567   We want to write a BEGIN/ROLLBACK block when a non-transactional table
7568   was updated in a transaction which was rolled back. This is to ensure
7569   that the same updates are run on the slave.
7570 
7571   @param thd
7572   @param cache		The cache to copy to the binlog
7573   @param commit_event   The commit event to print after writing the
7574                         contents of the cache.
7575   @param incident       Defines if an incident event should be created to
7576                         notify that some non-transactional changes did
7577                         not get into the binlog.
7578 
7579   @note
7580     We only come here if there is something in the cache.
7581   @note
7582     The thing in the cache is always a complete transaction.
7583   @note
7584     'cache' needs to be reinitialized after this functions returns.
7585 */
7586 
7587 bool
write_transaction_to_binlog(THD * thd,binlog_cache_mngr * cache_mngr,Log_event * end_ev,bool all,bool using_stmt_cache,bool using_trx_cache)7588 MYSQL_BIN_LOG::write_transaction_to_binlog(THD *thd,
7589                                            binlog_cache_mngr *cache_mngr,
7590                                            Log_event *end_ev, bool all,
7591                                            bool using_stmt_cache,
7592                                            bool using_trx_cache)
7593 {
7594   group_commit_entry entry;
7595   Ha_trx_info *ha_info;
7596   DBUG_ENTER("MYSQL_BIN_LOG::write_transaction_to_binlog");
7597 
7598   /*
7599     Control should not be allowed beyond this point in wsrep_emulate_bin_log
7600     mode. Also, do not write the cached updates to binlog if binary logging is
7601     disabled (log-bin/sql_log_bin).
7602   */
7603   if (wsrep_emulate_bin_log)
7604   {
7605     DBUG_RETURN(0);
7606   }
7607   else if (!(thd->variables.option_bits & OPTION_BIN_LOG))
7608   {
7609     cache_mngr->need_unlog= false;
7610     DBUG_RETURN(0);
7611   }
7612 
7613   entry.thd= thd;
7614   entry.cache_mngr= cache_mngr;
7615   entry.error= 0;
7616   entry.all= all;
7617   entry.using_stmt_cache= using_stmt_cache;
7618   entry.using_trx_cache= using_trx_cache;
7619   entry.need_unlog= is_preparing_xa(thd);
7620   ha_info= all ? thd->transaction->all.ha_list : thd->transaction->stmt.ha_list;
7621 
7622   for (; !entry.need_unlog && ha_info; ha_info= ha_info->next())
7623   {
7624     if (ha_info->is_started() && ha_info->ht() != binlog_hton &&
7625         !ha_info->ht()->commit_checkpoint_request)
7626       entry.need_unlog= true;
7627     break;
7628   }
7629 
7630   entry.end_event= end_ev;
7631   if (cache_mngr->stmt_cache.has_incident() ||
7632       cache_mngr->trx_cache.has_incident())
7633   {
7634     Incident_log_event inc_ev(thd, INCIDENT_LOST_EVENTS, &write_error_msg);
7635     entry.incident_event= &inc_ev;
7636     DBUG_RETURN(write_transaction_to_binlog_events(&entry));
7637   }
7638   else
7639   {
7640     entry.incident_event= NULL;
7641     DBUG_RETURN(write_transaction_to_binlog_events(&entry));
7642   }
7643 }
7644 
7645 
7646 /*
7647   Put a transaction that is ready to commit in the group commit queue.
7648   The transaction is identified by the ENTRY object passed into this function.
7649 
7650   To facilitate group commit for the binlog, we first queue up ourselves in
7651   this function. Then later the first thread to enter the queue waits for
7652   the LOCK_log mutex, and commits for everyone in the queue once it gets the
7653   lock. Any other threads in the queue just wait for the first one to finish
7654   the commit and wake them up. This way, all transactions in the queue get
7655   committed in a single disk operation.
7656 
7657   The main work in this function is when the commit in one transaction has
7658   been marked to wait for the commit of another transaction to happen
7659   first. This is used to support in-order parallel replication, where
7660   transactions can execute out-of-order but need to be committed in-order with
7661   how they happened on the master. The waiting of one commit on another needs
7662   to be integrated with the group commit queue, to ensure that the waiting
7663   transaction can participate in the same group commit as the waited-for
7664   transaction.
7665 
7666   So when we put a transaction in the queue, we check if there were other
7667   transactions already prepared to commit but just waiting for the first one
7668   to commit. If so, we add those to the queue as well, transitively for all
7669   waiters.
7670 
7671   And if a transaction is marked to wait for a prior transaction, but that
7672   prior transaction is already queued for group commit, then we can queue the
7673   new transaction directly to participate in the group commit.
7674 
7675   @retval < 0   Error
7676   @retval  -2   WSREP error with commit ordering
7677   @retval  -3   WSREP return code to mark the leader
7678   @retval > 0   If queued as the first entry in the queue (meaning this
7679                 is the leader)
7680   @retval   0   Otherwise (queued as participant, leader handles the commit)
7681 */
7682 
7683 int
queue_for_group_commit(group_commit_entry * orig_entry)7684 MYSQL_BIN_LOG::queue_for_group_commit(group_commit_entry *orig_entry)
7685 {
7686   group_commit_entry *entry, *orig_queue, *last;
7687   wait_for_commit *cur;
7688   wait_for_commit *wfc;
7689   bool backup_lock_released= 0;
7690   int result= 0;
7691   THD *thd= orig_entry->thd;
7692   DBUG_ENTER("MYSQL_BIN_LOG::queue_for_group_commit");
7693   DBUG_ASSERT(thd == current_thd);
7694 
7695   /*
7696     Check if we need to wait for another transaction to commit before us.
7697 
7698     It is safe to do a quick check without lock first in the case where we do
7699     not have to wait. But if the quick check shows we need to wait, we must do
7700     another safe check under lock, to avoid the race where the other
7701     transaction wakes us up between the check and the wait.
7702   */
7703   wfc= orig_entry->thd->wait_for_commit_ptr;
7704   orig_entry->queued_by_other= false;
7705   if (wfc && wfc->waitee.load(std::memory_order_acquire))
7706   {
7707     wait_for_commit *loc_waitee;
7708 
7709     mysql_mutex_lock(&wfc->LOCK_wait_commit);
7710     /*
7711       Do an extra check here, this time safely under lock.
7712 
7713       If waitee->commit_started is set, it means that the transaction we need
7714       to wait for has already queued up for group commit. In this case it is
7715       safe for us to queue up immediately as well, increasing the opprtunities
7716       for group commit. Because waitee has taken the LOCK_prepare_ordered
7717       before setting the flag, so there is no risk that we can queue ahead of
7718       it.
7719     */
7720     if ((loc_waitee= wfc->waitee.load(std::memory_order_relaxed)) &&
7721         !loc_waitee->commit_started)
7722     {
7723       PSI_stage_info old_stage;
7724 
7725         /*
7726           Release MDL_BACKUP_COMMIT LOCK while waiting for other threads to
7727           commit.
7728           This is needed to avoid deadlock between the other threads (which not
7729           yet have the MDL_BACKUP_COMMIT_LOCK) and any threads using
7730           BACKUP LOCK BLOCK_COMMIT.
7731         */
7732       if (thd->backup_commit_lock && thd->backup_commit_lock->ticket &&
7733           !backup_lock_released)
7734       {
7735         backup_lock_released= 1;
7736         thd->mdl_context.release_lock(thd->backup_commit_lock->ticket);
7737         thd->backup_commit_lock->ticket= 0;
7738       }
7739 
7740       /*
7741         By setting wfc->opaque_pointer to our own entry, we mark that we are
7742         ready to commit, but waiting for another transaction to commit before
7743         us.
7744 
7745         This other transaction may then take over the commit process for us to
7746         get us included in its own group commit. If this happens, the
7747         queued_by_other flag is set.
7748 
7749         Setting this flag may or may not be seen by the other thread, but we
7750         are safe in any case: The other thread will set queued_by_other under
7751         its LOCK_wait_commit, and we will not check queued_by_other only after
7752         we have been woken up.
7753       */
7754       wfc->opaque_pointer= orig_entry;
7755       DEBUG_SYNC(orig_entry->thd, "group_commit_waiting_for_prior");
7756       orig_entry->thd->ENTER_COND(&wfc->COND_wait_commit,
7757                                   &wfc->LOCK_wait_commit,
7758                                   &stage_waiting_for_prior_transaction_to_commit,
7759                                   &old_stage);
7760       while ((loc_waitee= wfc->waitee.load(std::memory_order_relaxed)) &&
7761               !orig_entry->thd->check_killed(1))
7762         mysql_cond_wait(&wfc->COND_wait_commit, &wfc->LOCK_wait_commit);
7763       wfc->opaque_pointer= NULL;
7764       DBUG_PRINT("info", ("After waiting for prior commit, queued_by_other=%d",
7765                  orig_entry->queued_by_other));
7766 
7767       if (loc_waitee)
7768       {
7769         /* Wait terminated due to kill. */
7770         mysql_mutex_lock(&loc_waitee->LOCK_wait_commit);
7771         if (loc_waitee->wakeup_subsequent_commits_running ||
7772             orig_entry->queued_by_other)
7773         {
7774           /* Our waitee is already waking us up, so ignore the kill. */
7775           mysql_mutex_unlock(&loc_waitee->LOCK_wait_commit);
7776           do
7777           {
7778             mysql_cond_wait(&wfc->COND_wait_commit, &wfc->LOCK_wait_commit);
7779           } while (wfc->waitee.load(std::memory_order_relaxed));
7780         }
7781         else
7782         {
7783           /* We were killed, so remove us from the list of waitee. */
7784           wfc->remove_from_list(&loc_waitee->subsequent_commits_list);
7785           mysql_mutex_unlock(&loc_waitee->LOCK_wait_commit);
7786           /*
7787             This is the thread clearing its own status, it is no longer on
7788             the list of waiters. So no memory barriers are needed here.
7789           */
7790           wfc->waitee.store(NULL, std::memory_order_relaxed);
7791 
7792           orig_entry->thd->EXIT_COND(&old_stage);
7793           /* Interrupted by kill. */
7794           DEBUG_SYNC(orig_entry->thd, "group_commit_waiting_for_prior_killed");
7795           wfc->wakeup_error= orig_entry->thd->killed_errno();
7796           if (!wfc->wakeup_error)
7797             wfc->wakeup_error= ER_QUERY_INTERRUPTED;
7798           my_message(wfc->wakeup_error,
7799                      ER_THD(orig_entry->thd, wfc->wakeup_error), MYF(0));
7800           result= -1;
7801           goto end;
7802         }
7803       }
7804       orig_entry->thd->EXIT_COND(&old_stage);
7805     }
7806     else
7807       mysql_mutex_unlock(&wfc->LOCK_wait_commit);
7808   }
7809   /*
7810     If the transaction we were waiting for has already put us into the group
7811     commit queue (and possibly already done the entire binlog commit for us),
7812     then there is nothing else to do.
7813   */
7814   if (orig_entry->queued_by_other)
7815     goto end;
7816 
7817   if (wfc && wfc->wakeup_error)
7818   {
7819     my_error(ER_PRIOR_COMMIT_FAILED, MYF(0));
7820     result= -1;
7821     goto end;
7822   }
7823 
7824   /* Now enqueue ourselves in the group commit queue. */
7825   DEBUG_SYNC(orig_entry->thd, "commit_before_enqueue");
7826   orig_entry->thd->clear_wakeup_ready();
7827   mysql_mutex_lock(&LOCK_prepare_ordered);
7828   orig_queue= group_commit_queue;
7829 
7830   /*
7831     Iteratively process everything added to the queue, looking for waiters,
7832     and their waiters, and so on. If a waiter is ready to commit, we
7833     immediately add it to the queue, and mark it as queued_by_other.
7834 
7835     This would be natural to do with recursion, but we want to avoid
7836     potentially unbounded recursion blowing the C stack, so we use the list
7837     approach instead.
7838 
7839     We keep a list of the group_commit_entry of all the waiters that need to
7840     be processed. Initially this list contains only the entry passed into this
7841     function.
7842 
7843     We process entries in the list one by one. The element currently being
7844     processed is pointed to by `entry`, and the element at the end of the list
7845     is pointed to by `last` (we do not use NULL to terminate the list).
7846 
7847     As we process an entry, any waiters for that entry are added at the end of
7848     the list, to be processed in subsequent iterations. The the entry is added
7849     to the group_commit_queue.  This continues until the list is exhausted,
7850     with all entries ever added eventually processed.
7851 
7852     The end result is a breath-first traversal of the tree of waiters,
7853     re-using the `next' pointers of the group_commit_entry objects in place of
7854     extra stack space in a recursive traversal.
7855 
7856     The temporary list linked through these `next' pointers is not used by the
7857     caller or any other function; it only exists while doing the iterative
7858     tree traversal. After, all the processed entries are linked into the
7859     group_commit_queue.
7860   */
7861 
7862   cur= wfc;
7863   last= orig_entry;
7864   entry= orig_entry;
7865   for (;;)
7866   {
7867     group_commit_entry *next_entry;
7868 
7869     if (entry->cache_mngr->using_xa)
7870     {
7871       DEBUG_SYNC(entry->thd, "commit_before_prepare_ordered");
7872       run_prepare_ordered(entry->thd, entry->all);
7873       DEBUG_SYNC(entry->thd, "commit_after_prepare_ordered");
7874     }
7875 
7876     if (cur)
7877     {
7878       /*
7879         Now that we have taken LOCK_prepare_ordered and will queue up in the
7880         group commit queue, it is safe for following transactions to queue
7881         themselves. We will grab here any transaction that is now ready to
7882         queue up, but after that, more transactions may become ready while the
7883         leader is waiting to start the group commit. So set the flag
7884         `commit_started', so that later transactions can still participate in
7885         the group commit..
7886       */
7887       cur->commit_started= true;
7888 
7889       /*
7890         Check if this transaction has other transaction waiting for it to
7891         commit.
7892 
7893         If so, process the waiting transactions, and their waiters and so on,
7894         transitively.
7895       */
7896       if (cur->subsequent_commits_list)
7897       {
7898         wait_for_commit *waiter, **waiter_ptr;
7899 
7900         mysql_mutex_lock(&cur->LOCK_wait_commit);
7901         /*
7902           Grab the list, now safely under lock, and process it if still
7903           non-empty.
7904         */
7905         waiter= cur->subsequent_commits_list;
7906         waiter_ptr= &cur->subsequent_commits_list;
7907         while (waiter)
7908         {
7909           wait_for_commit *next_waiter= waiter->next_subsequent_commit;
7910           group_commit_entry *entry2=
7911             (group_commit_entry *)waiter->opaque_pointer;
7912           if (entry2)
7913           {
7914             /*
7915               This is another transaction ready to be written to the binary
7916               log. We can put it into the queue directly, without needing a
7917               separate context switch to the other thread. We just set a flag
7918               so that the other thread will know when it wakes up that it was
7919               already processed.
7920 
7921               So remove it from the list of our waiters, and instead put it at
7922               the end of the list to be processed in a subsequent iteration of
7923               the outer loop.
7924             */
7925             *waiter_ptr= next_waiter;
7926             entry2->queued_by_other= true;
7927             last->next= entry2;
7928             last= entry2;
7929             /*
7930               As a small optimisation, we do not actually need to set
7931               entry2->next to NULL, as we can use the pointer `last' to check
7932               for end-of-list.
7933             */
7934           }
7935           else
7936           {
7937             /*
7938               This transaction is not ready to participate in the group commit
7939               yet, so leave it in the waiter list. It might join the group
7940               commit later, if it completes soon enough to do so (it will see
7941               our wfc->commit_started flag set), or it might commit later in a
7942               later group commit.
7943             */
7944             waiter_ptr= &waiter->next_subsequent_commit;
7945           }
7946           waiter= next_waiter;
7947         }
7948         mysql_mutex_unlock(&cur->LOCK_wait_commit);
7949       }
7950     }
7951 
7952     /*
7953       Handle the heuristics that if another transaction is waiting for this
7954       transaction (or if it does so later), then we want to trigger group
7955       commit immediately, without waiting for the binlog_commit_wait_usec
7956       timeout to expire.
7957     */
7958     entry->thd->waiting_on_group_commit= true;
7959 
7960     /* Add the entry to the group commit queue. */
7961     next_entry= entry->next;
7962     entry->next= group_commit_queue;
7963     group_commit_queue= entry;
7964     if (entry == last)
7965       break;
7966     /*
7967       Move to the next entry in the flattened list of waiting transactions
7968       that still need to be processed transitively.
7969     */
7970     entry= next_entry;
7971     DBUG_ASSERT(entry != NULL);
7972     cur= entry->thd->wait_for_commit_ptr;
7973   }
7974 
7975   result= orig_queue == NULL;
7976 
7977 #ifdef WITH_WSREP
7978   if (wsrep_is_active(entry->thd) &&
7979       wsrep_run_commit_hook(entry->thd, entry->all))
7980   {
7981     /*  Release commit order here */
7982     if (wsrep_ordered_commit(entry->thd, entry->all))
7983       result= -2;
7984 
7985     /* return -3, if this is leader */
7986     if (orig_queue == NULL)
7987       result= -3;
7988   }
7989   else
7990     DBUG_ASSERT(result != -2 && result != -3);
7991 #endif /* WITH_WSREP */
7992 
7993   if (opt_binlog_commit_wait_count > 0 && orig_queue != NULL)
7994     mysql_cond_signal(&COND_prepare_ordered);
7995   mysql_mutex_unlock(&LOCK_prepare_ordered);
7996   DEBUG_SYNC(orig_entry->thd, "commit_after_release_LOCK_prepare_ordered");
7997 
7998   DBUG_PRINT("info", ("Queued for group commit as %s",
7999                       (orig_queue == NULL) ? "leader" : "participant"));
8000 
8001 end:
8002   if (backup_lock_released)
8003     thd->mdl_context.acquire_lock(thd->backup_commit_lock,
8004                                   thd->variables.lock_wait_timeout);
8005   DBUG_RETURN(result);
8006 }
8007 
8008 bool
write_transaction_to_binlog_events(group_commit_entry * entry)8009 MYSQL_BIN_LOG::write_transaction_to_binlog_events(group_commit_entry *entry)
8010 {
8011   int is_leader= queue_for_group_commit(entry);
8012 #ifdef WITH_WSREP
8013   /* commit order was released in queue_for_group_commit() call,
8014      here we check if wsrep_commit_ordered() failed or if we are leader */
8015   switch (is_leader)
8016   {
8017   case -2: /* wsrep_ordered_commit() has failed */
8018     DBUG_ASSERT(wsrep_is_active(entry->thd));
8019     DBUG_ASSERT(wsrep_run_commit_hook(entry->thd, entry->all));
8020     entry->thd->wakeup_subsequent_commits(1);
8021     return true;
8022   case -3: /* this is leader, wait for prior commit to
8023               complete. This establishes total order for group leaders
8024            */
8025     DBUG_ASSERT(wsrep_is_active(entry->thd));
8026     DBUG_ASSERT(wsrep_run_commit_hook(entry->thd, entry->all));
8027     if (entry->thd->wait_for_prior_commit())
8028       return true;
8029 
8030     /* retain the correct is_leader value */
8031     is_leader= 1;
8032     break;
8033 
8034   default: /* native MariaDB cases */
8035     break;
8036   }
8037 #endif /* WITH_WSREP */
8038 
8039   /*
8040     The first in the queue handles group commit for all; the others just wait
8041     to be signalled when group commit is done.
8042   */
8043   if (is_leader < 0)
8044     return true;                                /* Error */
8045   else if (is_leader)
8046     trx_group_commit_leader(entry);
8047   else if (!entry->queued_by_other)
8048   {
8049     DEBUG_SYNC(entry->thd, "after_semisync_queue");
8050 
8051     entry->thd->wait_for_wakeup_ready();
8052   }
8053   else
8054   {
8055     /*
8056       If we were queued by another prior commit, then we are woken up
8057       only when the leader has already completed the commit for us.
8058       So nothing to do here then.
8059     */
8060   }
8061 
8062   if (!opt_optimize_thread_scheduling)
8063   {
8064     /* For the leader, trx_group_commit_leader() already took the lock. */
8065     if (!is_leader)
8066       mysql_mutex_lock(&LOCK_commit_ordered);
8067 
8068     DEBUG_SYNC(entry->thd, "commit_loop_entry_commit_ordered");
8069     ++num_commits;
8070     if (entry->cache_mngr->using_xa && !entry->error)
8071       run_commit_ordered(entry->thd, entry->all);
8072 
8073     group_commit_entry *next= entry->next;
8074     if (!next)
8075     {
8076       group_commit_queue_busy= FALSE;
8077       mysql_cond_signal(&COND_queue_busy);
8078       DEBUG_SYNC(entry->thd, "commit_after_group_run_commit_ordered");
8079     }
8080     mysql_mutex_unlock(&LOCK_commit_ordered);
8081     entry->thd->wakeup_subsequent_commits(entry->error);
8082 
8083     if (next)
8084     {
8085       /*
8086         Wake up the next thread in the group commit.
8087 
8088         The next thread can be waiting in two different ways, depending on
8089         whether it put itself in the queue, or if it was put in queue by us
8090         because it had to wait for us to commit first.
8091 
8092         So execute the appropriate wakeup, identified by the queued_by_other
8093         field.
8094       */
8095       if (next->queued_by_other)
8096         next->thd->wait_for_commit_ptr->wakeup(entry->error);
8097       else
8098         next->thd->signal_wakeup_ready();
8099     }
8100     else
8101     {
8102       /*
8103         If we rotated the binlog, and if we are using the unoptimized thread
8104         scheduling where every thread runs its own commit_ordered(), then we
8105         must do the commit checkpoint and log purge here, after all
8106         commit_ordered() calls have finished, and locks have been released.
8107       */
8108       if (entry->check_purge)
8109         checkpoint_and_purge(entry->binlog_id);
8110     }
8111 
8112   }
8113 
8114   if (likely(!entry->error))
8115     return entry->thd->wait_for_prior_commit();
8116 
8117   switch (entry->error)
8118   {
8119   case ER_ERROR_ON_WRITE:
8120     my_error(ER_ERROR_ON_WRITE, MYF(ME_ERROR_LOG), name, entry->commit_errno);
8121     break;
8122   case ER_ERROR_ON_READ:
8123     my_error(ER_ERROR_ON_READ, MYF(ME_ERROR_LOG),
8124              entry->error_cache->file_name, entry->commit_errno);
8125     break;
8126   default:
8127     /*
8128       There are not (and should not be) any errors thrown not covered above.
8129       But just in case one is added later without updating the above switch
8130       statement, include a catch-all.
8131     */
8132     my_printf_error(entry->error,
8133                     "Error writing transaction to binary log: %d",
8134                     MYF(ME_ERROR_LOG), entry->error);
8135   }
8136 
8137   /*
8138     Since we return error, this transaction XID will not be committed, so
8139     we need to mark it as not needed for recovery (unlog() is not called
8140     for a transaction if log_xid() fails).
8141   */
8142   if (entry->cache_mngr->using_xa && entry->cache_mngr->xa_xid &&
8143       entry->cache_mngr->need_unlog)
8144     mark_xid_done(entry->cache_mngr->binlog_id, true);
8145 
8146   return 1;
8147 }
8148 
8149 /*
8150   Do binlog group commit as the lead thread.
8151 
8152   This must be called when this statement/transaction is queued at the start of
8153   the group_commit_queue. It will wait to obtain the LOCK_log mutex, then group
8154   commit all the transactions in the queue (more may have entered while waiting
8155   for LOCK_log). After commit is done, all other threads in the queue will be
8156   signalled.
8157 
8158  */
8159 void
trx_group_commit_leader(group_commit_entry * leader)8160 MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader)
8161 {
8162   uint xid_count= 0;
8163   my_off_t UNINIT_VAR(commit_offset);
8164   group_commit_entry *current, *last_in_queue;
8165   group_commit_entry *queue= NULL;
8166   bool check_purge= false;
8167   ulong UNINIT_VAR(binlog_id);
8168   uint64 commit_id;
8169   DBUG_ENTER("MYSQL_BIN_LOG::trx_group_commit_leader");
8170 
8171   {
8172     DBUG_EXECUTE_IF("inject_binlog_commit_before_get_LOCK_log",
8173       DBUG_ASSERT(!debug_sync_set_action(leader->thd, STRING_WITH_LEN
8174         ("commit_before_get_LOCK_log SIGNAL waiting WAIT_FOR cont TIMEOUT 1")));
8175     );
8176     /*
8177       Lock the LOCK_log(), and once we get it, collect any additional writes
8178       that queued up while we were waiting.
8179     */
8180     DEBUG_SYNC(leader->thd, "commit_before_get_LOCK_log");
8181     mysql_mutex_lock(&LOCK_log);
8182     DEBUG_SYNC(leader->thd, "commit_after_get_LOCK_log");
8183 
8184     mysql_mutex_lock(&LOCK_prepare_ordered);
8185     if (opt_binlog_commit_wait_count)
8186       wait_for_sufficient_commits();
8187     /*
8188       Note that wait_for_sufficient_commits() may have released and
8189       re-acquired the LOCK_log and LOCK_prepare_ordered if it needed to wait.
8190     */
8191     current= group_commit_queue;
8192     group_commit_queue= NULL;
8193     mysql_mutex_unlock(&LOCK_prepare_ordered);
8194     binlog_id= current_binlog_id;
8195 
8196     /* As the queue is in reverse order of entering, reverse it. */
8197     last_in_queue= current;
8198     while (current)
8199     {
8200       group_commit_entry *next= current->next;
8201       /*
8202         Now that group commit is started, we can clear the flag; there is no
8203         longer any use in waiters on this commit trying to trigger it early.
8204       */
8205       current->thd->waiting_on_group_commit= false;
8206       current->next= queue;
8207       queue= current;
8208       current= next;
8209     }
8210     DBUG_ASSERT(leader == queue /* the leader should be first in queue */);
8211 
8212     /* Now we have in queue the list of transactions to be committed in order. */
8213   }
8214 
8215   DBUG_ASSERT(is_open());
8216   if (likely(is_open()))                       // Should always be true
8217   {
8218     commit_id= (last_in_queue == leader ? 0 : (uint64)leader->thd->query_id);
8219     DBUG_EXECUTE_IF("binlog_force_commit_id",
8220       {
8221         const LEX_CSTRING commit_name= { STRING_WITH_LEN("commit_id") };
8222         bool null_value;
8223         user_var_entry *entry=
8224           (user_var_entry*) my_hash_search(&leader->thd->user_vars,
8225                                            (uchar*) commit_name.str,
8226                                            commit_name.length);
8227         commit_id= entry->val_int(&null_value);
8228       });
8229     /*
8230       Commit every transaction in the queue.
8231 
8232       Note that we are doing this in a different thread than the one running
8233       the transaction! So we are limited in the operations we can do. In
8234       particular, we cannot call my_error() on behalf of a transaction, as
8235       that obtains the THD from thread local storage. Instead, we must set
8236       current->error and let the thread do the error reporting itself once
8237       we wake it up.
8238     */
8239     for (current= queue; current != NULL; current= current->next)
8240     {
8241       set_current_thd(current->thd);
8242       binlog_cache_mngr *cache_mngr= current->cache_mngr;
8243 
8244       /*
8245         We already checked before that at least one cache is non-empty; if both
8246         are empty we would have skipped calling into here.
8247       */
8248       DBUG_ASSERT(!cache_mngr->stmt_cache.empty() ||
8249                   !cache_mngr->trx_cache.empty()  ||
8250                   current->thd->transaction->xid_state.is_explicit_XA());
8251 
8252       if (unlikely((current->error= write_transaction_or_stmt(current,
8253                                                               commit_id))))
8254         current->commit_errno= errno;
8255 
8256       strmake_buf(cache_mngr->last_commit_pos_file, log_file_name);
8257       commit_offset= my_b_write_tell(&log_file);
8258       cache_mngr->last_commit_pos_offset= commit_offset;
8259       if ((cache_mngr->using_xa && cache_mngr->xa_xid) || current->need_unlog)
8260       {
8261         /*
8262           If all storage engines support commit_checkpoint_request(), then we
8263           do not need to keep track of when this XID is durably committed.
8264           Instead we will just ask the storage engine to durably commit all its
8265           XIDs when we rotate a binlog file.
8266         */
8267         if (current->need_unlog)
8268         {
8269           xid_count++;
8270           cache_mngr->need_unlog= true;
8271           cache_mngr->binlog_id= binlog_id;
8272         }
8273         else
8274           cache_mngr->need_unlog= false;
8275 
8276         cache_mngr->delayed_error= false;
8277       }
8278     }
8279     set_current_thd(leader->thd);
8280 
8281     bool synced= 0;
8282     if (unlikely(flush_and_sync(&synced)))
8283     {
8284       for (current= queue; current != NULL; current= current->next)
8285       {
8286         if (!current->error)
8287         {
8288           current->error= ER_ERROR_ON_WRITE;
8289           current->commit_errno= errno;
8290           current->error_cache= NULL;
8291         }
8292       }
8293     }
8294     else
8295     {
8296       bool any_error= false;
8297 
8298       mysql_mutex_assert_not_owner(&LOCK_prepare_ordered);
8299       mysql_mutex_assert_owner(&LOCK_log);
8300       mysql_mutex_assert_not_owner(&LOCK_after_binlog_sync);
8301       mysql_mutex_assert_not_owner(&LOCK_commit_ordered);
8302 
8303       for (current= queue; current != NULL; current= current->next)
8304       {
8305 #ifdef HAVE_REPLICATION
8306         if (likely(!current->error) &&
8307             unlikely(repl_semisync_master.
8308                      report_binlog_update(current->thd,
8309                                           current->cache_mngr->
8310                                           last_commit_pos_file,
8311                                           current->cache_mngr->
8312                                           last_commit_pos_offset)))
8313         {
8314           current->error= ER_ERROR_ON_WRITE;
8315           current->commit_errno= -1;
8316           current->error_cache= NULL;
8317           any_error= true;
8318         }
8319 #endif
8320       }
8321 
8322       /*
8323         update binlog_end_pos so it can be read by dump thread
8324         Note: must be _after_ the RUN_HOOK(after_flush) or else
8325         semi-sync might not have put the transaction into
8326         it's list before dump-thread tries to send it
8327       */
8328       update_binlog_end_pos(commit_offset);
8329 
8330       if (unlikely(any_error))
8331         sql_print_error("Failed to run 'after_flush' hooks");
8332     }
8333 
8334     /*
8335       If any commit_events are Xid_log_event, increase the number of pending
8336       XIDs in current binlog (it's decreased in ::unlog()). When the count in
8337       a (not active) binlog file reaches zero, we know that it is no longer
8338       needed in XA recovery, and we can log a new binlog checkpoint event.
8339     */
8340     if (xid_count > 0)
8341     {
8342       mark_xids_active(binlog_id, xid_count);
8343     }
8344 
8345     if (rotate(false, &check_purge))
8346     {
8347       /*
8348         If we fail to rotate, which thread should get the error?
8349         We give the error to the leader, as any my_error() thrown inside
8350         rotate() will have been registered for the leader THD.
8351 
8352         However we must not return error from here - that would cause
8353         ha_commit_trans() to abort and rollback the transaction, which would
8354         leave an inconsistent state with the transaction committed in the
8355         binlog but rolled back in the engine.
8356 
8357         Instead set a flag so that we can return error later, from unlog(),
8358         when the transaction has been safely committed in the engine.
8359       */
8360       leader->cache_mngr->delayed_error= true;
8361       my_error(ER_ERROR_ON_WRITE, MYF(ME_ERROR_LOG), name, errno);
8362       check_purge= false;
8363     }
8364     /* In case of binlog rotate, update the correct current binlog offset. */
8365     commit_offset= my_b_write_tell(&log_file);
8366   }
8367 
8368   DEBUG_SYNC(leader->thd, "commit_before_get_LOCK_after_binlog_sync");
8369   mysql_mutex_lock(&LOCK_after_binlog_sync);
8370   /*
8371     We cannot unlock LOCK_log until we have locked LOCK_after_binlog_sync;
8372     otherwise scheduling could allow the next group commit to run ahead of us,
8373     messing up the order of commit_ordered() calls. But as soon as
8374     LOCK_after_binlog_sync is obtained, we can let the next group commit start.
8375   */
8376   mysql_mutex_unlock(&LOCK_log);
8377 
8378   DEBUG_SYNC(leader->thd, "commit_after_release_LOCK_log");
8379 
8380   /*
8381     Loop through threads and run the binlog_sync hook
8382   */
8383   {
8384     mysql_mutex_assert_not_owner(&LOCK_prepare_ordered);
8385     mysql_mutex_assert_not_owner(&LOCK_log);
8386     mysql_mutex_assert_owner(&LOCK_after_binlog_sync);
8387     mysql_mutex_assert_not_owner(&LOCK_commit_ordered);
8388 
8389     bool first __attribute__((unused))= true;
8390     bool last __attribute__((unused));
8391     for (current= queue; current != NULL; current= current->next)
8392     {
8393       last= current->next == NULL;
8394 #ifdef HAVE_REPLICATION
8395       if (likely(!current->error))
8396         current->error=
8397           repl_semisync_master.wait_after_sync(current->cache_mngr->
8398                                                last_commit_pos_file,
8399                                                current->cache_mngr->
8400                                                last_commit_pos_offset);
8401 #endif
8402       first= false;
8403     }
8404   }
8405 
8406   DEBUG_SYNC(leader->thd, "commit_before_get_LOCK_commit_ordered");
8407 
8408   mysql_mutex_lock(&LOCK_commit_ordered);
8409   DBUG_EXECUTE_IF("crash_before_engine_commit",
8410       {
8411         DBUG_SUICIDE();
8412       });
8413   last_commit_pos_offset= commit_offset;
8414 
8415   /*
8416     Unlock LOCK_after_binlog_sync only *after* LOCK_commit_ordered has been
8417     acquired so that groups can not reorder for the different stages of
8418     the group commit procedure.
8419   */
8420   mysql_mutex_unlock(&LOCK_after_binlog_sync);
8421   DEBUG_SYNC(leader->thd, "commit_after_release_LOCK_after_binlog_sync");
8422   ++num_group_commits;
8423 
8424   if (!opt_optimize_thread_scheduling)
8425   {
8426     /*
8427       If we want to run commit_ordered() each in the transaction's own thread
8428       context, then we need to mark the queue reserved; we need to finish all
8429       threads in one group commit before the next group commit can be allowed
8430       to proceed, and we cannot unlock a simple pthreads mutex in a different
8431       thread from the one that locked it.
8432     */
8433 
8434     while (group_commit_queue_busy)
8435       mysql_cond_wait(&COND_queue_busy, &LOCK_commit_ordered);
8436     group_commit_queue_busy= TRUE;
8437 
8438     /*
8439       Set these so parent can run checkpoint_and_purge() in last thread.
8440       (When using optimized thread scheduling, we run checkpoint_and_purge()
8441       in this function, so parent does not need to and we need not set these
8442       values).
8443     */
8444     last_in_queue->check_purge= check_purge;
8445     last_in_queue->binlog_id= binlog_id;
8446 
8447     /* Note that we return with LOCK_commit_ordered locked! */
8448     DBUG_VOID_RETURN;
8449   }
8450 
8451   /*
8452     Wakeup each participant waiting for our group commit, first calling the
8453     commit_ordered() methods for any transactions doing 2-phase commit.
8454   */
8455   current= queue;
8456   while (current != NULL)
8457   {
8458     group_commit_entry *next;
8459 
8460     DEBUG_SYNC(leader->thd, "commit_loop_entry_commit_ordered");
8461     ++num_commits;
8462     if (current->cache_mngr->using_xa && likely(!current->error) &&
8463         DBUG_EVALUATE_IF("skip_commit_ordered", 0, 1))
8464       run_commit_ordered(current->thd, current->all);
8465     current->thd->wakeup_subsequent_commits(current->error);
8466 
8467     /*
8468       Careful not to access current->next after waking up the other thread! As
8469       it may change immediately after wakeup.
8470     */
8471     next= current->next;
8472     if (current != leader)                      // Don't wake up ourself
8473     {
8474       if (current->queued_by_other)
8475         current->thd->wait_for_commit_ptr->wakeup(current->error);
8476       else
8477         current->thd->signal_wakeup_ready();
8478     }
8479     current= next;
8480   }
8481   DEBUG_SYNC(leader->thd, "commit_after_group_run_commit_ordered");
8482   mysql_mutex_unlock(&LOCK_commit_ordered);
8483   DEBUG_SYNC(leader->thd, "commit_after_group_release_commit_ordered");
8484 
8485   if (check_purge)
8486     checkpoint_and_purge(binlog_id);
8487 
8488   DBUG_VOID_RETURN;
8489 }
8490 
8491 
8492 int
write_transaction_or_stmt(group_commit_entry * entry,uint64 commit_id)8493 MYSQL_BIN_LOG::write_transaction_or_stmt(group_commit_entry *entry,
8494                                          uint64 commit_id)
8495 {
8496   binlog_cache_mngr *mngr= entry->cache_mngr;
8497   DBUG_ENTER("MYSQL_BIN_LOG::write_transaction_or_stmt");
8498 
8499   if (write_gtid_event(entry->thd, is_prepared_xa(entry->thd),
8500                        entry->using_trx_cache, commit_id))
8501     DBUG_RETURN(ER_ERROR_ON_WRITE);
8502 
8503   if (entry->using_stmt_cache && !mngr->stmt_cache.empty() &&
8504       write_cache(entry->thd, mngr->get_binlog_cache_log(FALSE)))
8505   {
8506     entry->error_cache= &mngr->stmt_cache.cache_log;
8507     DBUG_RETURN(ER_ERROR_ON_WRITE);
8508   }
8509 
8510   if (entry->using_trx_cache && !mngr->trx_cache.empty())
8511   {
8512     DBUG_EXECUTE_IF("crash_before_writing_xid",
8513                     {
8514                       if ((write_cache(entry->thd,
8515                                        mngr->get_binlog_cache_log(TRUE))))
8516                         DBUG_PRINT("info", ("error writing binlog cache"));
8517                       else
8518                         flush_and_sync(0);
8519 
8520                       DBUG_PRINT("info", ("crashing before writing xid"));
8521                       DBUG_SUICIDE();
8522                     });
8523 
8524     if (write_cache(entry->thd, mngr->get_binlog_cache_log(TRUE)))
8525     {
8526       entry->error_cache= &mngr->trx_cache.cache_log;
8527       DBUG_RETURN(ER_ERROR_ON_WRITE);
8528     }
8529   }
8530 
8531   DBUG_EXECUTE_IF("inject_error_writing_xid",
8532                   {
8533                     entry->error_cache= NULL;
8534                     errno= 28;
8535                     DBUG_RETURN(ER_ERROR_ON_WRITE);
8536                   });
8537 
8538   if (write_event(entry->end_event))
8539   {
8540     entry->error_cache= NULL;
8541     DBUG_RETURN(ER_ERROR_ON_WRITE);
8542   }
8543   status_var_add(entry->thd->status_var.binlog_bytes_written,
8544                  entry->end_event->data_written);
8545 
8546   if (entry->incident_event)
8547   {
8548     if (write_event(entry->incident_event))
8549     {
8550       entry->error_cache= NULL;
8551       DBUG_RETURN(ER_ERROR_ON_WRITE);
8552     }
8553   }
8554 
8555   if (unlikely(mngr->get_binlog_cache_log(FALSE)->error))
8556   {
8557     entry->error_cache= &mngr->stmt_cache.cache_log;
8558     DBUG_RETURN(ER_ERROR_ON_WRITE);
8559   }
8560   if (unlikely(mngr->get_binlog_cache_log(TRUE)->error))  // Error on read
8561   {
8562     entry->error_cache= &mngr->trx_cache.cache_log;
8563     DBUG_RETURN(ER_ERROR_ON_WRITE);
8564   }
8565 
8566   DBUG_RETURN(0);
8567 }
8568 
8569 
8570 /*
8571   Wait for sufficient commits to queue up for group commit, according to the
8572   values of binlog_commit_wait_count and binlog_commit_wait_usec.
8573 
8574   Note that this function may release and re-acquire LOCK_log and
8575   LOCK_prepare_ordered if it needs to wait.
8576 */
8577 
8578 void
wait_for_sufficient_commits()8579 MYSQL_BIN_LOG::wait_for_sufficient_commits()
8580 {
8581   size_t count;
8582   group_commit_entry *e;
8583   group_commit_entry *last_head;
8584   struct timespec wait_until;
8585 
8586   mysql_mutex_assert_owner(&LOCK_log);
8587   mysql_mutex_assert_owner(&LOCK_prepare_ordered);
8588 
8589   for (e= last_head= group_commit_queue, count= 0; e; e= e->next)
8590   {
8591     if (++count >= opt_binlog_commit_wait_count)
8592     {
8593       group_commit_trigger_count++;
8594       return;
8595     }
8596     if (unlikely(e->thd->has_waiter))
8597     {
8598       group_commit_trigger_lock_wait++;
8599       return;
8600     }
8601   }
8602 
8603   mysql_mutex_unlock(&LOCK_log);
8604   set_timespec_nsec(wait_until, (ulonglong)1000*opt_binlog_commit_wait_usec);
8605 
8606   for (;;)
8607   {
8608     int err;
8609     group_commit_entry *head;
8610 
8611     err= mysql_cond_timedwait(&COND_prepare_ordered, &LOCK_prepare_ordered,
8612                               &wait_until);
8613     if (err == ETIMEDOUT)
8614     {
8615       group_commit_trigger_timeout++;
8616       break;
8617     }
8618     if (unlikely(last_head->thd->has_waiter))
8619     {
8620       group_commit_trigger_lock_wait++;
8621       break;
8622     }
8623     head= group_commit_queue;
8624     for (e= head; e && e != last_head; e= e->next)
8625     {
8626       ++count;
8627       if (unlikely(e->thd->has_waiter))
8628       {
8629         group_commit_trigger_lock_wait++;
8630         goto after_loop;
8631       }
8632     }
8633     if (count >= opt_binlog_commit_wait_count)
8634     {
8635       group_commit_trigger_count++;
8636       break;
8637     }
8638     last_head= head;
8639   }
8640 after_loop:
8641 
8642   /*
8643     We must not wait for LOCK_log while holding LOCK_prepare_ordered.
8644     LOCK_log can be held for long periods (eg. we do I/O under it), while
8645     LOCK_prepare_ordered must only be held for short periods.
8646 
8647     In addition, waiting for LOCK_log while holding LOCK_prepare_ordered would
8648     violate locking order of LOCK_log-before-LOCK_prepare_ordered. This could
8649     cause SAFEMUTEX warnings (even if it cannot actually deadlock with current
8650     code, as there can be at most one group commit leader thread at a time).
8651 
8652     So release and re-acquire LOCK_prepare_ordered if we need to wait for the
8653     LOCK_log.
8654   */
8655   if (mysql_mutex_trylock(&LOCK_log))
8656   {
8657     mysql_mutex_unlock(&LOCK_prepare_ordered);
8658     mysql_mutex_lock(&LOCK_log);
8659     mysql_mutex_lock(&LOCK_prepare_ordered);
8660   }
8661 }
8662 
8663 
8664 void
binlog_trigger_immediate_group_commit()8665 MYSQL_BIN_LOG::binlog_trigger_immediate_group_commit()
8666 {
8667   group_commit_entry *head;
8668   mysql_mutex_assert_owner(&LOCK_prepare_ordered);
8669   head= group_commit_queue;
8670   if (head)
8671   {
8672     head->thd->has_waiter= true;
8673     mysql_cond_signal(&COND_prepare_ordered);
8674   }
8675 }
8676 
8677 
8678 /*
8679   This function is called when a transaction T1 goes to wait for another
8680   transaction T2. It is used to cut short any binlog group commit delay from
8681   --binlog-commit-wait-count in the case where another transaction is stalled
8682   on the wait due to conflicting row locks.
8683 
8684   If T2 is already ready to group commit, any waiting group commit will be
8685   signalled to proceed immediately. Otherwise, a flag will be set in T2, and
8686   when T2 later becomes ready, immediate group commit will be triggered.
8687 */
8688 void
binlog_report_wait_for(THD * thd1,THD * thd2)8689 binlog_report_wait_for(THD *thd1, THD *thd2)
8690 {
8691   if (opt_binlog_commit_wait_count == 0)
8692     return;
8693   mysql_mutex_lock(&LOCK_prepare_ordered);
8694   thd2->has_waiter= true;
8695   if (thd2->waiting_on_group_commit)
8696     mysql_bin_log.binlog_trigger_immediate_group_commit();
8697   mysql_mutex_unlock(&LOCK_prepare_ordered);
8698 }
8699 
8700 
8701 /**
8702   Wait until we get a signal that the relay log has been updated.
8703 
8704   @param thd		Thread variable
8705 
8706   @note
8707     One must have a lock on LOCK_log before calling this function.
8708     This lock will be released before return! That's required by
8709     THD::enter_cond() (see NOTES in sql_class.h).
8710 */
8711 
wait_for_update_relay_log(THD * thd)8712 void MYSQL_BIN_LOG::wait_for_update_relay_log(THD* thd)
8713 {
8714   PSI_stage_info old_stage;
8715   DBUG_ENTER("wait_for_update_relay_log");
8716 
8717   mysql_mutex_assert_owner(&LOCK_log);
8718   thd->ENTER_COND(&COND_relay_log_updated, &LOCK_log,
8719                   &stage_slave_has_read_all_relay_log,
8720                   &old_stage);
8721   mysql_cond_wait(&COND_relay_log_updated, &LOCK_log);
8722   thd->EXIT_COND(&old_stage);
8723   DBUG_VOID_RETURN;
8724 }
8725 
8726 /**
8727   Wait until we get a signal that the binary log has been updated.
8728   Applies to master only.
8729 
8730   NOTES
8731   @param[in] thd        a THD struct
8732   @param[in] timeout    a pointer to a timespec;
8733                         NULL means to wait w/o timeout.
8734   @retval    0          if got signalled on update
8735   @retval    non-0      if wait timeout elapsed
8736   @note
8737     LOCK_log must be taken before calling this function.
8738     LOCK_log is being released while the thread is waiting.
8739     LOCK_log is released by the caller.
8740 */
8741 
wait_for_update_binlog_end_pos(THD * thd,struct timespec * timeout)8742 int MYSQL_BIN_LOG::wait_for_update_binlog_end_pos(THD* thd,
8743                                                   struct timespec *timeout)
8744 {
8745   int ret= 0;
8746   DBUG_ENTER("wait_for_update_binlog_end_pos");
8747 
8748   thd_wait_begin(thd, THD_WAIT_BINLOG);
8749   mysql_mutex_assert_owner(get_binlog_end_pos_lock());
8750   if (!timeout)
8751     mysql_cond_wait(&COND_bin_log_updated, get_binlog_end_pos_lock());
8752   else
8753     ret= mysql_cond_timedwait(&COND_bin_log_updated, get_binlog_end_pos_lock(),
8754                               timeout);
8755   thd_wait_end(thd);
8756   DBUG_RETURN(ret);
8757 }
8758 
8759 
8760 /**
8761   Close the log file.
8762 
8763   @param exiting     Bitmask for one or more of the following bits:
8764           - LOG_CLOSE_INDEX : if we should close the index file
8765           - LOG_CLOSE_TO_BE_OPENED : if we intend to call open
8766                                      at once after close.
8767           - LOG_CLOSE_STOP_EVENT : write a 'stop' event to the log
8768           - LOG_CLOSE_DELAYED_CLOSE : do not yet close the file and clear the
8769                                       LOG_EVENT_BINLOG_IN_USE_F flag
8770 
8771   @note
8772     One can do an open on the object at once after doing a close.
8773     The internal structures are not freed until cleanup() is called
8774 */
8775 
close(uint exiting)8776 void MYSQL_BIN_LOG::close(uint exiting)
8777 {					// One can't set log_type here!
8778   bool failed_to_save_state= false;
8779   DBUG_ENTER("MYSQL_BIN_LOG::close");
8780   DBUG_PRINT("enter",("exiting: %d", (int) exiting));
8781 
8782   mysql_mutex_assert_owner(&LOCK_log);
8783 
8784   if (log_state == LOG_OPENED)
8785   {
8786     DBUG_ASSERT(log_type == LOG_BIN);
8787 #ifdef HAVE_REPLICATION
8788     if (exiting & LOG_CLOSE_STOP_EVENT)
8789     {
8790       Stop_log_event s;
8791       // the checksumming rule for relay-log case is similar to Rotate
8792         s.checksum_alg= is_relay_log ? relay_log_checksum_alg
8793                                      : (enum_binlog_checksum_alg)binlog_checksum_options;
8794       DBUG_ASSERT(!is_relay_log ||
8795                   relay_log_checksum_alg != BINLOG_CHECKSUM_ALG_UNDEF);
8796       write_event(&s);
8797       bytes_written+= s.data_written;
8798       flush_io_cache(&log_file);
8799       update_binlog_end_pos();
8800 
8801       /*
8802         When we shut down server, write out the binlog state to a separate
8803         file so we do not have to scan an entire binlog file to recover it
8804         at next server start.
8805 
8806         Note that this must be written and synced to disk before marking the
8807         last binlog file as "not crashed".
8808       */
8809       if (!is_relay_log && write_state_to_file())
8810       {
8811         sql_print_error("Failed to save binlog GTID state during shutdown. "
8812                         "Binlog will be marked as crashed, so that crash "
8813                         "recovery can recover the state at next server "
8814                         "startup.");
8815         /*
8816           Leave binlog file marked as crashed, so we can recover state by
8817           scanning it now that we failed to write out the state properly.
8818         */
8819         failed_to_save_state= true;
8820       }
8821     }
8822 #endif /* HAVE_REPLICATION */
8823 
8824     /* don't pwrite in a file opened with O_APPEND - it doesn't work */
8825     if (log_file.type == WRITE_CACHE && !(exiting & LOG_CLOSE_DELAYED_CLOSE))
8826     {
8827       my_off_t org_position= mysql_file_tell(log_file.file, MYF(0));
8828       if (!failed_to_save_state)
8829         clear_inuse_flag_when_closing(log_file.file);
8830       /*
8831         Restore position so that anything we have in the IO_cache is written
8832         to the correct position.
8833         We need the seek here, as mysql_file_pwrite() is not guaranteed to keep the
8834         original position on system that doesn't support pwrite().
8835       */
8836       mysql_file_seek(log_file.file, org_position, MY_SEEK_SET, MYF(0));
8837     }
8838 
8839     /* this will cleanup IO_CACHE, sync and close the file */
8840     MYSQL_LOG::close(exiting);
8841   }
8842 
8843   /*
8844     The following test is needed even if is_open() is not set, as we may have
8845     called a not complete close earlier and the index file is still open.
8846   */
8847 
8848   if ((exiting & LOG_CLOSE_INDEX) && my_b_inited(&index_file))
8849   {
8850     end_io_cache(&index_file);
8851     if (unlikely(mysql_file_close(index_file.file, MYF(0)) < 0) &&
8852         ! write_error)
8853     {
8854       write_error= 1;
8855       sql_print_error(ER_DEFAULT(ER_ERROR_ON_WRITE), index_file_name, errno);
8856     }
8857   }
8858   log_state= (exiting & LOG_CLOSE_TO_BE_OPENED) ? LOG_TO_BE_OPENED : LOG_CLOSED;
8859   my_free(name);
8860   name= NULL;
8861   DBUG_VOID_RETURN;
8862 }
8863 
8864 
8865 /*
8866   Clear the LOG_EVENT_BINLOG_IN_USE_F; this marks the binlog file as cleanly
8867   closed and not needing crash recovery.
8868 */
clear_inuse_flag_when_closing(File file)8869 void MYSQL_BIN_LOG::clear_inuse_flag_when_closing(File file)
8870 {
8871   my_off_t offset= BIN_LOG_HEADER_SIZE + FLAGS_OFFSET;
8872   uchar flags= 0;            // clearing LOG_EVENT_BINLOG_IN_USE_F
8873   mysql_file_pwrite(file, &flags, 1, offset, MYF(0));
8874 }
8875 
8876 
set_max_size(ulong max_size_arg)8877 void MYSQL_BIN_LOG::set_max_size(ulong max_size_arg)
8878 {
8879   /*
8880     We need to take locks, otherwise this may happen:
8881     new_file() is called, calls open(old_max_size), then before open() starts,
8882     set_max_size() sets max_size to max_size_arg, then open() starts and
8883     uses the old_max_size argument, so max_size_arg has been overwritten and
8884     it's like if the SET command was never run.
8885   */
8886   DBUG_ENTER("MYSQL_BIN_LOG::set_max_size");
8887   mysql_mutex_lock(&LOCK_log);
8888   if (is_open())
8889     max_size= max_size_arg;
8890   mysql_mutex_unlock(&LOCK_log);
8891   DBUG_VOID_RETURN;
8892 }
8893 
8894 
8895 /**
8896   Check if a string is a valid number.
8897 
8898   @param str			String to test
8899   @param res			Store value here
8900   @param allow_wildcards	Set to 1 if we should ignore '%' and '_'
8901 
8902   @note
8903     For the moment the allow_wildcards argument is not used
8904     Should be move to some other file.
8905 
8906   @retval
8907     1	String is a number
8908   @retval
8909     0	String is not a number
8910 */
8911 
test_if_number(const char * str,ulong * res,bool allow_wildcards)8912 static bool test_if_number(const char *str, ulong *res, bool allow_wildcards)
8913 {
8914   int flag;
8915   const char *start;
8916   DBUG_ENTER("test_if_number");
8917 
8918   flag=0; start=str;
8919   while (*str++ == ' ') ;
8920   if (*--str == '-' || *str == '+')
8921     str++;
8922   while (my_isdigit(files_charset_info,*str) ||
8923 	 (allow_wildcards && (*str == wild_many || *str == wild_one)))
8924   {
8925     flag=1;
8926     str++;
8927   }
8928   if (*str == '.')
8929   {
8930     for (str++ ;
8931 	 my_isdigit(files_charset_info,*str) ||
8932 	   (allow_wildcards && (*str == wild_many || *str == wild_one)) ;
8933 	 str++, flag=1) ;
8934   }
8935   if (*str != 0 || flag == 0)
8936     DBUG_RETURN(0);
8937   if (res)
8938     *res=atol(start);
8939   DBUG_RETURN(1);			/* Number ok */
8940 } /* test_if_number */
8941 
8942 
sql_perror(const char * message)8943 void sql_perror(const char *message)
8944 {
8945 #if defined(_WIN32)
8946   char* buf;
8947   DWORD dw= GetLastError();
8948   if (FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER |  FORMAT_MESSAGE_FROM_SYSTEM |
8949         FORMAT_MESSAGE_IGNORE_INSERTS,  NULL, dw,
8950         MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL ) > 0)
8951   {
8952     sql_print_error("%s: %s",message, buf);
8953     LocalFree((HLOCAL)buf);
8954   }
8955   else
8956   {
8957     sql_print_error("%s", message);
8958   }
8959 #elif defined(HAVE_STRERROR)
8960   sql_print_error("%s: %s",message, strerror(errno));
8961 #else
8962   perror(message);
8963 #endif
8964 }
8965 
8966 
8967 /*
8968   Change the file associated with two output streams. Used to
8969   redirect stdout and stderr to a file. The streams are reopened
8970   only for appending (writing at end of file).
8971 */
reopen_fstreams(const char * filename,FILE * outstream,FILE * errstream)8972 bool reopen_fstreams(const char *filename, FILE *outstream, FILE *errstream)
8973 {
8974   if ((outstream && !my_freopen(filename, "a", outstream)) ||
8975       (errstream && !my_freopen(filename, "a", errstream)))
8976   {
8977     my_error(ER_CANT_CREATE_FILE, MYF(0), filename, errno);
8978     return TRUE;
8979   }
8980 
8981   /* The error stream must be unbuffered. */
8982   if (errstream)
8983     setbuf(errstream, NULL);
8984 
8985   return FALSE;
8986 }
8987 
8988 
8989 /*
8990   Unfortunately, there seems to be no good way
8991   to restore the original streams upon failure.
8992 */
redirect_std_streams(const char * file)8993 static bool redirect_std_streams(const char *file)
8994 {
8995   if (reopen_fstreams(file, stdout, stderr))
8996     return TRUE;
8997 
8998   setbuf(stderr, NULL);
8999   return FALSE;
9000 }
9001 
9002 
flush_error_log()9003 bool flush_error_log()
9004 {
9005   bool result= 0;
9006   if (opt_error_log)
9007   {
9008     mysql_mutex_lock(&LOCK_error_log);
9009     if (redirect_std_streams(log_error_file))
9010       result= 1;
9011     mysql_mutex_unlock(&LOCK_error_log);
9012   }
9013   return result;
9014 }
9015 
9016 #ifdef _WIN32
9017 struct eventlog_source
9018 {
9019   HANDLE handle;
eventlog_sourceeventlog_source9020   eventlog_source()
9021   {
9022     setup_windows_event_source();
9023     handle = RegisterEventSource(NULL, "MariaDB");
9024   }
9025 
~eventlog_sourceeventlog_source9026   ~eventlog_source()
9027   {
9028     if (handle)
9029       DeregisterEventSource(handle);
9030   }
9031 };
9032 
9033 static eventlog_source eventlog;
9034 
print_buffer_to_nt_eventlog(enum loglevel level,char * buff,size_t length,size_t buffLen)9035 static void print_buffer_to_nt_eventlog(enum loglevel level, char *buff,
9036                                         size_t length, size_t buffLen)
9037 {
9038   HANDLE event= eventlog.handle;
9039   char   *buffptr= buff;
9040   DBUG_ENTER("print_buffer_to_nt_eventlog");
9041 
9042   /* Add ending CR/LF's to string, overwrite last chars if necessary */
9043   strmov(buffptr+MY_MIN(length, buffLen-5), "\r\n\r\n");
9044 
9045   if (event)
9046   {
9047     switch (level) {
9048       case ERROR_LEVEL:
9049         ReportEvent(event, EVENTLOG_ERROR_TYPE, 0, MSG_DEFAULT, NULL, 1, 0,
9050                     (LPCSTR*)&buffptr, NULL);
9051         break;
9052       case WARNING_LEVEL:
9053         ReportEvent(event, EVENTLOG_WARNING_TYPE, 0, MSG_DEFAULT, NULL, 1, 0,
9054                     (LPCSTR*) &buffptr, NULL);
9055         break;
9056       case INFORMATION_LEVEL:
9057         ReportEvent(event, EVENTLOG_INFORMATION_TYPE, 0, MSG_DEFAULT, NULL, 1,
9058                     0, (LPCSTR*) &buffptr, NULL);
9059         break;
9060     }
9061   }
9062 
9063   DBUG_VOID_RETURN;
9064 }
9065 #endif /* _WIN32 */
9066 
9067 
9068 #ifndef EMBEDDED_LIBRARY
print_buffer_to_file(enum loglevel level,const char * buffer,size_t length)9069 static void print_buffer_to_file(enum loglevel level, const char *buffer,
9070                                  size_t length)
9071 {
9072   time_t skr;
9073   struct tm tm_tmp;
9074   struct tm *start;
9075   THD *thd= 0;
9076   size_t tag_length= 0;
9077   char tag[NAME_LEN];
9078   DBUG_ENTER("print_buffer_to_file");
9079   DBUG_PRINT("enter",("buffer: %s", buffer));
9080 
9081   if (mysqld_server_initialized && (thd= current_thd))
9082   {
9083     if (thd->connection_name.length)
9084     {
9085       /*
9086         Add tag for slaves so that the user can see from which connection
9087         the error originates.
9088       */
9089       tag_length= my_snprintf(tag, sizeof(tag),
9090                               ER_THD(thd, ER_MASTER_LOG_PREFIX),
9091                               (int) thd->connection_name.length,
9092                               thd->connection_name.str);
9093     }
9094   }
9095 
9096   mysql_mutex_lock(&LOCK_error_log);
9097 
9098   skr= my_time(0);
9099   localtime_r(&skr, &tm_tmp);
9100   start=&tm_tmp;
9101 
9102   fprintf(stderr, "%d-%02d-%02d %2d:%02d:%02d %lu [%s] %.*s%.*s\n",
9103           start->tm_year + 1900,
9104           start->tm_mon+1,
9105           start->tm_mday,
9106           start->tm_hour,
9107           start->tm_min,
9108           start->tm_sec,
9109           (unsigned long) (thd ? thd->thread_id : 0),
9110           (level == ERROR_LEVEL ? "ERROR" : level == WARNING_LEVEL ?
9111            "Warning" : "Note"),
9112           (int) tag_length, tag,
9113           (int) length, buffer);
9114 
9115   fflush(stderr);
9116 
9117   mysql_mutex_unlock(&LOCK_error_log);
9118   DBUG_VOID_RETURN;
9119 }
9120 
9121 /**
9122   Prints a printf style message to the error log and, under NT, to the
9123   Windows event log.
9124 
9125   This function prints the message into a buffer and then sends that buffer
9126   to other functions to write that message to other logging sources.
9127 
9128   @param level          The level of the msg significance
9129   @param format         Printf style format of message
9130   @param args           va_list list of arguments for the message
9131 
9132   @returns
9133     The function always returns 0. The return value is present in the
9134     signature to be compatible with other logging routines, which could
9135     return an error (e.g. logging to the log tables)
9136 */
vprint_msg_to_log(enum loglevel level,const char * format,va_list args)9137 int vprint_msg_to_log(enum loglevel level, const char *format, va_list args)
9138 {
9139   char   buff[1024];
9140   size_t length;
9141   DBUG_ENTER("vprint_msg_to_log");
9142 
9143   length= my_vsnprintf(buff, sizeof(buff), format, args);
9144   print_buffer_to_file(level, buff, length);
9145 
9146 #ifdef _WIN32
9147   print_buffer_to_nt_eventlog(level, buff, length, sizeof(buff));
9148 #endif
9149 
9150   DBUG_RETURN(0);
9151 }
9152 #endif /* EMBEDDED_LIBRARY */
9153 
9154 
sql_print_error(const char * format,...)9155 void sql_print_error(const char *format, ...)
9156 {
9157   va_list args;
9158   DBUG_ENTER("sql_print_error");
9159 
9160   va_start(args, format);
9161   error_log_print(ERROR_LEVEL, format, args);
9162   va_end(args);
9163 
9164   DBUG_VOID_RETURN;
9165 }
9166 
9167 
sql_print_warning(const char * format,...)9168 void sql_print_warning(const char *format, ...)
9169 {
9170   va_list args;
9171   DBUG_ENTER("sql_print_warning");
9172 
9173   va_start(args, format);
9174   error_log_print(WARNING_LEVEL, format, args);
9175   va_end(args);
9176 
9177   DBUG_VOID_RETURN;
9178 }
9179 
9180 
sql_print_information(const char * format,...)9181 void sql_print_information(const char *format, ...)
9182 {
9183   va_list args;
9184   DBUG_ENTER("sql_print_information");
9185 
9186   va_start(args, format);
9187   sql_print_information_v(format, args);
9188   va_end(args);
9189 
9190   DBUG_VOID_RETURN;
9191 }
9192 
sql_print_information_v(const char * format,va_list ap)9193 void sql_print_information_v(const char *format, va_list ap)
9194 {
9195   if (disable_log_notes)
9196     return;                 // Skip notes during start/shutdown
9197 
9198   error_log_print(INFORMATION_LEVEL, format, ap);
9199 }
9200 
9201 void
run_prepare_ordered(THD * thd,bool all)9202 TC_LOG::run_prepare_ordered(THD *thd, bool all)
9203 {
9204   Ha_trx_info *ha_info=
9205     all ? thd->transaction->all.ha_list : thd->transaction->stmt.ha_list;
9206 
9207   mysql_mutex_assert_owner(&LOCK_prepare_ordered);
9208   for (; ha_info; ha_info= ha_info->next())
9209   {
9210     handlerton *ht= ha_info->ht();
9211     if (!ht->prepare_ordered)
9212       continue;
9213     ht->prepare_ordered(ht, thd, all);
9214   }
9215 }
9216 
9217 
9218 void
run_commit_ordered(THD * thd,bool all)9219 TC_LOG::run_commit_ordered(THD *thd, bool all)
9220 {
9221   Ha_trx_info *ha_info=
9222     all ? thd->transaction->all.ha_list : thd->transaction->stmt.ha_list;
9223 
9224   mysql_mutex_assert_owner(&LOCK_commit_ordered);
9225   for (; ha_info; ha_info= ha_info->next())
9226   {
9227     handlerton *ht= ha_info->ht();
9228     if (!ht->commit_ordered)
9229       continue;
9230     ht->commit_ordered(ht, thd, all);
9231     DEBUG_SYNC(thd, "commit_after_run_commit_ordered");
9232   }
9233 }
9234 
9235 
log_and_order(THD * thd,my_xid xid,bool all,bool need_prepare_ordered,bool need_commit_ordered)9236 int TC_LOG_MMAP::log_and_order(THD *thd, my_xid xid, bool all,
9237                                bool need_prepare_ordered,
9238                                bool need_commit_ordered)
9239 {
9240   int cookie;
9241   struct commit_entry entry;
9242   bool UNINIT_VAR(is_group_commit_leader);
9243 
9244   if (need_prepare_ordered)
9245   {
9246     mysql_mutex_lock(&LOCK_prepare_ordered);
9247     run_prepare_ordered(thd, all);
9248     if (need_commit_ordered)
9249     {
9250       /*
9251         Must put us in queue so we can run_commit_ordered() in same sequence
9252         as we did run_prepare_ordered().
9253       */
9254       thd->clear_wakeup_ready();
9255       entry.thd= thd;
9256       commit_entry *previous_queue= commit_ordered_queue;
9257       entry.next= previous_queue;
9258       commit_ordered_queue= &entry;
9259       is_group_commit_leader= (previous_queue == NULL);
9260     }
9261     mysql_mutex_unlock(&LOCK_prepare_ordered);
9262   }
9263 
9264   if (thd->wait_for_prior_commit())
9265     return 0;
9266 
9267   cookie= 0;
9268   if (xid)
9269     cookie= log_one_transaction(xid);
9270 
9271   if (need_commit_ordered)
9272   {
9273     if (need_prepare_ordered)
9274     {
9275       /*
9276         We did the run_prepare_ordered() serialised, then ran the log_xid() in
9277         parallel. Now we have to do run_commit_ordered() serialised in the
9278         same sequence as run_prepare_ordered().
9279 
9280         We do this starting from the head of the queue, each thread doing
9281         run_commit_ordered() and signalling the next in queue.
9282       */
9283       if (is_group_commit_leader)
9284       {
9285         /* The first in queue starts the ball rolling. */
9286         mysql_mutex_lock(&LOCK_prepare_ordered);
9287         while (commit_ordered_queue_busy)
9288           mysql_cond_wait(&COND_queue_busy, &LOCK_prepare_ordered);
9289         commit_entry *queue= commit_ordered_queue;
9290         commit_ordered_queue= NULL;
9291         /*
9292           Mark the queue busy while we bounce it from one thread to the
9293           next.
9294         */
9295         commit_ordered_queue_busy= true;
9296         mysql_mutex_unlock(&LOCK_prepare_ordered);
9297 
9298         /* Reverse the queue list so we get correct order. */
9299         commit_entry *prev= NULL;
9300         while (queue)
9301         {
9302           commit_entry *next= queue->next;
9303           queue->next= prev;
9304           prev= queue;
9305           queue= next;
9306         }
9307         DBUG_ASSERT(prev == &entry);
9308         DBUG_ASSERT(prev->thd == thd);
9309       }
9310       else
9311       {
9312         /* Not first in queue; just wait until previous thread wakes us up. */
9313         thd->wait_for_wakeup_ready();
9314       }
9315     }
9316 
9317     /* Only run commit_ordered() if log_xid was successful. */
9318     if (cookie)
9319     {
9320       mysql_mutex_lock(&LOCK_commit_ordered);
9321       run_commit_ordered(thd, all);
9322       mysql_mutex_unlock(&LOCK_commit_ordered);
9323     }
9324 
9325     if (need_prepare_ordered)
9326     {
9327       commit_entry *next= entry.next;
9328       if (next)
9329       {
9330         next->thd->signal_wakeup_ready();
9331       }
9332       else
9333       {
9334         mysql_mutex_lock(&LOCK_prepare_ordered);
9335         commit_ordered_queue_busy= false;
9336         mysql_cond_signal(&COND_queue_busy);
9337         mysql_mutex_unlock(&LOCK_prepare_ordered);
9338       }
9339     }
9340   }
9341 
9342   return cookie;
9343 }
9344 
9345 
9346 /********* transaction coordinator log for 2pc - mmap() based solution *******/
9347 
9348 /*
9349   the log consists of a file, mapped to memory.
9350   file is divided into pages of tc_log_page_size size.
9351   (usable size of the first page is smaller because of the log header)
9352   there is a PAGE control structure for each page
9353   each page (or rather its PAGE control structure) can be in one of
9354   the three states - active, syncing, pool.
9355   there could be only one page in the active or syncing state,
9356   but many in pool - pool is a fifo queue.
9357   the usual lifecycle of a page is pool->active->syncing->pool.
9358   the "active" page is a page where new xid's are logged.
9359   the page stays active as long as the syncing slot is taken.
9360   the "syncing" page is being synced to disk. no new xid can be added to it.
9361   when the syncing is done the page is moved to a pool and an active page
9362   becomes "syncing".
9363 
9364   the result of such an architecture is a natural "commit grouping" -
9365   If commits are coming faster than the system can sync, they do not
9366   stall. Instead, all commits that came since the last sync are
9367   logged to the same "active" page, and they all are synced with the next -
9368   one - sync. Thus, thought individual commits are delayed, throughput
9369   is not decreasing.
9370 
9371   when an xid is added to an active page, the thread of this xid waits
9372   for a page's condition until the page is synced. when syncing slot
9373   becomes vacant one of these waiters is awaken to take care of syncing.
9374   it syncs the page and signals all waiters that the page is synced.
9375   PAGE::waiters is used to count these waiters, and a page may never
9376   become active again until waiters==0 (that is all waiters from the
9377   previous sync have noticed that the sync was completed)
9378 
9379   note, that the page becomes "dirty" and has to be synced only when a
9380   new xid is added into it. Removing a xid from a page does not make it
9381   dirty - we don't sync xid removals to disk.
9382 */
9383 
9384 ulong tc_log_page_waits= 0;
9385 
9386 #ifdef HAVE_MMAP
9387 
9388 #define TC_LOG_HEADER_SIZE (sizeof(tc_log_magic)+1)
9389 
9390 static const uchar tc_log_magic[]={(uchar) 254, 0x23, 0x05, 0x74};
9391 
9392 ulong opt_tc_log_size;
9393 ulong tc_log_max_pages_used=0, tc_log_page_size=0, tc_log_cur_pages_used=0;
9394 
open(const char * opt_name)9395 int TC_LOG_MMAP::open(const char *opt_name)
9396 {
9397   uint i;
9398   bool crashed=FALSE;
9399   PAGE *pg;
9400 
9401   DBUG_ASSERT(total_ha_2pc > 1);
9402   DBUG_ASSERT(opt_name);
9403   DBUG_ASSERT(opt_name[0]);
9404 
9405   tc_log_page_size= my_getpagesize();
9406 
9407   fn_format(logname,opt_name,mysql_data_home,"",MY_UNPACK_FILENAME);
9408   if ((fd= mysql_file_open(key_file_tclog, logname, O_RDWR | O_CLOEXEC, MYF(0))) < 0)
9409   {
9410     if (my_errno != ENOENT)
9411       goto err;
9412     if (using_heuristic_recover())
9413       return 1;
9414     if ((fd= mysql_file_create(key_file_tclog, logname, CREATE_MODE,
9415                                O_RDWR | O_CLOEXEC, MYF(MY_WME))) < 0)
9416       goto err;
9417     inited=1;
9418     file_length= opt_tc_log_size;
9419     if (mysql_file_chsize(fd, file_length, 0, MYF(MY_WME)))
9420       goto err;
9421   }
9422   else
9423   {
9424     inited= 1;
9425     crashed= TRUE;
9426     sql_print_information("Recovering after a crash using %s", opt_name);
9427     if (tc_heuristic_recover)
9428     {
9429       sql_print_error("Cannot perform automatic crash recovery when "
9430                       "--tc-heuristic-recover is used");
9431       goto err;
9432     }
9433     file_length= mysql_file_seek(fd, 0L, MY_SEEK_END, MYF(MY_WME+MY_FAE));
9434     if (file_length == MY_FILEPOS_ERROR || file_length % tc_log_page_size)
9435       goto err;
9436   }
9437 
9438   data= (uchar *)my_mmap(0, (size_t)file_length, PROT_READ|PROT_WRITE,
9439                         MAP_NOSYNC|MAP_SHARED, fd, 0);
9440   if (data == MAP_FAILED)
9441   {
9442     my_errno=errno;
9443     goto err;
9444   }
9445   inited=2;
9446 
9447   npages=(uint)file_length/tc_log_page_size;
9448   if (npages < 3)             // to guarantee non-empty pool
9449     goto err;
9450   if (!(pages=(PAGE *)my_malloc(key_memory_TC_LOG_MMAP_pages,
9451                                 npages*sizeof(PAGE), MYF(MY_WME|MY_ZEROFILL))))
9452     goto err;
9453   inited=3;
9454   for (pg=pages, i=0; i < npages; i++, pg++)
9455   {
9456     pg->next=pg+1;
9457     pg->waiters=0;
9458     pg->state=PS_POOL;
9459     mysql_mutex_init(key_PAGE_lock, &pg->lock, MY_MUTEX_INIT_FAST);
9460     mysql_cond_init(key_PAGE_cond, &pg->cond, 0);
9461     pg->ptr= pg->start=(my_xid *)(data + i*tc_log_page_size);
9462     pg->size=pg->free=tc_log_page_size/sizeof(my_xid);
9463     pg->end=pg->start + pg->size;
9464   }
9465   pages[0].size=pages[0].free=
9466                 (tc_log_page_size-TC_LOG_HEADER_SIZE)/sizeof(my_xid);
9467   pages[0].start=pages[0].end-pages[0].size;
9468   pages[npages-1].next=0;
9469   inited=4;
9470 
9471   if (crashed && recover())
9472       goto err;
9473 
9474   memcpy(data, tc_log_magic, sizeof(tc_log_magic));
9475   data[sizeof(tc_log_magic)]= (uchar)total_ha_2pc;
9476   my_msync(fd, data, tc_log_page_size, MS_SYNC);
9477   inited=5;
9478 
9479   mysql_mutex_init(key_LOCK_sync, &LOCK_sync, MY_MUTEX_INIT_FAST);
9480   mysql_mutex_init(key_LOCK_active, &LOCK_active, MY_MUTEX_INIT_FAST);
9481   mysql_mutex_init(key_LOCK_pool, &LOCK_pool, MY_MUTEX_INIT_FAST);
9482   mysql_mutex_init(key_LOCK_pending_checkpoint, &LOCK_pending_checkpoint,
9483                    MY_MUTEX_INIT_FAST);
9484   mysql_cond_init(key_COND_active, &COND_active, 0);
9485   mysql_cond_init(key_COND_pool, &COND_pool, 0);
9486   mysql_cond_init(key_TC_LOG_MMAP_COND_queue_busy, &COND_queue_busy, 0);
9487 
9488   inited=6;
9489 
9490   syncing= 0;
9491   active=pages;
9492   DBUG_ASSERT(npages >= 2);
9493   pool=pages+1;
9494   pool_last_ptr= &((pages+npages-1)->next);
9495   commit_ordered_queue= NULL;
9496   commit_ordered_queue_busy= false;
9497 
9498   return 0;
9499 
9500 err:
9501   close();
9502   return 1;
9503 }
9504 
9505 /**
9506   there is no active page, let's got one from the pool.
9507 
9508   Two strategies here:
9509     -# take the first from the pool
9510     -# if there're waiters - take the one with the most free space.
9511 
9512   @todo
9513     page merging. try to allocate adjacent page first,
9514     so that they can be flushed both in one sync
9515 */
9516 
get_active_from_pool()9517 void TC_LOG_MMAP::get_active_from_pool()
9518 {
9519   PAGE **p, **best_p=0;
9520   int best_free;
9521 
9522   mysql_mutex_lock(&LOCK_pool);
9523 
9524   do
9525   {
9526     best_p= p= &pool;
9527     if ((*p)->waiters == 0 && (*p)->free > 0) // can the first page be used ?
9528       break;                                  // yes - take it.
9529 
9530     best_free=0;            // no - trying second strategy
9531     for (p=&(*p)->next; *p; p=&(*p)->next)
9532     {
9533       if ((*p)->waiters == 0 && (*p)->free > best_free)
9534       {
9535         best_free=(*p)->free;
9536         best_p=p;
9537       }
9538     }
9539   }
9540   while ((*best_p == 0 || best_free == 0) && overflow());
9541 
9542   mysql_mutex_assert_owner(&LOCK_active);
9543   active=*best_p;
9544 
9545   /* Unlink the page from the pool. */
9546   if (!(*best_p)->next)
9547     pool_last_ptr= best_p;
9548   *best_p=(*best_p)->next;
9549   mysql_mutex_unlock(&LOCK_pool);
9550 
9551   mysql_mutex_lock(&active->lock);
9552   if (active->free == active->size) // we've chosen an empty page
9553   {
9554     tc_log_cur_pages_used++;
9555     set_if_bigger(tc_log_max_pages_used, tc_log_cur_pages_used);
9556   }
9557 }
9558 
9559 /**
9560   @todo
9561   perhaps, increase log size ?
9562 */
overflow()9563 int TC_LOG_MMAP::overflow()
9564 {
9565   /*
9566     simple overflow handling - just wait
9567     TODO perhaps, increase log size ?
9568     let's check the behaviour of tc_log_page_waits first
9569   */
9570   tc_log_page_waits++;
9571   mysql_cond_wait(&COND_pool, &LOCK_pool);
9572   return 1; // always return 1
9573 }
9574 
9575 /**
9576   Record that transaction XID is committed on the persistent storage.
9577 
9578     This function is called in the middle of two-phase commit:
9579     First all resources prepare the transaction, then tc_log->log() is called,
9580     then all resources commit the transaction, then tc_log->unlog() is called.
9581 
9582     All access to active page is serialized but it's not a problem, as
9583     we're assuming that fsync() will be a main bottleneck.
9584     That is, parallelizing writes to log pages we'll decrease number of
9585     threads waiting for a page, but then all these threads will be waiting
9586     for a fsync() anyway
9587 
9588    If tc_log == MYSQL_LOG then tc_log writes transaction to binlog and
9589    records XID in a special Xid_log_event.
9590    If tc_log = TC_LOG_MMAP then xid is written in a special memory-mapped
9591    log.
9592 
9593   @retval
9594     0  - error
9595   @retval
9596     \# - otherwise, "cookie", a number that will be passed as an argument
9597     to unlog() call. tc_log can define it any way it wants,
9598     and use for whatever purposes. TC_LOG_MMAP sets it
9599     to the position in memory where xid was logged to.
9600 */
9601 
log_one_transaction(my_xid xid)9602 int TC_LOG_MMAP::log_one_transaction(my_xid xid)
9603 {
9604   int err;
9605   PAGE *p;
9606   ulong cookie;
9607 
9608   mysql_mutex_lock(&LOCK_active);
9609 
9610   /*
9611     if the active page is full - just wait...
9612     frankly speaking, active->free here accessed outside of mutex
9613     protection, but it's safe, because it only means we may miss an
9614     unlog() for the active page, and we're not waiting for it here -
9615     unlog() does not signal COND_active.
9616   */
9617   while (unlikely(active && active->free == 0))
9618     mysql_cond_wait(&COND_active, &LOCK_active);
9619 
9620   /* no active page ? take one from the pool */
9621   if (active == 0)
9622     get_active_from_pool();
9623   else
9624     mysql_mutex_lock(&active->lock);
9625 
9626   p=active;
9627 
9628   /*
9629     p->free is always > 0 here because to decrease it one needs
9630     to take p->lock and before it one needs to take LOCK_active.
9631     But checked that active->free > 0 under LOCK_active and
9632     haven't release it ever since
9633   */
9634 
9635   /* searching for an empty slot */
9636   while (*p->ptr)
9637   {
9638     p->ptr++;
9639     DBUG_ASSERT(p->ptr < p->end);               // because p->free > 0
9640   }
9641 
9642   /* found! store xid there and mark the page dirty */
9643   cookie= (ulong)((uchar *)p->ptr - data);      // can never be zero
9644   *p->ptr++= xid;
9645   p->free--;
9646   p->state= PS_DIRTY;
9647   mysql_mutex_unlock(&p->lock);
9648 
9649   mysql_mutex_lock(&LOCK_sync);
9650   if (syncing)
9651   {                                          // somebody's syncing. let's wait
9652     mysql_mutex_unlock(&LOCK_active);
9653     mysql_mutex_lock(&p->lock);
9654     p->waiters++;
9655     while (p->state == PS_DIRTY && syncing)
9656     {
9657       mysql_mutex_unlock(&p->lock);
9658       mysql_cond_wait(&p->cond, &LOCK_sync);
9659       mysql_mutex_lock(&p->lock);
9660     }
9661     p->waiters--;
9662     err= p->state == PS_ERROR;
9663     if (p->state != PS_DIRTY)                   // page was synced
9664     {
9665       mysql_mutex_unlock(&LOCK_sync);
9666       if (p->waiters == 0)
9667         mysql_cond_signal(&COND_pool);     // in case somebody's waiting
9668       mysql_mutex_unlock(&p->lock);
9669       goto done;                             // we're done
9670     }
9671     DBUG_ASSERT(!syncing);
9672     mysql_mutex_unlock(&p->lock);
9673     syncing = p;
9674     mysql_mutex_unlock(&LOCK_sync);
9675 
9676     mysql_mutex_lock(&LOCK_active);
9677     active=0;                                  // page is not active anymore
9678     mysql_cond_broadcast(&COND_active);
9679     mysql_mutex_unlock(&LOCK_active);
9680   }
9681   else
9682   {
9683     syncing = p;                               // place is vacant - take it
9684     mysql_mutex_unlock(&LOCK_sync);
9685     active = 0;                                // page is not active anymore
9686     mysql_cond_broadcast(&COND_active);
9687     mysql_mutex_unlock(&LOCK_active);
9688   }
9689   err= sync();
9690 
9691 done:
9692   return err ? 0 : cookie;
9693 }
9694 
sync()9695 int TC_LOG_MMAP::sync()
9696 {
9697   int err;
9698 
9699   DBUG_ASSERT(syncing != active);
9700 
9701   /*
9702     sit down and relax - this can take a while...
9703     note - no locks are held at this point
9704   */
9705   err= my_msync(fd, syncing->start, syncing->size * sizeof(my_xid), MS_SYNC);
9706 
9707   /* page is synced. let's move it to the pool */
9708   mysql_mutex_lock(&LOCK_pool);
9709   (*pool_last_ptr)=syncing;
9710   pool_last_ptr=&(syncing->next);
9711   syncing->next=0;
9712   syncing->state= err ? PS_ERROR : PS_POOL;
9713   mysql_cond_signal(&COND_pool);           // in case somebody's waiting
9714   mysql_mutex_unlock(&LOCK_pool);
9715 
9716   /* marking 'syncing' slot free */
9717   mysql_mutex_lock(&LOCK_sync);
9718   mysql_cond_broadcast(&syncing->cond);    // signal "sync done"
9719   syncing=0;
9720   /*
9721     we check the "active" pointer without LOCK_active. Still, it's safe -
9722     "active" can change from NULL to not NULL any time, but it
9723     will take LOCK_sync before waiting on active->cond. That is, it can never
9724     miss a signal.
9725     And "active" can change to NULL only by the syncing thread
9726     (the thread that will send a signal below)
9727   */
9728   if (active)
9729     mysql_cond_signal(&active->cond);      // wake up a new syncer
9730   mysql_mutex_unlock(&LOCK_sync);
9731   return err;
9732 }
9733 
9734 static void
mmap_do_checkpoint_callback(void * data)9735 mmap_do_checkpoint_callback(void *data)
9736 {
9737   TC_LOG_MMAP::pending_cookies *pending=
9738     static_cast<TC_LOG_MMAP::pending_cookies *>(data);
9739   ++pending->pending_count;
9740 }
9741 
unlog(ulong cookie,my_xid xid)9742 int TC_LOG_MMAP::unlog(ulong cookie, my_xid xid)
9743 {
9744   pending_cookies *full_buffer= NULL;
9745   uint32 ncookies= tc_log_page_size / sizeof(my_xid);
9746   DBUG_ASSERT(*(my_xid *)(data+cookie) == xid);
9747 
9748   /*
9749     Do not delete the entry immediately, as there may be participating storage
9750     engines which implement commit_checkpoint_request(), and thus have not yet
9751     flushed the commit durably to disk.
9752 
9753     Instead put it in a queue - and periodically, we will request a checkpoint
9754     from all engines and delete a whole batch at once.
9755   */
9756   mysql_mutex_lock(&LOCK_pending_checkpoint);
9757   if (pending_checkpoint == NULL)
9758   {
9759     uint32 size= sizeof(*pending_checkpoint) + sizeof(ulong) * (ncookies - 1);
9760     if (!(pending_checkpoint=
9761           (pending_cookies *)my_malloc(PSI_INSTRUMENT_ME, size,
9762                                        MYF(MY_ZEROFILL))))
9763     {
9764       my_error(ER_OUTOFMEMORY, MYF(0), size);
9765       mysql_mutex_unlock(&LOCK_pending_checkpoint);
9766       return 1;
9767     }
9768   }
9769 
9770   pending_checkpoint->cookies[pending_checkpoint->count++]= cookie;
9771   if (pending_checkpoint->count == ncookies)
9772   {
9773     full_buffer= pending_checkpoint;
9774     pending_checkpoint= NULL;
9775   }
9776   mysql_mutex_unlock(&LOCK_pending_checkpoint);
9777 
9778   if (full_buffer)
9779   {
9780     /*
9781       We do an extra increment and notify here - this ensures that
9782       things work also if there are no engines at all that support
9783       commit_checkpoint_request.
9784     */
9785     ++full_buffer->pending_count;
9786     ha_commit_checkpoint_request(full_buffer, mmap_do_checkpoint_callback);
9787     commit_checkpoint_notify(full_buffer);
9788   }
9789   return 0;
9790 }
9791 
9792 
9793 void
commit_checkpoint_notify(void * cookie)9794 TC_LOG_MMAP::commit_checkpoint_notify(void *cookie)
9795 {
9796   uint count;
9797   pending_cookies *pending= static_cast<pending_cookies *>(cookie);
9798   mysql_mutex_lock(&LOCK_pending_checkpoint);
9799   DBUG_ASSERT(pending->pending_count > 0);
9800   count= --pending->pending_count;
9801   mysql_mutex_unlock(&LOCK_pending_checkpoint);
9802   if (count == 0)
9803   {
9804     uint i;
9805     for (i= 0; i < tc_log_page_size / sizeof(my_xid); ++i)
9806       delete_entry(pending->cookies[i]);
9807     my_free(pending);
9808   }
9809 }
9810 
9811 
9812 /**
9813   erase xid from the page, update page free space counters/pointers.
9814   cookie points directly to the memory where xid was logged.
9815 */
9816 
delete_entry(ulong cookie)9817 int TC_LOG_MMAP::delete_entry(ulong cookie)
9818 {
9819   PAGE *p=pages+(cookie/tc_log_page_size);
9820   my_xid *x=(my_xid *)(data+cookie);
9821 
9822   DBUG_ASSERT(x >= p->start);
9823   DBUG_ASSERT(x < p->end);
9824 
9825   mysql_mutex_lock(&p->lock);
9826   *x=0;
9827   p->free++;
9828   DBUG_ASSERT(p->free <= p->size);
9829   set_if_smaller(p->ptr, x);
9830   if (p->free == p->size)              // the page is completely empty
9831     statistic_decrement(tc_log_cur_pages_used, &LOCK_status);
9832   if (p->waiters == 0)                 // the page is in pool and ready to rock
9833     mysql_cond_signal(&COND_pool);     // ping ... for overflow()
9834   mysql_mutex_unlock(&p->lock);
9835   return 0;
9836 }
9837 
close()9838 void TC_LOG_MMAP::close()
9839 {
9840   uint i;
9841   switch (inited) {
9842   case 6:
9843     mysql_mutex_destroy(&LOCK_sync);
9844     mysql_mutex_destroy(&LOCK_active);
9845     mysql_mutex_destroy(&LOCK_pool);
9846     mysql_mutex_destroy(&LOCK_pending_checkpoint);
9847     mysql_cond_destroy(&COND_pool);
9848     mysql_cond_destroy(&COND_active);
9849     mysql_cond_destroy(&COND_queue_busy);
9850     /* fall through */
9851   case 5:
9852     data[0]='A'; // garble the first (signature) byte, in case mysql_file_delete fails
9853     /* fall through */
9854   case 4:
9855     for (i=0; i < npages; i++)
9856     {
9857       if (pages[i].ptr == 0)
9858         break;
9859       mysql_mutex_destroy(&pages[i].lock);
9860       mysql_cond_destroy(&pages[i].cond);
9861     }
9862     /* fall through */
9863   case 3:
9864     my_free(pages);
9865     /* fall through */
9866   case 2:
9867     my_munmap((char*)data, (size_t)file_length);
9868     /* fall through */
9869   case 1:
9870     mysql_file_close(fd, MYF(0));
9871   }
9872   if (inited>=5) // cannot do in the switch because of Windows
9873     mysql_file_delete(key_file_tclog, logname, MYF(MY_WME));
9874   if (pending_checkpoint)
9875     my_free(pending_checkpoint);
9876   inited=0;
9877 }
9878 
9879 
recover()9880 int TC_LOG_MMAP::recover()
9881 {
9882   HASH xids;
9883   PAGE *p=pages, *end_p=pages+npages;
9884 
9885   if (bcmp(data, tc_log_magic, sizeof(tc_log_magic)))
9886   {
9887     sql_print_error("Bad magic header in tc log");
9888     goto err1;
9889   }
9890 
9891   /*
9892     the first byte after magic signature is set to current
9893     number of storage engines on startup
9894   */
9895   if (data[sizeof(tc_log_magic)] > total_ha_2pc)
9896   {
9897     sql_print_error("Recovery failed! You must enable "
9898                     "all engines that were enabled at the moment of the crash");
9899     goto err1;
9900   }
9901 
9902   if (my_hash_init(PSI_INSTRUMENT_ME, &xids, &my_charset_bin,
9903                    tc_log_page_size/3, 0, sizeof(my_xid), 0, 0, MYF(0)))
9904     goto err1;
9905 
9906   for ( ; p < end_p ; p++)
9907   {
9908     for (my_xid *x=p->start; x < p->end; x++)
9909       if (*x && my_hash_insert(&xids, (uchar *)x))
9910         goto err2; // OOM
9911   }
9912 
9913   if (ha_recover(&xids))
9914     goto err2;
9915 
9916   my_hash_free(&xids);
9917   bzero(data, (size_t)file_length);
9918   return 0;
9919 
9920 err2:
9921   my_hash_free(&xids);
9922 err1:
9923   sql_print_error("Crash recovery failed. Either correct the problem "
9924                   "(if it's, for example, out of memory error) and restart, "
9925                   "or delete tc log and start mysqld with "
9926                   "--tc-heuristic-recover={commit|rollback}");
9927   return 1;
9928 }
9929 #endif
9930 
9931 TC_LOG *tc_log;
9932 TC_LOG_DUMMY tc_log_dummy;
9933 TC_LOG_MMAP  tc_log_mmap;
9934 
9935 /**
9936   Perform heuristic recovery, if --tc-heuristic-recover was used.
9937 
9938   @note
9939     no matter whether heuristic recovery was successful or not
9940     mysqld must exit. So, return value is the same in both cases.
9941 
9942   @retval
9943     0	no heuristic recovery was requested
9944   @retval
9945     1   heuristic recovery was performed
9946 */
9947 
using_heuristic_recover()9948 int TC_LOG::using_heuristic_recover()
9949 {
9950   if (!tc_heuristic_recover)
9951     return 0;
9952 
9953   sql_print_information("Heuristic crash recovery mode");
9954   if (ha_recover(0))
9955     sql_print_error("Heuristic crash recovery failed");
9956   sql_print_information("Please restart mysqld without --tc-heuristic-recover");
9957   return 1;
9958 }
9959 
9960 /****** transaction coordinator log for 2pc - binlog() based solution ******/
9961 #define TC_LOG_BINLOG MYSQL_BIN_LOG
9962 
open(const char * opt_name)9963 int TC_LOG_BINLOG::open(const char *opt_name)
9964 {
9965   int      error= 1;
9966 
9967   DBUG_ASSERT(total_ha_2pc > 1);
9968   DBUG_ASSERT(opt_name);
9969   DBUG_ASSERT(opt_name[0]);
9970 
9971   if (!my_b_inited(&index_file))
9972   {
9973     /* There was a failure to open the index file, can't open the binlog */
9974     cleanup();
9975     return 1;
9976   }
9977 
9978   if (using_heuristic_recover())
9979   {
9980     mysql_mutex_lock(&LOCK_log);
9981     /* generate a new binlog to mask a corrupted one */
9982     open(opt_name, 0, 0, WRITE_CACHE, max_binlog_size, 0, TRUE);
9983     mysql_mutex_unlock(&LOCK_log);
9984     cleanup();
9985     return 1;
9986   }
9987 
9988   error= do_binlog_recovery(opt_name, true);
9989   binlog_state_recover_done= true;
9990   return error;
9991 }
9992 
9993 /** This is called on shutdown, after ha_panic. */
close()9994 void TC_LOG_BINLOG::close()
9995 {
9996 }
9997 
9998 /*
9999   Do a binlog log_xid() for a group of transactions, linked through
10000   thd->next_commit_ordered.
10001 */
10002 int
log_and_order(THD * thd,my_xid xid,bool all,bool need_prepare_ordered,bool need_commit_ordered)10003 TC_LOG_BINLOG::log_and_order(THD *thd, my_xid xid, bool all,
10004                              bool need_prepare_ordered __attribute__((unused)),
10005                              bool need_commit_ordered __attribute__((unused)))
10006 {
10007   int err;
10008   DBUG_ENTER("TC_LOG_BINLOG::log_and_order");
10009 
10010   binlog_cache_mngr *cache_mngr= thd->binlog_setup_trx_data();
10011   if (!cache_mngr)
10012   {
10013     WSREP_DEBUG("Skipping empty log_xid: %s", thd->query());
10014     DBUG_RETURN(0);
10015   }
10016 
10017   cache_mngr->using_xa= TRUE;
10018   cache_mngr->xa_xid= xid;
10019   err= binlog_commit_flush_xid_caches(thd, cache_mngr, all, xid);
10020 
10021   DEBUG_SYNC(thd, "binlog_after_log_and_order");
10022 
10023   if (err)
10024     DBUG_RETURN(0);
10025 
10026   bool need_unlog= cache_mngr->need_unlog;
10027   /*
10028     The transaction won't need the flag anymore.
10029     Todo/fixme: consider to move the statement into cache_mngr->reset()
10030                 relocated to the current or later point.
10031   */
10032   cache_mngr->need_unlog= false;
10033   /*
10034     If using explicit user XA, we will not have XID. We must still return a
10035     non-zero cookie (as zero cookie signals error).
10036   */
10037   if (!xid || !need_unlog)
10038     DBUG_RETURN(BINLOG_COOKIE_DUMMY(cache_mngr->delayed_error));
10039 
10040   DBUG_RETURN(BINLOG_COOKIE_MAKE(cache_mngr->binlog_id,
10041                                  cache_mngr->delayed_error));
10042 }
10043 
10044 /*
10045   After an XID is logged, we need to hold on to the current binlog file until
10046   it is fully committed in the storage engine. The reason is that crash
10047   recovery only looks at the latest binlog, so we must make sure there are no
10048   outstanding prepared (but not committed) transactions before rotating the
10049   binlog.
10050 
10051   To handle this, we keep a count of outstanding XIDs. This function is used
10052   to increase this count when committing one or more transactions to the
10053   binary log.
10054 */
10055 void
mark_xids_active(ulong binlog_id,uint xid_count)10056 TC_LOG_BINLOG::mark_xids_active(ulong binlog_id, uint xid_count)
10057 {
10058   xid_count_per_binlog *b;
10059 
10060   DBUG_ENTER("TC_LOG_BINLOG::mark_xids_active");
10061   DBUG_PRINT("info", ("binlog_id=%lu xid_count=%u", binlog_id, xid_count));
10062 
10063   mysql_mutex_lock(&LOCK_xid_list);
10064   I_List_iterator<xid_count_per_binlog> it(binlog_xid_count_list);
10065   while ((b= it++))
10066   {
10067     if (b->binlog_id == binlog_id)
10068     {
10069       b->xid_count += xid_count;
10070       break;
10071     }
10072   }
10073   /*
10074     As we do not delete elements until count reach zero, elements should always
10075     be found.
10076   */
10077   DBUG_ASSERT(b);
10078   mysql_mutex_unlock(&LOCK_xid_list);
10079   DBUG_VOID_RETURN;
10080 }
10081 
10082 /*
10083   Once an XID is committed, it can no longer be needed during crash recovery,
10084   as it has been durably recorded on disk as "committed".
10085 
10086   This function is called to mark an XID this way. It needs to decrease the
10087   count of pending XIDs in the corresponding binlog. When the count reaches
10088   zero (for an "old" binlog that is not the active one), that binlog file no
10089   longer need to be scanned during crash recovery, so we can log a new binlog
10090   checkpoint.
10091 */
10092 void
mark_xid_done(ulong binlog_id,bool write_checkpoint)10093 TC_LOG_BINLOG::mark_xid_done(ulong binlog_id, bool write_checkpoint)
10094 {
10095   xid_count_per_binlog *b;
10096   bool first;
10097   ulong current;
10098 
10099   DBUG_ENTER("TC_LOG_BINLOG::mark_xid_done");
10100 
10101   mysql_mutex_lock(&LOCK_xid_list);
10102   current= current_binlog_id;
10103   I_List_iterator<xid_count_per_binlog> it(binlog_xid_count_list);
10104   first= true;
10105   while ((b= it++))
10106   {
10107     if (b->binlog_id == binlog_id)
10108     {
10109       --b->xid_count;
10110 
10111       DBUG_ASSERT(b->xid_count >= 0); // catch unmatched (++) decrement
10112 
10113       break;
10114     }
10115     first= false;
10116   }
10117   /* Binlog is always found, as we do not remove until count reaches 0 */
10118   DBUG_ASSERT(b);
10119   /*
10120     If a RESET MASTER is pending, we are about to remove all log files, and
10121     the RESET MASTER thread is waiting for all pending unlog() calls to
10122     complete while holding LOCK_log. In this case we should not log a binlog
10123     checkpoint event (it would be deleted immediately anyway and we would
10124     deadlock on LOCK_log) but just signal the thread.
10125   */
10126   if (unlikely(reset_master_pending))
10127   {
10128     mysql_cond_broadcast(&COND_xid_list);
10129     mysql_mutex_unlock(&LOCK_xid_list);
10130     DBUG_VOID_RETURN;
10131   }
10132 
10133   if (likely(binlog_id == current) || b->xid_count != 0 || !first ||
10134       !write_checkpoint)
10135   {
10136     /* No new binlog checkpoint reached yet. */
10137     mysql_mutex_unlock(&LOCK_xid_list);
10138     DBUG_VOID_RETURN;
10139   }
10140 
10141   /*
10142     Now log a binlog checkpoint for the first binlog file with a non-zero count.
10143 
10144     Note that it is possible (though perhaps unlikely) that when count of
10145     binlog (N-2) drops to zero, binlog (N-1) is already at zero. So we may
10146     need to skip several entries before we find the one to log in the binlog
10147     checkpoint event.
10148 
10149     We chain the locking of LOCK_xid_list and LOCK_log, so that we ensure that
10150     Binlog_checkpoint_events are logged in order. This simplifies recovery a
10151     bit, as it can just take the last binlog checkpoint in the log, rather
10152     than compare all found against each other to find the one pointing to the
10153     most recent binlog.
10154 
10155     Note also that we need to first release LOCK_xid_list, then acquire
10156     LOCK_log, then re-aquire LOCK_xid_list. If we were to take LOCK_log while
10157     holding LOCK_xid_list, we might deadlock with other threads that take the
10158     locks in the opposite order.
10159   */
10160 
10161   ++mark_xid_done_waiting;
10162   mysql_mutex_unlock(&LOCK_xid_list);
10163   mysql_mutex_lock(&LOCK_log);
10164   mysql_mutex_lock(&LOCK_xid_list);
10165   --mark_xid_done_waiting;
10166   mysql_cond_broadcast(&COND_xid_list);
10167   /* We need to reload current_binlog_id due to release/re-take of lock. */
10168   current= current_binlog_id;
10169 
10170   for (;;)
10171   {
10172     /* Remove initial element(s) with zero count. */
10173     b= binlog_xid_count_list.head();
10174     /*
10175       We must not remove all elements in the list - the entry for the current
10176       binlog must be present always.
10177     */
10178     DBUG_ASSERT(b);
10179     if (b->binlog_id == current || b->xid_count > 0)
10180       break;
10181     WSREP_XID_LIST_ENTRY("TC_LOG_BINLOG::mark_xid_done(): Removing "
10182                          "xid_list_entry for %s (%lu)", b);
10183     delete binlog_xid_count_list.get();
10184   }
10185 
10186   mysql_mutex_unlock(&LOCK_xid_list);
10187   write_binlog_checkpoint_event_already_locked(b->binlog_name,
10188                                                b->binlog_name_len);
10189   mysql_mutex_unlock(&LOCK_log);
10190   DBUG_VOID_RETURN;
10191 }
10192 
unlog(ulong cookie,my_xid xid)10193 int TC_LOG_BINLOG::unlog(ulong cookie, my_xid xid)
10194 {
10195   DBUG_ENTER("TC_LOG_BINLOG::unlog");
10196   if (!xid)
10197     DBUG_RETURN(0);
10198 
10199   if (!BINLOG_COOKIE_IS_DUMMY(cookie))
10200     mark_xid_done(BINLOG_COOKIE_GET_ID(cookie), true);
10201   /*
10202     See comment in trx_group_commit_leader() - if rotate() gave a failure,
10203     we delay the return of error code to here.
10204   */
10205   DBUG_RETURN(BINLOG_COOKIE_GET_ERROR_FLAG(cookie));
10206 }
10207 
write_empty_xa_prepare(THD * thd,binlog_cache_mngr * cache_mngr)10208 static bool write_empty_xa_prepare(THD *thd, binlog_cache_mngr *cache_mngr)
10209 {
10210   return binlog_commit_flush_xa_prepare(thd, true, cache_mngr);
10211 }
10212 
unlog_xa_prepare(THD * thd,bool all)10213 int TC_LOG_BINLOG::unlog_xa_prepare(THD *thd, bool all)
10214 {
10215   DBUG_ASSERT(is_preparing_xa(thd));
10216 
10217   binlog_cache_mngr *cache_mngr= thd->binlog_setup_trx_data();
10218   int cookie= 0;
10219 
10220   if (!cache_mngr->need_unlog)
10221   {
10222     Ha_trx_info *ha_info;
10223     uint rw_count= ha_count_rw_all(thd, &ha_info);
10224     bool rc= false;
10225 
10226     /*
10227       This transaction has not been binlogged as indicated by need_unlog.
10228       Such exceptional cases include transactions with no effect to engines,
10229       e.g REPLACE that does not change the dat but still the Engine
10230       transaction branch claims to be rw, and few more.
10231       In all such cases an empty XA-prepare group of events is bin-logged.
10232     */
10233     if (rw_count > 0)
10234     {
10235       /* an empty XA-prepare event group is logged */
10236       rc= write_empty_xa_prepare(thd, cache_mngr); // normally gains need_unlog
10237       trans_register_ha(thd, true, binlog_hton, 0); // do it for future commmit
10238     }
10239     if (rw_count == 0 || !cache_mngr->need_unlog)
10240       return rc;
10241   }
10242 
10243   cookie= BINLOG_COOKIE_MAKE(cache_mngr->binlog_id, cache_mngr->delayed_error);
10244   cache_mngr->need_unlog= false;
10245 
10246   return unlog(cookie, 1);
10247 }
10248 
10249 
10250 void
commit_checkpoint_notify(void * cookie)10251 TC_LOG_BINLOG::commit_checkpoint_notify(void *cookie)
10252 {
10253   xid_count_per_binlog *entry= static_cast<xid_count_per_binlog *>(cookie);
10254   bool found_entry= false;
10255   mysql_mutex_lock(&LOCK_binlog_background_thread);
10256   /* count the same notification kind from different engines */
10257   for (xid_count_per_binlog *link= binlog_background_thread_queue;
10258        link && !found_entry; link= link->next_in_queue)
10259   {
10260     if ((found_entry= (entry == link)))
10261       entry->notify_count++;
10262   }
10263   if (!found_entry)
10264   {
10265     entry->next_in_queue= binlog_background_thread_queue;
10266     binlog_background_thread_queue= entry;
10267   }
10268   mysql_cond_signal(&COND_binlog_background_thread);
10269   mysql_mutex_unlock(&LOCK_binlog_background_thread);
10270 }
10271 
10272 /*
10273   Binlog background thread.
10274 
10275   This thread is used to log binlog checkpoints in the background, rather than
10276   in the context of random storage engine threads that happen to call
10277   commit_checkpoint_notify_ha() and may not like the delays while syncing
10278   binlog to disk or may not be setup with all my_thread_init() and other
10279   necessary stuff.
10280 
10281   In the future, this thread could also be used to do log rotation in the
10282   background, which could eliminate all stalls around binlog rotations.
10283 */
10284 pthread_handler_t
binlog_background_thread(void * arg)10285 binlog_background_thread(void *arg __attribute__((unused)))
10286 {
10287   bool stop;
10288   MYSQL_BIN_LOG::xid_count_per_binlog *queue, *next;
10289   THD *thd;
10290   my_thread_init();
10291   DBUG_ENTER("binlog_background_thread");
10292 
10293   thd= new THD(next_thread_id());
10294   thd->system_thread= SYSTEM_THREAD_BINLOG_BACKGROUND;
10295   thd->thread_stack= (char*) &thd;           /* Set approximate stack start */
10296   thd->store_globals();
10297   thd->security_ctx->skip_grants();
10298   thd->set_command(COM_DAEMON);
10299 
10300   /*
10301     Load the slave replication GTID state from the mysql.gtid_slave_pos
10302     table.
10303 
10304     This is mostly so that we can start our seq_no counter from the highest
10305     seq_no seen by a slave. This way, we have a way to tell if a transaction
10306     logged by ourselves as master is newer or older than a replicated
10307     transaction.
10308   */
10309 #ifdef HAVE_REPLICATION
10310   if (rpl_load_gtid_slave_state(thd))
10311     sql_print_warning("Failed to load slave replication state from table "
10312                       "%s.%s: %u: %s", "mysql",
10313                       rpl_gtid_slave_state_table_name.str,
10314                       thd->get_stmt_da()->sql_errno(),
10315                       thd->get_stmt_da()->message());
10316 #endif
10317 
10318   mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
10319   binlog_background_thread_started= true;
10320   mysql_cond_signal(&mysql_bin_log.COND_binlog_background_thread_end);
10321   mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);
10322 
10323   for (;;)
10324   {
10325     /*
10326       Wait until there is something in the queue to process, or we are asked
10327       to shut down.
10328     */
10329     THD_STAGE_INFO(thd, stage_binlog_waiting_background_tasks);
10330     mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
10331     for (;;)
10332     {
10333       stop= binlog_background_thread_stop;
10334       queue= binlog_background_thread_queue;
10335       if (stop && !mysql_bin_log.is_xidlist_idle())
10336       {
10337         /*
10338           Delay stop until all pending binlog checkpoints have been processed.
10339         */
10340         stop= false;
10341       }
10342       if (stop || queue)
10343         break;
10344       mysql_cond_wait(&mysql_bin_log.COND_binlog_background_thread,
10345                       &mysql_bin_log.LOCK_binlog_background_thread);
10346     }
10347     /* Grab the queue, if any. */
10348     binlog_background_thread_queue= NULL;
10349     mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);
10350 
10351     /* Process any incoming commit_checkpoint_notify() calls. */
10352     DBUG_EXECUTE_IF("inject_binlog_background_thread_before_mark_xid_done",
10353       DBUG_ASSERT(!debug_sync_set_action(
10354         thd,
10355         STRING_WITH_LEN("binlog_background_thread_before_mark_xid_done "
10356                         "SIGNAL injected_binlog_background_thread "
10357                         "WAIT_FOR something_that_will_never_happen "
10358                         "TIMEOUT 2")));
10359       );
10360     while (queue)
10361     {
10362       long count= queue->notify_count;
10363       THD_STAGE_INFO(thd, stage_binlog_processing_checkpoint_notify);
10364       DEBUG_SYNC(thd, "binlog_background_thread_before_mark_xid_done");
10365       /* Set the thread start time */
10366       thd->set_time();
10367       /* Grab next pointer first, as mark_xid_done() may free the element. */
10368       next= queue->next_in_queue;
10369       queue->notify_count= 0;
10370       for (long i= 0; i <= count; i++)
10371         mysql_bin_log.mark_xid_done(queue->binlog_id, true);
10372       queue= next;
10373 
10374       DBUG_EXECUTE_IF("binlog_background_checkpoint_processed",
10375         DBUG_ASSERT(!debug_sync_set_action(
10376           thd,
10377           STRING_WITH_LEN("now SIGNAL binlog_background_checkpoint_processed")));
10378         );
10379     }
10380 
10381     if (stop)
10382       break;
10383   }
10384 
10385   THD_STAGE_INFO(thd, stage_binlog_stopping_background_thread);
10386 
10387   /* No need to use mutex as thd is not linked into other threads */
10388   delete thd;
10389 
10390   my_thread_end();
10391 
10392   /* Signal that we are (almost) stopped. */
10393   mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
10394   binlog_background_thread_stop= false;
10395   mysql_cond_signal(&mysql_bin_log.COND_binlog_background_thread_end);
10396   mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);
10397 
10398   DBUG_RETURN(0);
10399 }
10400 
10401 #ifdef HAVE_PSI_INTERFACE
10402 static PSI_thread_key key_thread_binlog;
10403 
10404 static PSI_thread_info all_binlog_threads[]=
10405 {
10406   { &key_thread_binlog, "binlog_background", PSI_FLAG_GLOBAL},
10407 };
10408 #endif /* HAVE_PSI_INTERFACE */
10409 
10410 static bool
start_binlog_background_thread()10411 start_binlog_background_thread()
10412 {
10413   pthread_t th;
10414 
10415 #ifdef HAVE_PSI_INTERFACE
10416   if (PSI_server)
10417     PSI_server->register_thread("sql", all_binlog_threads,
10418                                 array_elements(all_binlog_threads));
10419 #endif
10420 
10421   if (mysql_thread_create(key_thread_binlog, &th, &connection_attrib,
10422                           binlog_background_thread, NULL))
10423     return 1;
10424 
10425   /*
10426     Wait for the thread to have started (so we know that the slave replication
10427     state is loaded and we have correct global_gtid_counter).
10428   */
10429   mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
10430   while (!binlog_background_thread_started)
10431     mysql_cond_wait(&mysql_bin_log.COND_binlog_background_thread_end,
10432                     &mysql_bin_log.LOCK_binlog_background_thread);
10433   mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);
10434 
10435   return 0;
10436 }
10437 
10438 
recover(LOG_INFO * linfo,const char * last_log_name,IO_CACHE * first_log,Format_description_log_event * fdle,bool do_xa)10439 int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
10440                            IO_CACHE *first_log,
10441                            Format_description_log_event *fdle, bool do_xa)
10442 {
10443   Log_event *ev= NULL;
10444   HASH xids;
10445   MEM_ROOT mem_root;
10446   char binlog_checkpoint_name[FN_REFLEN];
10447   bool binlog_checkpoint_found;
10448   bool first_round;
10449   IO_CACHE log;
10450   File file= -1;
10451   const char *errmsg;
10452 #ifdef HAVE_REPLICATION
10453   rpl_gtid last_gtid;
10454   bool last_gtid_standalone= false;
10455   bool last_gtid_valid= false;
10456 #endif
10457 
10458   if (! fdle->is_valid() ||
10459       (do_xa && my_hash_init(key_memory_binlog_recover_exec, &xids, &my_charset_bin, TC_LOG_PAGE_SIZE/3, 0,
10460                              sizeof(my_xid), 0, 0, MYF(0))))
10461     goto err1;
10462 
10463   if (do_xa)
10464     init_alloc_root(key_memory_binlog_recover_exec, &mem_root,
10465                     TC_LOG_PAGE_SIZE, TC_LOG_PAGE_SIZE, MYF(0));
10466 
10467   fdle->flags&= ~LOG_EVENT_BINLOG_IN_USE_F; // abort on the first error
10468 
10469   /*
10470     Scan the binlog for XIDs that need to be committed if still in the
10471     prepared stage.
10472 
10473     Start with the latest binlog file, then continue with any other binlog
10474     files if the last found binlog checkpoint indicates it is needed.
10475   */
10476 
10477   binlog_checkpoint_found= false;
10478   first_round= true;
10479   for (;;)
10480   {
10481     while ((ev= Log_event::read_log_event(first_round ? first_log : &log,
10482                                           fdle, opt_master_verify_checksum))
10483            && ev->is_valid())
10484     {
10485       enum Log_event_type typ= ev->get_type_code();
10486       switch (typ)
10487       {
10488       case XID_EVENT:
10489       {
10490         if (do_xa)
10491         {
10492           Xid_log_event *xev=(Xid_log_event *)ev;
10493           uchar *x= (uchar *) memdup_root(&mem_root, (uchar*) &xev->xid,
10494                                           sizeof(xev->xid));
10495           if (!x || my_hash_insert(&xids, x))
10496             goto err2;
10497         }
10498         break;
10499       }
10500       case BINLOG_CHECKPOINT_EVENT:
10501         if (first_round && do_xa)
10502         {
10503           size_t dir_len;
10504           Binlog_checkpoint_log_event *cev= (Binlog_checkpoint_log_event *)ev;
10505           if (cev->binlog_file_len >= FN_REFLEN)
10506             sql_print_warning("Incorrect binlog checkpoint event with too "
10507                               "long file name found.");
10508           else
10509           {
10510             /*
10511               Note that we cannot use make_log_name() here, as we have not yet
10512               initialised MYSQL_BIN_LOG::log_file_name.
10513             */
10514             dir_len= dirname_length(last_log_name);
10515             strmake(strnmov(binlog_checkpoint_name, last_log_name, dir_len),
10516                     cev->binlog_file_name, FN_REFLEN - 1 - dir_len);
10517             binlog_checkpoint_found= true;
10518           }
10519         }
10520         break;
10521       case GTID_LIST_EVENT:
10522         if (first_round)
10523         {
10524           Gtid_list_log_event *glev= (Gtid_list_log_event *)ev;
10525 
10526           /* Initialise the binlog state from the Gtid_list event. */
10527           if (rpl_global_gtid_binlog_state.load(glev->list, glev->count))
10528             goto err2;
10529         }
10530         break;
10531 
10532 #ifdef HAVE_REPLICATION
10533       case GTID_EVENT:
10534         if (first_round)
10535         {
10536           Gtid_log_event *gev= (Gtid_log_event *)ev;
10537 
10538           /* Update the binlog state with any GTID logged after Gtid_list. */
10539           last_gtid.domain_id= gev->domain_id;
10540           last_gtid.server_id= gev->server_id;
10541           last_gtid.seq_no= gev->seq_no;
10542           last_gtid_standalone=
10543             ((gev->flags2 & Gtid_log_event::FL_STANDALONE) ? true : false);
10544           last_gtid_valid= true;
10545         }
10546         break;
10547 #endif
10548 
10549       case START_ENCRYPTION_EVENT:
10550         {
10551           if (fdle->start_decryption((Start_encryption_log_event*) ev))
10552             goto err2;
10553         }
10554         break;
10555 
10556       default:
10557         /* Nothing. */
10558         break;
10559       }
10560 
10561 #ifdef HAVE_REPLICATION
10562       if (last_gtid_valid &&
10563           ((last_gtid_standalone && !ev->is_part_of_group(typ)) ||
10564            (!last_gtid_standalone &&
10565             (typ == XID_EVENT ||
10566              typ == XA_PREPARE_LOG_EVENT ||
10567              (LOG_EVENT_IS_QUERY(typ) &&
10568               (((Query_log_event *)ev)->is_commit() ||
10569                ((Query_log_event *)ev)->is_rollback()))))))
10570       {
10571         if (rpl_global_gtid_binlog_state.update_nolock(&last_gtid, false))
10572           goto err2;
10573         last_gtid_valid= false;
10574       }
10575 #endif
10576 
10577       delete ev;
10578       ev= NULL;
10579     }
10580 
10581     if (!do_xa)
10582       break;
10583     /*
10584       If the last binlog checkpoint event points to an older log, we have to
10585       scan all logs from there also, to get all possible XIDs to recover.
10586 
10587       If there was no binlog checkpoint event at all, this means the log was
10588       written by an older version of MariaDB (or MySQL) - these always have an
10589       (implicit) binlog checkpoint event at the start of the last binlog file.
10590     */
10591     if (first_round)
10592     {
10593       if (!binlog_checkpoint_found)
10594         break;
10595       first_round= false;
10596       DBUG_EXECUTE_IF("xa_recover_expect_master_bin_000004",
10597           if (0 != strcmp("./master-bin.000004", binlog_checkpoint_name) &&
10598               0 != strcmp(".\\master-bin.000004", binlog_checkpoint_name))
10599             DBUG_SUICIDE();
10600         );
10601       if (find_log_pos(linfo, binlog_checkpoint_name, 1))
10602       {
10603         sql_print_error("Binlog file '%s' not found in binlog index, needed "
10604                         "for recovery. Aborting.", binlog_checkpoint_name);
10605         goto err2;
10606       }
10607     }
10608     else
10609     {
10610       end_io_cache(&log);
10611       mysql_file_close(file, MYF(MY_WME));
10612       file= -1;
10613     }
10614 
10615     if (!strcmp(linfo->log_file_name, last_log_name))
10616       break;                                    // No more files to do
10617     if ((file= open_binlog(&log, linfo->log_file_name, &errmsg)) < 0)
10618     {
10619       sql_print_error("%s", errmsg);
10620       goto err2;
10621     }
10622     /*
10623       We do not need to read the Format_description_log_event of other binlog
10624       files. It is not possible for a binlog checkpoint to span multiple
10625       binlog files written by different versions of the server. So we can use
10626       the first one read for reading from all binlog files.
10627     */
10628     if (find_next_log(linfo, 1))
10629     {
10630       sql_print_error("Error reading binlog files during recovery. Aborting.");
10631       goto err2;
10632     }
10633     fdle->reset_crypto();
10634   }
10635 
10636   if (do_xa)
10637   {
10638     if (ha_recover(&xids))
10639       goto err2;
10640 
10641     free_root(&mem_root, MYF(0));
10642     my_hash_free(&xids);
10643   }
10644   return 0;
10645 
10646 err2:
10647   delete ev;
10648   if (file >= 0)
10649   {
10650     end_io_cache(&log);
10651     mysql_file_close(file, MYF(MY_WME));
10652   }
10653   if (do_xa)
10654   {
10655     free_root(&mem_root, MYF(0));
10656     my_hash_free(&xids);
10657   }
10658 err1:
10659   sql_print_error("Crash recovery failed. Either correct the problem "
10660                   "(if it's, for example, out of memory error) and restart, "
10661                   "or delete (or rename) binary log and start mysqld with "
10662                   "--tc-heuristic-recover={commit|rollback}");
10663   return 1;
10664 }
10665 
10666 
10667 int
do_binlog_recovery(const char * opt_name,bool do_xa_recovery)10668 MYSQL_BIN_LOG::do_binlog_recovery(const char *opt_name, bool do_xa_recovery)
10669 {
10670   LOG_INFO log_info;
10671   const char *errmsg;
10672   IO_CACHE    log;
10673   File        file;
10674   Log_event  *ev= 0;
10675   Format_description_log_event fdle(BINLOG_VERSION);
10676   char        log_name[FN_REFLEN];
10677   int error;
10678 
10679   if (unlikely((error= find_log_pos(&log_info, NullS, 1))))
10680   {
10681     /*
10682       If there are no binlog files (LOG_INFO_EOF), then we still try to read
10683       the .state file to restore the binlog state. This allows to copy a server
10684       to provision a new one without copying the binlog files (except the
10685       master-bin.state file) and still preserve the correct binlog state.
10686     */
10687     if (error != LOG_INFO_EOF)
10688       sql_print_error("find_log_pos() failed (error: %d)", error);
10689     else
10690     {
10691       error= read_state_from_file();
10692       if (error == 2)
10693       {
10694         /*
10695           No binlog files and no binlog state is not an error (eg. just initial
10696           server start after fresh installation).
10697         */
10698         error= 0;
10699       }
10700     }
10701     return error;
10702   }
10703 
10704   if (! fdle.is_valid())
10705     return 1;
10706 
10707   do
10708   {
10709     strmake_buf(log_name, log_info.log_file_name);
10710   } while (!(error= find_next_log(&log_info, 1)));
10711 
10712   if (error !=  LOG_INFO_EOF)
10713   {
10714     sql_print_error("find_log_pos() failed (error: %d)", error);
10715     return error;
10716   }
10717 
10718   if ((file= open_binlog(&log, log_name, &errmsg)) < 0)
10719   {
10720     sql_print_error("%s", errmsg);
10721     return 1;
10722   }
10723 
10724   if ((ev= Log_event::read_log_event(&log, &fdle,
10725                                      opt_master_verify_checksum)) &&
10726       ev->get_type_code() == FORMAT_DESCRIPTION_EVENT)
10727   {
10728     if (ev->flags & LOG_EVENT_BINLOG_IN_USE_F)
10729     {
10730       sql_print_information("Recovering after a crash using %s", opt_name);
10731       error= recover(&log_info, log_name, &log,
10732                      (Format_description_log_event *)ev, do_xa_recovery);
10733     }
10734     else
10735     {
10736       error= read_state_from_file();
10737       if (unlikely(error == 2))
10738       {
10739         /*
10740           The binlog exists, but the .state file is missing. This is normal if
10741           this is the first master start after a major upgrade to 10.0 (with
10742           GTID support).
10743 
10744           However, it could also be that the .state file was lost somehow, and
10745           in this case it could be a serious issue, as we would set the wrong
10746           binlog state in the next binlog file to be created, and GTID
10747           processing would be corrupted. A common way would be copying files
10748           from an old server to a new one and forgetting the .state file.
10749 
10750           So in this case, we want to try to recover the binlog state by
10751           scanning the last binlog file (but we do not need any XA recovery).
10752 
10753           ToDo: We could avoid one scan at first start after major upgrade, by
10754           detecting that there is no GTID_LIST event at the start of the
10755           binlog file, and stopping the scan in that case.
10756         */
10757         error= recover(&log_info, log_name, &log,
10758                        (Format_description_log_event *)ev, false);
10759       }
10760     }
10761   }
10762 
10763   delete ev;
10764   end_io_cache(&log);
10765   mysql_file_close(file, MYF(MY_WME));
10766 
10767   return error;
10768 }
10769 
10770 
10771 #ifdef INNODB_COMPATIBILITY_HOOKS
10772 /*
10773   Get the current position of the MySQL binlog for transaction currently being
10774   committed.
10775 
10776   This is valid to call from within storage engine commit_ordered() and
10777   commit() methods only.
10778 
10779   Since it stores the position inside THD, it is safe to call without any
10780   locking.
10781 */
10782 void
mysql_bin_log_commit_pos(THD * thd,ulonglong * out_pos,const char ** out_file)10783 mysql_bin_log_commit_pos(THD *thd, ulonglong *out_pos, const char **out_file)
10784 {
10785   binlog_cache_mngr *cache_mngr;
10786   if (opt_bin_log &&
10787       (cache_mngr= (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton)))
10788   {
10789     *out_file= cache_mngr->last_commit_pos_file;
10790     *out_pos= (ulonglong)(cache_mngr->last_commit_pos_offset);
10791   }
10792   else
10793   {
10794     *out_file= NULL;
10795     *out_pos= 0;
10796   }
10797 }
10798 #endif /* INNODB_COMPATIBILITY_HOOKS */
10799 
10800 
10801 static void
binlog_checksum_update(MYSQL_THD thd,struct st_mysql_sys_var * var,void * var_ptr,const void * save)10802 binlog_checksum_update(MYSQL_THD thd, struct st_mysql_sys_var *var,
10803                        void *var_ptr, const void *save)
10804 {
10805   ulong value=  *((ulong *)save);
10806   bool check_purge= false;
10807   ulong UNINIT_VAR(prev_binlog_id);
10808 
10809   mysql_mutex_lock(mysql_bin_log.get_log_lock());
10810   if(mysql_bin_log.is_open())
10811   {
10812     prev_binlog_id= mysql_bin_log.current_binlog_id;
10813     if (binlog_checksum_options != value)
10814       mysql_bin_log.checksum_alg_reset= (enum_binlog_checksum_alg)value;
10815     if (mysql_bin_log.rotate(true, &check_purge))
10816       check_purge= false;
10817   }
10818   else
10819   {
10820     binlog_checksum_options= value;
10821   }
10822   DBUG_ASSERT(binlog_checksum_options == value);
10823   mysql_bin_log.checksum_alg_reset= BINLOG_CHECKSUM_ALG_UNDEF;
10824   mysql_mutex_unlock(mysql_bin_log.get_log_lock());
10825   if (check_purge)
10826     mysql_bin_log.checkpoint_and_purge(prev_binlog_id);
10827 }
10828 
10829 
show_binlog_vars(THD * thd,SHOW_VAR * var,void *,system_status_var * status_var,enum_var_type)10830 static int show_binlog_vars(THD *thd, SHOW_VAR *var, void *,
10831                             system_status_var *status_var, enum_var_type)
10832 {
10833   mysql_bin_log.set_status_variables(thd);
10834   var->type= SHOW_ARRAY;
10835   var->value= (char *)&binlog_status_vars_detail;
10836   return 0;
10837 }
10838 
10839 static SHOW_VAR binlog_status_vars_top[]= {
10840   {"Binlog", (char *) &show_binlog_vars, SHOW_FUNC},
10841   {NullS, NullS, SHOW_LONG}
10842 };
10843 
10844 static MYSQL_SYSVAR_BOOL(
10845   optimize_thread_scheduling,
10846   opt_optimize_thread_scheduling,
10847   PLUGIN_VAR_READONLY,
10848   "Run fast part of group commit in a single thread, to optimize kernel "
10849   "thread scheduling. On by default. Disable to run each transaction in group "
10850   "commit in its own thread, which can be slower at very high concurrency. "
10851   "This option is mostly for testing one algorithm versus the other, and it "
10852   "should not normally be necessary to change it.",
10853   NULL,
10854   NULL,
10855   1);
10856 
10857 static MYSQL_SYSVAR_ENUM(
10858   checksum,
10859   binlog_checksum_options,
10860   PLUGIN_VAR_RQCMDARG,
10861   "Type of BINLOG_CHECKSUM_ALG. Include checksum for "
10862   "log events in the binary log",
10863   NULL,
10864   binlog_checksum_update,
10865   BINLOG_CHECKSUM_ALG_CRC32,
10866   &binlog_checksum_typelib);
10867 
10868 static struct st_mysql_sys_var *binlog_sys_vars[]=
10869 {
10870   MYSQL_SYSVAR(optimize_thread_scheduling),
10871   MYSQL_SYSVAR(checksum),
10872   NULL
10873 };
10874 
10875 
10876 /*
10877   Copy out the non-directory part of binlog position filename for the
10878   `binlog_snapshot_file' status variable, same way as it is done for
10879   SHOW BINLOG STATUS.
10880 */
10881 static void
set_binlog_snapshot_file(const char * src)10882 set_binlog_snapshot_file(const char *src)
10883 {
10884   size_t dir_len = dirname_length(src);
10885   strmake_buf(binlog_snapshot_file, src + dir_len);
10886 }
10887 
10888 /*
10889   Copy out current values of status variables, for SHOW STATUS or
10890   information_schema.global_status.
10891 
10892   This is called only under LOCK_all_status_vars, so we can fill in a static array.
10893 */
10894 void
set_status_variables(THD * thd)10895 TC_LOG_BINLOG::set_status_variables(THD *thd)
10896 {
10897   binlog_cache_mngr *cache_mngr;
10898 
10899   if (thd && opt_bin_log)
10900     cache_mngr= (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
10901   else
10902     cache_mngr= 0;
10903 
10904   bool have_snapshot= (cache_mngr && cache_mngr->last_commit_pos_file[0] != 0);
10905   mysql_mutex_lock(&LOCK_commit_ordered);
10906   binlog_status_var_num_commits= this->num_commits;
10907   binlog_status_var_num_group_commits= this->num_group_commits;
10908   if (!have_snapshot)
10909   {
10910     set_binlog_snapshot_file(last_commit_pos_file);
10911     binlog_snapshot_position= last_commit_pos_offset;
10912   }
10913   mysql_mutex_unlock(&LOCK_commit_ordered);
10914   mysql_mutex_lock(&LOCK_prepare_ordered);
10915   binlog_status_group_commit_trigger_count= this->group_commit_trigger_count;
10916   binlog_status_group_commit_trigger_timeout= this->group_commit_trigger_timeout;
10917   binlog_status_group_commit_trigger_lock_wait= this->group_commit_trigger_lock_wait;
10918   mysql_mutex_unlock(&LOCK_prepare_ordered);
10919 
10920   if (have_snapshot)
10921   {
10922     set_binlog_snapshot_file(cache_mngr->last_commit_pos_file);
10923     binlog_snapshot_position= cache_mngr->last_commit_pos_offset;
10924   }
10925 }
10926 
10927 
10928 /*
10929   Find the Gtid_list_log_event at the start of a binlog.
10930 
10931   NULL for ok, non-NULL error message for error.
10932 
10933   If ok, then the event is returned in *out_gtid_list. This can be NULL if we
10934   get back to binlogs written by old server version without GTID support. If
10935   so, it means we have reached the point to start from, as no GTID events can
10936   exist in earlier binlogs.
10937 */
10938 const char *
get_gtid_list_event(IO_CACHE * cache,Gtid_list_log_event ** out_gtid_list)10939 get_gtid_list_event(IO_CACHE *cache, Gtid_list_log_event **out_gtid_list)
10940 {
10941   Format_description_log_event init_fdle(BINLOG_VERSION);
10942   Format_description_log_event *fdle;
10943   Log_event *ev;
10944   const char *errormsg = NULL;
10945 
10946   *out_gtid_list= NULL;
10947 
10948   if (!(ev= Log_event::read_log_event(cache, &init_fdle,
10949                                       opt_master_verify_checksum)) ||
10950       ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
10951   {
10952     if (ev)
10953       delete ev;
10954     return "Could not read format description log event while looking for "
10955       "GTID position in binlog";
10956   }
10957 
10958   fdle= static_cast<Format_description_log_event *>(ev);
10959 
10960   for (;;)
10961   {
10962     Log_event_type typ;
10963 
10964     ev= Log_event::read_log_event(cache, fdle, opt_master_verify_checksum);
10965     if (!ev)
10966     {
10967       errormsg= "Could not read GTID list event while looking for GTID "
10968         "position in binlog";
10969       break;
10970     }
10971     typ= ev->get_type_code();
10972     if (typ == GTID_LIST_EVENT)
10973       break;                                    /* Done, found it */
10974     if (typ == START_ENCRYPTION_EVENT)
10975     {
10976       if (fdle->start_decryption((Start_encryption_log_event*) ev))
10977         errormsg= "Could not set up decryption for binlog.";
10978     }
10979     delete ev;
10980     if (typ == ROTATE_EVENT || typ == STOP_EVENT ||
10981         typ == FORMAT_DESCRIPTION_EVENT || typ == START_ENCRYPTION_EVENT)
10982       continue;                                 /* Continue looking */
10983 
10984     /* We did not find any Gtid_list_log_event, must be old binlog. */
10985     ev= NULL;
10986     break;
10987   }
10988 
10989   delete fdle;
10990   *out_gtid_list= static_cast<Gtid_list_log_event *>(ev);
10991   return errormsg;
10992 }
10993 
10994 
10995 struct st_mysql_storage_engine binlog_storage_engine=
10996 { MYSQL_HANDLERTON_INTERFACE_VERSION };
10997 
maria_declare_plugin(binlog)10998 maria_declare_plugin(binlog)
10999 {
11000   MYSQL_STORAGE_ENGINE_PLUGIN,
11001   &binlog_storage_engine,
11002   "binlog",
11003   "MySQL AB",
11004   "This is a pseudo storage engine to represent the binlog in a transaction",
11005   PLUGIN_LICENSE_GPL,
11006   binlog_init, /* Plugin Init */
11007   NULL, /* Plugin Deinit */
11008   0x0100 /* 1.0 */,
11009   binlog_status_vars_top,     /* status variables                */
11010   binlog_sys_vars,            /* system variables                */
11011   "1.0",                      /* string version */
11012   MariaDB_PLUGIN_MATURITY_STABLE /* maturity */
11013 }
11014 maria_declare_plugin_end;
11015 
11016 #ifdef WITH_WSREP
11017 #include "wsrep_mysqld.h"
11018 
wsrep_get_trans_cache(THD * thd)11019 IO_CACHE *wsrep_get_trans_cache(THD * thd)
11020 {
11021   DBUG_ASSERT(binlog_hton->slot != HA_SLOT_UNDEF);
11022   binlog_cache_mngr *cache_mngr = (binlog_cache_mngr*)
11023     thd_get_ha_data(thd, binlog_hton);
11024   if (cache_mngr)
11025     return cache_mngr->get_binlog_cache_log(true);
11026 
11027   WSREP_DEBUG("binlog cache not initialized, conn: %llu",
11028 	      thd->thread_id);
11029   return NULL;
11030 }
11031 
wsrep_thd_binlog_trx_reset(THD * thd)11032 void wsrep_thd_binlog_trx_reset(THD * thd)
11033 {
11034   DBUG_ENTER("wsrep_thd_binlog_trx_reset");
11035   WSREP_DEBUG("wsrep_thd_binlog_reset");
11036   /*
11037     todo: fix autocommit select to not call the caller
11038   */
11039   binlog_cache_mngr *const cache_mngr=
11040     (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
11041   if (cache_mngr)
11042   {
11043     cache_mngr->reset(false, true);
11044     if (!cache_mngr->stmt_cache.empty())
11045     {
11046       WSREP_DEBUG("pending events in stmt cache, sql: %s", thd->query());
11047       cache_mngr->stmt_cache.reset();
11048     }
11049   }
11050   thd->reset_binlog_for_next_statement();
11051   DBUG_VOID_RETURN;
11052 }
11053 
wsrep_thd_binlog_stmt_rollback(THD * thd)11054 void wsrep_thd_binlog_stmt_rollback(THD * thd)
11055 {
11056   DBUG_ENTER("wsrep_thd_binlog_stmt_rollback");
11057   WSREP_DEBUG("wsrep_thd_binlog_stmt_rollback");
11058   binlog_cache_mngr *const cache_mngr=
11059     (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
11060   if (cache_mngr)
11061   {
11062     thd->binlog_remove_pending_rows_event(TRUE, TRUE);
11063     cache_mngr->stmt_cache.reset();
11064   }
11065   DBUG_VOID_RETURN;
11066 }
11067 
wsrep_register_binlog_handler(THD * thd,bool trx)11068 void wsrep_register_binlog_handler(THD *thd, bool trx)
11069 {
11070   DBUG_ENTER("register_binlog_handler");
11071   /*
11072     If this is the first call to this function while processing a statement,
11073     the transactional cache does not have a savepoint defined. So, in what
11074     follows:
11075       . an implicit savepoint is defined;
11076       . callbacks are registered;
11077       . binary log is set as read/write.
11078 
11079     The savepoint allows for truncating the trx-cache transactional changes
11080     fail. Callbacks are necessary to flush caches upon committing or rolling
11081     back a statement or a transaction. However, notifications do not happen
11082     if the binary log is set as read/write.
11083   */
11084   binlog_cache_mngr *cache_mngr=
11085     (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
11086   /* cache_mngr may be missing e.g. in mtr test ev51914.test */
11087   if (cache_mngr)
11088   {
11089     /*
11090       Set an implicit savepoint in order to be able to truncate a trx-cache.
11091     */
11092     if (cache_mngr->trx_cache.get_prev_position() == MY_OFF_T_UNDEF)
11093     {
11094       my_off_t pos= 0;
11095       binlog_trans_log_savepos(thd, &pos);
11096       cache_mngr->trx_cache.set_prev_position(pos);
11097     }
11098 
11099     /*
11100       Set callbacks in order to be able to call commmit or rollback.
11101     */
11102     if (trx)
11103       trans_register_ha(thd, TRUE, binlog_hton, 0);
11104     trans_register_ha(thd, FALSE, binlog_hton, 0);
11105 
11106     /*
11107       Set the binary log as read/write otherwise callbacks are not called.
11108     */
11109     thd->ha_data[binlog_hton->slot].ha_info[0].set_trx_read_write();
11110   }
11111   DBUG_VOID_RETURN;
11112 }
11113 
11114 #endif /* WITH_WSREP */
11115