1 /* Copyright (c) 2009, 2020, Oracle and/or its affiliates. All rights reserved.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License, version 2.0,
5    as published by the Free Software Foundation.
6 
7    This program is also distributed with certain software (including
8    but not limited to OpenSSL) that is licensed under separate terms,
9    as designated in a particular file or component or in included license
10    documentation.  The authors of MySQL hereby grant you an additional
11    permission to link the program and your derivative works with the
12    separately licensed software that they have included with MySQL.
13 
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License, version 2.0, for more details.
18 
19    You should have received a copy of the GNU General Public License
20    along with this program; if not, write to the Free Software
21    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
22 
23 #include "sql/binlog.h"
24 
25 #include "my_config.h"
26 
27 #include <errno.h>
28 #include <fcntl.h>
29 #include <limits.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 
33 #include "lex_string.h"
34 #include "map_helpers.h"
35 #include "my_alloc.h"
36 #include "my_loglevel.h"
37 #include "my_macros.h"
38 #include "my_systime.h"
39 #include "my_thread.h"
40 #include "sql/check_stack.h"
41 #include "sql/clone_handler.h"
42 #include "sql_string.h"
43 #include "template_utils.h"
44 #ifdef HAVE_UNISTD_H
45 #include <unistd.h>
46 #endif
47 #include <algorithm>
48 #include <list>
49 #include <map>
50 #include <new>
51 #include <queue>
52 #include <sstream>
53 #include <string>
54 
55 #include "dur_prop.h"
56 #include "libbinlogevents/include/compression/base.h"
57 #include "libbinlogevents/include/compression/iterator.h"
58 #include "libbinlogevents/include/control_events.h"
59 #include "libbinlogevents/include/debug_vars.h"
60 #include "libbinlogevents/include/rows_event.h"
61 #include "libbinlogevents/include/statement_events.h"
62 #include "libbinlogevents/include/table_id.h"
63 #include "mf_wcomp.h"    // wild_one, wild_many
64 #include "mutex_lock.h"  // Mutex_lock
65 #include "my_base.h"
66 #include "my_bitmap.h"
67 #include "my_byteorder.h"
68 #include "my_compiler.h"
69 #include "my_dbug.h"
70 #include "my_dir.h"
71 #include "my_sqlcommand.h"
72 #include "my_stacktrace.h"  // my_safe_print_system_time
73 #include "my_thread_local.h"
74 #include "mysql/components/services/log_builtins.h"
75 #include "mysql/plugin.h"
76 #include "mysql/psi/mysql_file.h"
77 #include "mysql/service_mysql_alloc.h"
78 #include "mysql/thread_type.h"
79 #include "mysqld_error.h"
80 #include "partition_info.h"
81 #include "prealloced_array.h"
82 #include "sql/binlog/global.h"
83 #include "sql/binlog/tools/iterators.h"
84 #include "sql/binlog_ostream.h"
85 #include "sql/binlog_reader.h"
86 #include "sql/create_field.h"
87 #include "sql/current_thd.h"
88 #include "sql/debug_sync.h"  // DEBUG_SYNC
89 #include "sql/derror.h"      // ER_THD
90 #include "sql/discrete_interval.h"
91 #include "sql/field.h"
92 #include "sql/handler.h"
93 #include "sql/item_func.h"  // user_var_entry
94 #include "sql/key.h"
95 #include "sql/log.h"
96 #include "sql/log_event.h"           // Rows_log_event
97 #include "sql/mysqld.h"              // sync_binlog_period ...
98 #include "sql/mysqld_thd_manager.h"  // Global_THD_manager
99 #include "sql/protocol.h"
100 #include "sql/psi_memory_key.h"
101 #include "sql/query_options.h"
102 #include "sql/rpl_filter.h"
103 #include "sql/rpl_gtid.h"
104 #include "sql/rpl_handler.h"  // RUN_HOOK
105 #include "sql/rpl_mi.h"       // Master_info
106 #include "sql/rpl_record.h"
107 #include "sql/rpl_rli.h"      // Relay_log_info
108 #include "sql/rpl_rli_pdb.h"  // Slave_worker
109 #include "sql/rpl_slave.h"
110 #include "sql/rpl_slave_commit_order_manager.h"  // Commit_order_manager
111 #include "sql/rpl_transaction_ctx.h"
112 #include "sql/rpl_trx_boundary_parser.h"  // Transaction_boundary_parser
113 #include "sql/rpl_utility.h"
114 #include "sql/sql_backup_lock.h"  // is_instance_backup_locked
115 #include "sql/sql_base.h"         // find_temporary_table
116 #include "sql/sql_bitmap.h"
117 #include "sql/sql_class.h"  // THD
118 #include "sql/sql_const.h"
119 #include "sql/sql_data_change.h"
120 #include "sql/sql_error.h"
121 #include "sql/sql_lex.h"
122 #include "sql/sql_list.h"
123 #include "sql/sql_parse.h"  // sqlcom_can_generate_row_events
124 #include "sql/sql_show.h"   // append_identifier
125 #include "sql/system_variables.h"
126 #include "sql/table.h"
127 #include "sql/transaction_info.h"
128 #include "sql/xa.h"
129 #include "sql_partition.h"
130 #include "thr_lock.h"
131 
132 class Item;
133 
134 using binary_log::checksum_crc32;
135 using std::list;
136 using std::max;
137 using std::min;
138 using std::string;
139 
140 #define FLAGSTR(V, F) ((V) & (F) ? #F " " : "")
141 #define YESNO(X) ((X) ? "yes" : "no")
142 
143 /**
144   @defgroup Binary_Log Binary Log
145   @{
146  */
147 
148 #define MY_OFF_T_UNDEF (~(my_off_t)0UL)
149 
150 /*
151   Constants required for the limit unsafe warnings suppression
152  */
153 // seconds after which the limit unsafe warnings suppression will be activated
154 #define LIMIT_UNSAFE_WARNING_ACTIVATION_TIMEOUT 50
155 // number of limit unsafe warnings after which the suppression will be activated
156 #define LIMIT_UNSAFE_WARNING_ACTIVATION_THRESHOLD_COUNT 50
157 
158 static ulonglong limit_unsafe_suppression_start_time = 0;
159 static bool unsafe_warning_suppression_is_activated = false;
160 static int limit_unsafe_warning_count = 0;
161 
162 static handlerton *binlog_hton;
163 bool opt_binlog_order_commits = true;
164 
165 const char *log_bin_index = nullptr;
166 const char *log_bin_basename = nullptr;
167 
168 /* Size for IO_CACHE buffer for binlog & relay log */
169 ulong rpl_read_size;
170 
171 MYSQL_BIN_LOG mysql_bin_log(&sync_binlog_period);
172 
173 static int binlog_init(void *p);
174 static int binlog_start_trans_and_stmt(THD *thd, Log_event *start_event);
175 static int binlog_close_connection(handlerton *hton, THD *thd);
176 static int binlog_savepoint_set(handlerton *hton, THD *thd, void *sv);
177 static int binlog_savepoint_rollback(handlerton *hton, THD *thd, void *sv);
178 static bool binlog_savepoint_rollback_can_release_mdl(handlerton *hton,
179                                                       THD *thd);
180 static int binlog_commit(handlerton *hton, THD *thd, bool all);
181 static int binlog_rollback(handlerton *hton, THD *thd, bool all);
182 static int binlog_prepare(handlerton *hton, THD *thd, bool all);
183 static xa_status_code binlog_xa_commit(handlerton *hton, XID *xid);
184 static xa_status_code binlog_xa_rollback(handlerton *hton, XID *xid);
185 static void exec_binlog_error_action_abort(const char *err_string);
186 static bool binlog_recover(Binlog_file_reader *binlog_file_reader,
187                            my_off_t *valid_pos);
188 static void binlog_prepare_row_images(const THD *thd, TABLE *table);
189 static bool is_loggable_xa_prepare(THD *thd);
190 
normalize_binlog_name(char * to,const char * from,bool is_relay_log)191 bool normalize_binlog_name(char *to, const char *from, bool is_relay_log) {
192   DBUG_TRACE;
193   bool error = false;
194   char buff[FN_REFLEN];
195   char *ptr = const_cast<char *>(from);
196   char *opt_name = is_relay_log ? opt_relay_logname : opt_bin_logname;
197 
198   DBUG_ASSERT(from);
199 
200   /* opt_name is not null and not empty and from is a relative path */
201   if (opt_name && opt_name[0] && from && !test_if_hard_path(from)) {
202     // take the path from opt_name
203     // take the filename from from
204     char log_dirpart[FN_REFLEN], log_dirname[FN_REFLEN];
205     size_t log_dirpart_len, log_dirname_len;
206     dirname_part(log_dirpart, opt_name, &log_dirpart_len);
207     dirname_part(log_dirname, from, &log_dirname_len);
208 
209     /* log may be empty => relay-log or log-bin did not
210         hold paths, just filename pattern */
211     if (log_dirpart_len > 0) {
212       /* create the new path name */
213       if (fn_format(buff, from + log_dirname_len, log_dirpart, "",
214                     MYF(MY_UNPACK_FILENAME | MY_SAFE_PATH)) == nullptr) {
215         error = true;
216         goto end;
217       }
218 
219       ptr = buff;
220     }
221   }
222 
223   DBUG_ASSERT(ptr);
224   if (ptr) {
225     size_t length = strlen(ptr);
226 
227     // Strips the CR+LF at the end of log name and \0-terminates it.
228     if (length && ptr[length - 1] == '\n') {
229       ptr[length - 1] = 0;
230       length--;
231       if (length && ptr[length - 1] == '\r') {
232         ptr[length - 1] = 0;
233         length--;
234       }
235     }
236     if (!length) {
237       error = true;
238       goto end;
239     }
240     strmake(to, ptr, length);
241   }
242 end:
243   return error;
244 }
245 
246 /**
247    Logical binlog file which wraps and hides the detail of lower layer storage
248    implementation. Binlog code just use this class to control real storage
249  */
250 class MYSQL_BIN_LOG::Binlog_ofile : public Basic_ostream {
251  public:
~Binlog_ofile()252   ~Binlog_ofile() override {
253     DBUG_TRACE;
254     close();
255     return;
256   }
257 
258   /**
259      Opens the binlog file. It opens the lower layer storage.
260 
261      @param[in] log_file_key  The PSI_file_key for this stream
262      @param[in] binlog_name  The file to be opened
263      @param[in] flags  The flags used by IO_CACHE.
264      @param[in] existing True if opening the file, false if creating a new one.
265 
266      @retval false  Success
267      @retval true  Error
268   */
open(PSI_file_key log_file_key,const char * binlog_name,myf flags,bool existing=false)269   bool open(
270 #ifdef HAVE_PSI_INTERFACE
271       PSI_file_key log_file_key,
272 #endif
273       const char *binlog_name, myf flags, bool existing = false) {
274     DBUG_TRACE;
275     DBUG_ASSERT(m_pipeline_head == nullptr);
276 
277 #ifndef DBUG_OFF
278     {
279 #ifndef HAVE_PSI_INTERFACE
280       PSI_file_key log_file_key = PSI_NOT_INSTRUMENTED;
281 #endif
282       MY_STAT info;
283       if (!mysql_file_stat(log_file_key, binlog_name, &info, MYF(0))) {
284         DBUG_ASSERT(existing == !(my_errno() == ENOENT));
285         set_my_errno(0);
286       }
287     }
288 #endif
289 
290     std::unique_ptr<IO_CACHE_ostream> file_ostream(new IO_CACHE_ostream);
291     if (file_ostream->open(log_file_key, binlog_name, flags)) return true;
292 
293     m_pipeline_head = std::move(file_ostream);
294 
295     /* Setup encryption for new files if needed */
296     if (!existing && rpl_encryption.is_enabled()) {
297       std::unique_ptr<Binlog_encryption_ostream> encrypted_ostream(
298           new Binlog_encryption_ostream());
299       if (encrypted_ostream->open(std::move(m_pipeline_head))) return true;
300       m_encrypted_header_size = encrypted_ostream->get_header_size();
301       m_pipeline_head = std::move(encrypted_ostream);
302     }
303 
304     return false;
305   }
306 
307   /**
308     Opens an existing binlog file. It opens the lower layer storage reusing the
309     existing file password if needed.
310 
311     @param[in] log_file_key The PSI_file_key for this stream
312     @param[in] binlog_name The file to be opened
313     @param[in] flags The flags used by IO_CACHE.
314 
315     @retval std::unique_ptr A Binlog_ofile object pointer.
316     @retval nullptr Error.
317   */
open_existing(PSI_file_key log_file_key,const char * binlog_name,myf flags)318   static std::unique_ptr<Binlog_ofile> open_existing(
319 #ifdef HAVE_PSI_INTERFACE
320       PSI_file_key log_file_key,
321 #endif
322       const char *binlog_name, myf flags) {
323     DBUG_TRACE;
324     std::unique_ptr<Rpl_encryption_header> header;
325     unsigned char magic[BINLOG_MAGIC_SIZE];
326 
327     /* Open a simple istream to read the magic from the file */
328     IO_CACHE_istream istream;
329     if (istream.open(key_file_binlog, key_file_binlog_cache, binlog_name,
330                      MYF(MY_WME | MY_DONT_CHECK_FILESIZE), rpl_read_size))
331       return nullptr;
332     if (istream.read(magic, BINLOG_MAGIC_SIZE) != BINLOG_MAGIC_SIZE)
333       return nullptr;
334 
335     DBUG_ASSERT(Rpl_encryption_header::ENCRYPTION_MAGIC_SIZE ==
336                 BINLOG_MAGIC_SIZE);
337     /* Identify the file type by the magic to get the encryption header */
338     if (memcmp(magic, Rpl_encryption_header::ENCRYPTION_MAGIC,
339                BINLOG_MAGIC_SIZE) == 0) {
340       header = Rpl_encryption_header::get_header(&istream);
341       if (header == nullptr) return nullptr;
342     } else if (memcmp(magic, BINLOG_MAGIC, BINLOG_MAGIC_SIZE) != 0) {
343       return nullptr;
344     }
345 
346     /* Open the binlog_ofile */
347     std::unique_ptr<Binlog_ofile> ret_ofile(new Binlog_ofile);
348     if (ret_ofile->open(
349 #ifdef HAVE_PSI_INTERFACE
350             log_file_key,
351 #endif
352             binlog_name, flags, true)) {
353       return nullptr;
354     }
355 
356     if (header != nullptr) {
357       /* Add the encryption stream on top of IO_CACHE */
358       std::unique_ptr<Binlog_encryption_ostream> encrypted_ostream(
359           new Binlog_encryption_ostream);
360       ret_ofile->m_encrypted_header_size = header->get_header_size();
361       encrypted_ostream->open(std::move(ret_ofile->m_pipeline_head),
362                               std::move(header));
363       ret_ofile->m_pipeline_head = std::move(encrypted_ostream);
364       ret_ofile->set_encrypted();
365     }
366     return ret_ofile;
367   }
368 
close()369   void close() {
370     m_pipeline_head.reset(nullptr);
371     m_position = 0;
372     m_encrypted_header_size = 0;
373   }
374 
375   /**
376      Writes data into storage and maintains binlog position.
377 
378      @param[in] buffer  the data will be written
379      @param[in] length  the length of the data
380 
381      @retval false  Success
382      @retval true  Error
383   */
write(const unsigned char * buffer,my_off_t length)384   bool write(const unsigned char *buffer, my_off_t length) override {
385     DBUG_ASSERT(m_pipeline_head != nullptr);
386 
387     if (m_pipeline_head->write(buffer, length)) return true;
388 
389     m_position += length;
390     return false;
391   }
392 
393   /**
394      Updates some bytes in the binlog file. If is only used for clearing
395      LOG_EVENT_BINLOG_IN_USE_F.
396 
397      @param[in] buffer  the data will be written
398      @param[in] length  the length of the data
399      @param[in] offset  the offset of the bytes will be updated
400 
401      @retval false  Success
402      @retval true  Error
403   */
update(const unsigned char * buffer,my_off_t length,my_off_t offset)404   bool update(const unsigned char *buffer, my_off_t length, my_off_t offset) {
405     DBUG_ASSERT(m_pipeline_head != nullptr);
406     return m_pipeline_head->seek(offset) ||
407            m_pipeline_head->write(buffer, length);
408   }
409 
410   /**
411      Truncates some data at the end of the binlog file.
412 
413      @param[in] offset  where the binlog file will be truncated to.
414 
415      @retval false  Success
416      @retval true  Error
417   */
truncate(my_off_t offset)418   bool truncate(my_off_t offset) {
419     DBUG_ASSERT(m_pipeline_head != nullptr);
420 
421     if (m_pipeline_head->truncate(offset)) return true;
422     m_position = offset;
423     return false;
424   }
425 
flush()426   bool flush() { return m_pipeline_head->flush(); }
sync()427   bool sync() { return m_pipeline_head->sync(); }
flush_and_sync()428   bool flush_and_sync() { return flush() || sync(); }
position()429   my_off_t position() { return m_position; }
is_empty()430   bool is_empty() { return position() == 0; }
is_open()431   bool is_open() { return m_pipeline_head != nullptr; }
432   /**
433     Returns the encrypted header size of the binary log file.
434 
435     @retval 0 The file is not encrypted.
436     @retval >0 The encryption header size.
437   */
get_encrypted_header_size()438   int get_encrypted_header_size() { return m_encrypted_header_size; }
439   /**
440     Returns the real file size.
441 
442     While position() returns the "file size" from the plain binary log events
443     stream point of view, this function considers the encryption header when it
444     exists.
445 
446     @return The real file size considering the encryption header.
447   */
get_real_file_size()448   my_off_t get_real_file_size() { return m_position + m_encrypted_header_size; }
449   /**
450     Get the pipeline head.
451 
452     @retval  Returns the pipeline head or nullptr.
453   */
get_pipeline_head()454   std::unique_ptr<Truncatable_ostream> get_pipeline_head() {
455     return std::move(m_pipeline_head);
456   }
457   /**
458     Check if the log file is encrypted.
459 
460     @retval  True if the log file is encrypted.
461     @retval  False if the log file is not encrypted.
462   */
is_encrypted()463   bool is_encrypted() { return m_encrypted; }
464   /**
465     Set that the log file is encrypted.
466   */
set_encrypted()467   void set_encrypted() { m_encrypted = true; }
468 
469  private:
470   my_off_t m_position = 0;
471   int m_encrypted_header_size = 0;
472   std::unique_ptr<Truncatable_ostream> m_pipeline_head;
473   bool m_encrypted = false;
474 };
475 
476 /**
477   Helper class to switch to a new thread and then go back to the previous one,
478   when the object is destroyed using RAII.
479 
480   This class is used to temporarily switch to another session (THD
481   structure). It will set up thread specific "globals" correctly
482   so that the POSIX thread looks exactly like the session attached to.
483   However, PSI_thread info is not touched as it is required to show
484   the actual physial view in PFS instrumentation i.e., it should
485   depict as the real thread doing the work instead of thread it switched
486   to.
487 
488   On destruction, the original session (which is supplied to the
489   constructor) will be re-attached automatically. For example, with
490   this code, the value of @c current_thd will be the same before and
491   after execution of the code.
492 
493   @code
494   {
495     for (int i = 0 ; i < count ; ++i)
496     {
497       // here we are attached to current_thd
498       // [...]
499       Thd_backup_and_restore switch_thd(current_thd, other_thd[i]);
500       // [...]
501       // here we are attached to other_thd[i]
502       // [...]
503     }
504     // here we are attached to current_thd
505   }
506   @endcode
507 
508   @warning The class is not designed to be inherited from.
509  */
510 
511 class Thd_backup_and_restore {
512  public:
513   /**
514     Try to attach the POSIX thread to a session.
515 
516     @param[in] backup_thd    The thd to restore to when object is destructed.
517     @param[in] new_thd       The thd to attach to.
518    */
519 
Thd_backup_and_restore(THD * backup_thd,THD * new_thd)520   Thd_backup_and_restore(THD *backup_thd, THD *new_thd)
521       : m_backup_thd(backup_thd),
522         m_new_thd(new_thd),
523         m_new_thd_old_real_id(new_thd->real_id),
524         m_new_thd_old_thread_stack(new_thd->thread_stack) {
525     DBUG_ASSERT(m_backup_thd != nullptr && m_new_thd != nullptr);
526     // Reset the state of the current thd.
527     m_backup_thd->restore_globals();
528 
529     m_new_thd->thread_stack = m_backup_thd->thread_stack;
530     m_new_thd->store_globals();
531   }
532 
533   /**
534       Restores to previous thd.
535    */
~Thd_backup_and_restore()536   ~Thd_backup_and_restore() {
537     /*
538       Restore the global variables of the thd we previously attached to,
539       to its original state. In other words, detach the m_new_thd.
540     */
541     m_new_thd->restore_globals();
542     m_new_thd->real_id = m_new_thd_old_real_id;
543     m_new_thd->thread_stack = m_new_thd_old_thread_stack;
544 
545     // Reset the global variables to the original state.
546     m_backup_thd->store_globals();
547   }
548 
549  private:
550   THD *m_backup_thd;
551   THD *m_new_thd;
552   my_thread_t m_new_thd_old_real_id;
553   const char *m_new_thd_old_thread_stack;
554 };
555 
556 /**
557   Caches for non-transactional and transactional data before writing
558   it to the binary log.
559 
560   @todo All the access functions for the flags suggest that the
561   encapsuling is not done correctly, so try to move any logic that
562   requires access to the flags into the cache.
563 */
564 class binlog_cache_data {
565  public:
binlog_cache_data(bool trx_cache_arg,ulong * ptr_binlog_cache_use_arg,ulong * ptr_binlog_cache_disk_use_arg)566   binlog_cache_data(bool trx_cache_arg, ulong *ptr_binlog_cache_use_arg,
567                     ulong *ptr_binlog_cache_disk_use_arg)
568       : m_pending(nullptr),
569         ptr_binlog_cache_use(ptr_binlog_cache_use_arg),
570         ptr_binlog_cache_disk_use(ptr_binlog_cache_disk_use_arg) {
571     flags.transactional = trx_cache_arg;
572   }
573 
open(my_off_t cache_size,my_off_t max_cache_size)574   bool open(my_off_t cache_size, my_off_t max_cache_size) {
575     return m_cache.open(cache_size, max_cache_size);
576   }
577 
get_cache()578   Binlog_cache_storage *get_cache() { return &m_cache; }
579   int finalize(THD *thd, Log_event *end_event);
580   int finalize(THD *thd, Log_event *end_event, XID_STATE *xs);
581   int flush(THD *thd, my_off_t *bytes, bool *wrote_xid);
582   int write_event(Log_event *event);
get_event_counter()583   size_t get_event_counter() { return event_counter; }
get_compressed_size()584   size_t get_compressed_size() { return m_compressed_size; }
get_decompressed_size()585   size_t get_decompressed_size() { return m_decompressed_size; }
get_compression_type()586   binary_log::transaction::compression::type get_compression_type() {
587     return m_compression_type;
588   }
589 
set_compressed_size(size_t s)590   void set_compressed_size(size_t s) { m_compressed_size = s; }
set_decompressed_size(size_t s)591   void set_decompressed_size(size_t s) { m_decompressed_size = s; }
set_compression_type(binary_log::transaction::compression::type t)592   void set_compression_type(binary_log::transaction::compression::type t) {
593     m_compression_type = t;
594   }
595 
~binlog_cache_data()596   virtual ~binlog_cache_data() {
597     DBUG_ASSERT(is_binlog_empty());
598     m_cache.close();
599   }
600 
is_binlog_empty() const601   bool is_binlog_empty() const {
602     DBUG_PRINT("debug", ("%s_cache - pending: 0x%llx, bytes: %llu",
603                          (flags.transactional ? "trx" : "stmt"),
604                          (ulonglong)pending(), (ulonglong)m_cache.length()));
605     return pending() == nullptr && m_cache.is_empty();
606   }
607 
is_finalized() const608   bool is_finalized() const { return flags.finalized; }
609 
pending() const610   Rows_log_event *pending() const { return m_pending; }
611 
set_pending(Rows_log_event * const pending)612   void set_pending(Rows_log_event *const pending) { m_pending = pending; }
613 
set_incident(void)614   void set_incident(void) { flags.incident = true; }
615 
has_incident(void) const616   bool has_incident(void) const { return flags.incident; }
617 
has_xid() const618   bool has_xid() const {
619     // There should only be an XID event if we are transactional
620     DBUG_ASSERT((flags.transactional && flags.with_xid) || !flags.with_xid);
621     return flags.with_xid;
622   }
623 
is_trx_cache() const624   bool is_trx_cache() const { return flags.transactional; }
625 
get_byte_position() const626   my_off_t get_byte_position() const { return m_cache.length(); }
627 
cache_state_checkpoint(my_off_t pos_to_checkpoint)628   void cache_state_checkpoint(my_off_t pos_to_checkpoint) {
629     // We only need to store the cache state for pos > 0
630     if (pos_to_checkpoint) {
631       cache_state state;
632       state.with_rbr = flags.with_rbr;
633       state.with_sbr = flags.with_sbr;
634       state.with_start = flags.with_start;
635       state.with_end = flags.with_end;
636       state.with_content = flags.with_content;
637       state.event_counter = event_counter;
638       cache_state_map[pos_to_checkpoint] = state;
639     }
640   }
641 
cache_state_rollback(my_off_t pos_to_rollback)642   void cache_state_rollback(my_off_t pos_to_rollback) {
643     if (pos_to_rollback) {
644       std::map<my_off_t, cache_state>::iterator it;
645       it = cache_state_map.find(pos_to_rollback);
646       if (it != cache_state_map.end()) {
647         flags.with_rbr = it->second.with_rbr;
648         flags.with_sbr = it->second.with_sbr;
649         flags.with_start = it->second.with_start;
650         flags.with_end = it->second.with_end;
651         flags.with_content = it->second.with_content;
652         event_counter = it->second.event_counter;
653       } else
654         DBUG_ASSERT(it == cache_state_map.end());
655     }
656     // Rolling back to pos == 0 means cleaning up the cache.
657     else {
658       flags.with_rbr = false;
659       flags.with_sbr = false;
660       flags.with_start = false;
661       flags.with_end = false;
662       flags.with_content = false;
663       event_counter = 0;
664     }
665   }
666 
667   /**
668      Reset the cache to unused state when the transaction is finished. It
669      drops all data in the cache and clears the flags of the transaction state.
670   */
reset()671   virtual void reset() {
672     compute_statistics();
673     remove_pending_event();
674 
675     if (m_cache.reset()) {
676       LogErr(WARNING_LEVEL, ER_BINLOG_CANT_RESIZE_CACHE);
677     }
678 
679     flags.incident = false;
680     flags.with_xid = false;
681     flags.immediate = false;
682     flags.finalized = false;
683     flags.with_sbr = false;
684     flags.with_rbr = false;
685     flags.with_start = false;
686     flags.with_end = false;
687     flags.with_content = false;
688 
689     /*
690       The truncate function calls reinit_io_cache that calls my_b_flush_io_cache
691       which may increase disk_writes. This breaks the disk_writes use by the
692       binary log which aims to compute the ratio between in-memory cache usage
693       and disk cache usage. To avoid this undesirable behavior, we reset the
694       variable after truncating the cache.
695     */
696     cache_state_map.clear();
697     event_counter = 0;
698     m_compressed_size = 0;
699     m_decompressed_size = 0;
700     m_compression_type = binary_log::transaction::compression::NONE;
701     DBUG_ASSERT(is_binlog_empty());
702   }
703 
704   /**
705     Returns information about the cache content with respect to
706     the binlog_format of the events.
707 
708     This will be used to set a flag on GTID_LOG_EVENT stating that the
709     transaction may have SBR statements or not, but the binlog dump
710     will show this flag as "rbr_only" when it is not set. That's why
711     an empty transaction should return true below, or else an empty
712     transaction would be assumed as "rbr_only" even not having RBR
713     events.
714 
715     When dumping a binary log content using mysqlbinlog client program,
716     for any transaction assumed as "rbr_only" it will be printed a
717     statement changing the transaction isolation level to READ COMMITTED.
718     It doesn't make sense to have an empty transaction "requiring" this
719     isolation level change.
720 
721     @return true  The cache have SBR events or is empty.
722     @return false The cache contains a transaction with no SBR events.
723    */
may_have_sbr_stmts()724   bool may_have_sbr_stmts() { return flags.with_sbr || !flags.with_rbr; }
725 
726   /**
727     Check if the binlog cache contains an empty transaction, which has
728     two binlog events "BEGIN" and "COMMIT".
729 
730     @return true  The binlog cache contains an empty transaction.
731     @return false Otherwise.
732   */
has_empty_transaction()733   bool has_empty_transaction() {
734     /*
735       The empty transaction has two events in trx/stmt binlog cache
736       and no changes: one is a transaction start and other is a transaction
737       end (there should be no SBR changing content and no RBR events).
738     */
739     if (flags.with_start &&   // Has transaction start statement
740         flags.with_end &&     // Has transaction end statement
741         !flags.with_content)  // Has no other content than START/END
742     {
743       DBUG_ASSERT(event_counter == 2);  // Two events in the cache only
744       DBUG_ASSERT(!flags.with_sbr);     // No statements changing content
745       DBUG_ASSERT(!flags.with_rbr);     // No rows changing content
746       DBUG_ASSERT(!flags.immediate);    // Not a DDL
747       DBUG_ASSERT(
748           !flags.with_xid);  // Not a XID trx and not an atomic DDL Query
749       return true;
750     }
751     return false;
752   }
753 
754   /**
755     Check if the binlog cache is empty or contains an empty transaction,
756     which has two binlog events "BEGIN" and "COMMIT".
757 
758     @return true  The binlog cache is empty or contains an empty transaction.
759     @return false Otherwise.
760   */
is_empty_or_has_empty_transaction()761   bool is_empty_or_has_empty_transaction() {
762     return is_binlog_empty() || has_empty_transaction();
763   }
764 
765  protected:
766   /*
767     This structure should have all cache variables/flags that should be restored
768     when a ROLLBACK TO SAVEPOINT statement be executed.
769   */
770   struct cache_state {
771     bool with_sbr;
772     bool with_rbr;
773     bool with_start;
774     bool with_end;
775     bool with_content;
776     size_t event_counter;
777   };
778   /*
779     For every SAVEPOINT used, we will store a cache_state for the current
780     binlog cache position. So, if a ROLLBACK TO SAVEPOINT is used, we can
781     restore the cache_state values after truncating the binlog cache.
782   */
783   std::map<my_off_t, cache_state> cache_state_map;
784   /*
785     In order to compute the transaction size (because of possible extra checksum
786     bytes), we need to keep track of how many events are in the binlog cache.
787   */
788   size_t event_counter = 0;
789 
790   size_t m_compressed_size = 0;
791   size_t m_decompressed_size = 0;
792   binary_log::transaction::compression::type m_compression_type =
793       binary_log::transaction::compression::type::NONE;
794   /*
795     It truncates the cache to a certain position. This includes deleting the
796     pending event. It corresponds to rollback statement or rollback to
797     a savepoint. It doesn't change transaction state.
798    */
truncate(my_off_t pos)799   void truncate(my_off_t pos) {
800     DBUG_PRINT("info", ("truncating to position %lu", (ulong)pos));
801     remove_pending_event();
802 
803     // TODO: check the return value.
804     (void)m_cache.truncate(pos);
805   }
806 
807   /**
808      Flush pending event to the cache buffer.
809    */
flush_pending_event(THD * thd)810   int flush_pending_event(THD *thd) {
811     if (m_pending) {
812       m_pending->set_flags(Rows_log_event::STMT_END_F);
813       if (int error = write_event(m_pending)) return error;
814       thd->clear_binlog_table_maps();
815     }
816     return 0;
817   }
818 
819   /**
820     Remove the pending event.
821    */
remove_pending_event()822   int remove_pending_event() {
823     delete m_pending;
824     m_pending = nullptr;
825     return 0;
826   }
827   struct Flags {
828     /*
829       Defines if this is either a trx-cache or stmt-cache, respectively, a
830       transactional or non-transactional cache.
831     */
832     bool transactional : 1;
833 
834     /*
835       This indicates that some events did not get into the cache and most likely
836       it is corrupted.
837     */
838     bool incident : 1;
839 
840     /*
841       This indicates that the cache should be written without BEGIN/END.
842     */
843     bool immediate : 1;
844 
845     /*
846       This flag indicates that the buffer was finalized and has to be
847       flushed to disk.
848      */
849     bool finalized : 1;
850 
851     /*
852       This indicates that either the cache contain an XID event, or it's
853       an atomic DDL Query-log-event. In the latter case the flag is set up
854       on the statement level, namely when the Query-log-event is cached
855       at time the DDL transaction is not committing.
856       The flag therefore gets reset when the cache is cleaned due to
857       the statement rollback, e.g in case of a DDL post-caching execution
858       error.
859       Any statement scope flag among other things must consider its
860       reset policy when the statement is rolled back.
861     */
862     bool with_xid : 1;
863 
864     /*
865       This indicates that the cache contain statements changing content.
866     */
867     bool with_sbr : 1;
868 
869     /*
870       This indicates that the cache contain RBR event changing content.
871     */
872     bool with_rbr : 1;
873 
874     /*
875       This indicates that the cache contain s transaction start statement.
876     */
877     bool with_start : 1;
878 
879     /*
880       This indicates that the cache contain a transaction end event.
881     */
882     bool with_end : 1;
883 
884     /*
885       This indicates that the cache contain content other than START/END.
886     */
887     bool with_content : 1;
888   } flags;
889 
890   virtual bool compress(THD *);
891 
892  private:
893   /*
894     Storage for byte data. This binlog_cache_data will serialize
895     events into bytes and put them into m_cache.
896   */
897   Binlog_cache_storage m_cache;
898 
899   /*
900     Pending binrows event. This event is the event where the rows are currently
901     written.
902    */
903   Rows_log_event *m_pending;
904 
905   /**
906     This function computes binlog cache and disk usage.
907   */
compute_statistics()908   void compute_statistics() {
909     if (!is_binlog_empty()) {
910       (*ptr_binlog_cache_use)++;
911       if (m_cache.disk_writes() != 0) (*ptr_binlog_cache_disk_use)++;
912     }
913   }
914 
915   /*
916     Stores a pointer to the status variable that keeps track of the in-memory
917     cache usage. This corresponds to either
918       . binlog_cache_use or binlog_stmt_cache_use.
919   */
920   ulong *ptr_binlog_cache_use;
921 
922   /*
923     Stores a pointer to the status variable that keeps track of the disk
924     cache usage. This corresponds to either
925       . binlog_cache_disk_use or binlog_stmt_cache_disk_use.
926   */
927   ulong *ptr_binlog_cache_disk_use;
928 
929   binlog_cache_data &operator=(const binlog_cache_data &info);
930   binlog_cache_data(const binlog_cache_data &info);
931 };
932 
933 class binlog_stmt_cache_data : public binlog_cache_data {
934  public:
binlog_stmt_cache_data(bool trx_cache_arg,ulong * ptr_binlog_cache_use_arg,ulong * ptr_binlog_cache_disk_use_arg)935   binlog_stmt_cache_data(bool trx_cache_arg, ulong *ptr_binlog_cache_use_arg,
936                          ulong *ptr_binlog_cache_disk_use_arg)
937       : binlog_cache_data(trx_cache_arg, ptr_binlog_cache_use_arg,
938                           ptr_binlog_cache_disk_use_arg) {}
939 
940   using binlog_cache_data::finalize;
941 
942   int finalize(THD *thd);
943 };
944 
finalize(THD * thd)945 int binlog_stmt_cache_data::finalize(THD *thd) {
946   if (flags.immediate) {
947     if (int error = finalize(thd, nullptr)) return error;
948   } else {
949     Query_log_event end_evt(thd, STRING_WITH_LEN("COMMIT"), false, false, true,
950                             0, true);
951     if (int error = finalize(thd, &end_evt)) return error;
952   }
953   return 0;
954 }
955 
956 class binlog_trx_cache_data : public binlog_cache_data {
957  public:
binlog_trx_cache_data(bool trx_cache_arg,ulong * ptr_binlog_cache_use_arg,ulong * ptr_binlog_cache_disk_use_arg)958   binlog_trx_cache_data(bool trx_cache_arg, ulong *ptr_binlog_cache_use_arg,
959                         ulong *ptr_binlog_cache_disk_use_arg)
960       : binlog_cache_data(trx_cache_arg, ptr_binlog_cache_use_arg,
961                           ptr_binlog_cache_disk_use_arg),
962         m_cannot_rollback(false),
963         before_stmt_pos(MY_OFF_T_UNDEF) {}
964 
reset()965   void reset() {
966     DBUG_TRACE;
967     DBUG_PRINT("enter", ("before_stmt_pos: %llu", (ulonglong)before_stmt_pos));
968     m_cannot_rollback = false;
969     before_stmt_pos = MY_OFF_T_UNDEF;
970     binlog_cache_data::reset();
971     DBUG_PRINT("return", ("before_stmt_pos: %llu", (ulonglong)before_stmt_pos));
972     return;
973   }
974 
cannot_rollback() const975   bool cannot_rollback() const { return m_cannot_rollback; }
976 
set_cannot_rollback()977   void set_cannot_rollback() { m_cannot_rollback = true; }
978 
get_prev_position() const979   my_off_t get_prev_position() const { return before_stmt_pos; }
980 
set_prev_position(my_off_t pos)981   void set_prev_position(my_off_t pos) {
982     DBUG_TRACE;
983     DBUG_PRINT("enter", ("before_stmt_pos: %llu", (ulonglong)before_stmt_pos));
984     before_stmt_pos = pos;
985     cache_state_checkpoint(before_stmt_pos);
986     DBUG_PRINT("return", ("before_stmt_pos: %llu", (ulonglong)before_stmt_pos));
987     return;
988   }
989 
restore_prev_position()990   void restore_prev_position() {
991     DBUG_TRACE;
992     DBUG_PRINT("enter", ("before_stmt_pos: %llu", (ulonglong)before_stmt_pos));
993     binlog_cache_data::truncate(before_stmt_pos);
994     cache_state_rollback(before_stmt_pos);
995     before_stmt_pos = MY_OFF_T_UNDEF;
996     /*
997       Binlog statement rollback clears with_xid now as the atomic DDL statement
998       marker which can be set as early as at event creation and caching.
999     */
1000     flags.with_xid = false;
1001     DBUG_PRINT("return", ("before_stmt_pos: %llu", (ulonglong)before_stmt_pos));
1002     return;
1003   }
1004 
restore_savepoint(my_off_t pos)1005   void restore_savepoint(my_off_t pos) {
1006     DBUG_TRACE;
1007     DBUG_PRINT("enter", ("before_stmt_pos: %llu", (ulonglong)before_stmt_pos));
1008     binlog_cache_data::truncate(pos);
1009     if (pos <= before_stmt_pos) before_stmt_pos = MY_OFF_T_UNDEF;
1010     cache_state_rollback(pos);
1011     DBUG_PRINT("return", ("before_stmt_pos: %llu", (ulonglong)before_stmt_pos));
1012     return;
1013   }
1014 
1015   using binlog_cache_data::truncate;
1016 
1017   int truncate(THD *thd, bool all);
1018 
1019  private:
1020   /*
1021     It will be set true if any statement which cannot be rolled back safely
1022     is put in trx_cache.
1023   */
1024   bool m_cannot_rollback;
1025 
1026   /*
1027     Binlog position before the start of the current statement.
1028   */
1029   my_off_t before_stmt_pos;
1030 
1031   binlog_trx_cache_data &operator=(const binlog_trx_cache_data &info);
1032   binlog_trx_cache_data(const binlog_trx_cache_data &info);
1033 };
1034 
1035 class binlog_cache_mngr {
1036  public:
binlog_cache_mngr(ulong * ptr_binlog_stmt_cache_use_arg,ulong * ptr_binlog_stmt_cache_disk_use_arg,ulong * ptr_binlog_cache_use_arg,ulong * ptr_binlog_cache_disk_use_arg)1037   binlog_cache_mngr(ulong *ptr_binlog_stmt_cache_use_arg,
1038                     ulong *ptr_binlog_stmt_cache_disk_use_arg,
1039                     ulong *ptr_binlog_cache_use_arg,
1040                     ulong *ptr_binlog_cache_disk_use_arg)
1041       : stmt_cache(false, ptr_binlog_stmt_cache_use_arg,
1042                    ptr_binlog_stmt_cache_disk_use_arg),
1043         trx_cache(true, ptr_binlog_cache_use_arg,
1044                   ptr_binlog_cache_disk_use_arg),
1045         has_logged_xid(false) {}
1046 
init()1047   bool init() {
1048     return stmt_cache.open(binlog_stmt_cache_size,
1049                            max_binlog_stmt_cache_size) ||
1050            trx_cache.open(binlog_cache_size, max_binlog_cache_size);
1051   }
1052 
get_binlog_cache_data(bool is_transactional)1053   binlog_cache_data *get_binlog_cache_data(bool is_transactional) {
1054     if (is_transactional)
1055       return &trx_cache;
1056     else
1057       return &stmt_cache;
1058   }
1059 
get_stmt_cache()1060   Binlog_cache_storage *get_stmt_cache() { return stmt_cache.get_cache(); }
get_trx_cache()1061   Binlog_cache_storage *get_trx_cache() { return trx_cache.get_cache(); }
1062   /**
1063     Convenience method to check if both caches are empty.
1064    */
is_binlog_empty() const1065   bool is_binlog_empty() const {
1066     return stmt_cache.is_binlog_empty() && trx_cache.is_binlog_empty();
1067   }
1068 
1069   /*
1070     clear stmt_cache and trx_cache if they are not empty
1071   */
reset()1072   void reset() {
1073     if (!stmt_cache.is_binlog_empty()) stmt_cache.reset();
1074     if (!trx_cache.is_binlog_empty()) trx_cache.reset();
1075   }
1076 
1077 #ifndef DBUG_OFF
dbug_any_finalized() const1078   bool dbug_any_finalized() const {
1079     return stmt_cache.is_finalized() || trx_cache.is_finalized();
1080   }
1081 #endif
1082 
1083   /*
1084     Convenience method to flush both caches to the binary log.
1085 
1086     @param bytes_written Pointer to variable that will be set to the
1087                          number of bytes written for the flush.
1088     @param wrote_xid     Pointer to variable that will be set to @c
1089                          true if any XID event was written to the
1090                          binary log. Otherwise, the variable will not
1091                          be touched.
1092     @return Error code on error, zero if no error.
1093    */
flush(THD * thd,my_off_t * bytes_written,bool * wrote_xid)1094   int flush(THD *thd, my_off_t *bytes_written, bool *wrote_xid) {
1095     my_off_t stmt_bytes = 0;
1096     my_off_t trx_bytes = 0;
1097     DBUG_ASSERT(stmt_cache.has_xid() == 0);
1098     int error = stmt_cache.flush(thd, &stmt_bytes, wrote_xid);
1099     if (error) return error;
1100     DEBUG_SYNC(thd, "after_flush_stm_cache_before_flush_trx_cache");
1101     error = trx_cache.flush(thd, &trx_bytes, wrote_xid);
1102     if (error) return error;
1103     *bytes_written = stmt_bytes + trx_bytes;
1104     return 0;
1105   }
1106 
1107   /**
1108     Check if at least one of transacaction and statement binlog caches
1109     contains an empty transaction, other one is empty or contains an
1110     empty transaction.
1111 
1112     @return true  At least one of transacaction and statement binlog
1113                   caches an empty transaction, other one is emptry
1114                   or contains an empty transaction.
1115     @return false Otherwise.
1116   */
has_empty_transaction()1117   bool has_empty_transaction() {
1118     return (trx_cache.is_empty_or_has_empty_transaction() &&
1119             stmt_cache.is_empty_or_has_empty_transaction() &&
1120             !is_binlog_empty());
1121   }
1122 
1123   binlog_stmt_cache_data stmt_cache;
1124   binlog_trx_cache_data trx_cache;
1125   /*
1126     The bool flag is for preventing do_binlog_xa_commit_rollback()
1127     execution twice which can happen for "external" xa commit/rollback.
1128   */
1129   bool has_logged_xid;
1130 
1131  private:
1132   binlog_cache_mngr &operator=(const binlog_cache_mngr &info);
1133   binlog_cache_mngr(const binlog_cache_mngr &info);
1134 };
1135 
thd_get_cache_mngr(const THD * thd)1136 static binlog_cache_mngr *thd_get_cache_mngr(const THD *thd) {
1137   /*
1138     If opt_bin_log is not set, binlog_hton->slot == -1 and hence
1139     thd_get_ha_data(thd, hton) segfaults.
1140   */
1141   DBUG_ASSERT(opt_bin_log);
1142   return (binlog_cache_mngr *)thd_get_ha_data(thd, binlog_hton);
1143 }
1144 
1145 /**
1146   Checks if the BINLOG_CACHE_SIZE's value is greater than MAX_BINLOG_CACHE_SIZE.
1147   If this happens, the BINLOG_CACHE_SIZE is set to MAX_BINLOG_CACHE_SIZE.
1148 */
check_binlog_cache_size(THD * thd)1149 void check_binlog_cache_size(THD *thd) {
1150   if (binlog_cache_size > max_binlog_cache_size) {
1151     if (thd) {
1152       push_warning_printf(
1153           thd, Sql_condition::SL_WARNING, ER_BINLOG_CACHE_SIZE_GREATER_THAN_MAX,
1154           ER_THD(thd, ER_BINLOG_CACHE_SIZE_GREATER_THAN_MAX),
1155           (ulong)binlog_cache_size, (ulong)max_binlog_cache_size);
1156     } else {
1157       LogErr(WARNING_LEVEL, ER_BINLOG_CACHE_SIZE_TOO_LARGE, binlog_cache_size,
1158              (ulong)max_binlog_cache_size);
1159     }
1160     binlog_cache_size = static_cast<ulong>(max_binlog_cache_size);
1161   }
1162 }
1163 
1164 /**
1165   Checks if the BINLOG_STMT_CACHE_SIZE's value is greater than
1166   MAX_BINLOG_STMT_CACHE_SIZE. If this happens, the BINLOG_STMT_CACHE_SIZE is set
1167   to MAX_BINLOG_STMT_CACHE_SIZE.
1168 */
check_binlog_stmt_cache_size(THD * thd)1169 void check_binlog_stmt_cache_size(THD *thd) {
1170   if (binlog_stmt_cache_size > max_binlog_stmt_cache_size) {
1171     if (thd) {
1172       push_warning_printf(
1173           thd, Sql_condition::SL_WARNING,
1174           ER_BINLOG_STMT_CACHE_SIZE_GREATER_THAN_MAX,
1175           ER_THD(thd, ER_BINLOG_STMT_CACHE_SIZE_GREATER_THAN_MAX),
1176           (ulong)binlog_stmt_cache_size, (ulong)max_binlog_stmt_cache_size);
1177     } else {
1178       LogErr(WARNING_LEVEL, ER_BINLOG_STMT_CACHE_SIZE_TOO_LARGE,
1179              binlog_stmt_cache_size, (ulong)max_binlog_stmt_cache_size);
1180     }
1181     binlog_stmt_cache_size = static_cast<ulong>(max_binlog_stmt_cache_size);
1182   }
1183 }
1184 
1185 /**
1186  Check whether binlog_hton has valid slot and enabled
1187 */
binlog_enabled()1188 bool binlog_enabled() {
1189   return (binlog_hton && binlog_hton->slot != HA_SLOT_UNDEF);
1190 }
1191 
1192 /*
1193  Save position of binary log transaction cache.
1194 
1195  SYNPOSIS
1196    binlog_trans_log_savepos()
1197 
1198    thd      The thread to take the binlog data from
1199    pos      Pointer to variable where the position will be stored
1200 
1201  DESCRIPTION
1202 
1203    Save the current position in the binary log transaction cache into
1204    the variable pointed to by 'pos'
1205 */
1206 
binlog_trans_log_savepos(THD * thd,my_off_t * pos)1207 static void binlog_trans_log_savepos(THD *thd, my_off_t *pos) {
1208   DBUG_TRACE;
1209   DBUG_ASSERT(pos != nullptr);
1210   binlog_cache_mngr *const cache_mngr = thd_get_cache_mngr(thd);
1211   DBUG_ASSERT(mysql_bin_log.is_open());
1212   *pos = cache_mngr->trx_cache.get_byte_position();
1213   DBUG_PRINT("return", ("position: %lu", (ulong)*pos));
1214   cache_mngr->trx_cache.cache_state_checkpoint(*pos);
1215 }
1216 
binlog_dummy_recover(handlerton *,XA_recover_txn *,uint,MEM_ROOT *)1217 static int binlog_dummy_recover(handlerton *, XA_recover_txn *, uint,
1218                                 MEM_ROOT *) {
1219   return 0;
1220 }
1221 
1222 /**
1223   Auxiliary class to copy serialized events to the binary log and
1224   correct some of the fields that are not known until just before
1225   writing the event.
1226 
1227   This class allows feeding events in parts, so it is practical to use
1228   in do_write_cache() which reads events from an IO_CACHE where events
1229   may span mutiple cache pages.
1230 
1231   The following fields are fixed before writing the event:
1232   - end_log_pos is set
1233   - the checksum is computed if checksums are enabled
1234   - the length is incremented by the checksum size if checksums are enabled
1235 */
1236 class Binlog_event_writer : public Basic_ostream {
1237   MYSQL_BIN_LOG::Binlog_ofile *m_binlog_file;
1238   bool have_checksum;
1239   ha_checksum initial_checksum;
1240   ha_checksum checksum;
1241   uint32 end_log_pos;
1242   uchar header[LOG_EVENT_HEADER_LEN];
1243   my_off_t header_len = 0;
1244   uint32 event_len = 0;
1245 
1246  public:
1247   /**
1248     Constructs a new Binlog_event_writer. Should be called once before
1249     starting to flush the transaction or statement cache to the
1250     binlog.
1251 
1252     @param binlog_file to write to.
1253   */
Binlog_event_writer(MYSQL_BIN_LOG::Binlog_ofile * binlog_file)1254   Binlog_event_writer(MYSQL_BIN_LOG::Binlog_ofile *binlog_file)
1255       : m_binlog_file(binlog_file),
1256         have_checksum(binlog_checksum_options !=
1257                       binary_log::BINLOG_CHECKSUM_ALG_OFF),
1258         initial_checksum(my_checksum(0L, nullptr, 0)),
1259         checksum(initial_checksum),
1260         end_log_pos(binlog_file->position()) {
1261     // Simulate checksum error
1262     if (DBUG_EVALUATE_IF("fault_injection_crc_value", 1, 0)) checksum--;
1263   }
1264 
update_header()1265   void update_header() {
1266     event_len = uint4korr(header + EVENT_LEN_OFFSET);
1267 
1268     // Increase end_log_pos
1269     end_log_pos += event_len;
1270 
1271     // Update event length if it has checksum
1272     if (have_checksum) {
1273       int4store(header + EVENT_LEN_OFFSET, event_len + BINLOG_CHECKSUM_LEN);
1274       end_log_pos += BINLOG_CHECKSUM_LEN;
1275     }
1276 
1277     // Store end_log_pos
1278     int4store(header + LOG_POS_OFFSET, end_log_pos);
1279     // update the checksum
1280     if (have_checksum) checksum = my_checksum(checksum, header, header_len);
1281   }
1282 
write(const unsigned char * buffer,my_off_t length)1283   bool write(const unsigned char *buffer, my_off_t length) {
1284     DBUG_TRACE;
1285 
1286     while (length > 0) {
1287       /* Write event header into binlog */
1288       if (event_len == 0) {
1289         /* data in the buf may be smaller than header size.*/
1290         uint32 header_incr =
1291             std::min<uint32>(LOG_EVENT_HEADER_LEN - header_len, length);
1292 
1293         memcpy(header + header_len, buffer, header_incr);
1294         header_len += header_incr;
1295         buffer += header_incr;
1296         length -= header_incr;
1297 
1298         if (header_len == LOG_EVENT_HEADER_LEN) {
1299           update_header();
1300           if (m_binlog_file->write(header, header_len)) return true;
1301 
1302           event_len -= header_len;
1303           header_len = 0;
1304         }
1305       } else {
1306         my_off_t write_bytes = std::min<uint64>(length, event_len);
1307 
1308         if (m_binlog_file->write(buffer, write_bytes)) return true;
1309 
1310         // update the checksum
1311         if (have_checksum)
1312           checksum = my_checksum(checksum, buffer, write_bytes);
1313 
1314         event_len -= write_bytes;
1315         length -= write_bytes;
1316         buffer += write_bytes;
1317 
1318         // The whole event is copied, now add the checksum
1319         if (have_checksum && event_len == 0) {
1320           uchar checksum_buf[BINLOG_CHECKSUM_LEN];
1321 
1322           int4store(checksum_buf, checksum);
1323           if (m_binlog_file->write(checksum_buf, BINLOG_CHECKSUM_LEN))
1324             return true;
1325           checksum = initial_checksum;
1326         }
1327       }
1328     }
1329     return false;
1330   }
1331   /**
1332     Returns true if per event checksum is enabled.
1333   */
is_checksum_enabled()1334   bool is_checksum_enabled() { return have_checksum; }
1335 };
1336 
1337 /*
1338   this function is mostly a placeholder.
1339   conceptually, binlog initialization (now mostly done in MYSQL_BIN_LOG::open)
1340   should be moved here.
1341 */
1342 
binlog_init(void * p)1343 static int binlog_init(void *p) {
1344   binlog_hton = (handlerton *)p;
1345   binlog_hton->state = opt_bin_log ? SHOW_OPTION_YES : SHOW_OPTION_NO;
1346   binlog_hton->db_type = DB_TYPE_BINLOG;
1347   binlog_hton->savepoint_offset = sizeof(my_off_t);
1348   binlog_hton->close_connection = binlog_close_connection;
1349   binlog_hton->savepoint_set = binlog_savepoint_set;
1350   binlog_hton->savepoint_rollback = binlog_savepoint_rollback;
1351   binlog_hton->savepoint_rollback_can_release_mdl =
1352       binlog_savepoint_rollback_can_release_mdl;
1353   binlog_hton->commit = binlog_commit;
1354   binlog_hton->commit_by_xid = binlog_xa_commit;
1355   binlog_hton->rollback = binlog_rollback;
1356   binlog_hton->rollback_by_xid = binlog_xa_rollback;
1357   binlog_hton->prepare = binlog_prepare;
1358   binlog_hton->recover = binlog_dummy_recover;
1359   binlog_hton->flags = HTON_NOT_USER_SELECTABLE | HTON_HIDDEN;
1360   return 0;
1361 }
1362 
binlog_deinit(void *)1363 static int binlog_deinit(void *) {
1364   /* Using binlog as TC after the binlog has been unloaded, won't work */
1365   if (tc_log == &mysql_bin_log) tc_log = nullptr;
1366   binlog_hton = nullptr;
1367   return 0;
1368 }
1369 
binlog_close_connection(handlerton *,THD * thd)1370 static int binlog_close_connection(handlerton *, THD *thd) {
1371   DBUG_TRACE;
1372   binlog_cache_mngr *const cache_mngr = thd_get_cache_mngr(thd);
1373   DBUG_ASSERT(cache_mngr->is_binlog_empty());
1374   DBUG_PRINT("debug", ("Set ha_data slot %d to 0x%llx", binlog_hton->slot,
1375                        (ulonglong) nullptr));
1376   thd_set_ha_data(thd, binlog_hton, nullptr);
1377   cache_mngr->~binlog_cache_mngr();
1378   my_free(cache_mngr);
1379   return 0;
1380 }
1381 
write_event(Log_event * ev)1382 int binlog_cache_data::write_event(Log_event *ev) {
1383   DBUG_TRACE;
1384 
1385   if (ev != nullptr) {
1386     DBUG_EXECUTE_IF("simulate_disk_full_at_flush_pending",
1387                     { DBUG_SET("+d,simulate_file_write_error"); });
1388 
1389     if (binary_event_serialize(ev, &m_cache)) {
1390       DBUG_EXECUTE_IF("simulate_disk_full_at_flush_pending", {
1391         DBUG_SET("-d,simulate_file_write_error");
1392         DBUG_SET("-d,simulate_disk_full_at_flush_pending");
1393         /*
1394            after +d,simulate_file_write_error the local cache
1395            is in unsane state. Since -d,simulate_file_write_error
1396            revokes the first simulation do_write_cache()
1397            can't be run without facing an assert.
1398            So it's blocked with the following 2nd simulation:
1399         */
1400         DBUG_SET("+d,simulate_do_write_cache_failure");
1401       });
1402       return 1;
1403     }
1404     if (ev->get_type_code() == binary_log::XID_EVENT) flags.with_xid = true;
1405     if (ev->is_using_immediate_logging()) flags.immediate = true;
1406     /* DDL gets marked as xid-requiring at its caching. */
1407     if (is_atomic_ddl_event(ev)) flags.with_xid = true;
1408     /* With respect to the event type being written */
1409     if (ev->is_sbr_logging_format()) flags.with_sbr = true;
1410     if (ev->is_rbr_logging_format()) flags.with_rbr = true;
1411     /* With respect to empty transactions */
1412     if (ev->starts_group()) flags.with_start = true;
1413     if (ev->ends_group()) flags.with_end = true;
1414     if (!ev->starts_group() && !ev->ends_group()) flags.with_content = true;
1415     event_counter++;
1416     DBUG_PRINT("debug",
1417                ("event_counter= %lu", static_cast<ulong>(event_counter)));
1418   }
1419   return 0;
1420 }
1421 
assign_automatic_gtids_to_flush_group(THD * first_seen)1422 bool MYSQL_BIN_LOG::assign_automatic_gtids_to_flush_group(THD *first_seen) {
1423   DBUG_TRACE;
1424   bool error = false;
1425   bool is_global_sid_locked = false;
1426   rpl_sidno locked_sidno = 0;
1427 
1428   for (THD *head = first_seen; head; head = head->next_to_commit) {
1429     DBUG_ASSERT(head->variables.gtid_next.type != UNDEFINED_GTID);
1430 
1431     /* Generate GTID */
1432     if (head->variables.gtid_next.type == AUTOMATIC_GTID) {
1433       if (!is_global_sid_locked) {
1434         global_sid_lock->rdlock();
1435         is_global_sid_locked = true;
1436       }
1437       if (gtid_state->generate_automatic_gtid(
1438               head,
1439               head->get_transaction()->get_rpl_transaction_ctx()->get_sidno(),
1440               head->get_transaction()->get_rpl_transaction_ctx()->get_gno(),
1441               &locked_sidno) != RETURN_STATUS_OK) {
1442         head->commit_error = THD::CE_FLUSH_ERROR;
1443         error = true;
1444       }
1445     } else {
1446       DBUG_PRINT("info",
1447                  ("thd->variables.gtid_next.type=%d "
1448                   "thd->owned_gtid.sidno=%d",
1449                   head->variables.gtid_next.type, head->owned_gtid.sidno));
1450       if (head->variables.gtid_next.type == ASSIGNED_GTID)
1451         DBUG_ASSERT(head->owned_gtid.sidno > 0);
1452       else {
1453         DBUG_ASSERT(head->variables.gtid_next.type == ANONYMOUS_GTID);
1454         DBUG_ASSERT(head->owned_gtid.sidno == THD::OWNED_SIDNO_ANONYMOUS);
1455       }
1456     }
1457   }
1458 
1459   if (locked_sidno > 0) gtid_state->unlock_sidno(locked_sidno);
1460 
1461   if (is_global_sid_locked) global_sid_lock->unlock();
1462 
1463   return error;
1464 }
1465 
1466 /**
1467   Write the Gtid_log_event to the binary log (prior to writing the
1468   statement or transaction cache).
1469 
1470   @param thd Thread that is committing.
1471   @param cache_data The cache that is flushing.
1472   @param writer The event will be written to this Binlog_event_writer object.
1473 
1474   @retval false Success.
1475   @retval true Error.
1476 */
write_transaction(THD * thd,binlog_cache_data * cache_data,Binlog_event_writer * writer)1477 bool MYSQL_BIN_LOG::write_transaction(THD *thd, binlog_cache_data *cache_data,
1478                                       Binlog_event_writer *writer) {
1479   DBUG_TRACE;
1480 
1481   /*
1482     The GTID for the THD was assigned at
1483     assign_automatic_gtids_to_flush_group()
1484   */
1485   DBUG_ASSERT(thd->owned_gtid.sidno == THD::OWNED_SIDNO_ANONYMOUS ||
1486               thd->owned_gtid.sidno > 0);
1487 
1488   int64 sequence_number, last_committed;
1489   /* Generate logical timestamps for MTS */
1490   m_dependency_tracker.get_dependency(thd, sequence_number, last_committed);
1491 
1492   /*
1493     In case both the transaction cache and the statement cache are
1494     non-empty, both will be flushed in sequence and logged as
1495     different transactions. Then the second transaction must only
1496     be executed after the first one has committed. Therefore, we
1497     need to set last_committed for the second transaction equal to
1498     last_committed for the first transaction. This is done in
1499     binlog_cache_data::flush. binlog_cache_data::flush uses the
1500     condition trn_ctx->last_committed==SEQ_UNINIT to detect this
1501     situation, hence the need to set it here.
1502   */
1503   thd->get_transaction()->last_committed = SEQ_UNINIT;
1504 
1505   /*
1506     For delayed replication and also for the purpose of lag monitoring,
1507     we assume that the commit timestamp of the transaction is the time of
1508     executing this code (the time of writing the Gtid_log_event to the binary
1509     log).
1510   */
1511   ulonglong immediate_commit_timestamp = my_micro_time();
1512 
1513   /*
1514     When the original_commit_timestamp session variable is set to a value
1515     other than UNDEFINED_COMMIT_TIMESTAMP, it means that either the timestamp
1516     is known ( > 0 ) or the timestamp is not known ( == 0 ).
1517   */
1518   ulonglong original_commit_timestamp =
1519       thd->variables.original_commit_timestamp;
1520   /*
1521     When original_commit_timestamp == UNDEFINED_COMMIT_TIMESTAMP, we assume
1522     that:
1523     a) it is not known if this thread is a slave applier ( = 0 );
1524     b) this is a new transaction ( = immediate_commit_timestamp);
1525   */
1526   if (original_commit_timestamp == UNDEFINED_COMMIT_TIMESTAMP) {
1527     /*
1528       When applying a transaction using replication, assume that the
1529       original commit timestamp is not known (the transaction wasn't
1530       originated on the current server).
1531     */
1532     if (thd->slave_thread || thd->is_binlog_applier()) {
1533       original_commit_timestamp = 0;
1534     } else
1535     /* Assume that this transaction is original from this server */
1536     {
1537       DBUG_EXECUTE_IF("rpl_invalid_gtid_timestamp",
1538                       // add one our to the commit timestamps
1539                       immediate_commit_timestamp += 3600000000;);
1540       original_commit_timestamp = immediate_commit_timestamp;
1541     }
1542   } else {
1543     // Clear the session variable to have cleared states for next transaction.
1544     thd->variables.original_commit_timestamp = UNDEFINED_COMMIT_TIMESTAMP;
1545   }
1546 
1547   if (thd->slave_thread) {
1548     // log warning if the replication timestamps are invalid
1549     if (original_commit_timestamp > immediate_commit_timestamp &&
1550         !thd->rli_slave->get_c_rli()->gtid_timestamps_warning_logged) {
1551       LogErr(WARNING_LEVEL, ER_INVALID_REPLICATION_TIMESTAMPS);
1552       thd->rli_slave->get_c_rli()->gtid_timestamps_warning_logged = true;
1553     } else {
1554       if (thd->rli_slave->get_c_rli()->gtid_timestamps_warning_logged &&
1555           original_commit_timestamp <= immediate_commit_timestamp) {
1556         LogErr(WARNING_LEVEL, ER_RPL_TIMESTAMPS_RETURNED_TO_NORMAL);
1557         thd->rli_slave->get_c_rli()->gtid_timestamps_warning_logged = false;
1558       }
1559     }
1560   }
1561 
1562   uint32_t trx_immediate_server_version =
1563       do_server_version_int(::server_version);
1564   // Clear the session variable to have cleared states for next transaction.
1565   thd->variables.immediate_server_version = UNDEFINED_SERVER_VERSION;
1566   DBUG_EXECUTE_IF("fixed_server_version",
1567                   trx_immediate_server_version = 888888;);
1568   DBUG_EXECUTE_IF("gr_fixed_server_version",
1569                   trx_immediate_server_version = 777777;);
1570 
1571   /*
1572     When the original_server_version session variable is set to a value
1573     other than UNDEFINED_SERVER_VERSION, it means that either the
1574     server version is known or the server_version is not known
1575     (UNKNOWN_SERVER_VERSION).
1576   */
1577   uint32_t trx_original_server_version = thd->variables.original_server_version;
1578 
1579   /*
1580     When original_server_version == UNDEFINED_SERVER_VERSION, we assume
1581     that:
1582     a) it is not known if this thread is a slave applier ( = 0 );
1583     b) this is a new transaction ( = ::server_version);
1584   */
1585   if (trx_original_server_version == UNDEFINED_SERVER_VERSION) {
1586     /*
1587       When applying a transaction using replication, assume that the
1588       original server version is not known (the transaction wasn't
1589       originated on the current server).
1590     */
1591     if (thd->slave_thread || thd->is_binlog_applier()) {
1592       trx_original_server_version = UNKNOWN_SERVER_VERSION;
1593     } else
1594     /* Assume that this transaction is original from this server */
1595     {
1596       trx_original_server_version = trx_immediate_server_version;
1597     }
1598   } else {
1599     // Clear the session variable to have cleared states for next transaction.
1600     thd->variables.original_server_version = UNDEFINED_SERVER_VERSION;
1601   }
1602   Gtid_log_event gtid_event(
1603       thd, cache_data->is_trx_cache(), last_committed, sequence_number,
1604       cache_data->may_have_sbr_stmts(), original_commit_timestamp,
1605       immediate_commit_timestamp, trx_original_server_version,
1606       trx_immediate_server_version);
1607 
1608   // Set the transaction length, based on cache info
1609   gtid_event.set_trx_length_by_cache_size(cache_data->get_byte_position(),
1610                                           writer->is_checksum_enabled(),
1611                                           cache_data->get_event_counter());
1612 
1613   DBUG_PRINT("debug", ("cache_data->get_byte_position()= %llu",
1614                        cache_data->get_byte_position()));
1615   DBUG_PRINT("debug", ("cache_data->get_event_counter()= %lu",
1616                        static_cast<ulong>(cache_data->get_event_counter())));
1617   DBUG_PRINT("debug", ("writer->is_checksum_enabled()= %s",
1618                        YESNO(writer->is_checksum_enabled())));
1619   DBUG_PRINT("debug", ("gtid_event.get_event_length()= %lu",
1620                        static_cast<ulong>(gtid_event.get_event_length())));
1621   DBUG_PRINT("info",
1622              ("transaction_length= %llu", gtid_event.transaction_length));
1623 
1624   bool ret = gtid_event.write(writer);
1625   if (ret) goto end;
1626 
1627   /*
1628     finally write the transaction data, if it was not compressed
1629     and written as part of the gtid event already
1630   */
1631   ret = mysql_bin_log.write_cache(thd, cache_data, writer);
1632 
1633   if (!ret) {
1634     // update stats if monitoring is active
1635     binlog::global_context.monitoring_context()
1636         .transaction_compression()
1637         .update(binlog::monitoring::log_type::BINARY,
1638                 cache_data->get_compression_type(), thd->owned_gtid,
1639                 gtid_event.immediate_commit_timestamp,
1640                 cache_data->get_compressed_size(),
1641                 cache_data->get_decompressed_size());
1642   }
1643 
1644 end:
1645   return ret;
1646 }
1647 
gtid_end_transaction(THD * thd)1648 int MYSQL_BIN_LOG::gtid_end_transaction(THD *thd) {
1649   DBUG_TRACE;
1650 
1651   DBUG_PRINT("info", ("query=%s", thd->query().str));
1652 
1653   if (thd->owned_gtid.sidno > 0) {
1654     DBUG_ASSERT(thd->variables.gtid_next.type == ASSIGNED_GTID);
1655 
1656     if (!opt_bin_log || (thd->slave_thread && !opt_log_slave_updates)) {
1657       /*
1658         If the binary log is disabled for this thread (either by
1659         log_bin=0 or sql_log_bin=0 or by log_slave_updates=0 for a
1660         slave thread), then the statement must not be written to the
1661         binary log.  In this case, we just save the GTID into the
1662         table directly.
1663 
1664         (This only happens for DDL, since DML will save the GTID into
1665         table and release ownership inside ha_commit_trans.)
1666       */
1667       if (gtid_state->save(thd) != 0) {
1668         gtid_state->update_on_rollback(thd);
1669         return 1;
1670       } else if (!has_commit_order_manager(thd)) {
1671         /*
1672           The gtid_state->save implicitly performs the commit, in the following
1673           stack:
1674             Gtid_state::save ->
1675             Gtid_table_persistor::save ->
1676             Gtid_table_access_context::deinit ->
1677             System_table_access::close_table ->
1678             ha_commit_trans ->
1679             Relay_log_info::pre_commit ->
1680             Slave_worker::commit_positions(THD*) ->
1681             Slave_worker::commit_positions(THD*,Log_event*,...) ->
1682             Slave_worker::flush_info ->
1683             Rpl_info_handler::flush_info ->
1684             Rpl_info_table::do_flush_info ->
1685             Rpl_info_table_access::close_table ->
1686             System_table_access::close_table ->
1687             ha_commit_trans ->
1688             MYSQL_BIN_LOG::commit ->
1689             ha_commit_low
1690 
1691           If slave-preserve-commit-order is disabled, it does not call
1692           update_on_commit from this stack. The reason is as follows:
1693 
1694           In the normal case of MYSQL_BIN_LOG::commit, where the transaction is
1695           going to be written to the binary log, it invokes
1696           MYSQL_BIN_LOG::ordered_commit, which updates the GTID state (the call
1697           gtid_state->update_commit_group(first) in process_commit_stage_queue).
1698           However, when MYSQL_BIN_LOG::commit is invoked from this stack, it is
1699           because the transaction is not going to be written to the binary log,
1700           and then MYSQL_BIN_LOG::commit has a special case that calls
1701           ha_commit_low directly, skipping ordered_commit. Therefore, the GTID
1702           state is not updated in this stack.
1703 
1704           On the other hand, if slave-preserve-commit-order is enabled, the
1705           logic that orders commit carries out a subset of the binlog group
1706           commit from within ha_commit_low, and this includes updating the GTID
1707           state. In particular, there is the following call stack under
1708           ha_commit_low:
1709 
1710             ha_commit_low ->
1711             Commit_order_manager::wait_and_finish ->
1712             Commit_order_manager::finish ->
1713             Commit_order_manager::flush_engine_and_signal_threads ->
1714             Gtid_state::update_commit_group
1715 
1716           Therefore, it is necessary to call update_on_commit only in case we
1717           are not using slave-preserve-commit-order here.
1718         */
1719         gtid_state->update_on_commit(thd);
1720       }
1721     } else {
1722       /*
1723         If statement is supposed to be written to binlog, we write it
1724         to the binary log.  Inserting into table and releasing
1725         ownership will be done in the binlog commit handler.
1726       */
1727 
1728       /*
1729         thd->cache_mngr may be uninitialized if the first transaction
1730         executed by the client is empty.
1731       */
1732       if (thd->binlog_setup_trx_data()) return 1;
1733       binlog_cache_data *cache_data = &thd_get_cache_mngr(thd)->trx_cache;
1734 
1735       // Generate BEGIN event
1736       Query_log_event qinfo(thd, STRING_WITH_LEN("BEGIN"), true, false, true, 0,
1737                             true);
1738       DBUG_ASSERT(!qinfo.is_using_immediate_logging());
1739 
1740       /*
1741         Write BEGIN event and then commit (which will generate commit
1742         event and Gtid_log_event)
1743       */
1744       DBUG_PRINT("debug", ("Writing to trx_cache"));
1745       if (cache_data->write_event(&qinfo) || mysql_bin_log.commit(thd, true))
1746         return 1;
1747     }
1748   } else if (thd->owned_gtid.sidno == THD::OWNED_SIDNO_ANONYMOUS ||
1749              /*
1750                A transaction with an empty owned gtid should call
1751                end_gtid_violating_transaction(...) to clear the
1752                flag thd->has_gtid_consistency_violatoin in case
1753                it is set. It missed the clear in ordered_commit,
1754                because its binlog transaction cache is empty.
1755              */
1756              thd->has_gtid_consistency_violation)
1757 
1758   {
1759     gtid_state->update_on_commit(thd);
1760   } else if (thd->variables.gtid_next.type == ASSIGNED_GTID &&
1761              thd->owned_gtid_is_empty()) {
1762     DBUG_ASSERT(thd->has_gtid_consistency_violation == false);
1763     gtid_state->update_on_commit(thd);
1764   }
1765 
1766   return 0;
1767 }
1768 
reencrypt_logs()1769 bool MYSQL_BIN_LOG::reencrypt_logs() {
1770   DBUG_TRACE;
1771 
1772   if (!is_open()) return false;
1773 
1774   std::string error_message;
1775   /* Gather the set of files to be accessed. */
1776   list<string> filename_list;
1777   LOG_INFO linfo;
1778   int error = 0;
1779   list<string>::reverse_iterator rit;
1780 
1781   /* Read binary/relay log file names from index file. */
1782   mysql_mutex_lock(&LOCK_index);
1783   for (error = find_log_pos(&linfo, nullptr, false); !error;
1784        error = find_next_log(&linfo, false)) {
1785     filename_list.push_back(string(linfo.log_file_name));
1786   }
1787   mysql_mutex_unlock(&LOCK_index);
1788   if (error != LOG_INFO_EOF ||
1789       DBUG_EVALUATE_IF("fail_to_open_index_file", true, false)) {
1790     error_message.assign("I/O error reading index file '");
1791     error_message.append(index_file_name);
1792     error_message.append("'");
1793     goto err;
1794   }
1795 
1796   rit = filename_list.rbegin();
1797   /* Skip the last binary/relay log. */
1798   if (rit != filename_list.rend()) rit++;
1799   /* Iterate backwards through binary/relay logs. */
1800   while (rit != filename_list.rend()) {
1801     const char *filename = rit->c_str();
1802     DBUG_EXECUTE_IF("purge_logs_during_reencryption", {
1803       purge_logs(filename, true, true /*need_lock_index=true*/,
1804                  true /*need_update_threads=true*/, nullptr, false);
1805     });
1806     MUTEX_LOCK(lock, &LOCK_index);
1807     std::unique_ptr<Binlog_ofile> ofile(
1808         Binlog_ofile::open_existing(key_file_binlog, filename, MYF(MY_WME)));
1809 
1810     if (ofile == nullptr ||
1811         DBUG_EVALUATE_IF("fail_to_open_log_file", true, false) ||
1812         DBUG_EVALUATE_IF("fail_to_read_index_file", true, false)) {
1813       /* If we can not open the log file, check if it exists in index file. */
1814       error = find_log_pos(&linfo, filename, false);
1815       DBUG_EXECUTE_IF("fail_to_read_index_file", error = LOG_INFO_IO;);
1816       if (error == LOG_INFO_EOF) {
1817         /* If it does not exist in index file, re-encryption has finished. */
1818         if (current_thd->is_error()) current_thd->clear_error();
1819         break;
1820       } else if (error == 0) {
1821         /* If it exists in index file, failed to open the log file. */
1822         error_message.assign("Failed to open log file '");
1823         error_message.append(filename);
1824         error_message.append("'");
1825         goto err;
1826       } else if (error == LOG_INFO_IO) {
1827         /* Failed to read index file. */
1828         error_message.assign("I/O error reading index file '");
1829         error_message.append(index_file_name);
1830         error_message.append("'");
1831         goto err;
1832       }
1833     }
1834 
1835     if (ofile->is_encrypted()) {
1836       std::unique_ptr<Truncatable_ostream> pipeline_head =
1837           ofile->get_pipeline_head();
1838       std::unique_ptr<Binlog_encryption_ostream> binlog_encryption_ostream(
1839           down_cast<Binlog_encryption_ostream *>(pipeline_head.release()));
1840 
1841       auto ret_value = binlog_encryption_ostream->reencrypt();
1842       if (ret_value.first) {
1843         error_message.assign("Failed to re-encrypt log file '");
1844         error_message.append(filename);
1845         error_message.append("': ");
1846         error_message.append(ret_value.second.c_str());
1847         goto err;
1848       }
1849     }
1850 
1851     rit++;
1852   }
1853 
1854   filename_list.clear();
1855 
1856   return false;
1857 
1858 err:
1859   if (current_thd->is_error()) current_thd->clear_error();
1860   my_error(ER_BINLOG_MASTER_KEY_ROTATION_FAIL_TO_REENCRYPT_LOG, MYF(0),
1861            error_message.c_str());
1862   filename_list.clear();
1863 
1864   return true;
1865 }
1866 
compress(THD * thd)1867 bool binlog_cache_data::compress(THD *thd) {
1868   DBUG_TRACE;
1869   auto error{false};
1870   auto ctype{binary_log::transaction::compression::type::NONE};
1871   auto uncompressed_size{m_cache.length()};
1872   auto size{uncompressed_size};
1873   auto &cctx{thd->rpl_thd_ctx.transaction_compression_ctx()};
1874   binary_log::transaction::compression::Compressor *compressor{nullptr};
1875 
1876   // no compression enabled (ctype == NONE at this point)
1877   if (thd->variables.binlog_trx_compression == false) goto end;
1878 
1879   // do not compress if there are incident events
1880   DBUG_EXECUTE_IF("binlog_compression_inject_incident", set_incident(););
1881   if (has_incident()) goto end;
1882 
1883   // do not compress if there are non-transactional changes
1884   if (thd->get_transaction()->has_modified_non_trans_table(
1885           Transaction_ctx::STMT) ||
1886       thd->get_transaction()->has_modified_non_trans_table(
1887           Transaction_ctx::SESSION))
1888     goto end;
1889 
1890   // do not compress if has SBR
1891   if (may_have_sbr_stmts()) goto end;
1892 
1893   // Unable to get a reference to a compressor, fallback to
1894   // non compressed
1895   if ((compressor = cctx.get_compressor(thd)) == nullptr) goto end;
1896 
1897   // compression is enabled and all pre-conditions checked.
1898   // now compress
1899   else {
1900     std::size_t old_capacity{0};
1901     unsigned char *buffer{nullptr};
1902     unsigned char *old_buffer{nullptr};
1903     Transaction_payload_log_event tple{thd};
1904     Compressed_ostream stream;
1905     PSI_stage_info old_stage;
1906 
1907     // set the thread stage to compressing transaction
1908     thd->enter_stage(&stage_binlog_transaction_compress, &old_stage, __func__,
1909                      __FILE__, __LINE__);
1910     // do we have enough compression buffer ? If not swap with a larger one
1911     std::tie(buffer, std::ignore, old_capacity) = compressor->get_buffer();
1912     if (old_capacity < size) {
1913       old_buffer = buffer;
1914       auto new_buffer = (unsigned char *)malloc(size);
1915       if (new_buffer)
1916         compressor->set_buffer(new_buffer, size);
1917       else {
1918         /* purecov: begin inspected */
1919         // OOM
1920         error = true;
1921         goto compression_end;
1922         /* purecov: end */
1923       }
1924     }
1925 
1926     ctype = compressor->compression_type_code();
1927 
1928     compressor->open();
1929 
1930     // inject the compressor in the output stream
1931     stream.set_compressor(compressor);
1932 
1933     // FIXME: innefficient, we should not copy caches around
1934     //        This should be fixed when we revamp the capture
1935     //        cache handling (and make this more geared towards
1936     //        possible enhancements, such as streaming the changes)
1937     //        Also, if the cache actually spills to disk, this may
1938     //        the impact may be amplified, since reiniting the
1939     //        causes a flush to disk
1940     if ((error = m_cache.copy_to(&stream))) goto compression_end;
1941 
1942     compressor->close();
1943 
1944     if ((error = m_cache.truncate(0))) goto compression_end;
1945     // Since we deleted all events from the cache, we also need to
1946     // reset event_counter.
1947     event_counter = 0;
1948 
1949     // fill in the new transport event
1950     std::tie(buffer, size, std::ignore) = compressor->get_buffer();
1951     tple.set_payload((const char *)buffer);
1952     tple.set_payload_size(size);
1953     tple.set_compression_type(ctype);
1954     tple.set_uncompressed_size(uncompressed_size);
1955 
1956     // write back the new cache contents
1957     error = write_event(&tple);
1958 
1959   compression_end:
1960     // revert back to the default buffer, so that we don't overuse memory
1961     if (old_buffer) {
1962       std::tie(buffer, std::ignore, std::ignore) = compressor->get_buffer();
1963       compressor->set_buffer(old_buffer, old_capacity);
1964       free(buffer);
1965     }
1966 
1967     // revert the stage if needed
1968     if (old_stage.m_key != 0) THD_STAGE_INFO(thd, old_stage);
1969   }
1970 
1971 end:
1972   if (!error) {
1973     set_compression_type(ctype);
1974     set_compressed_size(m_cache.length());
1975     set_decompressed_size(uncompressed_size);
1976   }
1977   return error;
1978 }
1979 
1980 /**
1981   This function finalizes the cache preparing for commit or rollback.
1982 
1983   The function just writes all the necessary events to the cache but
1984   does not flush the data to the binary log file. That is the role of
1985   the binlog_cache_data::flush function.
1986 
1987   @see binlog_cache_data::flush
1988 
1989   @param thd                The thread whose transaction should be flushed
1990   @param end_event          The end event either commit/rollback
1991 
1992   @return
1993     nonzero if an error pops up when flushing the cache.
1994 */
finalize(THD * thd,Log_event * end_event)1995 int binlog_cache_data::finalize(THD *thd, Log_event *end_event) {
1996   DBUG_TRACE;
1997   if (!is_binlog_empty()) {
1998     DBUG_ASSERT(!flags.finalized);
1999     if (int error = flush_pending_event(thd)) return error;
2000     if (int error = write_event(end_event)) return error;
2001     if (int error = this->compress(thd)) return error;
2002     DBUG_PRINT("debug", ("flags.finalized: %s", YESNO(flags.finalized)));
2003     flags.finalized = true;
2004   }
2005   return 0;
2006 }
2007 
2008 /**
2009    The method writes XA END query to XA-prepared transaction's cache
2010    and calls the "basic" finalize().
2011 
2012    @return error code, 0 success
2013 */
2014 
finalize(THD * thd,Log_event * end_event,XID_STATE * xs)2015 int binlog_cache_data::finalize(THD *thd, Log_event *end_event, XID_STATE *xs) {
2016   int error = 0;
2017   char buf[XID::ser_buf_size];
2018   char query[sizeof("XA END") + 1 + sizeof(buf)];
2019   int qlen = sprintf(query, "XA END %s", xs->get_xid()->serialize(buf));
2020   Query_log_event qev(thd, query, qlen, true, false, true, 0);
2021 
2022   if ((error = write_event(&qev))) return error;
2023 
2024   return finalize(thd, end_event);
2025 }
2026 
2027 /**
2028   Flush caches to the binary log.
2029 
2030   If the cache is finalized, the cache will be flushed to the binary
2031   log file. If the cache is not finalized, nothing will be done.
2032 
2033   If flushing fails for any reason, an error will be reported and the
2034   cache will be reset. Flushing can fail in two circumstances:
2035 
2036   - It was not possible to write the cache to the file. In this case,
2037     it does not make sense to keep the cache.
2038 
2039   - The cache was successfully written to disk but post-flush actions
2040     (such as binary log rotation) failed. In this case, the cache is
2041     already written to disk and there is no reason to keep it.
2042 
2043   @see binlog_cache_data::finalize
2044  */
flush(THD * thd,my_off_t * bytes_written,bool * wrote_xid)2045 int binlog_cache_data::flush(THD *thd, my_off_t *bytes_written,
2046                              bool *wrote_xid) {
2047   /*
2048     Doing a commit or a rollback including non-transactional tables,
2049     i.e., ending a transaction where we might write the transaction
2050     cache to the binary log.
2051 
2052     We can always end the statement when ending a transaction since
2053     transactions are not allowed inside stored functions. If they
2054     were, we would have to ensure that we're not ending a statement
2055     inside a stored function.
2056   */
2057   DBUG_TRACE;
2058   DBUG_PRINT("debug", ("flags.finalized: %s", YESNO(flags.finalized)));
2059   int error = 0;
2060   if (flags.finalized) {
2061     my_off_t bytes_in_cache = m_cache.length();
2062     Transaction_ctx *trn_ctx = thd->get_transaction();
2063 
2064     DBUG_PRINT("debug", ("bytes_in_cache: %llu", bytes_in_cache));
2065 
2066     trn_ctx->sequence_number = mysql_bin_log.m_dependency_tracker.step();
2067 
2068     /*
2069       In case of two caches the transaction is split into two groups.
2070       The 2nd group is considered to be a successor of the 1st rather
2071       than to have a common commit parent with it.
2072       Notice that due to a simple method of detection that the current is
2073       the 2nd cache being flushed, the very first few transactions may be logged
2074       sequentially (a next one is tagged as if a preceding one is its
2075       commit parent).
2076     */
2077     if (trn_ctx->last_committed == SEQ_UNINIT)
2078       trn_ctx->last_committed = trn_ctx->sequence_number - 1;
2079 
2080     /*
2081       The GTID is written prior to flushing the statement cache, if
2082       the transaction has written to the statement cache; and prior to
2083       flushing the transaction cache if the transaction has written to
2084       the transaction cache.  If GTIDs are enabled, then transactional
2085       and non-transactional updates cannot be mixed, so at most one of
2086       the caches can be non-empty, so just one GTID will be
2087       generated. If GTIDs are disabled, then no GTID is generated at
2088       all; if both the transactional cache and the statement cache are
2089       non-empty then we get two Anonymous_gtid_log_events, which is
2090       correct.
2091     */
2092     Binlog_event_writer writer(mysql_bin_log.get_binlog_file());
2093 
2094     /* The GTID ownership process might set the commit_error */
2095     error = (thd->commit_error == THD::CE_FLUSH_ERROR);
2096 
2097     DBUG_EXECUTE_IF("simulate_binlog_flush_error", {
2098       if (rand() % 3 == 0) {
2099         thd->commit_error = THD::CE_FLUSH_ERROR;
2100       }
2101     };);
2102 
2103     DBUG_EXECUTE_IF("fault_injection_reinit_io_cache_while_flushing_to_file",
2104                     { DBUG_SET("+d,fault_injection_reinit_io_cache"); });
2105 
2106     if (!error)
2107       if ((error = mysql_bin_log.write_transaction(thd, this, &writer)))
2108         thd->commit_error = THD::CE_FLUSH_ERROR;
2109 
2110     DBUG_EXECUTE_IF("fault_injection_reinit_io_cache_while_flushing_to_file",
2111                     { DBUG_SET("-d,fault_injection_reinit_io_cache"); });
2112 
2113     if (flags.with_xid && error == 0) *wrote_xid = true;
2114 
2115     /*
2116       Reset have to be after the if above, since it clears the
2117       with_xid flag
2118     */
2119     reset();
2120     if (bytes_written) *bytes_written = bytes_in_cache;
2121   }
2122   DBUG_ASSERT(!flags.finalized);
2123   return error;
2124 }
2125 
2126 /**
2127   This function truncates the transactional cache upon committing or rolling
2128   back either a transaction or a statement.
2129 
2130   @param thd        The thread whose transaction should be flushed
2131   @param all        @c true means truncate the transaction, otherwise the
2132                     statement must be truncated.
2133 
2134   @return
2135     nonzero if an error pops up when truncating the transactional cache.
2136 */
truncate(THD * thd,bool all)2137 int binlog_trx_cache_data::truncate(THD *thd, bool all) {
2138   DBUG_TRACE;
2139   int error = 0;
2140 
2141   DBUG_PRINT("info",
2142              ("thd->options={ %s %s}, transaction: %s",
2143               FLAGSTR(thd->variables.option_bits, OPTION_NOT_AUTOCOMMIT),
2144               FLAGSTR(thd->variables.option_bits, OPTION_BEGIN),
2145               all ? "all" : "stmt"));
2146 
2147   remove_pending_event();
2148 
2149   /*
2150     If rolling back an entire transaction or a single statement not
2151     inside a transaction, we reset the transaction cache.
2152     Even though formally the atomic DDL statement may not end multi-statement
2153     transaction the cache needs full resetting as there must
2154     be no other data in it but belonging to the DDL.
2155   */
2156   if (ending_trans(thd, all)) {
2157     if (has_incident()) {
2158       const char *err_msg =
2159           "Error happend while resetting the transaction "
2160           "cache for a rolled back transaction or a single "
2161           "statement not inside a transaction.";
2162       error = mysql_bin_log.write_incident(thd, true /*need_lock_log=true*/,
2163                                            err_msg);
2164     }
2165     reset();
2166   }
2167   /*
2168     If rolling back a statement in a transaction, we truncate the
2169     transaction cache to remove the statement.
2170   */
2171   else if (get_prev_position() != MY_OFF_T_UNDEF)
2172     restore_prev_position();
2173 
2174   thd->clear_binlog_table_maps();
2175 
2176   return error;
2177 }
2178 
get_xa_opt(THD * thd)2179 inline enum xa_option_words get_xa_opt(THD *thd) {
2180   enum xa_option_words xa_opt = XA_NONE;
2181   switch (thd->lex->sql_command) {
2182     case SQLCOM_XA_COMMIT:
2183       xa_opt =
2184           static_cast<Sql_cmd_xa_commit *>(thd->lex->m_sql_cmd)->get_xa_opt();
2185       break;
2186     default:
2187       break;
2188   }
2189 
2190   return xa_opt;
2191 }
2192 
2193 /**
2194    Predicate function yields true when XA transaction is
2195    being logged having a proper state ready for prepare or
2196    commit in one phase.
2197 
2198    @param thd    THD pointer of running transaction
2199    @return true  When the being prepared transaction should be binlogged,
2200            false otherwise.
2201 */
2202 
is_loggable_xa_prepare(THD * thd)2203 inline bool is_loggable_xa_prepare(THD *thd) {
2204   /*
2205     simulate_commit_failure is doing a trick with XID_STATE while
2206     the ongoing transaction is not XA, and therefore to be errored out,
2207     asserted below. In that case because of the
2208     latter fact the function returns @c false.
2209   */
2210   DBUG_EXECUTE_IF("simulate_commit_failure", {
2211     XID_STATE *xs = thd->get_transaction()->xid_state();
2212     DBUG_ASSERT((thd->is_error() && xs->get_state() == XID_STATE::XA_IDLE) ||
2213                 xs->get_state() == XID_STATE::XA_NOTR);
2214   });
2215 
2216   return DBUG_EVALUATE_IF(
2217       "simulate_commit_failure", false,
2218       thd->get_transaction()->xid_state()->has_state(XID_STATE::XA_IDLE));
2219 }
2220 
binlog_prepare(handlerton *,THD * thd,bool all)2221 static int binlog_prepare(handlerton *, THD *thd, bool all) {
2222   DBUG_TRACE;
2223   if (!all) {
2224     thd->get_transaction()->store_commit_parent(
2225         mysql_bin_log.m_dependency_tracker.get_max_committed_timestamp());
2226   }
2227 
2228   return all && is_loggable_xa_prepare(thd) ? mysql_bin_log.commit(thd, true)
2229                                             : 0;
2230 }
2231 
2232 /**
2233    Logging XA commit/rollback of a prepared transaction.
2234 
2235    The function is called at XA-commit or XA-rollback logging via
2236    two paths: the recovered-or-slave-applier or immediately through
2237    the  XA-prepared transaction connection itself.
2238    It fills in appropiate event in the statement cache whenever
2239    xid state is marked with is_binlogged() flag that indicates
2240    the prepared part of the transaction must've been logged.
2241 
2242    About early returns from the function.
2243    In the recovered-or-slave-applier case the function may be called
2244    for the 2nd time, which has_logged_xid monitors.
2245    ONE_PHASE option to XA-COMMIT is handled to skip
2246    writing XA-commit event now.
2247    And the final early return check is for the read-only XA that is
2248    not to be logged.
2249 
2250    @param thd          THD handle
2251    @param xid          a pointer to XID object that is serialized
2252    @param commit       when @c true XA-COMMIT is to be logged,
2253                        and @c false when it's XA-ROLLBACK.
2254    @return error code, 0 success
2255 */
2256 
do_binlog_xa_commit_rollback(THD * thd,XID * xid,bool commit)2257 inline int do_binlog_xa_commit_rollback(THD *thd, XID *xid, bool commit) {
2258   DBUG_ASSERT(thd->lex->sql_command == SQLCOM_XA_COMMIT ||
2259               thd->lex->sql_command == SQLCOM_XA_ROLLBACK);
2260 
2261   XID_STATE *xid_state = thd->get_transaction()->xid_state();
2262   binlog_cache_mngr *cache_mngr = thd_get_cache_mngr(thd);
2263 
2264   if (cache_mngr != nullptr && cache_mngr->has_logged_xid) return 0;
2265 
2266   if (get_xa_opt(thd) == XA_ONE_PHASE) return 0;
2267   if (!xid_state->is_binlogged())
2268     return 0;  // nothing was really logged at prepare
2269   if (thd->is_error() && DBUG_EVALUATE_IF("simulate_xa_rm_error", 0, 1))
2270     return 0;  // don't binlog if there are some errors.
2271 
2272   DBUG_ASSERT(!xid->is_null() ||
2273               !(thd->variables.option_bits & OPTION_BIN_LOG));
2274 
2275   char buf[XID::ser_buf_size];
2276   char query[(sizeof("XA ROLLBACK")) + 1 + sizeof(buf)];
2277   int qlen = sprintf(query, "XA %s %s", commit ? "COMMIT" : "ROLLBACK",
2278                      xid->serialize(buf));
2279   Query_log_event qinfo(thd, query, qlen, false, true, true, 0, false);
2280   return mysql_bin_log.write_event(&qinfo);
2281 }
2282 
2283 /**
2284    Logging XA commit/rollback of a prepared transaction in the case
2285    it was disconnected and resumed (recovered), or executed by a slave applier.
2286 
2287    @param thd         THD handle
2288    @param xid         a pointer to XID object
2289    @param commit      when @c true XA-COMMIT is logged, otherwise XA-ROLLBACK
2290 
2291    @return error code, 0 success
2292 */
2293 
binlog_xa_commit_or_rollback(THD * thd,XID * xid,bool commit)2294 inline xa_status_code binlog_xa_commit_or_rollback(THD *thd, XID *xid,
2295                                                    bool commit) {
2296   int error = 0;
2297 
2298 #ifndef DBUG_OFF
2299   {
2300     binlog_cache_mngr *cache_mngr = thd_get_cache_mngr(thd);
2301     DBUG_ASSERT(!cache_mngr || !cache_mngr->has_logged_xid);
2302   }
2303 #endif
2304   if (!(error = do_binlog_xa_commit_rollback(thd, xid, commit))) {
2305     /*
2306       Error can't be propagated naturally via result.
2307       A grand-caller has to access to it through thd's da.
2308       todo:
2309       Bug #20488921 ERROR PROPAGATION DOES FULLY WORK IN XA
2310       stands in the way of implementing a failure simulation
2311       for XA PREPARE/COMMIT/ROLLBACK.
2312     */
2313     binlog_cache_mngr *cache_mngr = thd_get_cache_mngr(thd);
2314 
2315     if (cache_mngr) cache_mngr->has_logged_xid = true;
2316     if (commit)
2317       error = mysql_bin_log.commit(thd, true);
2318     else
2319       error = mysql_bin_log.rollback(thd, true);
2320     if (cache_mngr) cache_mngr->has_logged_xid = false;
2321   }
2322 
2323   return error == TC_LOG::RESULT_SUCCESS ? XA_OK : XAER_RMERR;
2324 }
2325 
binlog_xa_commit(handlerton *,XID * xid)2326 static xa_status_code binlog_xa_commit(handlerton *, XID *xid) {
2327   return binlog_xa_commit_or_rollback(current_thd, xid, true);
2328 }
2329 
binlog_xa_rollback(handlerton *,XID * xid)2330 static xa_status_code binlog_xa_rollback(handlerton *, XID *xid) {
2331   return binlog_xa_commit_or_rollback(current_thd, xid, false);
2332 }
2333 
2334 /**
2335   When a fatal error occurs due to which binary logging becomes impossible and
2336   the user specified binlog_error_action= ABORT_SERVER the following function is
2337   invoked. This function pushes the appropriate error message to client and logs
2338   the same to server error log and then aborts the server.
2339 
2340   @param err_string          Error string which specifies the exact error
2341                              message from the caller.
2342 
2343   @retval
2344     none
2345 */
exec_binlog_error_action_abort(const char * err_string)2346 static void exec_binlog_error_action_abort(const char *err_string) {
2347   THD *thd = current_thd;
2348   /*
2349     When the code enters here it means that there was an error at higher layer
2350     and my_error function could have been invoked to let the client know what
2351     went wrong during the execution.
2352 
2353     But these errors will not let the client know that the server is going to
2354     abort. Even if we add an additional my_error function call at this point
2355     client will be able to see only the first error message that was set
2356     during the very first invocation of my_error function call.
2357 
2358     The advantage of having multiple my_error function calls are visible when
2359     the server is up and running and user issues SHOW WARNINGS or SHOW ERROR
2360     calls. In this special scenario server will be immediately aborted and
2361     user will not be able execute the above SHOW commands.
2362 
2363     Hence we clear the previous errors and push one critical error message to
2364     clients.
2365    */
2366   if (thd) {
2367     if (thd->is_error()) thd->clear_error();
2368     /*
2369       Send error to both client and to the server error log.
2370     */
2371     my_error(ER_BINLOG_LOGGING_IMPOSSIBLE, MYF(ME_FATALERROR), err_string);
2372   }
2373 
2374   LogErr(ERROR_LEVEL, ER_BINLOG_LOGGING_NOT_POSSIBLE, err_string);
2375   flush_error_log_messages();
2376 
2377   if (thd) thd->send_statement_status();
2378   abort();
2379 }
2380 
2381 /**
2382   This function is called once after each statement.
2383 
2384   @todo This function is currently not used any more and will
2385   eventually be eliminated. The real commit job is done in the
2386   MYSQL_BIN_LOG::commit function.
2387 
2388   @see MYSQL_BIN_LOG::commit
2389 
2390   @see handlerton::commit
2391 */
binlog_commit(handlerton *,THD *,bool)2392 static int binlog_commit(handlerton *, THD *, bool) {
2393   DBUG_TRACE;
2394   /*
2395     Nothing to do (any more) on commit.
2396    */
2397   return 0;
2398 }
2399 
2400 /**
2401   This function is called when a transaction or a statement is rolled back.
2402 
2403   @internal It is necessary to execute a rollback here if the
2404   transaction was rolled back because of executing a ROLLBACK TO
2405   SAVEPOINT command, but it is not used for normal rollback since
2406   MYSQL_BIN_LOG::rollback is called in that case.
2407 
2408   @todo Refactor code to introduce a <code>MYSQL_BIN_LOG::rollback(THD
2409   *thd, SAVEPOINT *sv)</code> function in @c TC_LOG and have that
2410   function execute the necessary work to rollback to a savepoint.
2411 
2412   @param thd   The client thread that executes the transaction.
2413   @param all   This is @c true if this is a real transaction rollback, and
2414                @false otherwise.
2415 
2416   @see handlerton::rollback
2417 */
binlog_rollback(handlerton *,THD * thd,bool all)2418 static int binlog_rollback(handlerton *, THD *thd, bool all) {
2419   DBUG_TRACE;
2420   int error = 0;
2421   if (thd->lex->sql_command == SQLCOM_ROLLBACK_TO_SAVEPOINT)
2422     error = mysql_bin_log.rollback(thd, all);
2423   return error;
2424 }
2425 
2426 /**
2427   Write a rollback record of the transaction to the binary log.
2428 
2429   For binary log group commit, the rollback is separated into three
2430   parts:
2431 
2432   1. First part consists of filling the necessary caches and
2433      finalizing them (if they need to be finalized). After a cache is
2434      finalized, nothing can be added to the cache.
2435 
2436   2. Second part execute an ordered flush and commit. This will be
2437      done using the group commit functionality in @c ordered_commit.
2438 
2439      Since we roll back the transaction early, we call @c
2440      ordered_commit with the @c skip_commit flag set. The @c
2441      ha_commit_low call inside @c ordered_commit will then not be
2442      called.
2443 
2444   3. Third part checks any errors resulting from the flush and handles
2445      them appropriately.
2446 
2447   @see MYSQL_BIN_LOG::ordered_commit
2448   @see ha_commit_low
2449   @see ha_rollback_low
2450 
2451   @param thd Session to commit
2452   @param all This is @c true if this is a real transaction rollback, and
2453              @c false otherwise.
2454 
2455   @return Error code, or zero if there were no error.
2456  */
2457 
rollback(THD * thd,bool all)2458 int MYSQL_BIN_LOG::rollback(THD *thd, bool all) {
2459   int error = 0;
2460   bool stuff_logged = false;
2461   binlog_cache_mngr *cache_mngr = thd_get_cache_mngr(thd);
2462   bool is_empty = false;
2463 
2464   DBUG_TRACE;
2465   DBUG_PRINT("enter",
2466              ("all: %s, cache_mngr: 0x%llx, thd->is_error: %s", YESNO(all),
2467               (ulonglong)cache_mngr, YESNO(thd->is_error())));
2468   /*
2469     Defer XA-transaction rollback until its XA-rollback event is recorded.
2470     When we are executing a ROLLBACK TO SAVEPOINT, we
2471     should only clear the caches since this function is called as part
2472     of the engine rollback.
2473     In other cases we roll back the transaction in the engines early
2474     since this will release locks and allow other transactions to
2475     start executing.
2476   */
2477   if (thd->lex->sql_command == SQLCOM_XA_ROLLBACK) {
2478     XID_STATE *xs = thd->get_transaction()->xid_state();
2479 
2480     DBUG_ASSERT(all || !xs->is_binlogged() ||
2481                 (!xs->is_in_recovery() && thd->is_error()));
2482     /*
2483       Whenever cache_mngr is not initialized, the xa prepared
2484       transaction's binary logging status must not be set, unless the
2485       transaction is rolled back through an external connection which
2486       has binlogging switched off.
2487     */
2488     DBUG_ASSERT(cache_mngr || !xs->is_binlogged() ||
2489                 !(is_open() && thd->variables.option_bits & OPTION_BIN_LOG));
2490 
2491     is_empty = !xs->is_binlogged();
2492     if ((error = do_binlog_xa_commit_rollback(thd, xs->get_xid(), false)))
2493       goto end;
2494     cache_mngr = thd_get_cache_mngr(thd);
2495   } else if (thd->lex->sql_command != SQLCOM_ROLLBACK_TO_SAVEPOINT)
2496     if ((error = ha_rollback_low(thd, all))) goto end;
2497 
2498   /*
2499     If there is no cache manager, or if there is nothing in the
2500     caches, there are no caches to roll back, so we're trivially done
2501     unless XA-ROLLBACK that yet to run rollback_low().
2502   */
2503   if (cache_mngr == nullptr || cache_mngr->is_binlog_empty()) {
2504     goto end;
2505   }
2506 
2507   DBUG_PRINT("debug", ("all.cannot_safely_rollback(): %s, trx_cache_empty: %s",
2508                        YESNO(thd->get_transaction()->cannot_safely_rollback(
2509                            Transaction_ctx::SESSION)),
2510                        YESNO(cache_mngr->trx_cache.is_binlog_empty())));
2511   DBUG_PRINT("debug",
2512              ("stmt.cannot_safely_rollback(): %s, stmt_cache_empty: %s",
2513               YESNO(thd->get_transaction()->cannot_safely_rollback(
2514                   Transaction_ctx::STMT)),
2515               YESNO(cache_mngr->stmt_cache.is_binlog_empty())));
2516 
2517   /*
2518     If an incident event is set we do not flush the content of the statement
2519     cache because it may be corrupted.
2520   */
2521   if (cache_mngr->stmt_cache.has_incident()) {
2522     const char *err_msg =
2523         "The content of the statement cache is corrupted "
2524         "while writing a rollback record of the transaction "
2525         "to the binary log.";
2526     error = write_incident(thd, true /*need_lock_log=true*/, err_msg);
2527     cache_mngr->stmt_cache.reset();
2528   } else if (!cache_mngr->stmt_cache.is_binlog_empty()) {
2529     if (thd->lex->sql_command == SQLCOM_CREATE_TABLE &&
2530         thd->lex->select_lex->get_fields_list()->elements && /* With select */
2531         !(thd->lex->create_info->options & HA_LEX_CREATE_TMP_TABLE) &&
2532         thd->is_current_stmt_binlog_format_row()) {
2533       /*
2534         In row based binlog format, we reset the binlog statement cache
2535         when rolling back a single statement 'CREATE...SELECT' transaction,
2536         since the 'CREATE TABLE' event was put in the binlog statement cache.
2537       */
2538       cache_mngr->stmt_cache.reset();
2539     } else {
2540       if ((error = cache_mngr->stmt_cache.finalize(thd))) goto end;
2541       stuff_logged = true;
2542     }
2543   }
2544 
2545   if (ending_trans(thd, all)) {
2546     if (trans_cannot_safely_rollback(thd)) {
2547       const char xa_rollback_str[] = "XA ROLLBACK";
2548       /*
2549         sizeof(xa_rollback_str) and XID::ser_buf_size both allocate `\0',
2550         so one of the two is used for necessary in the xa case `space' char
2551       */
2552       char query[sizeof(xa_rollback_str) + XID::ser_buf_size] = "ROLLBACK";
2553       XID_STATE *xs = thd->get_transaction()->xid_state();
2554 
2555       if (thd->lex->sql_command == SQLCOM_XA_ROLLBACK) {
2556         /* this block is relevant only for not prepared yet and "local" xa trx
2557          */
2558         DBUG_ASSERT(
2559             thd->get_transaction()->xid_state()->has_state(XID_STATE::XA_IDLE));
2560         DBUG_ASSERT(!cache_mngr->has_logged_xid);
2561 
2562         sprintf(query, "%s ", xa_rollback_str);
2563         xs->get_xid()->serialize(query + sizeof(xa_rollback_str));
2564       }
2565       /*
2566         If the transaction is being rolled back and contains changes that
2567         cannot be rolled back, the trx-cache's content is flushed.
2568       */
2569       Query_log_event end_evt(thd, query, strlen(query), true, false, true, 0,
2570                               true);
2571       error = thd->lex->sql_command != SQLCOM_XA_ROLLBACK
2572                   ? cache_mngr->trx_cache.finalize(thd, &end_evt)
2573                   : cache_mngr->trx_cache.finalize(thd, &end_evt, xs);
2574       stuff_logged = true;
2575     } else {
2576       /*
2577         If the transaction is being rolled back and its changes can be
2578         rolled back, the trx-cache's content is truncated.
2579       */
2580       error = cache_mngr->trx_cache.truncate(thd, all);
2581 
2582       DBUG_EXECUTE_IF("ensure_binlog_cache_is_reset", {
2583         /* Assert that binlog cache is reset at rollback time. */
2584         DBUG_ASSERT(binlog_cache_is_reset);
2585         binlog_cache_is_reset = false;
2586       };);
2587     }
2588   } else {
2589     /*
2590       If a statement is being rolled back, it is necessary to know
2591       exactly why a statement may not be safely rolled back as in
2592       some specific situations the trx-cache can be truncated.
2593 
2594       If a temporary table is created or dropped, the trx-cache is not
2595       truncated. Note that if the stmt-cache is used, there is nothing
2596       to truncate in the trx-cache.
2597 
2598       If a non-transactional table is updated and the binlog format is
2599       statement, the trx-cache is not truncated. The trx-cache is used
2600       when the direct option is off and a transactional table has been
2601       updated before the current statement in the context of the
2602       current transaction. Note that if the stmt-cache is used there is
2603       nothing to truncate in the trx-cache.
2604 
2605       If other binlog formats are used, updates to non-transactional
2606       tables are written to the stmt-cache and trx-cache can be safely
2607       truncated, if necessary.
2608     */
2609     if (thd->get_transaction()->has_dropped_temp_table(Transaction_ctx::STMT) ||
2610         thd->get_transaction()->has_created_temp_table(Transaction_ctx::STMT) ||
2611         (thd->get_transaction()->has_modified_non_trans_table(
2612              Transaction_ctx::STMT) &&
2613          thd->variables.binlog_format == BINLOG_FORMAT_STMT)) {
2614       /*
2615         If the statement is being rolled back and dropped or created a
2616         temporary table or modified a non-transactional table and the
2617         statement-based replication is in use, the statement's changes
2618         in the trx-cache are preserved.
2619       */
2620       cache_mngr->trx_cache.set_prev_position(MY_OFF_T_UNDEF);
2621     } else {
2622       /*
2623         Otherwise, the statement's changes in the trx-cache are
2624         truncated.
2625       */
2626       error = cache_mngr->trx_cache.truncate(thd, all);
2627     }
2628   }
2629   if (stuff_logged) {
2630     Transaction_ctx *trn_ctx = thd->get_transaction();
2631     trn_ctx->store_commit_parent(
2632         m_dependency_tracker.get_max_committed_timestamp());
2633   }
2634 
2635   DBUG_PRINT("debug", ("error: %d", error));
2636   if (error == 0 && stuff_logged) {
2637     if (RUN_HOOK(
2638             transaction, before_commit,
2639             (thd, all, thd_get_cache_mngr(thd)->get_trx_cache(),
2640              thd_get_cache_mngr(thd)->get_stmt_cache(),
2641              max<my_off_t>(max_binlog_cache_size, max_binlog_stmt_cache_size),
2642              false))) {
2643       // Reset the thread OK status before changing the outcome.
2644       if (thd->get_stmt_da()->is_ok())
2645         thd->get_stmt_da()->reset_diagnostics_area();
2646       my_error(ER_RUN_HOOK_ERROR, MYF(0), "before_commit");
2647       return RESULT_ABORTED;
2648     }
2649 #ifndef DBUG_OFF
2650     /*
2651       XA rollback is always accepted.
2652     */
2653     if (thd->get_transaction()
2654             ->get_rpl_transaction_ctx()
2655             ->is_transaction_rollback())
2656       DBUG_ASSERT(0);
2657 #endif
2658 
2659     error = ordered_commit(thd, all, /* skip_commit */ true);
2660   }
2661 
2662   if (check_write_error(thd)) {
2663     /*
2664       "all == true" means that a "rollback statement" triggered the error and
2665       this function was called. However, this must not happen as a rollback
2666       is written directly to the binary log. And in auto-commit mode, a single
2667       statement that is rolled back has the flag all == false.
2668     */
2669     DBUG_ASSERT(!all);
2670     /*
2671       We reach this point if the effect of a statement did not properly get into
2672       a cache and need to be rolled back.
2673     */
2674     error |= cache_mngr->trx_cache.truncate(thd, all);
2675   }
2676 
2677 end:
2678   /* Deferred xa rollback to engines */
2679   if (!error && thd->lex->sql_command == SQLCOM_XA_ROLLBACK) {
2680     error = ha_rollback_low(thd, all);
2681     if (!error && !thd->is_error()) {
2682       /*
2683         XA-rollback ignores the gtid_state, if the transaciton
2684         is empty.
2685       */
2686       if (is_empty && !thd->slave_thread) gtid_state->update_on_rollback(thd);
2687       /*
2688         XA-rollback commits the new gtid_state, if transaction
2689         is not empty.
2690       */
2691       else {
2692         gtid_state->update_on_commit(thd);
2693         /*
2694           Inform hook listeners that a XA ROLLBACK did commit, that
2695           is, did log a transaction to the binary log.
2696         */
2697         (void)RUN_HOOK(transaction, after_commit, (thd, all));
2698       }
2699     }
2700   }
2701   /*
2702     When a statement errors out on auto-commit mode it is rollback
2703     implicitly, so the same should happen to its GTID.
2704   */
2705   if (!thd->in_active_multi_stmt_transaction())
2706     gtid_state->update_on_rollback(thd);
2707 
2708   /*
2709     TODO: some errors are overwritten, which may cause problem,
2710     fix it later.
2711   */
2712   DBUG_PRINT("return", ("error: %d", error));
2713   return error;
2714 }
2715 
2716 /**
2717   @note
2718   How do we handle this (unlikely but legal) case:
2719   @verbatim
2720     [transaction] + [update to non-trans table] + [rollback to savepoint] ?
2721   @endverbatim
2722   The problem occurs when a savepoint is before the update to the
2723   non-transactional table. Then when there's a rollback to the savepoint, if we
2724   simply truncate the binlog cache, we lose the part of the binlog cache where
2725   the update is. If we want to not lose it, we need to write the SAVEPOINT
2726   command and the ROLLBACK TO SAVEPOINT command to the binlog cache. The latter
2727   is easy: it's just write at the end of the binlog cache, but the former
2728   should be *inserted* to the place where the user called SAVEPOINT. The
2729   solution is that when the user calls SAVEPOINT, we write it to the binlog
2730   cache (so no need to later insert it). As transactions are never intermixed
2731   in the binary log (i.e. they are serialized), we won't have conflicts with
2732   savepoint names when using mysqlbinlog or in the slave SQL thread.
2733   Then when ROLLBACK TO SAVEPOINT is called, if we updated some
2734   non-transactional table, we don't truncate the binlog cache but instead write
2735   ROLLBACK TO SAVEPOINT to it; otherwise we truncate the binlog cache (which
2736   will chop the SAVEPOINT command from the binlog cache, which is good as in
2737   that case there is no need to have it in the binlog).
2738 */
2739 
binlog_savepoint_set(handlerton *,THD * thd,void * sv)2740 static int binlog_savepoint_set(handlerton *, THD *thd, void *sv) {
2741   DBUG_TRACE;
2742   int error = 1;
2743 
2744   String log_query;
2745   if (log_query.append(STRING_WITH_LEN("SAVEPOINT ")))
2746     return error;
2747   else
2748     append_identifier(thd, &log_query, thd->lex->ident.str,
2749                       thd->lex->ident.length);
2750 
2751   int errcode = query_error_code(thd, thd->killed == THD::NOT_KILLED);
2752   Query_log_event qinfo(thd, log_query.c_ptr_safe(), log_query.length(), true,
2753                         false, true, errcode);
2754   /*
2755     We cannot record the position before writing the statement
2756     because a rollback to a savepoint (.e.g. consider it "S") would
2757     prevent the savepoint statement (i.e. "SAVEPOINT S") from being
2758     written to the binary log despite the fact that the server could
2759     still issue other rollback statements to the same savepoint (i.e.
2760     "S").
2761     Given that the savepoint is valid until the server releases it,
2762     ie, until the transaction commits or it is released explicitly,
2763     we need to log it anyway so that we don't have "ROLLBACK TO S"
2764     or "RELEASE S" without the preceding "SAVEPOINT S" in the binary
2765     log.
2766   */
2767   if (!(error = mysql_bin_log.write_event(&qinfo)))
2768     binlog_trans_log_savepos(thd, (my_off_t *)sv);
2769 
2770   return error;
2771 }
2772 
binlog_savepoint_rollback(handlerton *,THD * thd,void * sv)2773 static int binlog_savepoint_rollback(handlerton *, THD *thd, void *sv) {
2774   DBUG_TRACE;
2775   binlog_cache_mngr *const cache_mngr = thd_get_cache_mngr(thd);
2776   my_off_t pos = *(my_off_t *)sv;
2777   DBUG_ASSERT(pos != ~(my_off_t)0);
2778 
2779   /*
2780     Write ROLLBACK TO SAVEPOINT to the binlog cache if we have updated some
2781     non-transactional table. Otherwise, truncate the binlog cache starting
2782     from the SAVEPOINT command.
2783   */
2784   if (trans_cannot_safely_rollback(thd)) {
2785     String log_query;
2786     if (log_query.append(STRING_WITH_LEN("ROLLBACK TO ")))
2787       return 1;
2788     else {
2789       /*
2790         Before writing identifier to the binlog, make sure to
2791         quote the identifier properly so as to prevent any SQL
2792         injection on the slave.
2793       */
2794       append_identifier(thd, &log_query, thd->lex->ident.str,
2795                         thd->lex->ident.length);
2796     }
2797 
2798     int errcode = query_error_code(thd, thd->killed == THD::NOT_KILLED);
2799     Query_log_event qinfo(thd, log_query.c_ptr_safe(), log_query.length(), true,
2800                           false, true, errcode);
2801     return mysql_bin_log.write_event(&qinfo);
2802   }
2803   // Otherwise, we truncate the cache
2804   cache_mngr->trx_cache.restore_savepoint(pos);
2805   /*
2806     When a SAVEPOINT is executed inside a stored function/trigger we force the
2807     pending event to be flushed with a STMT_END_F flag and clear the table maps
2808     as well to ensure that following DMLs will have a clean state to start
2809     with. ROLLBACK inside a stored routine has to finalize possibly existing
2810     current row-based pending event with cleaning up table maps. That ensures
2811     that following DMLs will have a clean state to start with.
2812    */
2813   if (thd->in_sub_stmt) thd->clear_binlog_table_maps();
2814   return 0;
2815 }
2816 
2817 /**
2818    purge logs, master and slave sides both, related error code
2819    convertor.
2820    Called from @c purge_error_message(), @c MYSQL_BIN_LOG::reset_logs()
2821 
2822    @param  res  an error code as used by purging routines
2823 
2824    @return the user level error code ER_*
2825 */
purge_log_get_error_code(int res)2826 static uint purge_log_get_error_code(int res) {
2827   uint errcode = 0;
2828 
2829   switch (res) {
2830     case 0:
2831       break;
2832     case LOG_INFO_EOF:
2833       errcode = ER_UNKNOWN_TARGET_BINLOG;
2834       break;
2835     case LOG_INFO_IO:
2836       errcode = ER_IO_ERR_LOG_INDEX_READ;
2837       break;
2838     case LOG_INFO_INVALID:
2839       errcode = ER_BINLOG_PURGE_PROHIBITED;
2840       break;
2841     case LOG_INFO_SEEK:
2842       errcode = ER_FSEEK_FAIL;
2843       break;
2844     case LOG_INFO_MEM:
2845       errcode = ER_OUT_OF_RESOURCES;
2846       break;
2847     case LOG_INFO_FATAL:
2848       errcode = ER_BINLOG_PURGE_FATAL_ERR;
2849       break;
2850     case LOG_INFO_IN_USE:
2851       errcode = ER_LOG_IN_USE;
2852       break;
2853     case LOG_INFO_EMFILE:
2854       errcode = ER_BINLOG_PURGE_EMFILE;
2855       break;
2856     default:
2857       errcode = ER_LOG_PURGE_UNKNOWN_ERR;
2858       break;
2859   }
2860 
2861   return errcode;
2862 }
2863 
2864 /**
2865   Check whether binlog state allows to safely release MDL locks after
2866   rollback to savepoint.
2867 
2868   @param thd   The client thread that executes the transaction.
2869 
2870   @return true  - It is safe to release MDL locks.
2871           false - If it is not.
2872 */
binlog_savepoint_rollback_can_release_mdl(handlerton *,THD * thd)2873 static bool binlog_savepoint_rollback_can_release_mdl(handlerton *, THD *thd) {
2874   DBUG_TRACE;
2875   /**
2876     If we have not updated any non-transactional tables rollback
2877     to savepoint will simply truncate binlog cache starting from
2878     SAVEPOINT command. So it should be safe to release MDL acquired
2879     after SAVEPOINT command in this case.
2880   */
2881   return !trans_cannot_safely_rollback(thd);
2882 }
2883 
2884 /**
2885   Adjust log offset in the binary log file for all running slaves
2886   This class implements call back function for do_for_all_thd().
2887   It is called for each thd in thd list to adjust offset.
2888 */
2889 class Adjust_offset : public Do_THD_Impl {
2890  public:
Adjust_offset(my_off_t value)2891   Adjust_offset(my_off_t value) : m_purge_offset(value) {}
operator ()(THD * thd)2892   virtual void operator()(THD *thd) {
2893     LOG_INFO *linfo;
2894     mysql_mutex_lock(&thd->LOCK_thd_data);
2895     if ((linfo = thd->current_linfo)) {
2896       /*
2897         Index file offset can be less that purge offset only if
2898         we just started reading the index file. In that case
2899         we have nothing to adjust.
2900       */
2901       if (linfo->index_file_offset < m_purge_offset)
2902         linfo->fatal = (linfo->index_file_offset != 0);
2903       else
2904         linfo->index_file_offset -= m_purge_offset;
2905     }
2906     mysql_mutex_unlock(&thd->LOCK_thd_data);
2907   }
2908 
2909  private:
2910   my_off_t m_purge_offset;
2911 };
2912 
2913 /*
2914   Adjust the position pointer in the binary log file for all running slaves.
2915 
2916   SYNOPSIS
2917     adjust_linfo_offsets()
2918     purge_offset	Number of bytes removed from start of log index file
2919 
2920   NOTES
2921     - This is called when doing a PURGE when we delete lines from the
2922       index log file.
2923 
2924   REQUIREMENTS
2925     - Before calling this function, we have to ensure that no threads are
2926       using any binary log file before purge_offset.
2927 
2928   TODO
2929     - Inform the slave threads that they should sync the position
2930       in the binary log file with flush_relay_log_info.
2931       Now they sync is done for next read.
2932 */
adjust_linfo_offsets(my_off_t purge_offset)2933 static void adjust_linfo_offsets(my_off_t purge_offset) {
2934   Adjust_offset adjust_offset(purge_offset);
2935   Global_THD_manager::get_instance()->do_for_all_thd(&adjust_offset);
2936 }
2937 
2938 /**
2939   This class implements Call back function for do_for_all_thd().
2940   It is called for each thd in thd list to count
2941   threads using bin log file
2942 */
2943 
2944 class Log_in_use : public Do_THD_Impl {
2945  public:
Log_in_use(const char * value)2946   Log_in_use(const char *value) : m_log_name(value), m_count(0) {
2947     m_log_name_len = strlen(m_log_name) + 1;
2948   }
operator ()(THD * thd)2949   virtual void operator()(THD *thd) {
2950     LOG_INFO *linfo;
2951     mysql_mutex_lock(&thd->LOCK_thd_data);
2952     if ((linfo = thd->current_linfo)) {
2953       if (!strncmp(m_log_name, linfo->log_file_name, m_log_name_len)) {
2954         LogErr(WARNING_LEVEL, ER_BINLOG_FILE_BEING_READ_NOT_PURGED, m_log_name,
2955                thd->thread_id());
2956         m_count++;
2957       }
2958     }
2959     mysql_mutex_unlock(&thd->LOCK_thd_data);
2960   }
get_count()2961   int get_count() { return m_count; }
2962 
2963  private:
2964   const char *m_log_name;
2965   size_t m_log_name_len;
2966   int m_count;
2967 };
2968 
log_in_use(const char * log_name)2969 static int log_in_use(const char *log_name) {
2970   Log_in_use log_in_use(log_name);
2971 #ifndef DBUG_OFF
2972   if (current_thd)
2973     DEBUG_SYNC(current_thd, "purge_logs_after_lock_index_before_thread_count");
2974 #endif
2975   Global_THD_manager::get_instance()->do_for_all_thd(&log_in_use);
2976   return log_in_use.get_count();
2977 }
2978 
purge_error_message(THD * thd,int res)2979 static bool purge_error_message(THD *thd, int res) {
2980   uint errcode;
2981 
2982   if ((errcode = purge_log_get_error_code(res)) != 0) {
2983     my_error(errcode, MYF(0));
2984     return true;
2985   }
2986   my_ok(thd);
2987   return false;
2988 }
2989 
is_transaction_empty(THD * thd)2990 bool is_transaction_empty(THD *thd) {
2991   DBUG_TRACE;
2992   int rw_ha_count = check_trx_rw_engines(thd, Transaction_ctx::SESSION);
2993   rw_ha_count += check_trx_rw_engines(thd, Transaction_ctx::STMT);
2994   return rw_ha_count == 0;
2995 }
2996 
check_trx_rw_engines(THD * thd,Transaction_ctx::enum_trx_scope trx_scope)2997 int check_trx_rw_engines(THD *thd, Transaction_ctx::enum_trx_scope trx_scope) {
2998   DBUG_TRACE;
2999 
3000   int rw_ha_count = 0;
3001   Ha_trx_info *ha_list =
3002       (Ha_trx_info *)thd->get_transaction()->ha_trx_info(trx_scope);
3003 
3004   for (Ha_trx_info *ha_info = ha_list; ha_info; ha_info = ha_info->next()) {
3005     if (ha_info->is_trx_read_write()) ++rw_ha_count;
3006   }
3007   return rw_ha_count;
3008 }
3009 
is_empty_transaction_in_binlog_cache(const THD * thd)3010 bool is_empty_transaction_in_binlog_cache(const THD *thd) {
3011   DBUG_TRACE;
3012 
3013   binlog_cache_mngr *const cache_mngr = thd_get_cache_mngr(thd);
3014   if (cache_mngr != nullptr && cache_mngr->has_empty_transaction()) {
3015     return true;
3016   }
3017 
3018   return false;
3019 }
3020 
3021 /**
3022   This function checks if a transactional table was updated by the
3023   current transaction.
3024 
3025   @param thd The client thread that executed the current statement.
3026   @return
3027     @c true if a transactional table was updated, @c false otherwise.
3028 */
trans_has_updated_trans_table(const THD * thd)3029 bool trans_has_updated_trans_table(const THD *thd) {
3030   binlog_cache_mngr *const cache_mngr = thd_get_cache_mngr(thd);
3031 
3032   return (cache_mngr ? !cache_mngr->trx_cache.is_binlog_empty() : 0);
3033 }
3034 
3035 /**
3036   This function checks if a transactional table was updated by the
3037   current statement.
3038 
3039   @param ha_list Registered storage engine handler list.
3040   @return
3041     @c true if a transactional table was updated, @c false otherwise.
3042 */
stmt_has_updated_trans_table(Ha_trx_info * ha_list)3043 bool stmt_has_updated_trans_table(Ha_trx_info *ha_list) {
3044   const Ha_trx_info *ha_info;
3045   for (ha_info = ha_list; ha_info; ha_info = ha_info->next()) {
3046     if (ha_info->is_trx_read_write() && ha_info->ht() != binlog_hton)
3047       return (true);
3048   }
3049   return (false);
3050 }
3051 
3052 /**
3053   This function checks if a transaction, either a multi-statement
3054   or a single statement transaction is about to commit or not.
3055 
3056   @param thd The client thread that executed the current statement.
3057   @param all Committing a transaction (i.e. true) or a statement
3058              (i.e. false).
3059   @return
3060     @c true if committing a transaction, otherwise @c false.
3061 */
ending_trans(THD * thd,const bool all)3062 bool ending_trans(THD *thd, const bool all) {
3063   return (all || ending_single_stmt_trans(thd, all));
3064 }
3065 
3066 /**
3067   This function checks if a single statement transaction is about
3068   to commit or not.
3069 
3070   @param thd The client thread that executed the current statement.
3071   @param all Committing a transaction (i.e. true) or a statement
3072              (i.e. false).
3073   @return
3074     @c true if committing a single statement transaction, otherwise
3075     @c false.
3076 */
ending_single_stmt_trans(THD * thd,const bool all)3077 bool ending_single_stmt_trans(THD *thd, const bool all) {
3078   return (!all && !thd->in_multi_stmt_transaction_mode());
3079 }
3080 
3081 /**
3082   This function checks if a transaction cannot be rolled back safely.
3083 
3084   @param thd The client thread that executed the current statement.
3085   @return
3086     @c true if cannot be safely rolled back, @c false otherwise.
3087 */
trans_cannot_safely_rollback(const THD * thd)3088 bool trans_cannot_safely_rollback(const THD *thd) {
3089   binlog_cache_mngr *const cache_mngr = thd_get_cache_mngr(thd);
3090 
3091   return cache_mngr->trx_cache.cannot_rollback();
3092 }
3093 
3094 /**
3095   This function checks if current statement cannot be rollded back safely.
3096 
3097   @param thd The client thread that executed the current statement.
3098   @return
3099     @c true if cannot be safely rolled back, @c false otherwise.
3100 */
stmt_cannot_safely_rollback(const THD * thd)3101 bool stmt_cannot_safely_rollback(const THD *thd) {
3102   return thd->get_transaction()->cannot_safely_rollback(Transaction_ctx::STMT);
3103 }
3104 
3105 /**
3106   Execute a PURGE BINARY LOGS TO @<log@> command.
3107 
3108   @param thd Pointer to THD object for the client thread executing the
3109   statement.
3110 
3111   @param to_log Name of the last log to purge.
3112 
3113   @retval false success
3114   @retval true failure
3115 */
purge_master_logs(THD * thd,const char * to_log)3116 bool purge_master_logs(THD *thd, const char *to_log) {
3117   char search_file_name[FN_REFLEN];
3118   if (!mysql_bin_log.is_open()) {
3119     my_ok(thd);
3120     return false;
3121   }
3122 
3123   mysql_bin_log.make_log_name(search_file_name, to_log);
3124   return purge_error_message(
3125       thd, mysql_bin_log.purge_logs(
3126                search_file_name, false, true /*need_lock_index=true*/,
3127                true /*need_update_threads=true*/, nullptr, false));
3128 }
3129 
3130 /**
3131   Execute a PURGE BINARY LOGS BEFORE @<date@> command.
3132 
3133   @param thd Pointer to THD object for the client thread executing the
3134   statement.
3135 
3136   @param purge_time Date before which logs should be purged.
3137 
3138   @retval false success
3139   @retval true failure
3140 */
purge_master_logs_before_date(THD * thd,time_t purge_time)3141 bool purge_master_logs_before_date(THD *thd, time_t purge_time) {
3142   if (!mysql_bin_log.is_open()) {
3143     my_ok(thd);
3144     return false;
3145   }
3146   return purge_error_message(
3147       thd, mysql_bin_log.purge_logs_before_date(purge_time, false));
3148 }
3149 
3150 /*
3151   Helper function to get the error code of the query to be binlogged.
3152  */
query_error_code(const THD * thd,bool not_killed)3153 int query_error_code(const THD *thd, bool not_killed) {
3154   int error;
3155 
3156   if (not_killed) {
3157     error = thd->is_error() ? thd->get_stmt_da()->mysql_errno() : 0;
3158 
3159     /* thd->get_stmt_da()->sql_errno() might be ER_SERVER_SHUTDOWN or
3160        ER_QUERY_INTERRUPTED, So here we need to make sure that error
3161        is not set to these errors when specified not_killed by the
3162        caller.
3163     */
3164     if (error == ER_SERVER_SHUTDOWN || error == ER_QUERY_INTERRUPTED) error = 0;
3165   } else
3166     error = thd->killed;
3167 
3168   return error;
3169 }
3170 
3171 /**
3172   Copy content of 'from' file from offset to 'to' file.
3173 
3174   - We do the copy outside of the IO_CACHE as the cache
3175   buffers would just make things slower and more complicated.
3176   In most cases the copy loop should only do one read.
3177 
3178   @param from          File to copy.
3179   @param to            File to copy to.
3180   @param offset        Offset in 'from' file.
3181 
3182 
3183   @retval
3184     0    ok
3185   @retval
3186     -1    error
3187 */
copy_file(IO_CACHE * from,IO_CACHE * to,my_off_t offset)3188 static bool copy_file(IO_CACHE *from, IO_CACHE *to, my_off_t offset) {
3189   int bytes_read;
3190   uchar io_buf[IO_SIZE * 2];
3191   DBUG_TRACE;
3192 
3193   mysql_file_seek(from->file, offset, MY_SEEK_SET, MYF(0));
3194   while (true) {
3195     if ((bytes_read = (int)mysql_file_read(from->file, io_buf, sizeof(io_buf),
3196                                            MYF(MY_WME))) < 0)
3197       goto err;
3198     if (DBUG_EVALUATE_IF("fault_injection_copy_part_file", 1, 0))
3199       bytes_read = bytes_read / 2;
3200     if (!bytes_read) break;  // end of file
3201     if (mysql_file_write(to->file, io_buf, bytes_read, MYF(MY_WME | MY_NABP)))
3202       goto err;
3203   }
3204 
3205   return false;
3206 
3207 err:
3208   return true;
3209 }
3210 
3211 /**
3212    Load data's io cache specific hook to be executed
3213    before a chunk of data is being read into the cache's buffer
3214    The fuction instantianates and writes into the binlog
3215    replication events along LOAD DATA processing.
3216 
3217    @param file  pointer to io-cache
3218    @retval 0 success
3219    @retval 1 failure
3220 */
log_loaded_block(IO_CACHE * file)3221 int log_loaded_block(IO_CACHE *file) {
3222   DBUG_TRACE;
3223   LOAD_FILE_INFO *lf_info;
3224   uint block_len;
3225   /* buffer contains position where we started last read */
3226   uchar *buffer = (uchar *)my_b_get_buffer_start(file);
3227   uint max_event_size = current_thd->variables.max_allowed_packet;
3228   lf_info = (LOAD_FILE_INFO *)file->arg;
3229   if (lf_info->thd->is_current_stmt_binlog_format_row()) return 0;
3230   if (lf_info->last_pos_in_file != HA_POS_ERROR &&
3231       lf_info->last_pos_in_file >= my_b_get_pos_in_file(file))
3232     return 0;
3233 
3234   for (block_len = (uint)(my_b_get_bytes_in_buffer(file)); block_len > 0;
3235        buffer += min(block_len, max_event_size),
3236       block_len -= min(block_len, max_event_size)) {
3237     lf_info->last_pos_in_file = my_b_get_pos_in_file(file);
3238     if (lf_info->logged_data_file) {
3239       Append_block_log_event a(lf_info->thd, lf_info->thd->db().str, buffer,
3240                                min(block_len, max_event_size),
3241                                lf_info->log_delayed);
3242       if (mysql_bin_log.write_event(&a)) return 1;
3243     } else {
3244       Begin_load_query_log_event b(lf_info->thd, lf_info->thd->db().str, buffer,
3245                                    min(block_len, max_event_size),
3246                                    lf_info->log_delayed);
3247       if (mysql_bin_log.write_event(&b)) return 1;
3248       lf_info->logged_data_file = true;
3249     }
3250   }
3251   return 0;
3252 }
3253 
3254 /* Helper function for SHOW BINLOG/RELAYLOG EVENTS */
3255 template <class BINLOG_FILE_READER>
show_binlog_events(THD * thd,MYSQL_BIN_LOG * binary_log)3256 bool show_binlog_events(THD *thd, MYSQL_BIN_LOG *binary_log) {
3257   Protocol *protocol = thd->get_protocol();
3258   List<Item> field_list;
3259   std::string errmsg;
3260   LOG_INFO linfo;
3261 
3262   DBUG_TRACE;
3263 
3264   DBUG_ASSERT(thd->lex->sql_command == SQLCOM_SHOW_BINLOG_EVENTS ||
3265               thd->lex->sql_command == SQLCOM_SHOW_RELAYLOG_EVENTS);
3266 
3267   if (binary_log->is_open()) {
3268     LEX_MASTER_INFO *lex_mi = &thd->lex->mi;
3269     SELECT_LEX_UNIT *unit = thd->lex->unit;
3270     ha_rows event_count, limit_start, limit_end;
3271     my_off_t pos =
3272         max<my_off_t>(BIN_LOG_HEADER_SIZE, lex_mi->pos);  // user-friendly
3273     char search_file_name[FN_REFLEN], *name;
3274     const char *log_file_name = lex_mi->log_file_name;
3275     Log_event *ev = nullptr;
3276 
3277     unit->set_limit(thd, thd->lex->current_select());
3278     limit_start = unit->offset_limit_cnt;
3279     limit_end = unit->select_limit_cnt;
3280 
3281     name = search_file_name;
3282     if (log_file_name)
3283       binary_log->make_log_name(search_file_name, log_file_name);
3284     else
3285       name = nullptr;  // Find first log
3286 
3287     linfo.index_file_offset = 0;
3288 
3289     if (binary_log->find_log_pos(&linfo, name, true /*need_lock_index=true*/)) {
3290       errmsg = "Could not find target log";
3291       goto err;
3292     }
3293 
3294     mysql_mutex_lock(&thd->LOCK_thd_data);
3295     thd->current_linfo = &linfo;
3296     mysql_mutex_unlock(&thd->LOCK_thd_data);
3297 
3298     BINLOG_FILE_READER binlog_file_reader(
3299         opt_master_verify_checksum,
3300         std::max(thd->variables.max_allowed_packet,
3301                  binlog_row_event_max_size + MAX_LOG_EVENT_HEADER));
3302 
3303     if (binlog_file_reader.open(linfo.log_file_name, pos)) {
3304       errmsg = binlog_file_reader.get_error_str();
3305       goto err;
3306     }
3307 
3308     /*
3309       Adjust the pos to the correct starting offset of an event after the
3310       specified position if it is an invalid starting offset.
3311     */
3312     pos = binlog_file_reader.position();
3313 
3314     /*
3315       For 'in-active' binlog file, it is safe to read all events in it. But
3316       for 'active' binlog file, it is only safe to read the events before
3317       get_binlog_end_pos().
3318 
3319       Binlog rotation may happen after calling is_active(). In this case,
3320       end_pos will NOT be set to 0 while the file is actually not 'active'.
3321       It is safe, since 'end_pos' still expresses a correct position.
3322     */
3323     my_off_t end_pos = binary_log->get_binlog_end_pos();
3324     if (!binary_log->is_active(linfo.log_file_name)) end_pos = 0;
3325 
3326     DEBUG_SYNC(thd, "after_show_binlog_event_found_file");
3327 
3328     /**
3329       Relaylog_file_reader and Binlog_file_reader are typedefs to
3330       Basic_binlog_file_reader whereas Relaylog_file_reader uses
3331       a Relaylog_ifile in the template instantiation and
3332       Binlog_file_reader uses a Binlog_ifile in the template
3333       instantiation.
3334 
3335       Binlog_ifile and Relaylog_ifile differ only in the open()
3336       member function and they both derive from Basic_binlog_ifile.
3337 
3338       Therefore, it is OK to cast to Binlog_file_reader here.
3339 
3340       TODO: in the future investigate if some refactoring is needed
3341             here. Perhaps make the Iterator itself templated.
3342      */
3343     binlog::tools::Iterator it(
3344         reinterpret_cast<Binlog_file_reader *>(&binlog_file_reader));
3345 
3346     /*
3347       Unpacked events shall copy their part of the buffer from uncompressed
3348       buffer (the cointainer, i.e., the buffer iterator goes out of scope
3349       once the events are inflated and put in a vector). However, it is
3350       unclear if the *buffer* from which events are deserialized is still
3351       needed for the porposes of displaying events in SHOW BINLOG/RELAYLOG
3352       EVENTS.
3353     */
3354     my_off_t last_log_pos = 0;
3355     for (event_count = 0, ev = it.begin(); ev != it.end();) {
3356       DEBUG_SYNC(thd, "wait_in_show_binlog_events_loop");
3357       if (event_count >= limit_start &&
3358           ev->net_send(protocol, linfo.log_file_name, pos)) {
3359         /* purecov: begin inspected */
3360         errmsg = "Net error";
3361         delete ev;
3362         ev = nullptr;
3363         goto err;
3364         /* purecov: end */
3365       }
3366       last_log_pos = ev->common_header->log_pos;
3367       delete ev;
3368       ev = nullptr;
3369       pos = binlog_file_reader.position();
3370 
3371       if (++event_count == limit_end) break;
3372       if ((ev = it.next()) == it.end()) break;
3373       if (it.has_error()) break;
3374       if (end_pos > 0 && pos >= end_pos &&
3375           (ev->common_header->log_pos != last_log_pos)) {
3376         delete ev;
3377         ev = nullptr;
3378         break;
3379       }
3380     }
3381 
3382     if (binlog_file_reader.has_fatal_error())
3383       errmsg = binlog_file_reader.get_error_str();
3384     else if (it.has_error())
3385       errmsg = it.get_error_message(); /* purecov: inspected */
3386     else
3387       errmsg = "";
3388   }
3389   // Check that linfo is still on the function scope.
3390   DEBUG_SYNC(thd, "after_show_binlog_events");
3391 
3392 err:
3393   if (!errmsg.empty()) {
3394     if (thd->lex->sql_command == SQLCOM_SHOW_RELAYLOG_EVENTS)
3395       my_error(ER_ERROR_WHEN_EXECUTING_COMMAND, MYF(0), "SHOW RELAYLOG EVENTS",
3396                errmsg.c_str());
3397     else
3398       my_error(ER_ERROR_WHEN_EXECUTING_COMMAND, MYF(0), "SHOW BINLOG EVENTS",
3399                errmsg.c_str());
3400   } else
3401     my_eof(thd);
3402 
3403   mysql_mutex_lock(&thd->LOCK_thd_data);
3404   thd->current_linfo = nullptr;
3405   mysql_mutex_unlock(&thd->LOCK_thd_data);
3406   return !errmsg.empty();
3407 }
3408 
show_binlog_events(THD * thd,MYSQL_BIN_LOG * binary_log)3409 bool show_binlog_events(THD *thd, MYSQL_BIN_LOG *binary_log) {
3410   if (binary_log->is_relay_log)
3411     return show_binlog_events<Relaylog_file_reader>(thd, binary_log);
3412   return show_binlog_events<Binlog_file_reader>(thd, binary_log);
3413 }
3414 
3415 /**
3416   Execute a SHOW BINLOG EVENTS statement.
3417 
3418   @param thd Pointer to THD object for the client thread executing the
3419   statement.
3420 
3421   @retval false success
3422   @retval true failure
3423 */
mysql_show_binlog_events(THD * thd)3424 bool mysql_show_binlog_events(THD *thd) {
3425   List<Item> field_list;
3426   DBUG_TRACE;
3427 
3428   DBUG_ASSERT(thd->lex->sql_command == SQLCOM_SHOW_BINLOG_EVENTS);
3429 
3430   Log_event::init_show_field_list(&field_list);
3431   if (thd->send_result_metadata(&field_list,
3432                                 Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF))
3433     return true;
3434 
3435   /*
3436     Wait for handlers to insert any pending information
3437     into the binlog.  For e.g. ndb which updates the binlog asynchronously
3438     this is needed so that the uses sees all its own commands in the binlog
3439   */
3440   ha_binlog_wait(thd);
3441 
3442   return show_binlog_events(thd, &mysql_bin_log);
3443 }
3444 
MYSQL_BIN_LOG(uint * sync_period,bool relay_log)3445 MYSQL_BIN_LOG::MYSQL_BIN_LOG(uint *sync_period, bool relay_log)
3446     : name(nullptr),
3447       write_error(false),
3448       inited(false),
3449       m_binlog_file(new Binlog_ofile()),
3450       m_key_LOCK_log(key_LOG_LOCK_log),
3451       bytes_written(0),
3452       file_id(1),
3453       sync_period_ptr(sync_period),
3454       sync_counter(0),
3455       is_relay_log(relay_log),
3456       checksum_alg_reset(binary_log::BINLOG_CHECKSUM_ALG_UNDEF),
3457       relay_log_checksum_alg(binary_log::BINLOG_CHECKSUM_ALG_UNDEF),
3458       previous_gtid_set_relaylog(nullptr),
3459       is_rotating_caused_by_incident(false) {
3460   /*
3461     We don't want to initialize locks here as such initialization depends on
3462     safe_mutex (when using safe_mutex) which depends on MY_INIT(), which is
3463     called only in main(). Doing initialization here would make it happen
3464     before main().
3465   */
3466   index_file_name[0] = 0;
3467 }
3468 
~MYSQL_BIN_LOG()3469 MYSQL_BIN_LOG::~MYSQL_BIN_LOG() { delete m_binlog_file; }
3470 
3471 /* this is called only once */
3472 
cleanup()3473 void MYSQL_BIN_LOG::cleanup() {
3474   DBUG_TRACE;
3475   if (inited) {
3476     inited = false;
3477     close(LOG_CLOSE_INDEX | LOG_CLOSE_STOP_EVENT, true /*need_lock_log=true*/,
3478           true /*need_lock_index=true*/);
3479     mysql_mutex_destroy(&LOCK_log);
3480     mysql_mutex_destroy(&LOCK_index);
3481     mysql_mutex_destroy(&LOCK_commit);
3482     mysql_mutex_destroy(&LOCK_sync);
3483     mysql_mutex_destroy(&LOCK_binlog_end_pos);
3484     mysql_mutex_destroy(&LOCK_xids);
3485     mysql_cond_destroy(&update_cond);
3486     mysql_cond_destroy(&m_prep_xids_cond);
3487     if (!is_relay_log) {
3488       Commit_stage_manager::get_instance().deinit();
3489     }
3490   }
3491 
3492   delete m_binlog_file;
3493   m_binlog_file = nullptr;
3494 }
3495 
init_pthread_objects()3496 void MYSQL_BIN_LOG::init_pthread_objects() {
3497   DBUG_ASSERT(inited == 0);
3498   inited = true;
3499 
3500   mysql_mutex_init(m_key_LOCK_log, &LOCK_log, MY_MUTEX_INIT_SLOW);
3501   mysql_mutex_init(m_key_LOCK_index, &LOCK_index, MY_MUTEX_INIT_SLOW);
3502   mysql_mutex_init(m_key_LOCK_commit, &LOCK_commit, MY_MUTEX_INIT_FAST);
3503   mysql_mutex_init(m_key_LOCK_sync, &LOCK_sync, MY_MUTEX_INIT_FAST);
3504   mysql_mutex_init(m_key_LOCK_binlog_end_pos, &LOCK_binlog_end_pos,
3505                    MY_MUTEX_INIT_FAST);
3506   mysql_mutex_init(m_key_LOCK_xids, &LOCK_xids, MY_MUTEX_INIT_FAST);
3507   mysql_cond_init(m_key_update_cond, &update_cond);
3508   mysql_cond_init(m_key_prep_xids_cond, &m_prep_xids_cond);
3509   if (!is_relay_log) {
3510     Commit_stage_manager::get_instance().init(
3511         m_key_LOCK_flush_queue, m_key_LOCK_sync_queue, m_key_LOCK_commit_queue,
3512         m_key_LOCK_done, m_key_COND_done);
3513   }
3514 }
3515 
3516 /**
3517   Check if a string is a valid number.
3518 
3519   @param str			String to test
3520   @param res			Store value here
3521   @param allow_wildcards	Set to 1 if we should ignore '%' and '_'
3522 
3523   @note
3524     For the moment the allow_wildcards argument is not used
3525     Should be moved to some other file.
3526 
3527   @retval
3528     1	String is a number
3529   @retval
3530     0	String is not a number
3531 */
3532 
is_number(const char * str,ulong * res,bool allow_wildcards)3533 static bool is_number(const char *str, ulong *res, bool allow_wildcards) {
3534   int flag;
3535   const char *start;
3536   DBUG_TRACE;
3537 
3538   flag = 0;
3539   start = str;
3540   while (*str++ == ' ')
3541     ;
3542   if (*--str == '-' || *str == '+') str++;
3543   while (my_isdigit(files_charset_info, *str) ||
3544          (allow_wildcards && (*str == wild_many || *str == wild_one))) {
3545     flag = 1;
3546     str++;
3547   }
3548   if (*str == '.') {
3549     for (str++; my_isdigit(files_charset_info, *str) ||
3550                 (allow_wildcards && (*str == wild_many || *str == wild_one));
3551          str++, flag = 1)
3552       ;
3553   }
3554   if (*str != 0 || flag == 0) return false;
3555   if (res) *res = atol(start);
3556   return true; /* Number ok */
3557 } /* is_number */
3558 
3559 /**
3560   Find a unique filename for 'filename.#'.
3561 
3562   Set '#' to the highest existing log file extension plus one.
3563 
3564   This function will return nonzero if: (i) the generated name
3565   exceeds FN_REFLEN; (ii) if the number of extensions is exhausted;
3566   or (iii) some other error happened while examining the filesystem.
3567 
3568   @return
3569     nonzero if not possible to get unique filename.
3570 */
3571 
find_uniq_filename(char * name,uint32 new_index_number)3572 static int find_uniq_filename(char *name, uint32 new_index_number) {
3573   uint i;
3574   char buff[FN_REFLEN], ext_buf[FN_REFLEN];
3575   MY_DIR *dir_info = nullptr;
3576   struct fileinfo *file_info;
3577   ulong max_found = 0, next = 0, number = 0;
3578   size_t buf_length, length;
3579   char *start, *end;
3580   int error = 0;
3581   DBUG_TRACE;
3582 
3583   length = dirname_part(buff, name, &buf_length);
3584   start = name + length;
3585   end = strend(start);
3586 
3587   *end = '.';
3588   length = (size_t)(end - start + 1);
3589 
3590   if ((DBUG_EVALUATE_IF(
3591           "error_unique_log_filename", 1,
3592           !(dir_info =
3593                 my_dir(buff, MYF(MY_DONT_SORT)))))) {  // This shouldn't happen
3594     my_stpcpy(end, ".1");                              // use name+1
3595     return 1;
3596   }
3597   file_info = dir_info->dir_entry;
3598   for (i = dir_info->number_off_files; i--; file_info++) {
3599     if (strncmp(file_info->name, start, length) == 0 &&
3600         is_number(file_info->name + length, &number, false)) {
3601       max_found = std::max(max_found, number);
3602     }
3603   }
3604   my_dirend(dir_info);
3605 
3606   /* check if reached the maximum possible extension number */
3607   if (max_found >= MAX_LOG_UNIQUE_FN_EXT) {
3608     LogErr(ERROR_LEVEL, ER_BINLOG_FILE_EXTENSION_NUMBER_EXHAUSTED, max_found);
3609     error = 1;
3610     goto end;
3611   }
3612 
3613   if (new_index_number > 0) {
3614     /*
3615       If "new_index_number" was specified, this means we are handling a
3616       "RESET MASTER TO" command and the binary log was already purged
3617       so max_found should be 0.
3618     */
3619     DBUG_ASSERT(max_found == 0);
3620     next = new_index_number;
3621   } else
3622     next = max_found + 1;
3623   if (sprintf(ext_buf, "%06lu", next) < 0) {
3624     error = 1;
3625     goto end;
3626   }
3627   *end++ = '.';
3628 
3629   /*
3630     Check if the generated extension size + the file name exceeds the
3631     buffer size used. If one did not check this, then the filename might be
3632     truncated, resulting in error.
3633    */
3634   if (((strlen(ext_buf) + (end - name)) >= FN_REFLEN)) {
3635     LogErr(ERROR_LEVEL, ER_BINLOG_FILE_NAME_TOO_LONG, name, ext_buf,
3636            (strlen(ext_buf) + (end - name)));
3637     error = 1;
3638     goto end;
3639   }
3640 
3641   if (sprintf(end, "%06lu", next) < 0) {
3642     error = 1;
3643     goto end;
3644   }
3645 
3646   /* print warning if reaching the end of available extensions. */
3647   if (next > MAX_ALLOWED_FN_EXT_RESET_MASTER)
3648     LogErr(WARNING_LEVEL, ER_BINLOG_FILE_EXTENSION_NUMBER_RUNNING_LOW, next,
3649            (MAX_LOG_UNIQUE_FN_EXT - next));
3650 
3651 end:
3652   return error;
3653 }
3654 
generate_new_name(char * new_name,const char * log_name,uint32 new_index_number)3655 int MYSQL_BIN_LOG::generate_new_name(char *new_name, const char *log_name,
3656                                      uint32 new_index_number) {
3657   fn_format(new_name, log_name, mysql_data_home, "", 4);
3658   if (!fn_ext(log_name)[0]) {
3659     if (find_uniq_filename(new_name, new_index_number)) {
3660       if (current_thd != nullptr)
3661         my_printf_error(ER_NO_UNIQUE_LOGFILE,
3662                         ER_THD(current_thd, ER_NO_UNIQUE_LOGFILE),
3663                         MYF(ME_FATALERROR), log_name);
3664       LogErr(ERROR_LEVEL, ER_FAILED_TO_GENERATE_UNIQUE_LOGFILE, log_name);
3665       return 1;
3666     }
3667   }
3668   return 0;
3669 }
3670 
3671 /**
3672   @todo
3673   The following should be using fn_format();  We just need to
3674   first change fn_format() to cut the file name if it's too long.
3675 */
generate_name(const char * log_name,const char * suffix,char * buff)3676 const char *MYSQL_BIN_LOG::generate_name(const char *log_name,
3677                                          const char *suffix, char *buff) {
3678   if (!log_name || !log_name[0]) {
3679     if (is_relay_log || log_bin_supplied)
3680       strmake(buff, default_logfile_name, FN_REFLEN - strlen(suffix) - 1);
3681     else
3682       strmake(buff, default_binlogfile_name, FN_REFLEN - strlen(suffix) - 1);
3683 
3684     return (const char *)fn_format(buff, buff, "", suffix,
3685                                    MYF(MY_REPLACE_EXT | MY_REPLACE_DIR));
3686   }
3687   // get rid of extension to avoid problems
3688 
3689   const char *p = fn_ext(log_name);
3690   uint length = (uint)(p - log_name);
3691   strmake(buff, log_name, min<size_t>(length, FN_REFLEN - 1));
3692   return (const char *)buff;
3693 }
3694 
init_and_set_log_file_name(const char * log_name,const char * new_name,uint32 new_index_number)3695 bool MYSQL_BIN_LOG::init_and_set_log_file_name(const char *log_name,
3696                                                const char *new_name,
3697                                                uint32 new_index_number) {
3698   if (new_name && !my_stpcpy(log_file_name, new_name))
3699     return true;
3700   else if (!new_name &&
3701            generate_new_name(log_file_name, log_name, new_index_number))
3702     return true;
3703 
3704   return false;
3705 }
3706 
3707 /**
3708   Open the logfile and init IO_CACHE.
3709 
3710   @param log_file_key        The file instrumentation key for this file
3711   @param log_name            The name of the log to open
3712   @param new_name            The new name for the logfile.
3713                              NULL forces generate_new_name() to be called.
3714   @param new_index_number    The binary log file index number to start from
3715                              after the RESET MASTER TO command is called.
3716 
3717   @return true if error, false otherwise.
3718 */
3719 
open(PSI_file_key log_file_key,const char * log_name,const char * new_name,uint32 new_index_number)3720 bool MYSQL_BIN_LOG::open(PSI_file_key log_file_key, const char *log_name,
3721                          const char *new_name, uint32 new_index_number) {
3722   DBUG_TRACE;
3723   bool ret = false;
3724 
3725   write_error = false;
3726   myf flags = MY_WME | MY_NABP | MY_WAIT_IF_FULL;
3727   if (is_relay_log) flags = flags | MY_REPORT_WAITING_IF_FULL;
3728 
3729   if (!(name = my_strdup(key_memory_MYSQL_LOG_name, log_name, MYF(MY_WME)))) {
3730     goto err;
3731   }
3732 
3733   if (init_and_set_log_file_name(name, new_name, new_index_number) ||
3734       DBUG_EVALUATE_IF("fault_injection_init_name", 1, 0))
3735     goto err;
3736 
3737   db[0] = 0;
3738 
3739   /* Keep the key for reopen */
3740   m_log_file_key = log_file_key;
3741 
3742   /*
3743     LOCK_sync guarantees that no thread is calling m_binlog_file to sync data
3744     to disk when another thread is opening the new file
3745     (FLUSH LOG or RESET MASTER).
3746   */
3747   if (!is_relay_log) mysql_mutex_lock(&LOCK_sync);
3748 
3749   ret = m_binlog_file->open(log_file_key, log_file_name, flags);
3750 
3751   if (!is_relay_log) mysql_mutex_unlock(&LOCK_sync);
3752 
3753   if (ret) goto err;
3754 
3755   atomic_log_state = LOG_OPENED;
3756   return false;
3757 
3758 err:
3759   if (binlog_error_action == ABORT_SERVER) {
3760     exec_binlog_error_action_abort(
3761         "Either disk is full, file system is read only or "
3762         "there was an encryption error while opening the binlog. "
3763         "Aborting the server.");
3764   } else
3765     LogErr(ERROR_LEVEL, ER_BINLOG_CANT_OPEN_FOR_LOGGING, log_name, errno);
3766 
3767   my_free(name);
3768   name = nullptr;
3769   atomic_log_state = LOG_CLOSED;
3770   return true;
3771 }
3772 
open_index_file(const char * index_file_name_arg,const char * log_name,bool need_lock_index)3773 bool MYSQL_BIN_LOG::open_index_file(const char *index_file_name_arg,
3774                                     const char *log_name,
3775                                     bool need_lock_index) {
3776   bool error = false;
3777   File index_file_nr = -1;
3778   if (need_lock_index)
3779     mysql_mutex_lock(&LOCK_index);
3780   else
3781     mysql_mutex_assert_owner(&LOCK_index);
3782 
3783   /*
3784     First open of this class instance
3785     Create an index file that will hold all file names uses for logging.
3786     Add new entries to the end of it.
3787   */
3788   myf opt = MY_UNPACK_FILENAME;
3789 
3790   if (my_b_inited(&index_file)) goto end;
3791 
3792   if (!index_file_name_arg) {
3793     index_file_name_arg = log_name;  // Use same basename for index file
3794     opt = MY_UNPACK_FILENAME | MY_REPLACE_EXT;
3795   }
3796   fn_format(index_file_name, index_file_name_arg, mysql_data_home, ".index",
3797             opt);
3798 
3799   if (set_crash_safe_index_file_name(index_file_name_arg)) {
3800     error = true;
3801     goto end;
3802   }
3803 
3804   /*
3805     We need move crash_safe_index_file to index_file if the index_file
3806     does not exist and crash_safe_index_file exists when mysqld server
3807     restarts.
3808   */
3809   if (my_access(index_file_name, F_OK) &&
3810       !my_access(crash_safe_index_file_name, F_OK) &&
3811       my_rename(crash_safe_index_file_name, index_file_name, MYF(MY_WME))) {
3812     LogErr(ERROR_LEVEL, ER_BINLOG_CANT_MOVE_TMP_TO_INDEX,
3813            "MYSQL_BIN_LOG::open_index_file");
3814     error = true;
3815     goto end;
3816   }
3817 
3818   if ((index_file_nr = mysql_file_open(m_key_file_log_index, index_file_name,
3819                                        O_RDWR | O_CREAT, MYF(MY_WME))) < 0 ||
3820       mysql_file_sync(index_file_nr, MYF(MY_WME)) ||
3821       init_io_cache_ext(&index_file, index_file_nr, IO_SIZE, READ_CACHE,
3822                         mysql_file_seek(index_file_nr, 0L, MY_SEEK_END, MYF(0)),
3823                         false, MYF(MY_WME | MY_WAIT_IF_FULL),
3824                         m_key_file_log_index_cache) ||
3825       DBUG_EVALUATE_IF("fault_injection_openning_index", 1, 0)) {
3826     /*
3827       TODO: all operations creating/deleting the index file or a log, should
3828       call my_sync_dir() or my_sync_dir_by_file() to be durable.
3829       TODO: file creation should be done with mysql_file_create()
3830       not mysql_file_open().
3831     */
3832     if (index_file_nr >= 0) mysql_file_close(index_file_nr, MYF(0));
3833     error = true;
3834     goto end;
3835   }
3836 
3837   /*
3838     Sync the index by purging any binary log file that is not registered.
3839     In other words, either purge binary log files that were removed from
3840     the index but not purged from the file system due to a crash or purge
3841     any binary log file that was created but not register in the index
3842     due to a crash.
3843   */
3844 
3845   if (set_purge_index_file_name(index_file_name_arg) ||
3846       open_purge_index_file(false) ||
3847       purge_index_entry(nullptr, nullptr, false) || close_purge_index_file() ||
3848       DBUG_EVALUATE_IF("fault_injection_recovering_index", 1, 0)) {
3849     LogErr(ERROR_LEVEL, ER_BINLOG_FAILED_TO_SYNC_INDEX_FILE);
3850     error = true;
3851     goto end;
3852   }
3853 
3854 end:
3855   if (need_lock_index) mysql_mutex_unlock(&LOCK_index);
3856   return error;
3857 }
3858 
3859 /**
3860   Add the GTIDs from the given relaylog file and also
3861   update the IO thread transaction parser.
3862 
3863   @param filename Relaylog file to read from.
3864   @param retrieved_gtids Gtid_set to store the GTIDs found on the relaylog file.
3865   @param verify_checksum Set to true to verify event checksums.
3866   @param trx_parser The transaction boundary parser to be used in order to
3867   only add a GTID to the gtid_set after ensuring the transaction is fully
3868   stored on the relay log.
3869   @param partial_trx The trx_monitoring_info of the last incomplete transaction
3870   found in the relay log.
3871 
3872   @retval false The file was successfully read and all GTIDs from
3873   Previous_gtids and Gtid_log_event from complete transactions were added to
3874   the retrieved_set.
3875   @retval true There was an error during the procedure.
3876 */
read_gtids_and_update_trx_parser_from_relaylog(const char * filename,Gtid_set * retrieved_gtids,bool verify_checksum,Transaction_boundary_parser * trx_parser,Gtid_monitoring_info * partial_trx)3877 static bool read_gtids_and_update_trx_parser_from_relaylog(
3878     const char *filename, Gtid_set *retrieved_gtids, bool verify_checksum,
3879     Transaction_boundary_parser *trx_parser,
3880     Gtid_monitoring_info *partial_trx) {
3881   DBUG_TRACE;
3882   DBUG_PRINT("info", ("Opening file %s", filename));
3883 
3884   DBUG_ASSERT(retrieved_gtids != nullptr);
3885   DBUG_ASSERT(trx_parser != nullptr);
3886 #ifndef DBUG_OFF
3887   unsigned long event_counter = 0;
3888 #endif
3889   bool error = false;
3890 
3891   Relaylog_file_reader relaylog_file_reader(verify_checksum);
3892   if (relaylog_file_reader.open(filename)) {
3893     LogErr(ERROR_LEVEL, ER_BINLOG_FILE_OPEN_FAILED,
3894            relaylog_file_reader.get_error_str());
3895 
3896     /*
3897       As read_gtids_from_binlog() will not throw error on truncated
3898       relaylog files, we should do the same here in order to keep the
3899       current behavior.
3900     */
3901     if (relaylog_file_reader.get_error_type() ==
3902         Binlog_read_error::CANNOT_GET_FILE_PASSWORD)
3903       error = true;
3904     return error;
3905   }
3906 
3907   Log_event *ev = nullptr;
3908   bool seen_prev_gtids = false;
3909   ulong data_len = 0;
3910 
3911   while (!error && (ev = relaylog_file_reader.read_event_object()) != nullptr) {
3912     DBUG_PRINT("info", ("Read event of type %s", ev->get_type_str()));
3913 #ifndef DBUG_OFF
3914     event_counter++;
3915 #endif
3916 
3917     data_len = uint4korr(ev->temp_buf + EVENT_LEN_OFFSET);
3918 
3919     bool info_error{false};
3920     binary_log::Log_event_basic_info log_event_info;
3921     std::tie(info_error, log_event_info) = extract_log_event_basic_info(
3922         ev->temp_buf, data_len,
3923         relaylog_file_reader.format_description_event());
3924 
3925     if (info_error || trx_parser->feed_event(log_event_info, false)) {
3926       /*
3927         The transaction boundary parser found an error while parsing a
3928         sequence of events from the relaylog. As we don't know if the
3929         parsing has started from a reliable point (it might started in
3930         a relay log file that begins with the rest of a transaction
3931         that started in a previous relay log file), it is better to do
3932         nothing in this case. The boundary parser will fix itself once
3933         finding an event that represent a transaction boundary.
3934 
3935         Suppose the following relaylog:
3936 
3937          rl-bin.000011 | rl-bin.000012 | rl-bin.000013 | rl-bin-000014
3938         ---------------+---------------+---------------+---------------
3939          PREV_GTIDS    | PREV_GTIDS    | PREV_GTIDS    | PREV_GTIDS
3940          (empty)       | (UUID:1-2)    | (UUID:1-2)    | (UUID:1-2)
3941         ---------------+---------------+---------------+---------------
3942          XID           | QUERY(INSERT) | QUERY(INSERT) | XID
3943         ---------------+---------------+---------------+---------------
3944          GTID(UUID:2)  |
3945         ---------------+
3946          QUERY(CREATE  |
3947          TABLE t1 ...) |
3948         ---------------+
3949          GTID(UUID:3)  |
3950         ---------------+
3951          QUERY(BEGIN)  |
3952         ---------------+
3953 
3954         As it is impossible to determine the current Retrieved_Gtid_Set by only
3955         looking to the PREVIOUS_GTIDS on the last relay log file, and scanning
3956         events on it, we tried to find a relay log file that contains at least
3957         one GTID event during the backwards search.
3958 
3959         In the example, we will find a GTID only in rl-bin.000011, as the
3960         UUID:3 transaction was spanned across 4 relay log files.
3961 
3962         The transaction spanning can be caused by "FLUSH RELAY LOGS" commands
3963         on slave while it is queuing the transaction.
3964 
3965         So, in order to correctly add UUID:3 into Retrieved_Gtid_Set, we need
3966         to parse the relay log starting on the file we found the last GTID
3967         queued to know if the transaction was fully retrieved or not.
3968 
3969         Start scanning rl-bin.000011 after resetting the transaction parser
3970         will generate an error, as XID event is only expected inside a DML,
3971         but in this case, we can ignore this error and reset the parser.
3972       */
3973       trx_parser->reset();
3974       /*
3975         We also have to discard the GTID of the partial transaction that was
3976         not finished if there is one. This is needed supposing that an
3977         incomplete transaction was replicated with a GTID.
3978 
3979         GTID(1), QUERY(BEGIN), QUERY(INSERT), ANONYMOUS_GTID, QUERY(DROP ...)
3980 
3981         In the example above, without cleaning the partial_trx,
3982         the GTID(1) would be added to the Retrieved_Gtid_Set after the
3983         QUERY(DROP ...) event.
3984 
3985         GTID(1), QUERY(BEGIN), QUERY(INSERT), GTID(2), QUERY(DROP ...)
3986 
3987         In the example above the GTID(1) will also be discarded as the
3988         GTID(1) transaction is not complete.
3989       */
3990       if (partial_trx->is_processing_trx_set()) {
3991         DBUG_PRINT("info", ("Discarding Gtid(%d, %lld) as the transaction "
3992                             "wasn't complete and we found an error in the"
3993                             "transaction boundary parser.",
3994                             partial_trx->get_processing_trx_gtid()->sidno,
3995                             partial_trx->get_processing_trx_gtid()->gno));
3996         partial_trx->clear_processing_trx();
3997       }
3998     }
3999 
4000     switch (ev->get_type_code()) {
4001       case binary_log::FORMAT_DESCRIPTION_EVENT:
4002       case binary_log::ROTATE_EVENT:
4003         // do nothing; just accept this event and go to next
4004         break;
4005       case binary_log::PREVIOUS_GTIDS_LOG_EVENT: {
4006         seen_prev_gtids = true;
4007         // add events to sets
4008         Previous_gtids_log_event *prev_gtids_ev =
4009             (Previous_gtids_log_event *)ev;
4010         if (prev_gtids_ev->add_to_set(retrieved_gtids) != 0) {
4011           error = true;
4012           break;
4013         }
4014 #ifndef DBUG_OFF
4015         char *prev_buffer = prev_gtids_ev->get_str(nullptr, nullptr);
4016         DBUG_PRINT("info", ("Got Previous_gtids from file '%s': Gtid_set='%s'.",
4017                             filename, prev_buffer));
4018         my_free(prev_buffer);
4019 #endif
4020         break;
4021       }
4022       case binary_log::GTID_LOG_EVENT: {
4023         /* If we didn't find any PREVIOUS_GTIDS in this file */
4024         if (!seen_prev_gtids) {
4025           my_error(ER_BINLOG_LOGICAL_CORRUPTION, MYF(0), filename,
4026                    "The first global transaction identifier was read, but "
4027                    "no other information regarding identifiers existing "
4028                    "on the previous log files was found.");
4029           error = true;
4030           break;
4031         }
4032 
4033         Gtid_log_event *gtid_ev = (Gtid_log_event *)ev;
4034         rpl_sidno sidno = gtid_ev->get_sidno(retrieved_gtids->get_sid_map());
4035         ulonglong immediate_commit_timestamp =
4036             gtid_ev->immediate_commit_timestamp;
4037         longlong original_commit_timestamp = gtid_ev->original_commit_timestamp;
4038 
4039         if (sidno < 0) {
4040           error = true;
4041           break;
4042         } else {
4043           if (retrieved_gtids->ensure_sidno(sidno) != RETURN_STATUS_OK) {
4044             error = true;
4045             break;
4046           } else {
4047             Gtid gtid = {sidno, gtid_ev->get_gno()};
4048             /*
4049               As are updating the transaction boundary parser while reading
4050               GTIDs from relay log files to fill the Retrieved_Gtid_Set, we
4051               should not add the GTID here as we don't know if the transaction
4052               is complete on the relay log yet.
4053             */
4054             partial_trx->start(gtid, original_commit_timestamp,
4055                                immediate_commit_timestamp);
4056           }
4057           DBUG_PRINT("info",
4058                      ("Found Gtid in relaylog file '%s': Gtid(%d, %lld).",
4059                       filename, sidno, gtid_ev->get_gno()));
4060         }
4061         break;
4062       }
4063       case binary_log::ANONYMOUS_GTID_LOG_EVENT:
4064       default:
4065         /*
4066           If we reached the end of a transaction after storing it's GTID
4067           in partial_trx structure, it is time to add this GTID to the
4068           retrieved_gtids set because the transaction is complete and there is
4069           no need for asking this transaction again.
4070         */
4071         if (trx_parser->is_not_inside_transaction()) {
4072           if (partial_trx->is_processing_trx_set()) {
4073             const Gtid *fully_retrieved_gtid;
4074             fully_retrieved_gtid = partial_trx->get_processing_trx_gtid();
4075             DBUG_PRINT("info", ("Adding Gtid to Retrieved_Gtid_Set as the "
4076                                 "transaction was completed at "
4077                                 "relaylog file '%s': Gtid(%d, %lld).",
4078                                 filename, fully_retrieved_gtid->sidno,
4079                                 fully_retrieved_gtid->gno));
4080             retrieved_gtids->_add_gtid(*fully_retrieved_gtid);
4081             /*
4082              We don't need to update the last queued structure here. We just
4083              want to have the information about the partial transaction left in
4084              the relay log.
4085             */
4086             partial_trx->clear();
4087           }
4088         }
4089         break;
4090     }
4091     delete ev;
4092   }
4093 
4094   if (relaylog_file_reader.has_fatal_error()) {
4095     // This is not a fatal error; the log may just be truncated.
4096     // @todo but what other errors could happen? IO error?
4097     LogErr(WARNING_LEVEL, ER_BINLOG_ERROR_READING_GTIDS_FROM_RELAY_LOG, -1);
4098   }
4099 
4100 #ifndef DBUG_OFF
4101   LogErr(INFORMATION_LEVEL, ER_BINLOG_EVENTS_READ_FROM_RELAY_LOG_INFO,
4102          event_counter, filename);
4103 #endif
4104 
4105   return error;
4106 }
4107 
4108 enum enum_read_gtids_from_binlog_status {
4109   GOT_GTIDS,
4110   GOT_PREVIOUS_GTIDS,
4111   NO_GTIDS,
4112   ERROR,
4113   TRUNCATED
4114 };
4115 /**
4116   Reads GTIDs from the given binlog file.
4117 
4118   @param filename File to read from.
4119   @param all_gtids If not NULL, then the GTIDs from the
4120   Previous_gtids_log_event and from all Gtid_log_events are stored in
4121   this object.
4122   @param prev_gtids If not NULL, then the GTIDs from the
4123   Previous_gtids_log_events are stored in this object.
4124   @param first_gtid If not NULL, then the first GTID information from the
4125   file will be stored in this object.
4126   @param sid_map The sid_map object to use in the rpl_sidno generation
4127   of the Gtid_log_event. If lock is needed in the sid_map, the caller
4128   must hold it.
4129   @param verify_checksum Set to true to verify event checksums.
4130   @param is_relay_log Set to true, if filename is a Relay Log, false if it is a
4131   Binary Log.
4132   @retval GOT_GTIDS The file was successfully read and it contains
4133   both Gtid_log_events and Previous_gtids_log_events.
4134   This is only possible if either all_gtids or first_gtid are not null.
4135   @retval GOT_PREVIOUS_GTIDS The file was successfully read and it
4136   contains Previous_gtids_log_events but no Gtid_log_events.
4137   For binary logs, if no all_gtids and no first_gtid are specified,
4138   this function will be done right after reading the PREVIOUS_GTIDS
4139   regardless of the rest of the content of the binary log file.
4140   @retval NO_GTIDS The file was successfully read and it does not
4141   contain GTID events.
4142   @retval ERROR Out of memory, or IO error, or malformed event
4143   structure, or the file is malformed (e.g., contains Gtid_log_events
4144   but no Previous_gtids_log_event).
4145   @retval TRUNCATED The file was truncated before the end of the
4146   first Previous_gtids_log_event.
4147 */
read_gtids_from_binlog(const char * filename,Gtid_set * all_gtids,Gtid_set * prev_gtids,Gtid * first_gtid,Sid_map * sid_map,bool verify_checksum,bool is_relay_log)4148 static enum_read_gtids_from_binlog_status read_gtids_from_binlog(
4149     const char *filename, Gtid_set *all_gtids, Gtid_set *prev_gtids,
4150     Gtid *first_gtid, Sid_map *sid_map, bool verify_checksum,
4151     bool is_relay_log) {
4152   DBUG_TRACE;
4153   DBUG_PRINT("info", ("Opening file %s", filename));
4154 
4155 #ifndef DBUG_OFF
4156   unsigned long event_counter = 0;
4157   /*
4158     We assert here that both all_gtids and prev_gtids, if specified,
4159     uses the same sid_map as the one passed as a parameter. This is just
4160     to ensure that, if the sid_map needed some lock and was locked by
4161     the caller, the lock applies to all the GTID sets this function is
4162     dealing with.
4163   */
4164   if (all_gtids) DBUG_ASSERT(all_gtids->get_sid_map() == sid_map);
4165   if (prev_gtids) DBUG_ASSERT(prev_gtids->get_sid_map() == sid_map);
4166 #endif
4167 
4168   Binlog_file_reader binlog_file_reader(verify_checksum);
4169   if (binlog_file_reader.open(filename)) {
4170     LogErr(ERROR_LEVEL, ER_BINLOG_FILE_OPEN_FAILED,
4171            binlog_file_reader.get_error_str());
4172     /*
4173       We need to revisit the recovery procedure for relay log
4174       files. Currently, it is called after this routine.
4175       /Alfranio
4176     */
4177     if (binlog_file_reader.get_error_type() ==
4178         Binlog_read_error::CANNOT_GET_FILE_PASSWORD)
4179       return ERROR;
4180     return TRUNCATED;
4181   }
4182 
4183   Log_event *ev = nullptr;
4184   enum_read_gtids_from_binlog_status ret = NO_GTIDS;
4185   bool done = false;
4186   bool seen_first_gtid = false;
4187   while (!done && (ev = binlog_file_reader.read_event_object()) != nullptr) {
4188 #ifndef DBUG_OFF
4189     event_counter++;
4190 #endif
4191     DBUG_PRINT("info", ("Read event of type %s", ev->get_type_str()));
4192     switch (ev->get_type_code()) {
4193       case binary_log::FORMAT_DESCRIPTION_EVENT:
4194       case binary_log::ROTATE_EVENT:
4195         // do nothing; just accept this event and go to next
4196         break;
4197       case binary_log::PREVIOUS_GTIDS_LOG_EVENT: {
4198         ret = GOT_PREVIOUS_GTIDS;
4199         // add events to sets
4200         Previous_gtids_log_event *prev_gtids_ev =
4201             (Previous_gtids_log_event *)ev;
4202         if (all_gtids != nullptr && prev_gtids_ev->add_to_set(all_gtids) != 0)
4203           ret = ERROR, done = true;
4204         else if (prev_gtids != nullptr &&
4205                  prev_gtids_ev->add_to_set(prev_gtids) != 0)
4206           ret = ERROR, done = true;
4207 #ifndef DBUG_OFF
4208         char *prev_buffer = prev_gtids_ev->get_str(nullptr, nullptr);
4209         DBUG_PRINT("info", ("Got Previous_gtids from file '%s': Gtid_set='%s'.",
4210                             filename, prev_buffer));
4211         my_free(prev_buffer);
4212 #endif
4213         /*
4214           If this is not a relay log, the previous_gtids were asked and no
4215           all_gtids neither first_gtid were asked, it is fine to consider the
4216           job as done.
4217         */
4218         if (!is_relay_log && prev_gtids != nullptr && all_gtids == nullptr &&
4219             first_gtid == nullptr)
4220           done = true;
4221         DBUG_EXECUTE_IF("inject_fault_bug16502579", {
4222           DBUG_PRINT("debug", ("PREVIOUS_GTIDS_LOG_EVENT found. "
4223                                "Injected ret=NO_GTIDS."));
4224           if (ret == GOT_PREVIOUS_GTIDS) {
4225             ret = NO_GTIDS;
4226             done = false;
4227           }
4228         });
4229         break;
4230       }
4231       case binary_log::GTID_LOG_EVENT: {
4232         if (ret != GOT_GTIDS) {
4233           if (ret != GOT_PREVIOUS_GTIDS) {
4234             /*
4235               Since this routine is run on startup, there may not be a
4236               THD instance. Therefore, ER(X) cannot be used.
4237              */
4238             const char *msg_fmt =
4239                 (current_thd != nullptr)
4240                     ? ER_THD(current_thd, ER_BINLOG_LOGICAL_CORRUPTION)
4241                     : ER_DEFAULT(ER_BINLOG_LOGICAL_CORRUPTION);
4242             my_printf_error(
4243                 ER_BINLOG_LOGICAL_CORRUPTION, msg_fmt, MYF(0), filename,
4244                 "The first global transaction identifier was read, but "
4245                 "no other information regarding identifiers existing "
4246                 "on the previous log files was found.");
4247             ret = ERROR, done = true;
4248             break;
4249           } else
4250             ret = GOT_GTIDS;
4251         }
4252         /*
4253           When this is a relaylog, we just check if the relay log contains at
4254           least one Gtid_log_event, so that we can distinguish the return values
4255           GOT_GTID and GOT_PREVIOUS_GTIDS. We don't need to read anything else
4256           from the relay log.
4257           When this is a binary log, if all_gtids is requested (i.e., NOT NULL),
4258           we should continue to read all gtids. If just first_gtid was
4259           requested, we will be done after storing this Gtid_log_event info on
4260           it.
4261         */
4262         if (is_relay_log) {
4263           ret = GOT_GTIDS, done = true;
4264         } else {
4265           Gtid_log_event *gtid_ev = (Gtid_log_event *)ev;
4266           rpl_sidno sidno = gtid_ev->get_sidno(sid_map);
4267           if (sidno < 0)
4268             ret = ERROR, done = true;
4269           else {
4270             if (all_gtids) {
4271               if (all_gtids->ensure_sidno(sidno) != RETURN_STATUS_OK)
4272                 ret = ERROR, done = true;
4273               all_gtids->_add_gtid(sidno, gtid_ev->get_gno());
4274               DBUG_PRINT("info", ("Got Gtid from file '%s': Gtid(%d, %lld).",
4275                                   filename, sidno, gtid_ev->get_gno()));
4276             }
4277 
4278             /* If the first GTID was requested, stores it */
4279             if (first_gtid && !seen_first_gtid) {
4280               first_gtid->set(sidno, gtid_ev->get_gno());
4281               seen_first_gtid = true;
4282               /* If the first_gtid was the only thing requested, we are done */
4283               if (all_gtids == nullptr) ret = GOT_GTIDS, done = true;
4284             }
4285           }
4286         }
4287         break;
4288       }
4289       case binary_log::ANONYMOUS_GTID_LOG_EVENT: {
4290         /*
4291           When this is a relaylog, we just check if it contains
4292           at least one Anonymous_gtid_log_event after initialization
4293           (FDs, Rotates and PREVIOUS_GTIDS), so that we can distinguish the
4294           return values GOT_GTID and GOT_PREVIOUS_GTIDS.
4295           We don't need to read anything else from the relay log.
4296         */
4297         if (is_relay_log) {
4298           ret = GOT_GTIDS;
4299           done = true;
4300           break;
4301         }
4302         DBUG_ASSERT(prev_gtids == nullptr
4303                         ? true
4304                         : all_gtids != nullptr || first_gtid != nullptr);
4305       }
4306       // Fall through.
4307       default:
4308         // if we found any other event type without finding a
4309         // previous_gtids_log_event, then the rest of this binlog
4310         // cannot contain gtids
4311         if (ret != GOT_GTIDS && ret != GOT_PREVIOUS_GTIDS) done = true;
4312         /*
4313           The GTIDs of the relaylog files will be handled later
4314           because of the possibility of transactions be spanned
4315           along distinct relaylog files.
4316           So, if we found an ordinary event without finding the
4317           GTID but we already found the PREVIOUS_GTIDS, this probably
4318           means that the event is from a transaction that started on
4319           previous relaylog file.
4320         */
4321         if (ret == GOT_PREVIOUS_GTIDS && is_relay_log) done = true;
4322         break;
4323     }
4324     delete ev;
4325     DBUG_PRINT("info", ("done=%d", done));
4326   }
4327 
4328   if (binlog_file_reader.has_fatal_error()) {
4329     // This is not a fatal error; the log may just be truncated.
4330 
4331     // @todo but what other errors could happen? IO error?
4332     LogErr(WARNING_LEVEL, ER_BINLOG_ERROR_READING_GTIDS_FROM_BINARY_LOG, -1);
4333   }
4334 
4335   if (all_gtids)
4336     all_gtids->dbug_print("all_gtids");
4337   else
4338     DBUG_PRINT("info", ("all_gtids==NULL"));
4339   if (prev_gtids)
4340     prev_gtids->dbug_print("prev_gtids");
4341   else
4342     DBUG_PRINT("info", ("prev_gtids==NULL"));
4343   if (first_gtid == nullptr)
4344     DBUG_PRINT("info", ("first_gtid==NULL"));
4345   else if (first_gtid->sidno == 0)
4346     DBUG_PRINT("info", ("first_gtid.sidno==0"));
4347   else
4348     first_gtid->dbug_print(sid_map, "first_gtid");
4349 
4350   DBUG_PRINT("info", ("returning %d", ret));
4351 #ifndef DBUG_OFF
4352   if (!is_relay_log && prev_gtids != nullptr && all_gtids == nullptr &&
4353       first_gtid == nullptr)
4354     LogErr(INFORMATION_LEVEL, ER_BINLOG_EVENTS_READ_FROM_BINLOG_INFO,
4355            event_counter, filename);
4356 #endif
4357   return ret;
4358 }
4359 
find_first_log_not_in_gtid_set(char * binlog_file_name,const Gtid_set * gtid_set,Gtid * first_gtid,const char ** errmsg)4360 bool MYSQL_BIN_LOG::find_first_log_not_in_gtid_set(char *binlog_file_name,
4361                                                    const Gtid_set *gtid_set,
4362                                                    Gtid *first_gtid,
4363                                                    const char **errmsg) {
4364   DBUG_TRACE;
4365   LOG_INFO linfo;
4366   auto log_index = this->get_log_index();
4367   std::list<std::string> filename_list = log_index.second;
4368   int error = log_index.first;
4369   list<string>::reverse_iterator rit;
4370   Gtid_set binlog_previous_gtid_set{gtid_set->get_sid_map()};
4371 
4372   if (error != LOG_INFO_EOF) {
4373     *errmsg =
4374         "Failed to read the binary log index file while "
4375         "looking for the oldest binary log that contains any GTID "
4376         "that is not in the given gtid set";
4377     error = -1;
4378     goto end;
4379   }
4380 
4381   if (filename_list.empty()) {
4382     *errmsg =
4383         "Could not find first log file name in binary log index file "
4384         "while looking for the oldest binary log that contains any GTID "
4385         "that is not in the given gtid set";
4386     error = -2;
4387     goto end;
4388   }
4389 
4390   /*
4391     Iterate over all the binary logs in reverse order, and read only
4392     the Previous_gtids_log_event, to find the first one, that is the
4393     subset of the given gtid set. Since every binary log begins with
4394     a Previous_gtids_log_event, that contains all GTIDs in all
4395     previous binary logs.
4396     We also ask for the first GTID in the binary log to know if we
4397     should send the FD event with the "created" field cleared or not.
4398   */
4399   DBUG_PRINT("info", ("Iterating backwards through binary logs, and reading "
4400                       "only the Previous_gtids_log_event, to find the first "
4401                       "one, that is the subset of the given gtid set."));
4402   rit = filename_list.rbegin();
4403   error = 0;
4404   while (rit != filename_list.rend()) {
4405     binlog_previous_gtid_set.clear();
4406     const char *filename = rit->c_str();
4407     DBUG_PRINT("info",
4408                ("Read Previous_gtids_log_event from filename='%s'", filename));
4409     switch (read_gtids_from_binlog(filename, nullptr, &binlog_previous_gtid_set,
4410                                    first_gtid,
4411                                    binlog_previous_gtid_set.get_sid_map(),
4412                                    opt_master_verify_checksum, is_relay_log)) {
4413       case ERROR:
4414         *errmsg =
4415             "Error reading header of binary log while looking for "
4416             "the oldest binary log that contains any GTID that is not in "
4417             "the given gtid set";
4418         error = -3;
4419         goto end;
4420       case NO_GTIDS:
4421         *errmsg =
4422             "Found old binary log without GTIDs while looking for "
4423             "the oldest binary log that contains any GTID that is not in "
4424             "the given gtid set";
4425         error = -4;
4426         goto end;
4427       case GOT_GTIDS:
4428       case GOT_PREVIOUS_GTIDS:
4429         if (binlog_previous_gtid_set.is_subset(gtid_set)) {
4430           strcpy(binlog_file_name, filename);
4431           /*
4432             Verify that the selected binlog is not the first binlog,
4433           */
4434           DBUG_EXECUTE_IF("slave_reconnect_with_gtid_set_executed",
4435                           DBUG_ASSERT(strcmp(filename_list.begin()->c_str(),
4436                                              binlog_file_name) != 0););
4437           goto end;
4438         }
4439       case TRUNCATED:
4440         break;
4441     }
4442 
4443     rit++;
4444   }
4445 
4446   if (rit == filename_list.rend()) {
4447     report_missing_gtids(&binlog_previous_gtid_set, gtid_set, errmsg);
4448     error = -5;
4449   }
4450 
4451 end:
4452   if (error) DBUG_PRINT("error", ("'%s'", *errmsg));
4453   filename_list.clear();
4454   DBUG_PRINT("info", ("returning %d", error));
4455   return error != 0 ? true : false;
4456 }
4457 
init_gtid_sets(Gtid_set * all_gtids,Gtid_set * lost_gtids,bool verify_checksum,bool need_lock,Transaction_boundary_parser * trx_parser,Gtid_monitoring_info * partial_trx,bool is_server_starting)4458 bool MYSQL_BIN_LOG::init_gtid_sets(Gtid_set *all_gtids, Gtid_set *lost_gtids,
4459                                    bool verify_checksum, bool need_lock,
4460                                    Transaction_boundary_parser *trx_parser,
4461                                    Gtid_monitoring_info *partial_trx,
4462                                    bool is_server_starting) {
4463   DBUG_TRACE;
4464   DBUG_PRINT(
4465       "info",
4466       ("lost_gtids=%p; so we are recovering a %s log; is_relay_log=%d",
4467        lost_gtids, lost_gtids == nullptr ? "relay" : "binary", is_relay_log));
4468 
4469   Checkable_rwlock *sid_lock =
4470       is_relay_log ? all_gtids->get_sid_map()->get_sid_lock() : global_sid_lock;
4471   /*
4472     If this is a relay log, we must have the IO thread Master_info trx_parser
4473     in order to correctly feed it with relay log events.
4474   */
4475 #ifndef DBUG_OFF
4476   if (is_relay_log) {
4477     DBUG_ASSERT(trx_parser != nullptr);
4478     DBUG_ASSERT(lost_gtids == nullptr);
4479   }
4480 #endif
4481 
4482   /*
4483     Acquires the necessary locks to ensure that logs are not either
4484     removed or updated when we are reading from it.
4485   */
4486   if (need_lock) {
4487     // We don't need LOCK_log if we are only going to read the initial
4488     // Prevoius_gtids_log_event and ignore the Gtid_log_events.
4489     if (all_gtids != nullptr) mysql_mutex_lock(&LOCK_log);
4490     mysql_mutex_lock(&LOCK_index);
4491     sid_lock->wrlock();
4492   } else {
4493     if (all_gtids != nullptr) mysql_mutex_assert_owner(&LOCK_log);
4494     mysql_mutex_assert_owner(&LOCK_index);
4495     sid_lock->assert_some_wrlock();
4496   }
4497 
4498   /* Initialize the sid_map to be used in read_gtids_from_binlog */
4499   Sid_map *sid_map = nullptr;
4500   if (all_gtids)
4501     sid_map = all_gtids->get_sid_map();
4502   else if (lost_gtids)
4503     sid_map = lost_gtids->get_sid_map();
4504 
4505   // Gather the set of files to be accessed.
4506   auto log_index = this->get_log_index(false);
4507   std::list<std::string> filename_list = log_index.second;
4508   int error = log_index.first;
4509   list<string>::iterator it;
4510   list<string>::reverse_iterator rit;
4511   bool reached_first_file = false;
4512 
4513   if (error != LOG_INFO_EOF) {
4514     DBUG_PRINT("error", ("Error reading %s index",
4515                          is_relay_log ? "relaylog" : "binlog"));
4516     goto end;
4517   }
4518   /*
4519     On server starting, one new empty binlog file is created and
4520     its file name is put into index file before initializing
4521     GLOBAL.GTID_EXECUTED AND GLOBAL.GTID_PURGED, it is not the
4522     last binlog file before the server restarts, so we remove
4523     its file name from filename_list.
4524   */
4525   if (is_server_starting && !is_relay_log && !filename_list.empty())
4526     filename_list.pop_back();
4527 
4528   error = 0;
4529   if (all_gtids != nullptr) {
4530     DBUG_PRINT("info", ("Iterating backwards through %s logs, "
4531                         "looking for the last %s log that contains "
4532                         "a Previous_gtids_log_event.",
4533                         is_relay_log ? "relay" : "binary",
4534                         is_relay_log ? "relay" : "binary"));
4535     // Iterate over all files in reverse order until we find one that
4536     // contains a Previous_gtids_log_event.
4537     rit = filename_list.rbegin();
4538     bool can_stop_reading = false;
4539     reached_first_file = (rit == filename_list.rend());
4540     DBUG_PRINT("info",
4541                ("filename='%s' reached_first_file=%d",
4542                 reached_first_file ? "" : rit->c_str(), reached_first_file));
4543     while (!can_stop_reading && !reached_first_file) {
4544       const char *filename = rit->c_str();
4545       DBUG_ASSERT(rit != filename_list.rend());
4546       rit++;
4547       reached_first_file = (rit == filename_list.rend());
4548       DBUG_PRINT("info", ("filename='%s' can_stop_reading=%d "
4549                           "reached_first_file=%d, ",
4550                           filename, can_stop_reading, reached_first_file));
4551       switch (read_gtids_from_binlog(
4552           filename, all_gtids, reached_first_file ? lost_gtids : nullptr,
4553           nullptr /* first_gtid */, sid_map, verify_checksum, is_relay_log)) {
4554         case ERROR: {
4555           error = 1;
4556           goto end;
4557         }
4558         case GOT_GTIDS: {
4559           can_stop_reading = true;
4560           break;
4561         }
4562         case GOT_PREVIOUS_GTIDS: {
4563           /*
4564             If this is a binlog file, it is enough to have GOT_PREVIOUS_GTIDS.
4565             If this is a relaylog file, we need to find at least one GTID to
4566             start parsing the relay log to add GTID of transactions that might
4567             have spanned in distinct relaylog files.
4568           */
4569           if (!is_relay_log) can_stop_reading = true;
4570           break;
4571         }
4572         case NO_GTIDS: {
4573           /*
4574             Mysql server iterates backwards through binary logs, looking for
4575             the last binary log that contains a Previous_gtids_log_event for
4576             gathering the set of gtid_executed on server start. This may take
4577             very long time if it has many binary logs and almost all of them
4578             are out of filesystem cache. So if the binlog_gtid_simple_recovery
4579             is enabled, and the last binary log does not contain any GTID
4580             event, do not read any more binary logs, GLOBAL.GTID_EXECUTED and
4581             GLOBAL.GTID_PURGED should be empty in the case.
4582           */
4583           if (binlog_gtid_simple_recovery && is_server_starting &&
4584               !is_relay_log) {
4585             DBUG_ASSERT(all_gtids->is_empty());
4586             DBUG_ASSERT(lost_gtids->is_empty());
4587             goto end;
4588           }
4589           /*FALLTHROUGH*/
4590         }
4591         case TRUNCATED: {
4592           break;
4593         }
4594       }
4595     }
4596 
4597     /*
4598       If we use GTIDs and have partial transactions on the relay log,
4599       must check if it ends on next relay log files.
4600       We also need to feed the boundary parser with the rest of the
4601       relay log to put it in the correct state before receiving new
4602       events from the master in the case of GTID auto positioning be
4603       disabled.
4604     */
4605     if (is_relay_log && filename_list.size() > 0) {
4606       /*
4607         Suppose the following relaylog:
4608 
4609          rl-bin.000001 | rl-bin.000002 | rl-bin.000003 | rl-bin-000004
4610         ---------------+---------------+---------------+---------------
4611          PREV_GTIDS    | PREV_GTIDS    | PREV_GTIDS    | PREV_GTIDS
4612          (empty)       | (UUID:1)      | (UUID:1)      | (UUID:1)
4613         ---------------+---------------+---------------+---------------
4614          GTID(UUID:1)  | QUERY(INSERT) | QUERY(INSERT) | XID
4615         ---------------+---------------+---------------+---------------
4616          QUERY(CREATE  |
4617          TABLE t1 ...) |
4618         ---------------+
4619          GTID(UUID:2)  |
4620         ---------------+
4621          QUERY(BEGIN)  |
4622         ---------------+
4623 
4624         As it is impossible to determine the current Retrieved_Gtid_Set by only
4625         looking to the PREVIOUS_GTIDS on the last relay log file, and scanning
4626         events on it, we tried to find a relay log file that contains at least
4627         one GTID event during the backwards search.
4628 
4629         In the example, we will find a GTID only in rl-bin.000001, as the
4630         UUID:2 transaction was spanned across 4 relay log files.
4631 
4632         The transaction spanning can be caused by "FLUSH RELAY LOGS" commands
4633         on slave while it is queuing the transaction.
4634 
4635         So, in order to correctly add UUID:2 into Retrieved_Gtid_Set, we need
4636         to parse the relay log starting on the file we found the last GTID
4637         queued to know if the transaction was fully retrieved or not.
4638       */
4639 
4640       /*
4641         Adjust the reverse iterator to point to the relaylog file we
4642         need to start parsing, as it was incremented after generating
4643         the relay log file name.
4644       */
4645       DBUG_ASSERT(rit != filename_list.rbegin());
4646       rit--;
4647       DBUG_ASSERT(rit != filename_list.rend());
4648       /* Reset the transaction parser before feeding it with events */
4649       trx_parser->reset();
4650       partial_trx->clear();
4651 
4652       DBUG_PRINT("info", ("Iterating forwards through relay logs, "
4653                           "updating the Retrieved_Gtid_Set and updating "
4654                           "IO thread trx parser before start."));
4655       for (it = find(filename_list.begin(), filename_list.end(), *rit);
4656            it != filename_list.end(); it++) {
4657         const char *filename = it->c_str();
4658         DBUG_PRINT("info", ("filename='%s'", filename));
4659         if (read_gtids_and_update_trx_parser_from_relaylog(
4660                 filename, all_gtids, true, trx_parser, partial_trx)) {
4661           error = 1;
4662           goto end;
4663         }
4664       }
4665     }
4666   }
4667   if (lost_gtids != nullptr && !reached_first_file) {
4668     /*
4669       This branch is only reacheable by a binary log. The relay log
4670       don't need to get lost_gtids information.
4671 
4672       A 5.6 server sets GTID_PURGED by rotating the binary log.
4673 
4674       A 5.6 server that had recently enabled GTIDs and set GTID_PURGED
4675       would have a sequence of binary logs like:
4676 
4677       master-bin.N  : No PREVIOUS_GTIDS (GTID wasn't enabled)
4678       master-bin.N+1: Has an empty PREVIOUS_GTIDS and a ROTATE
4679                       (GTID was enabled on startup)
4680       master-bin.N+2: Has a PREVIOUS_GTIDS with the content set by a
4681                       SET @@GLOBAL.GTID_PURGED + has GTIDs of some
4682                       transactions.
4683 
4684       If this 5.6 server be upgraded to 5.7 keeping its binary log files,
4685       this routine will have to find the first binary log that contains a
4686       PREVIOUS_GTIDS + a GTID event to ensure that the content of the
4687       GTID_PURGED will be correctly set (assuming binlog_gtid_simple_recovery
4688       is not enabled).
4689     */
4690     DBUG_PRINT("info", ("Iterating forwards through binary logs, looking for "
4691                         "the first binary log that contains both a "
4692                         "Previous_gtids_log_event and a Gtid_log_event."));
4693     DBUG_ASSERT(!is_relay_log);
4694     for (it = filename_list.begin(); it != filename_list.end(); it++) {
4695       /*
4696         We should pass a first_gtid to read_gtids_from_binlog when
4697         binlog_gtid_simple_recovery is disabled, or else it will return
4698         right after reading the PREVIOUS_GTIDS event to avoid stall on
4699         reading the whole binary log.
4700       */
4701       Gtid first_gtid = {0, 0};
4702       const char *filename = it->c_str();
4703       DBUG_PRINT("info", ("filename='%s'", filename));
4704       switch (read_gtids_from_binlog(
4705           filename, nullptr, lost_gtids,
4706           binlog_gtid_simple_recovery ? nullptr : &first_gtid, sid_map,
4707           verify_checksum, is_relay_log)) {
4708         case ERROR: {
4709           error = 1;
4710           /*FALLTHROUGH*/
4711         }
4712         case GOT_GTIDS: {
4713           goto end;
4714         }
4715         case NO_GTIDS:
4716         case GOT_PREVIOUS_GTIDS: {
4717           /*
4718             Mysql server iterates forwards through binary logs, looking for
4719             the first binary log that contains both Previous_gtids_log_event
4720             and gtid_log_event for gathering the set of gtid_purged on server
4721             start. It also iterates forwards through binary logs, looking for
4722             the first binary log that contains both Previous_gtids_log_event
4723             and gtid_log_event for gathering the set of gtid_purged when
4724             purging binary logs. This may take very long time if it has many
4725             binary logs and almost all of them are out of filesystem cache.
4726             So if the binlog_gtid_simple_recovery is enabled, we just
4727             initialize GLOBAL.GTID_PURGED from the first binary log, do not
4728             read any more binary logs.
4729           */
4730           if (binlog_gtid_simple_recovery) goto end;
4731           /*FALLTHROUGH*/
4732         }
4733         case TRUNCATED: {
4734           break;
4735         }
4736       }
4737     }
4738   }
4739 end:
4740   if (all_gtids) all_gtids->dbug_print("all_gtids");
4741   if (lost_gtids) lost_gtids->dbug_print("lost_gtids");
4742   if (need_lock) {
4743     sid_lock->unlock();
4744     mysql_mutex_unlock(&LOCK_index);
4745     if (all_gtids != nullptr) mysql_mutex_unlock(&LOCK_log);
4746   }
4747   filename_list.clear();
4748   DBUG_PRINT("info", ("returning %d", error));
4749   return error != 0 ? true : false;
4750 }
4751 
4752 /**
4753   Open a (new) binlog file.
4754 
4755   - Open the log file and the index file. Register the new
4756   file name in it
4757   - When calling this when the file is in use, you must have a locks
4758   on LOCK_log and LOCK_index.
4759 
4760   @retval
4761     0	ok
4762   @retval
4763     1	error
4764 */
4765 
open_binlog(const char * log_name,const char * new_name,ulong max_size_arg,bool null_created_arg,bool need_lock_index,bool need_sid_lock,Format_description_log_event * extra_description_event,uint32 new_index_number)4766 bool MYSQL_BIN_LOG::open_binlog(
4767     const char *log_name, const char *new_name, ulong max_size_arg,
4768     bool null_created_arg, bool need_lock_index, bool need_sid_lock,
4769     Format_description_log_event *extra_description_event,
4770     uint32 new_index_number) {
4771   // lock_index must be acquired *before* sid_lock.
4772   DBUG_ASSERT(need_sid_lock || !need_lock_index);
4773   DBUG_TRACE;
4774   DBUG_PRINT("enter", ("base filename: %s", log_name));
4775 
4776   mysql_mutex_assert_owner(get_log_lock());
4777 
4778   if (init_and_set_log_file_name(log_name, new_name, new_index_number)) {
4779     LogErr(ERROR_LEVEL, ER_BINLOG_CANT_GENERATE_NEW_FILE_NAME);
4780     return true;
4781   }
4782 
4783   DBUG_PRINT("info", ("generated filename: %s", log_file_name));
4784 
4785   DEBUG_SYNC(current_thd, "after_log_file_name_initialized");
4786 
4787   if (open_purge_index_file(true) ||
4788       register_create_index_entry(log_file_name) || sync_purge_index_file() ||
4789       DBUG_EVALUATE_IF("fault_injection_registering_index", 1, 0)) {
4790     /**
4791       @todo: although this was introduced to appease valgrind
4792       when injecting emulated faults using fault_injection_registering_index
4793       it may be good to consider what actually happens when
4794       open_purge_index_file succeeds but register or sync fails.
4795 
4796       Perhaps we might need the code below in MYSQL_BIN_LOG::cleanup
4797       for "real life" purposes as well?
4798     */
4799     DBUG_EXECUTE_IF("fault_injection_registering_index", {
4800       if (my_b_inited(&purge_index_file)) {
4801         end_io_cache(&purge_index_file);
4802         my_close(purge_index_file.file, MYF(0));
4803       }
4804     });
4805 
4806     LogErr(ERROR_LEVEL, ER_BINLOG_FAILED_TO_SYNC_INDEX_FILE_IN_OPEN);
4807     return true;
4808   }
4809   DBUG_EXECUTE_IF("crash_create_non_critical_before_update_index",
4810                   DBUG_SUICIDE(););
4811 
4812   write_error = false;
4813 
4814   /* open the main log file */
4815   if (open(m_key_file_log, log_name, new_name, new_index_number)) {
4816     close_purge_index_file();
4817     return true; /* all warnings issued */
4818   }
4819 
4820   max_size = max_size_arg;
4821 
4822   bool write_file_name_to_index_file = false;
4823 
4824   /* This must be before goto err. */
4825 #ifndef DBUG_OFF
4826   binary_log_debug::debug_pretend_version_50034_in_binlog =
4827       DBUG_EVALUATE_IF("pretend_version_50034_in_binlog", true, false);
4828 #endif
4829   Format_description_log_event s;
4830 
4831   if (m_binlog_file->is_empty()) {
4832     /*
4833       The binary log file was empty (probably newly created)
4834       This is the normal case and happens when the user doesn't specify
4835       an extension for the binary log files.
4836       In this case we write a standard header to it.
4837     */
4838     if (m_binlog_file->write(pointer_cast<const uchar *>(BINLOG_MAGIC),
4839                              BIN_LOG_HEADER_SIZE))
4840       goto err;
4841     bytes_written += BIN_LOG_HEADER_SIZE;
4842     write_file_name_to_index_file = true;
4843   }
4844 
4845   /*
4846     don't set LOG_EVENT_BINLOG_IN_USE_F for the relay log
4847   */
4848   if (!is_relay_log) {
4849     s.common_header->flags |= LOG_EVENT_BINLOG_IN_USE_F;
4850   }
4851 
4852   if (is_relay_log) {
4853     /* relay-log */
4854     if (relay_log_checksum_alg == binary_log::BINLOG_CHECKSUM_ALG_UNDEF) {
4855       /* inherit master's A descriptor if one has been received */
4856       if (opt_slave_sql_verify_checksum == 0)
4857         /* otherwise use slave's local preference of RL events verification */
4858         relay_log_checksum_alg = binary_log::BINLOG_CHECKSUM_ALG_OFF;
4859       else
4860         relay_log_checksum_alg =
4861             static_cast<enum_binlog_checksum_alg>(binlog_checksum_options);
4862     }
4863   }
4864 
4865   if (!s.is_valid()) goto err;
4866   s.dont_set_created = null_created_arg;
4867   /* Set LOG_EVENT_RELAY_LOG_F flag for relay log's FD */
4868   if (is_relay_log) s.set_relay_log_event();
4869   if (write_event_to_binlog(&s)) goto err;
4870   /*
4871     We need to revisit this code and improve it.
4872     See further comments in the mysqld.
4873     /Alfranio
4874   */
4875   if (current_thd) {
4876     Checkable_rwlock *sid_lock = nullptr;
4877     Gtid_set logged_gtids_binlog(global_sid_map, global_sid_lock);
4878     Gtid_set *previous_logged_gtids;
4879 
4880     if (is_relay_log) {
4881       previous_logged_gtids = previous_gtid_set_relaylog;
4882       sid_lock = previous_gtid_set_relaylog->get_sid_map()->get_sid_lock();
4883     } else {
4884       previous_logged_gtids = &logged_gtids_binlog;
4885       sid_lock = global_sid_lock;
4886     }
4887 
4888     if (need_sid_lock)
4889       sid_lock->wrlock();
4890     else
4891       sid_lock->assert_some_wrlock();
4892 
4893     if (!is_relay_log) {
4894       const Gtid_set *executed_gtids = gtid_state->get_executed_gtids();
4895       const Gtid_set *gtids_only_in_table =
4896           gtid_state->get_gtids_only_in_table();
4897       /* logged_gtids_binlog= executed_gtids - gtids_only_in_table */
4898       if (logged_gtids_binlog.add_gtid_set(executed_gtids) !=
4899           RETURN_STATUS_OK) {
4900         if (need_sid_lock) sid_lock->unlock();
4901         goto err;
4902       }
4903       logged_gtids_binlog.remove_gtid_set(gtids_only_in_table);
4904     }
4905     DBUG_PRINT("info", ("Generating PREVIOUS_GTIDS for %s file.",
4906                         is_relay_log ? "relaylog" : "binlog"));
4907     Previous_gtids_log_event prev_gtids_ev(previous_logged_gtids);
4908     if (is_relay_log) prev_gtids_ev.set_relay_log_event();
4909     if (need_sid_lock) sid_lock->unlock();
4910     if (write_event_to_binlog(&prev_gtids_ev)) goto err;
4911   } else  // !(current_thd)
4912   {
4913     /*
4914       If the slave was configured before server restart, the server will
4915       generate a new relay log file without having current_thd, but this
4916       new relay log file must have a PREVIOUS_GTIDS event as we now
4917       generate the PREVIOUS_GTIDS event always.
4918 
4919       This is only needed for relay log files because the server will add
4920       the PREVIOUS_GTIDS of binary logs (when current_thd==NULL) after
4921       server's GTID initialization.
4922 
4923       During server's startup at mysqld_main(), from the binary/relay log
4924       initialization point of view, it will:
4925       1) Call init_server_components() that will generate a new binary log
4926          file but won't write the PREVIOUS_GTIDS event yet;
4927       2) Initialize server's GTIDs;
4928       3) Write the binary log PREVIOUS_GTIDS;
4929       4) Call init_slave() in where the new relay log file will be created
4930          after initializing relay log's Retrieved_Gtid_Set;
4931     */
4932     if (is_relay_log) {
4933       Sid_map *previous_gtid_sid_map =
4934           previous_gtid_set_relaylog->get_sid_map();
4935       Checkable_rwlock *sid_lock = previous_gtid_sid_map->get_sid_lock();
4936 
4937       if (need_sid_lock)
4938         sid_lock->wrlock();
4939       else
4940         sid_lock->assert_some_wrlock(); /* purecov: inspected */
4941 
4942       DBUG_PRINT("info", ("Generating PREVIOUS_GTIDS for relaylog file."));
4943       Previous_gtids_log_event prev_gtids_ev(previous_gtid_set_relaylog);
4944       prev_gtids_ev.set_relay_log_event();
4945 
4946       if (need_sid_lock) sid_lock->unlock();
4947 
4948       if (write_event_to_binlog(&prev_gtids_ev)) goto err;
4949     }
4950   }
4951   if (extra_description_event) {
4952     /*
4953       This is a relay log written to by the I/O slave thread.
4954       Write the event so that others can later know the format of this relay
4955       log.
4956       Note that this event is very close to the original event from the
4957       master (it has binlog version of the master, event types of the
4958       master), so this is suitable to parse the next relay log's event. It
4959       has been produced by
4960       Format_description_log_event::Format_description_log_event(char* buf,).
4961       Why don't we want to write the mi_description_event if this
4962       event is for format<4 (3.23 or 4.x): this is because in that case, the
4963       mi_description_event describes the data received from the
4964       master, but not the data written to the relay log (*conversion*),
4965       which is in format 4 (slave's).
4966     */
4967     /*
4968       Set 'created' to 0, so that in next relay logs this event does not
4969       trigger cleaning actions on the slave in
4970       Format_description_log_event::apply_event_impl().
4971     */
4972     extra_description_event->created = 0;
4973     /* Don't set log_pos in event header */
4974     extra_description_event->set_artificial_event();
4975 
4976     if (binary_event_serialize(extra_description_event, m_binlog_file))
4977       goto err;
4978     bytes_written += extra_description_event->common_header->data_written;
4979   }
4980   if (m_binlog_file->flush_and_sync()) goto err;
4981 
4982   if (write_file_name_to_index_file) {
4983     DBUG_EXECUTE_IF("crash_create_critical_before_update_index",
4984                     DBUG_SUICIDE(););
4985     DBUG_ASSERT(my_b_inited(&index_file) != 0);
4986 
4987     /*
4988       The new log file name is appended into crash safe index file after
4989       all the content of index file is copyed into the crash safe index
4990       file. Then move the crash safe index file to index file.
4991     */
4992     DBUG_EXECUTE_IF("simulate_disk_full_on_open_binlog",
4993                     { DBUG_SET("+d,simulate_no_free_space_error"); });
4994     if (DBUG_EVALUATE_IF("fault_injection_updating_index", 1, 0) ||
4995         add_log_to_index((uchar *)log_file_name, strlen(log_file_name),
4996                          need_lock_index)) {
4997       DBUG_EXECUTE_IF("simulate_disk_full_on_open_binlog", {
4998         DBUG_SET("-d,simulate_file_write_error");
4999         DBUG_SET("-d,simulate_no_free_space_error");
5000         DBUG_SET("-d,simulate_disk_full_on_open_binlog");
5001       });
5002       goto err;
5003     }
5004 
5005     DBUG_EXECUTE_IF("crash_create_after_update_index", DBUG_SUICIDE(););
5006   }
5007 
5008   atomic_log_state = LOG_OPENED;
5009   /*
5010     At every rotate memorize the last transaction counter state to use it as
5011     offset at logging the transaction logical timestamps.
5012   */
5013   mysql_mutex_lock(&LOCK_slave_trans_dep_tracker);
5014   m_dependency_tracker.rotate();
5015   mysql_mutex_unlock(&LOCK_slave_trans_dep_tracker);
5016 
5017   close_purge_index_file();
5018 
5019   update_binlog_end_pos();
5020   return false;
5021 
5022 err:
5023   if (is_inited_purge_index_file())
5024     purge_index_entry(nullptr, nullptr, need_lock_index);
5025   close_purge_index_file();
5026   if (binlog_error_action == ABORT_SERVER) {
5027     exec_binlog_error_action_abort(
5028         "Either disk is full, file system is read only or "
5029         "there was an encryption error while opening the binlog. "
5030         "Aborting the server.");
5031   } else {
5032     LogErr(ERROR_LEVEL, ER_BINLOG_CANT_USE_FOR_LOGGING,
5033            (new_name) ? new_name : name, errno);
5034     close(LOG_CLOSE_INDEX, false, need_lock_index);
5035   }
5036   return true;
5037 }
5038 
5039 /**
5040   Move crash safe index file to index file.
5041 
5042   @param need_lock_index If true, LOCK_index will be acquired;
5043   otherwise it should already be held.
5044 
5045   @retval 0 ok
5046   @retval -1 error
5047 */
move_crash_safe_index_file_to_index_file(bool need_lock_index)5048 int MYSQL_BIN_LOG::move_crash_safe_index_file_to_index_file(
5049     bool need_lock_index) {
5050   int error = 0;
5051   File fd = -1;
5052   DBUG_TRACE;
5053   int failure_trials = MYSQL_BIN_LOG::MAX_RETRIES_FOR_DELETE_RENAME_FAILURE;
5054   bool file_rename_status = false, file_delete_status = false;
5055   THD *thd = current_thd;
5056 
5057   if (need_lock_index)
5058     mysql_mutex_lock(&LOCK_index);
5059   else
5060     mysql_mutex_assert_owner(&LOCK_index);
5061 
5062   if (my_b_inited(&index_file)) {
5063     end_io_cache(&index_file);
5064     if (mysql_file_close(index_file.file, MYF(0)) < 0) {
5065       error = -1;
5066       LogErr(ERROR_LEVEL, ER_BINLOG_FAILED_TO_CLOSE_INDEX_FILE_WHILE_REBUILDING,
5067              index_file_name);
5068       /*
5069         Delete Crash safe index file here and recover the binlog.index
5070         state(index_file io_cache) from old binlog.index content.
5071        */
5072       mysql_file_delete(key_file_binlog_index, crash_safe_index_file_name,
5073                         MYF(0));
5074 
5075       goto recoverable_err;
5076     }
5077 
5078     /*
5079       Sometimes an outsider can lock index files for temporary viewing
5080       purpose. For eg: MEB locks binlog.index/relaylog.index to view
5081       the content of the file. During that small period of time, deletion
5082       of the file is not possible on some platforms(Eg: Windows)
5083       Server should retry the delete operation for few times instead of
5084       panicking immediately.
5085     */
5086     while ((file_delete_status == false) && (failure_trials > 0)) {
5087       if (DBUG_EVALUATE_IF("force_index_file_delete_failure", 1, 0)) break;
5088 
5089       DBUG_EXECUTE_IF("simulate_index_file_delete_failure", {
5090         /* This simulation causes the delete to fail */
5091         static char first_char = index_file_name[0];
5092         index_file_name[0] = 0;
5093         sql_print_information("Retrying delete");
5094         if (failure_trials == 1) index_file_name[0] = first_char;
5095       };);
5096       file_delete_status = !(mysql_file_delete(key_file_binlog_index,
5097                                                index_file_name, MYF(MY_WME)));
5098       --failure_trials;
5099       if (!file_delete_status) {
5100         my_sleep(1000);
5101         /* Clear the error before retrying. */
5102         if (failure_trials > 0) thd->clear_error();
5103       }
5104     }
5105 
5106     if (!file_delete_status) {
5107       error = -1;
5108       LogErr(ERROR_LEVEL,
5109              ER_BINLOG_FAILED_TO_DELETE_INDEX_FILE_WHILE_REBUILDING,
5110              index_file_name);
5111       /*
5112         Delete Crash safe file index file here and recover the binlog.index
5113         state(index_file io_cache) from old binlog.index content.
5114        */
5115       mysql_file_delete(key_file_binlog_index, crash_safe_index_file_name,
5116                         MYF(0));
5117 
5118       goto recoverable_err;
5119     }
5120   }
5121 
5122   DBUG_EXECUTE_IF("crash_create_before_rename_index_file", DBUG_SUICIDE(););
5123   /*
5124     Sometimes an outsider can lock index files for temporary viewing
5125     purpose. For eg: MEB locks binlog.index/relaylog.index to view
5126     the content of the file. During that small period of time, rename
5127     of the file is not possible on some platforms(Eg: Windows)
5128     Server should retry the rename operation for few times instead of panicking
5129     immediately.
5130   */
5131   failure_trials = MYSQL_BIN_LOG::MAX_RETRIES_FOR_DELETE_RENAME_FAILURE;
5132   while ((file_rename_status == false) && (failure_trials > 0)) {
5133     DBUG_EXECUTE_IF("simulate_crash_safe_index_file_rename_failure", {
5134       /* This simulation causes the rename to fail */
5135       static char first_char = index_file_name[0];
5136       index_file_name[0] = 0;
5137       sql_print_information("Retrying rename");
5138       if (failure_trials == 1) index_file_name[0] = first_char;
5139     };);
5140     file_rename_status =
5141         !(my_rename(crash_safe_index_file_name, index_file_name, MYF(MY_WME)));
5142     --failure_trials;
5143     if (!file_rename_status) {
5144       my_sleep(1000);
5145       /* Clear the error before retrying. */
5146       if (failure_trials > 0) thd->clear_error();
5147     }
5148   }
5149   if (!file_rename_status) {
5150     error = -1;
5151     LogErr(ERROR_LEVEL, ER_BINLOG_FAILED_TO_RENAME_INDEX_FILE_WHILE_REBUILDING,
5152            index_file_name);
5153     goto fatal_err;
5154   }
5155   DBUG_EXECUTE_IF("crash_create_after_rename_index_file", DBUG_SUICIDE(););
5156 
5157 recoverable_err:
5158   if ((fd = mysql_file_open(key_file_binlog_index, index_file_name,
5159                             O_RDWR | O_CREAT, MYF(MY_WME))) < 0 ||
5160       mysql_file_sync(fd, MYF(MY_WME)) ||
5161       init_io_cache_ext(&index_file, fd, IO_SIZE, READ_CACHE,
5162                         mysql_file_seek(fd, 0L, MY_SEEK_END, MYF(0)), false,
5163                         MYF(MY_WME | MY_WAIT_IF_FULL),
5164                         key_file_binlog_index_cache)) {
5165     LogErr(ERROR_LEVEL, ER_BINLOG_FAILED_TO_OPEN_INDEX_FILE_AFTER_REBUILDING,
5166            index_file_name);
5167     goto fatal_err;
5168   }
5169 
5170   if (need_lock_index) mysql_mutex_unlock(&LOCK_index);
5171   return error;
5172 
5173 fatal_err:
5174   /*
5175     This situation is very very rare to happen (unless there is some serious
5176     memory related issues like OOM) and should be treated as fatal error.
5177     Hence it is better to bring down the server without respecting
5178     'binlog_error_action' value here.
5179   */
5180   exec_binlog_error_action_abort(
5181       "MySQL server failed to update the "
5182       "binlog.index file's content properly. "
5183       "It might not be in sync with available "
5184       "binlogs and the binlog.index file state is in "
5185       "unrecoverable state. Aborting the server.");
5186   /*
5187     Server is aborted in the above function.
5188     This is dead code to make compiler happy.
5189    */
5190   return error;
5191 }
5192 
5193 /**
5194   Append log file name to index file.
5195 
5196   - To make crash safe, we copy all the content of index file
5197   to crash safe index file firstly and then append the log
5198   file name to the crash safe index file. Finally move the
5199   crash safe index file to index file.
5200 
5201   @retval
5202     0   ok
5203   @retval
5204     -1   error
5205 */
add_log_to_index(uchar * log_name,size_t log_name_len,bool need_lock_index)5206 int MYSQL_BIN_LOG::add_log_to_index(uchar *log_name, size_t log_name_len,
5207                                     bool need_lock_index) {
5208   DBUG_TRACE;
5209 
5210   if (open_crash_safe_index_file()) {
5211     LogErr(ERROR_LEVEL, ER_BINLOG_CANT_OPEN_TMP_INDEX,
5212            "MYSQL_BIN_LOG::add_log_to_index");
5213     goto err;
5214   }
5215 
5216   if (copy_file(&index_file, &crash_safe_index_file, 0)) {
5217     LogErr(ERROR_LEVEL, ER_BINLOG_CANT_COPY_INDEX_TO_TMP,
5218            "MYSQL_BIN_LOG::add_log_to_index");
5219     goto err;
5220   }
5221 
5222   if (my_b_write(&crash_safe_index_file, log_name, log_name_len) ||
5223       my_b_write(&crash_safe_index_file, pointer_cast<const uchar *>("\n"),
5224                  1) ||
5225       flush_io_cache(&crash_safe_index_file) ||
5226       mysql_file_sync(crash_safe_index_file.file, MYF(MY_WME))) {
5227     LogErr(ERROR_LEVEL, ER_BINLOG_CANT_APPEND_LOG_TO_TMP_INDEX, log_name);
5228     goto err;
5229   }
5230 
5231   if (close_crash_safe_index_file()) {
5232     LogErr(ERROR_LEVEL, ER_BINLOG_CANT_CLOSE_TMP_INDEX,
5233            "MYSQL_BIN_LOG::add_log_to_index");
5234     goto err;
5235   }
5236 
5237   if (move_crash_safe_index_file_to_index_file(need_lock_index)) {
5238     LogErr(ERROR_LEVEL, ER_BINLOG_CANT_MOVE_TMP_TO_INDEX,
5239            "MYSQL_BIN_LOG::add_log_to_index");
5240     goto err;
5241   }
5242 
5243   return 0;
5244 
5245 err:
5246   return -1;
5247 }
5248 
get_current_log(LOG_INFO * linfo,bool need_lock_log)5249 int MYSQL_BIN_LOG::get_current_log(LOG_INFO *linfo,
5250                                    bool need_lock_log /*true*/) {
5251   if (need_lock_log) mysql_mutex_lock(&LOCK_log);
5252   int ret = raw_get_current_log(linfo);
5253   if (need_lock_log) mysql_mutex_unlock(&LOCK_log);
5254   return ret;
5255 }
5256 
raw_get_current_log(LOG_INFO * linfo)5257 int MYSQL_BIN_LOG::raw_get_current_log(LOG_INFO *linfo) {
5258   strmake(linfo->log_file_name, log_file_name,
5259           sizeof(linfo->log_file_name) - 1);
5260   linfo->pos = m_binlog_file->position();
5261   linfo->encrypted_header_size = m_binlog_file->get_encrypted_header_size();
5262   return 0;
5263 }
5264 
check_write_error(const THD * thd)5265 bool MYSQL_BIN_LOG::check_write_error(const THD *thd) {
5266   DBUG_TRACE;
5267 
5268   bool checked = false;
5269 
5270   if (!thd->is_error()) return checked;
5271 
5272   switch (thd->get_stmt_da()->mysql_errno()) {
5273     case ER_TRANS_CACHE_FULL:
5274     case ER_STMT_CACHE_FULL:
5275     case ER_ERROR_ON_WRITE:
5276     case ER_BINLOG_LOGGING_IMPOSSIBLE:
5277       checked = true;
5278       break;
5279   }
5280   DBUG_PRINT("return", ("checked: %s", YESNO(checked)));
5281   return checked;
5282 }
5283 
report_cache_write_error(THD * thd,bool is_transactional)5284 void MYSQL_BIN_LOG::report_cache_write_error(THD *thd, bool is_transactional) {
5285   DBUG_TRACE;
5286 
5287   write_error = true;
5288 
5289   if (check_write_error(thd)) return;
5290 
5291   if (my_errno() == EFBIG) {
5292     if (is_transactional) {
5293       my_error(ER_TRANS_CACHE_FULL, MYF(MY_WME));
5294     } else {
5295       my_error(ER_STMT_CACHE_FULL, MYF(MY_WME));
5296     }
5297   } else {
5298     char errbuf[MYSYS_STRERROR_SIZE];
5299     my_error(ER_ERROR_ON_WRITE, MYF(MY_WME), name, errno,
5300              my_strerror(errbuf, sizeof(errbuf), errno));
5301   }
5302 }
5303 
compare_log_name(const char * log_1,const char * log_2)5304 static int compare_log_name(const char *log_1, const char *log_2) {
5305   const char *log_1_basename = log_1 + dirname_length(log_1);
5306   const char *log_2_basename = log_2 + dirname_length(log_2);
5307 
5308   return strcmp(log_1_basename, log_2_basename);
5309 }
5310 
5311 /**
5312   Find the position in the log-index-file for the given log name.
5313 
5314   @param[out] linfo The found log file name will be stored here, along
5315   with the byte offset of the next log file name in the index file.
5316   @param log_name Filename to find in the index file, or NULL if we
5317   want to read the first entry.
5318   @param need_lock_index If false, this function acquires LOCK_index;
5319   otherwise the lock should already be held by the caller.
5320 
5321   @note
5322     On systems without the truncate function the file will end with one or
5323     more empty lines.  These will be ignored when reading the file.
5324 
5325   @retval
5326     0			ok
5327   @retval
5328     LOG_INFO_EOF	        End of log-index-file found
5329   @retval
5330     LOG_INFO_IO		Got IO error while reading file
5331 */
5332 
find_log_pos(LOG_INFO * linfo,const char * log_name,bool need_lock_index)5333 int MYSQL_BIN_LOG::find_log_pos(LOG_INFO *linfo, const char *log_name,
5334                                 bool need_lock_index) {
5335   int error = 0;
5336   char *full_fname = linfo->log_file_name;
5337   char full_log_name[FN_REFLEN], fname[FN_REFLEN];
5338   DBUG_TRACE;
5339   full_log_name[0] = full_fname[0] = 0;
5340 
5341   /*
5342     Mutex needed because we need to make sure the file pointer does not
5343     move from under our feet
5344   */
5345   if (need_lock_index)
5346     mysql_mutex_lock(&LOCK_index);
5347   else
5348     mysql_mutex_assert_owner(&LOCK_index);
5349 
5350   if (!my_b_inited(&index_file)) {
5351     error = LOG_INFO_IO;
5352     goto end;
5353   }
5354 
5355   // extend relative paths for log_name to be searched
5356   if (log_name) {
5357     if (normalize_binlog_name(full_log_name, log_name, is_relay_log)) {
5358       error = LOG_INFO_EOF;
5359       goto end;
5360     }
5361   }
5362 
5363   DBUG_PRINT("enter", ("log_name: %s, full_log_name: %s",
5364                        log_name ? log_name : "NULL", full_log_name));
5365 
5366   /* As the file is flushed, we can't get an error here */
5367   my_b_seek(&index_file, (my_off_t)0);
5368 
5369   for (;;) {
5370     size_t length;
5371     my_off_t offset = my_b_tell(&index_file);
5372 
5373     DBUG_EXECUTE_IF("simulate_find_log_pos_error", error = LOG_INFO_EOF;
5374                     break;);
5375     /* If we get 0 or 1 characters, this is the end of the file */
5376     if ((length = my_b_gets(&index_file, fname, FN_REFLEN)) <= 1) {
5377       /* Did not find the given entry; Return not found or error */
5378       error = !index_file.error ? LOG_INFO_EOF : LOG_INFO_IO;
5379       break;
5380     }
5381 
5382     // extend relative paths and match against full path
5383     if (normalize_binlog_name(full_fname, fname, is_relay_log)) {
5384       error = LOG_INFO_EOF;
5385       break;
5386     }
5387     // if the log entry matches, null string matching anything
5388     if (!log_name || !compare_log_name(full_fname, full_log_name)) {
5389       DBUG_PRINT("info", ("Found log file entry"));
5390       linfo->index_file_start_offset = offset;
5391       linfo->index_file_offset = my_b_tell(&index_file);
5392       break;
5393     }
5394     linfo->entry_index++;
5395   }
5396 
5397 end:
5398   if (need_lock_index) mysql_mutex_unlock(&LOCK_index);
5399   return error;
5400 }
5401 
5402 /**
5403   Find the position in the log-index-file for the given log name.
5404 
5405   @param[out] linfo The filename will be stored here, along with the
5406   byte offset of the next filename in the index file.
5407 
5408   @param need_lock_index If true, LOCK_index will be acquired;
5409   otherwise it should already be held by the caller.
5410 
5411   @note
5412     - Before calling this function, one has to call find_log_pos()
5413     to set up 'linfo'
5414     - Mutex needed because we need to make sure the file pointer does not move
5415     from under our feet
5416 
5417   @retval 0 ok
5418   @retval LOG_INFO_EOF End of log-index-file found
5419   @retval LOG_INFO_IO Got IO error while reading file
5420 */
find_next_log(LOG_INFO * linfo,bool need_lock_index)5421 int MYSQL_BIN_LOG::find_next_log(LOG_INFO *linfo, bool need_lock_index) {
5422   int error = 0;
5423   size_t length;
5424   char fname[FN_REFLEN];
5425   char *full_fname = linfo->log_file_name;
5426 
5427   if (need_lock_index)
5428     mysql_mutex_lock(&LOCK_index);
5429   else
5430     mysql_mutex_assert_owner(&LOCK_index);
5431 
5432   if (!my_b_inited(&index_file)) {
5433     error = LOG_INFO_IO;
5434     goto err;
5435   }
5436   /* As the file is flushed, we can't get an error here */
5437   my_b_seek(&index_file, linfo->index_file_offset);
5438 
5439   linfo->index_file_start_offset = linfo->index_file_offset;
5440   if ((length = my_b_gets(&index_file, fname, FN_REFLEN)) <= 1) {
5441     error = !index_file.error ? LOG_INFO_EOF : LOG_INFO_IO;
5442     goto err;
5443   }
5444 
5445   if (fname[0] != 0) {
5446     if (normalize_binlog_name(full_fname, fname, is_relay_log)) {
5447       error = LOG_INFO_EOF;
5448       goto err;
5449     }
5450     length = strlen(full_fname);
5451   }
5452 
5453   linfo->index_file_offset = my_b_tell(&index_file);
5454 
5455 err:
5456   if (need_lock_index) mysql_mutex_unlock(&LOCK_index);
5457   return error;
5458 }
5459 
5460 /**
5461   Find the relay log name following the given name from relay log index file.
5462 
5463   @param[in,out] log_name  The name is full path name.
5464 
5465   @return return 0 if it finds next relay log. Otherwise return the error code.
5466 */
find_next_relay_log(char log_name[FN_REFLEN+1])5467 int MYSQL_BIN_LOG::find_next_relay_log(char log_name[FN_REFLEN + 1]) {
5468   LOG_INFO info;
5469   int error;
5470   char relative_path_name[FN_REFLEN + 1];
5471 
5472   if (fn_format(relative_path_name, log_name + dirname_length(log_name),
5473                 mysql_data_home, "", 0) == NullS)
5474     return 1;
5475 
5476   mysql_mutex_lock(&LOCK_index);
5477 
5478   error = find_log_pos(&info, relative_path_name, false);
5479   if (error == 0) {
5480     error = find_next_log(&info, false);
5481     if (error == 0) strcpy(log_name, info.log_file_name);
5482   }
5483 
5484   mysql_mutex_unlock(&LOCK_index);
5485   return error;
5486 }
5487 
get_log_index(bool need_lock_index)5488 std::pair<int, std::list<std::string>> MYSQL_BIN_LOG::get_log_index(
5489     bool need_lock_index) {
5490   DBUG_TRACE;
5491   LOG_INFO log_info;
5492 
5493   if (need_lock_index)
5494     mysql_mutex_lock(&LOCK_index);
5495   else
5496     mysql_mutex_assert_owner(&LOCK_index);
5497 
5498   std::list<std::string> filename_list;
5499   int error = 0;
5500   for (error =
5501            this->find_log_pos(&log_info, nullptr, false /*need_lock_index*/);
5502        error == 0;
5503        error = this->find_next_log(&log_info, false /*need_lock_index*/)) {
5504     filename_list.push_back(std::string(log_info.log_file_name));
5505   }
5506 
5507   if (need_lock_index) mysql_mutex_unlock(&LOCK_index);
5508 
5509   return std::make_pair(error, filename_list);
5510 }
5511 
5512 /**
5513   Removes files, as part of a RESET MASTER or RESET SLAVE statement,
5514   by deleting all logs referred to in the index file and the index
5515   file. Then, it creates a new index file and a new log file.
5516 
5517   The new index file will only contain the new log file.
5518 
5519   @param thd Thread
5520   @param delete_only If true, do not create a new index file and
5521   a new log file.
5522 
5523   @note
5524     If not called from slave thread, write start event to new log
5525 
5526   @retval
5527     0	ok
5528   @retval
5529     1   error
5530 */
reset_logs(THD * thd,bool delete_only)5531 bool MYSQL_BIN_LOG::reset_logs(THD *thd, bool delete_only) {
5532   LOG_INFO linfo;
5533   bool error = false;
5534   int err;
5535   const char *save_name = nullptr;
5536   Checkable_rwlock *sid_lock = nullptr;
5537   DBUG_TRACE;
5538 
5539   /*
5540     Flush logs for storage engines, so that the last transaction
5541     is persisted inside storage engines.
5542   */
5543   DBUG_ASSERT(!thd->is_log_reset());
5544   thd->set_log_reset();
5545   if (ha_flush_logs()) {
5546     thd->clear_log_reset();
5547     return true;
5548   }
5549   thd->clear_log_reset();
5550 
5551   ha_reset_logs(thd);
5552 
5553   /*
5554     We need to get both locks to be sure that no one is trying to
5555     write to the index log file.
5556   */
5557   mysql_mutex_lock(&LOCK_log);
5558   mysql_mutex_lock(&LOCK_index);
5559 
5560   if (is_relay_log)
5561     sid_lock = previous_gtid_set_relaylog->get_sid_map()->get_sid_lock();
5562   else
5563     sid_lock = global_sid_lock;
5564   sid_lock->wrlock();
5565 
5566   /* Save variables so that we can reopen the log */
5567   save_name = name;
5568   name = nullptr;  // Protect against free
5569   close(LOG_CLOSE_TO_BE_OPENED, false /*need_lock_log=false*/,
5570         false /*need_lock_index=false*/);
5571 
5572   /*
5573     First delete all old log files and then update the index file.
5574     As we first delete the log files and do not use sort of logging,
5575     a crash may lead to an inconsistent state where the index has
5576     references to non-existent files.
5577 
5578     We need to invert the steps and use the purge_index_file methods
5579     in order to make the operation safe.
5580   */
5581 
5582   if ((err = find_log_pos(&linfo, NullS, false /*need_lock_index=false*/)) !=
5583       0) {
5584     uint errcode = purge_log_get_error_code(err);
5585     LogErr(ERROR_LEVEL, ER_BINLOG_CANT_LOCATE_OLD_BINLOG_OR_RELAY_LOG_FILES);
5586     my_error(errcode, MYF(0));
5587     error = true;
5588     goto err;
5589   }
5590 
5591   for (;;) {
5592     if ((error = my_delete_allow_opened(linfo.log_file_name, MYF(0))) != 0) {
5593       if (my_errno() == ENOENT) {
5594         push_warning_printf(
5595             current_thd, Sql_condition::SL_WARNING, ER_LOG_PURGE_NO_FILE,
5596             ER_THD(current_thd, ER_LOG_PURGE_NO_FILE), linfo.log_file_name);
5597         LogErr(INFORMATION_LEVEL, ER_BINLOG_CANT_DELETE_FILE,
5598                linfo.log_file_name);
5599         set_my_errno(0);
5600         error = false;
5601       } else {
5602         push_warning_printf(current_thd, Sql_condition::SL_WARNING,
5603                             ER_BINLOG_PURGE_FATAL_ERR,
5604                             "a problem with deleting %s; "
5605                             "consider examining correspondence "
5606                             "of your binlog index file "
5607                             "to the actual binlog files",
5608                             linfo.log_file_name);
5609         error = true;
5610         goto err;
5611       }
5612     }
5613     if (find_next_log(&linfo, false /*need_lock_index=false*/)) break;
5614   }
5615 
5616   /* Start logging with a new file */
5617   close(LOG_CLOSE_INDEX | LOG_CLOSE_TO_BE_OPENED, false /*need_lock_log=false*/,
5618         false /*need_lock_index=false*/);
5619   if ((error = my_delete_allow_opened(index_file_name,
5620                                       MYF(0))))  // Reset (open will update)
5621   {
5622     if (my_errno() == ENOENT) {
5623       push_warning_printf(
5624           current_thd, Sql_condition::SL_WARNING, ER_LOG_PURGE_NO_FILE,
5625           ER_THD(current_thd, ER_LOG_PURGE_NO_FILE), index_file_name);
5626       LogErr(INFORMATION_LEVEL, ER_BINLOG_CANT_DELETE_FILE, index_file_name);
5627       set_my_errno(0);
5628       error = false;
5629     } else {
5630       push_warning_printf(current_thd, Sql_condition::SL_WARNING,
5631                           ER_BINLOG_PURGE_FATAL_ERR,
5632                           "a problem with deleting %s; "
5633                           "consider examining correspondence "
5634                           "of your binlog index file "
5635                           "to the actual binlog files",
5636                           index_file_name);
5637       error = true;
5638       goto err;
5639     }
5640   }
5641   DBUG_EXECUTE_IF("wait_for_kill_gtid_state_clear", {
5642     const char action[] = "now WAIT_FOR kill_gtid_state_clear";
5643     DBUG_ASSERT(!debug_sync_set_action(thd, STRING_WITH_LEN(action)));
5644   };);
5645 
5646   /*
5647     For relay logs we clear the gtid state associated per channel(i.e rli)
5648     in the purge_relay_logs()
5649   */
5650   if (!is_relay_log) {
5651     if (gtid_state->clear(thd)) {
5652       error = true;
5653     }
5654     /*
5655       Don't clear global_sid_map because gtid_state->clear() above didn't
5656       touched owned_gtids GTID set.
5657     */
5658     error = error || gtid_state->init();
5659   }
5660 
5661   if (!delete_only) {
5662     if (!open_index_file(index_file_name, nullptr,
5663                          false /*need_lock_index=false*/))
5664       error = open_binlog(save_name, nullptr, max_size, false,
5665                           false /*need_lock_index=false*/,
5666                           false /*need_sid_lock=false*/, nullptr,
5667                           thd->lex->next_binlog_file_nr) ||
5668               error;
5669   }
5670   /* String has been duplicated, free old file-name */
5671   if (name != nullptr) {
5672     my_free(const_cast<char *>(save_name));
5673     save_name = nullptr;
5674   }
5675 
5676 err:
5677   if (name == nullptr)
5678     name = const_cast<char *>(save_name);  // restore old file-name
5679   sid_lock->unlock();
5680   mysql_mutex_unlock(&LOCK_index);
5681   mysql_mutex_unlock(&LOCK_log);
5682   return error;
5683 }
5684 
5685 /**
5686   Set the name of crash safe index file.
5687 
5688   @retval
5689     0   ok
5690   @retval
5691     1   error
5692 */
set_crash_safe_index_file_name(const char * base_file_name)5693 int MYSQL_BIN_LOG::set_crash_safe_index_file_name(const char *base_file_name) {
5694   int error = 0;
5695   DBUG_TRACE;
5696   if (fn_format(crash_safe_index_file_name, base_file_name, mysql_data_home,
5697                 ".index_crash_safe",
5698                 MYF(MY_UNPACK_FILENAME | MY_SAFE_PATH | MY_REPLACE_EXT)) ==
5699       nullptr) {
5700     error = 1;
5701     LogErr(ERROR_LEVEL, ER_BINLOG_CANT_SET_TMP_INDEX_NAME);
5702   }
5703   return error;
5704 }
5705 
5706 /**
5707   Open a (new) crash safe index file.
5708 
5709   @note
5710     The crash safe index file is a special file
5711     used for guaranteeing index file crash safe.
5712   @retval
5713     0   ok
5714   @retval
5715     1   error
5716 */
open_crash_safe_index_file()5717 int MYSQL_BIN_LOG::open_crash_safe_index_file() {
5718   int error = 0;
5719   File file = -1;
5720 
5721   DBUG_TRACE;
5722 
5723   if (!my_b_inited(&crash_safe_index_file)) {
5724     myf flags = MY_WME | MY_NABP | MY_WAIT_IF_FULL;
5725     if (is_relay_log) flags = flags | MY_REPORT_WAITING_IF_FULL;
5726 
5727     if ((file = my_open(crash_safe_index_file_name, O_RDWR | O_CREAT,
5728                         MYF(MY_WME))) < 0 ||
5729         init_io_cache(&crash_safe_index_file, file, IO_SIZE, WRITE_CACHE, 0,
5730                       false, flags)) {
5731       error = 1;
5732       LogErr(ERROR_LEVEL, ER_BINLOG_FAILED_TO_OPEN_TEMPORARY_INDEX_FILE);
5733     }
5734   }
5735   return error;
5736 }
5737 
5738 /**
5739   Close the crash safe index file.
5740 
5741   @note
5742     The crash safe file is just closed, is not deleted.
5743     Because it is moved to index file later on.
5744   @retval
5745     0   ok
5746   @retval
5747     1   error
5748 */
close_crash_safe_index_file()5749 int MYSQL_BIN_LOG::close_crash_safe_index_file() {
5750   int error = 0;
5751 
5752   DBUG_TRACE;
5753 
5754   if (my_b_inited(&crash_safe_index_file)) {
5755     end_io_cache(&crash_safe_index_file);
5756     error = my_close(crash_safe_index_file.file, MYF(0));
5757   }
5758   crash_safe_index_file = IO_CACHE();
5759 
5760   return error;
5761 }
5762 
5763 /**
5764   Remove logs from index file.
5765 
5766   - To make crash safe, we copy the content of index file
5767   from index_file_start_offset recored in log_info to
5768   crash safe index file firstly and then move the crash
5769   safe index file to index file.
5770 
5771   @param log_info               Store here the found log file name and
5772                                 position to the NEXT log file name in
5773                                 the index file.
5774 
5775   @param need_update_threads    If we want to update the log coordinates
5776                                 of all threads. False for relay logs,
5777                                 true otherwise.
5778 
5779   @retval
5780     0    ok
5781   @retval
5782     LOG_INFO_IO    Got IO error while reading/writing file
5783 */
remove_logs_from_index(LOG_INFO * log_info,bool need_update_threads)5784 int MYSQL_BIN_LOG::remove_logs_from_index(LOG_INFO *log_info,
5785                                           bool need_update_threads) {
5786   if (open_crash_safe_index_file()) {
5787     LogErr(ERROR_LEVEL, ER_BINLOG_CANT_OPEN_TMP_INDEX,
5788            "MYSQL_BIN_LOG::remove_logs_from_index");
5789     goto err;
5790   }
5791 
5792   if (copy_file(&index_file, &crash_safe_index_file,
5793                 log_info->index_file_start_offset)) {
5794     LogErr(ERROR_LEVEL, ER_BINLOG_CANT_COPY_INDEX_TO_TMP,
5795            "MYSQL_BIN_LOG::remove_logs_from_index");
5796     goto err;
5797   }
5798 
5799   if (close_crash_safe_index_file()) {
5800     LogErr(ERROR_LEVEL, ER_BINLOG_CANT_CLOSE_TMP_INDEX,
5801            "MYSQL_BIN_LOG::remove_logs_from_index");
5802     goto err;
5803   }
5804   DBUG_EXECUTE_IF("fault_injection_copy_part_file", DBUG_SUICIDE(););
5805 
5806   if (move_crash_safe_index_file_to_index_file(
5807           false /*need_lock_index=false*/)) {
5808     LogErr(ERROR_LEVEL, ER_BINLOG_CANT_MOVE_TMP_TO_INDEX,
5809            "MYSQL_BIN_LOG::remove_logs_from_index");
5810     goto err;
5811   }
5812 
5813   // now update offsets in index file for running threads
5814   if (need_update_threads)
5815     adjust_linfo_offsets(log_info->index_file_start_offset);
5816   return 0;
5817 
5818 err:
5819   return LOG_INFO_IO;
5820 }
5821 
5822 /**
5823   Remove all logs before the given log from disk and from the index file.
5824 
5825   @param to_log	      Delete all log file name before this file.
5826   @param included            If true, to_log is deleted too.
5827   @param need_lock_index     Set to true, if the lock_index of the binary log
5828   shall be acquired, false if the called is already the owner of the lock_index.
5829   @param need_update_threads If we want to update the log coordinates of
5830                              all threads. False for relay logs, true otherwise.
5831   @param decrease_log_space  If not null, decrement this variable of
5832                              the amount of log space freed
5833   @param auto_purge          True if this is an automatic purge.
5834 
5835   @note
5836     If any of the logs before the deleted one is in use,
5837     only purge logs up to this one.
5838 
5839   @retval 0			ok
5840   @retval LOG_INFO_EOF		to_log not found
5841   @retval LOG_INFO_EMFILE       too many files opened
5842   @retval LOG_INFO_FATAL        if any other than ENOENT error from
5843                                 mysql_file_stat() or mysql_file_delete()
5844 */
5845 
purge_logs(const char * to_log,bool included,bool need_lock_index,bool need_update_threads,ulonglong * decrease_log_space,bool auto_purge)5846 int MYSQL_BIN_LOG::purge_logs(const char *to_log, bool included,
5847                               bool need_lock_index, bool need_update_threads,
5848                               ulonglong *decrease_log_space, bool auto_purge) {
5849   int error = 0, no_of_log_files_to_purge = 0, no_of_log_files_purged = 0;
5850   int no_of_threads_locking_log = 0;
5851   bool exit_loop = false;
5852   LOG_INFO log_info;
5853   THD *thd = current_thd;
5854   DBUG_TRACE;
5855   DBUG_PRINT("info", ("to_log= %s", to_log));
5856 
5857   if (need_lock_index)
5858     mysql_mutex_lock(&LOCK_index);
5859   else
5860     mysql_mutex_assert_owner(&LOCK_index);
5861   if ((error =
5862            find_log_pos(&log_info, to_log, false /*need_lock_index=false*/))) {
5863     LogErr(ERROR_LEVEL, ER_BINLOG_PURGE_LOGS_CALLED_WITH_FILE_NOT_IN_INDEX,
5864            to_log);
5865     goto err;
5866   }
5867 
5868   no_of_log_files_to_purge = log_info.entry_index;
5869 
5870   if ((error = open_purge_index_file(true))) {
5871     LogErr(ERROR_LEVEL, ER_BINLOG_PURGE_LOGS_CANT_SYNC_INDEX_FILE);
5872     goto err;
5873   }
5874 
5875   /*
5876     File name exists in index file; delete until we find this file
5877     or a file that is used.
5878   */
5879   if ((error = find_log_pos(&log_info, NullS, false /*need_lock_index=false*/)))
5880     goto err;
5881 
5882   while ((compare_log_name(to_log, log_info.log_file_name) ||
5883           (exit_loop = included))) {
5884     if (is_active(log_info.log_file_name)) {
5885       if (!auto_purge)
5886         push_warning_printf(
5887             thd, Sql_condition::SL_WARNING, ER_WARN_PURGE_LOG_IS_ACTIVE,
5888             ER_THD(thd, ER_WARN_PURGE_LOG_IS_ACTIVE), log_info.log_file_name);
5889       break;
5890     }
5891 
5892     if ((no_of_threads_locking_log = log_in_use(log_info.log_file_name))) {
5893       if (!auto_purge)
5894         push_warning_printf(thd, Sql_condition::SL_WARNING,
5895                             ER_WARN_PURGE_LOG_IN_USE,
5896                             ER_THD(thd, ER_WARN_PURGE_LOG_IN_USE),
5897                             log_info.log_file_name, no_of_threads_locking_log,
5898                             no_of_log_files_purged, no_of_log_files_to_purge);
5899       break;
5900     }
5901     no_of_log_files_purged++;
5902 
5903     if ((error = register_purge_index_entry(log_info.log_file_name))) {
5904       LogErr(ERROR_LEVEL, ER_BINLOG_PURGE_LOGS_CANT_COPY_TO_REGISTER_FILE,
5905              log_info.log_file_name);
5906       goto err;
5907     }
5908 
5909     if (find_next_log(&log_info, false /*need_lock_index=false*/) || exit_loop)
5910       break;
5911   }
5912 
5913   DBUG_EXECUTE_IF("crash_purge_before_update_index", DBUG_SUICIDE(););
5914 
5915   if ((error = sync_purge_index_file())) {
5916     LogErr(ERROR_LEVEL, ER_BINLOG_PURGE_LOGS_CANT_FLUSH_REGISTER_FILE);
5917     goto err;
5918   }
5919 
5920   /* We know how many files to delete. Update index file. */
5921   if ((error = remove_logs_from_index(&log_info, need_update_threads))) {
5922     LogErr(ERROR_LEVEL, ER_BINLOG_PURGE_LOGS_CANT_UPDATE_INDEX_FILE);
5923     goto err;
5924   }
5925 
5926   // Update gtid_state->lost_gtids
5927   if (!is_relay_log) {
5928     global_sid_lock->wrlock();
5929     error = init_gtid_sets(
5930         nullptr, const_cast<Gtid_set *>(gtid_state->get_lost_gtids()),
5931         opt_master_verify_checksum, false /*false=don't need lock*/,
5932         nullptr /*trx_parser*/, nullptr /*partial_trx*/);
5933     global_sid_lock->unlock();
5934     if (error) goto err;
5935   }
5936 
5937   DBUG_EXECUTE_IF("crash_purge_critical_after_update_index", DBUG_SUICIDE(););
5938 
5939 err:
5940 
5941   int error_index = 0, close_error_index = 0;
5942   /* Read each entry from purge_index_file and delete the file. */
5943   if (!error && is_inited_purge_index_file() &&
5944       (error_index = purge_index_entry(thd, decrease_log_space,
5945                                        false /*need_lock_index=false*/)))
5946     LogErr(ERROR_LEVEL, ER_BINLOG_PURGE_LOGS_FAILED_TO_PURGE_LOG);
5947 
5948   close_error_index = close_purge_index_file();
5949 
5950   DBUG_EXECUTE_IF("crash_purge_non_critical_after_update_index",
5951                   DBUG_SUICIDE(););
5952 
5953   if (need_lock_index) mysql_mutex_unlock(&LOCK_index);
5954 
5955   /*
5956     Error codes from purge logs take precedence.
5957     Then error codes from purging the index entry.
5958     Finally, error codes from closing the purge index file.
5959   */
5960   error = error ? error : (error_index ? error_index : close_error_index);
5961 
5962   return error;
5963 }
5964 
set_purge_index_file_name(const char * base_file_name)5965 int MYSQL_BIN_LOG::set_purge_index_file_name(const char *base_file_name) {
5966   int error = 0;
5967   DBUG_TRACE;
5968   if (fn_format(
5969           purge_index_file_name, base_file_name, mysql_data_home, ".~rec~",
5970           MYF(MY_UNPACK_FILENAME | MY_SAFE_PATH | MY_REPLACE_EXT)) == nullptr) {
5971     error = 1;
5972     LogErr(ERROR_LEVEL, ER_BINLOG_FAILED_TO_SET_PURGE_INDEX_FILE_NAME);
5973   }
5974   return error;
5975 }
5976 
open_purge_index_file(bool destroy)5977 int MYSQL_BIN_LOG::open_purge_index_file(bool destroy) {
5978   int error = 0;
5979   File file = -1;
5980 
5981   DBUG_TRACE;
5982 
5983   if (destroy) close_purge_index_file();
5984 
5985   if (!my_b_inited(&purge_index_file)) {
5986     myf flags = MY_WME | MY_NABP | MY_WAIT_IF_FULL;
5987     if (is_relay_log) flags = flags | MY_REPORT_WAITING_IF_FULL;
5988 
5989     if ((file = my_open(purge_index_file_name, O_RDWR | O_CREAT, MYF(MY_WME))) <
5990             0 ||
5991         init_io_cache(&purge_index_file, file, IO_SIZE,
5992                       (destroy ? WRITE_CACHE : READ_CACHE), 0, false, flags)) {
5993       error = 1;
5994       LogErr(ERROR_LEVEL, ER_BINLOG_FAILED_TO_OPEN_REGISTER_FILE);
5995     }
5996   }
5997   return error;
5998 }
5999 
close_purge_index_file()6000 int MYSQL_BIN_LOG::close_purge_index_file() {
6001   int error = 0;
6002 
6003   DBUG_TRACE;
6004 
6005   if (my_b_inited(&purge_index_file)) {
6006     end_io_cache(&purge_index_file);
6007     error = my_close(purge_index_file.file, MYF(0));
6008   }
6009   my_delete(purge_index_file_name, MYF(0));
6010   new (&purge_index_file) IO_CACHE();
6011 
6012   return error;
6013 }
6014 
is_inited_purge_index_file()6015 bool MYSQL_BIN_LOG::is_inited_purge_index_file() {
6016   DBUG_TRACE;
6017   return my_b_inited(&purge_index_file);
6018 }
6019 
sync_purge_index_file()6020 int MYSQL_BIN_LOG::sync_purge_index_file() {
6021   int error = 0;
6022   DBUG_TRACE;
6023 
6024   if ((error = flush_io_cache(&purge_index_file)) ||
6025       (error = my_sync(purge_index_file.file, MYF(MY_WME))))
6026     return error;
6027 
6028   return error;
6029 }
6030 
register_purge_index_entry(const char * entry)6031 int MYSQL_BIN_LOG::register_purge_index_entry(const char *entry) {
6032   int error = 0;
6033   DBUG_TRACE;
6034 
6035   if ((error = my_b_write(&purge_index_file, (const uchar *)entry,
6036                           strlen(entry))) ||
6037       (error = my_b_write(&purge_index_file, (const uchar *)"\n", 1)))
6038     return error;
6039 
6040   return error;
6041 }
6042 
register_create_index_entry(const char * entry)6043 int MYSQL_BIN_LOG::register_create_index_entry(const char *entry) {
6044   DBUG_TRACE;
6045   return register_purge_index_entry(entry);
6046 }
6047 
purge_index_entry(THD * thd,ulonglong * decrease_log_space,bool need_lock_index)6048 int MYSQL_BIN_LOG::purge_index_entry(THD *thd, ulonglong *decrease_log_space,
6049                                      bool need_lock_index) {
6050   MY_STAT s;
6051   int error = 0;
6052   LOG_INFO log_info;
6053   LOG_INFO check_log_info;
6054 
6055   DBUG_TRACE;
6056 
6057   DBUG_ASSERT(my_b_inited(&purge_index_file));
6058 
6059   if ((error =
6060            reinit_io_cache(&purge_index_file, READ_CACHE, 0, false, false))) {
6061     LogErr(ERROR_LEVEL, ER_BINLOG_FAILED_TO_REINIT_REGISTER_FILE);
6062     goto err;
6063   }
6064 
6065   for (;;) {
6066     size_t length;
6067 
6068     if ((length = my_b_gets(&purge_index_file, log_info.log_file_name,
6069                             FN_REFLEN)) <= 1) {
6070       if (purge_index_file.error) {
6071         error = purge_index_file.error;
6072         LogErr(ERROR_LEVEL, ER_BINLOG_FAILED_TO_READ_REGISTER_FILE, error);
6073         goto err;
6074       }
6075 
6076       /* Reached EOF */
6077       break;
6078     }
6079 
6080     /* Get rid of the trailing '\n' */
6081     log_info.log_file_name[length - 1] = 0;
6082 
6083     if (!mysql_file_stat(m_key_file_log, log_info.log_file_name, &s, MYF(0))) {
6084       if (my_errno() == ENOENT) {
6085         /*
6086           It's not fatal if we can't stat a log file that does not exist;
6087           If we could not stat, we won't delete.
6088         */
6089         if (thd) {
6090           push_warning_printf(
6091               thd, Sql_condition::SL_WARNING, ER_LOG_PURGE_NO_FILE,
6092               ER_THD(thd, ER_LOG_PURGE_NO_FILE), log_info.log_file_name);
6093         }
6094         LogErr(INFORMATION_LEVEL, ER_CANT_STAT_FILE, log_info.log_file_name);
6095         set_my_errno(0);
6096       } else {
6097         /*
6098           Other than ENOENT are fatal
6099         */
6100         if (thd) {
6101           push_warning_printf(thd, Sql_condition::SL_WARNING,
6102                               ER_BINLOG_PURGE_FATAL_ERR,
6103                               "a problem with getting info on being purged %s; "
6104                               "consider examining correspondence "
6105                               "of your binlog index file "
6106                               "to the actual binlog files",
6107                               log_info.log_file_name);
6108         } else {
6109           LogErr(INFORMATION_LEVEL,
6110                  ER_BINLOG_CANT_DELETE_LOG_FILE_DOES_INDEX_MATCH_FILES,
6111                  log_info.log_file_name);
6112         }
6113         error = LOG_INFO_FATAL;
6114         goto err;
6115       }
6116     } else {
6117       if ((error = find_log_pos(&check_log_info, log_info.log_file_name,
6118                                 need_lock_index))) {
6119         if (error != LOG_INFO_EOF) {
6120           if (thd) {
6121             push_warning_printf(thd, Sql_condition::SL_WARNING,
6122                                 ER_BINLOG_PURGE_FATAL_ERR,
6123                                 "a problem with deleting %s and "
6124                                 "reading the binlog index file",
6125                                 log_info.log_file_name);
6126           } else {
6127             LogErr(INFORMATION_LEVEL,
6128                    ER_BINLOG_CANT_DELETE_FILE_AND_READ_BINLOG_INDEX,
6129                    log_info.log_file_name);
6130           }
6131           goto err;
6132         }
6133 
6134         error = 0;
6135         if (!need_lock_index) {
6136           /*
6137             This is to avoid triggering an error in NDB.
6138 
6139             @todo: This is weird, what does NDB errors have to do with
6140             need_lock_index? Explain better or refactor /Sven
6141           */
6142           ha_binlog_index_purge_file(current_thd, log_info.log_file_name);
6143         }
6144 
6145         DBUG_PRINT("info", ("purging %s", log_info.log_file_name));
6146         if (!mysql_file_delete(key_file_binlog, log_info.log_file_name,
6147                                MYF(0))) {
6148           DBUG_EXECUTE_IF("wait_in_purge_index_entry", {
6149             const char action[] =
6150                 "now SIGNAL in_purge_index_entry WAIT_FOR go_ahead_sql";
6151             DBUG_ASSERT(!debug_sync_set_action(thd, STRING_WITH_LEN(action)));
6152             DBUG_SET("-d,wait_in_purge_index_entry");
6153           };);
6154 
6155           if (decrease_log_space) *decrease_log_space -= s.st_size;
6156         } else {
6157           if (my_errno() == ENOENT) {
6158             if (thd) {
6159               push_warning_printf(
6160                   thd, Sql_condition::SL_WARNING, ER_LOG_PURGE_NO_FILE,
6161                   ER_THD(thd, ER_LOG_PURGE_NO_FILE), log_info.log_file_name);
6162             }
6163             LogErr(INFORMATION_LEVEL, ER_BINLOG_CANT_DELETE_FILE,
6164                    log_info.log_file_name);
6165             set_my_errno(0);
6166           } else {
6167             if (thd) {
6168               push_warning_printf(thd, Sql_condition::SL_WARNING,
6169                                   ER_BINLOG_PURGE_FATAL_ERR,
6170                                   "a problem with deleting %s; "
6171                                   "consider examining correspondence "
6172                                   "of your binlog index file "
6173                                   "to the actual binlog files",
6174                                   log_info.log_file_name);
6175             } else {
6176               LogErr(INFORMATION_LEVEL,
6177                      ER_BINLOG_CANT_DELETE_LOG_FILE_DOES_INDEX_MATCH_FILES,
6178                      log_info.log_file_name);
6179             }
6180             if (my_errno() == EMFILE) {
6181               DBUG_PRINT("info", ("my_errno: %d, set ret = LOG_INFO_EMFILE",
6182                                   my_errno()));
6183               error = LOG_INFO_EMFILE;
6184               goto err;
6185             }
6186             error = LOG_INFO_FATAL;
6187             goto err;
6188           }
6189         }
6190       }
6191     }
6192   }
6193 
6194 err:
6195   return error;
6196 }
6197 
6198 /**
6199   Remove all logs before the given file date from disk and from the
6200   index file.
6201 
6202   @param purge_time	Delete all log files before given date.
6203   @param auto_purge     True if this is an automatic purge.
6204 
6205   @note
6206     If any of the logs before the deleted one is in use,
6207     only purge logs up to this one.
6208 
6209   @retval
6210     0				ok
6211   @retval
6212     LOG_INFO_PURGE_NO_ROTATE	Binary file that can't be rotated
6213     LOG_INFO_FATAL              if any other than ENOENT error from
6214                                 mysql_file_stat() or mysql_file_delete()
6215 */
6216 
purge_logs_before_date(time_t purge_time,bool auto_purge)6217 int MYSQL_BIN_LOG::purge_logs_before_date(time_t purge_time, bool auto_purge) {
6218   int error;
6219   int no_of_threads_locking_log = 0, no_of_log_files_purged = 0;
6220   bool log_is_active = false, log_is_in_use = false;
6221   char to_log[FN_REFLEN], copy_log_in_use[FN_REFLEN];
6222   LOG_INFO log_info;
6223   MY_STAT stat_area;
6224   THD *thd = current_thd;
6225 
6226   DBUG_TRACE;
6227 
6228   mysql_mutex_lock(&LOCK_index);
6229   to_log[0] = 0;
6230 
6231   if ((error = find_log_pos(&log_info, NullS, false /*need_lock_index=false*/)))
6232     goto err;
6233 
6234   while (!(log_is_active = is_active(log_info.log_file_name))) {
6235     if (!mysql_file_stat(m_key_file_log, log_info.log_file_name, &stat_area,
6236                          MYF(0))) {
6237       if (my_errno() == ENOENT) {
6238         /*
6239           It's not fatal if we can't stat a log file that does not exist.
6240         */
6241         set_my_errno(0);
6242       } else {
6243         /*
6244           Other than ENOENT are fatal
6245         */
6246         if (thd) {
6247           push_warning_printf(thd, Sql_condition::SL_WARNING,
6248                               ER_BINLOG_PURGE_FATAL_ERR,
6249                               "a problem with getting info on being purged %s; "
6250                               "consider examining correspondence "
6251                               "of your binlog index file "
6252                               "to the actual binlog files",
6253                               log_info.log_file_name);
6254         } else {
6255           LogErr(INFORMATION_LEVEL, ER_BINLOG_FAILED_TO_DELETE_LOG_FILE,
6256                  log_info.log_file_name);
6257         }
6258         error = LOG_INFO_FATAL;
6259         goto err;
6260       }
6261     }
6262     /* check if the binary log file is older than the purge_time
6263        if yes check if it is in use, if not in use then add
6264        it in the list of binary log files to be purged.
6265     */
6266     else if (stat_area.st_mtime < purge_time) {
6267       if ((no_of_threads_locking_log = log_in_use(log_info.log_file_name))) {
6268         if (!auto_purge) {
6269           log_is_in_use = true;
6270           strcpy(copy_log_in_use, log_info.log_file_name);
6271         }
6272         break;
6273       }
6274       strmake(to_log, log_info.log_file_name,
6275               sizeof(log_info.log_file_name) - 1);
6276       no_of_log_files_purged++;
6277     } else
6278       break;
6279     if (find_next_log(&log_info, false /*need_lock_index=false*/)) break;
6280   }
6281 
6282   if (log_is_active) {
6283     if (!auto_purge)
6284       push_warning_printf(
6285           thd, Sql_condition::SL_WARNING, ER_WARN_PURGE_LOG_IS_ACTIVE,
6286           ER_THD(thd, ER_WARN_PURGE_LOG_IS_ACTIVE), log_info.log_file_name);
6287   }
6288 
6289   if (log_is_in_use) {
6290     int no_of_log_files_to_purge = no_of_log_files_purged + 1;
6291     while (strcmp(log_file_name, log_info.log_file_name)) {
6292       if (mysql_file_stat(m_key_file_log, log_info.log_file_name, &stat_area,
6293                           MYF(0))) {
6294         if (stat_area.st_mtime < purge_time)
6295           no_of_log_files_to_purge++;
6296         else
6297           break;
6298       }
6299       if (find_next_log(&log_info, false /*need_lock_index=false*/)) {
6300         no_of_log_files_to_purge++;
6301         break;
6302       }
6303     }
6304 
6305     push_warning_printf(thd, Sql_condition::SL_WARNING,
6306                         ER_WARN_PURGE_LOG_IN_USE,
6307                         ER_THD(thd, ER_WARN_PURGE_LOG_IN_USE), copy_log_in_use,
6308                         no_of_threads_locking_log, no_of_log_files_purged,
6309                         no_of_log_files_to_purge);
6310   }
6311 
6312   error = (to_log[0] ? purge_logs(to_log, true, false /*need_lock_index=false*/,
6313                                   true /*need_update_threads=true*/,
6314                                   (ulonglong *)nullptr, auto_purge)
6315                      : 0);
6316 
6317 err:
6318   mysql_mutex_unlock(&LOCK_index);
6319   return error;
6320 }
6321 
6322 /**
6323   Create a new log file name.
6324 
6325   @param[out] buf       Buffer allocated with at least FN_REFLEN bytes where
6326                         new name is stored.
6327   @param      log_ident Identity of the binary/relay log.
6328 
6329   @note
6330     If file name will be longer then FN_REFLEN it will be truncated
6331 */
6332 
make_log_name(char * buf,const char * log_ident)6333 void MYSQL_BIN_LOG::make_log_name(char *buf, const char *log_ident) {
6334   size_t dir_len = dirname_length(log_file_name);
6335   if (dir_len >= FN_REFLEN) dir_len = FN_REFLEN - 1;
6336   my_stpnmov(buf, log_file_name, dir_len);
6337   strmake(buf + dir_len, log_ident, FN_REFLEN - dir_len - 1);
6338 }
6339 
6340 /**
6341   Check if we are writing/reading to the given log file.
6342 */
6343 
is_active(const char * log_file_name_arg)6344 bool MYSQL_BIN_LOG::is_active(const char *log_file_name_arg) {
6345   return !compare_log_name(log_file_name, log_file_name_arg);
6346 }
6347 
inc_prep_xids(THD * thd)6348 void MYSQL_BIN_LOG::inc_prep_xids(THD *thd) {
6349   DBUG_TRACE;
6350 #ifndef DBUG_OFF
6351   int result = ++m_atomic_prep_xids;
6352   DBUG_PRINT("debug", ("m_atomic_prep_xids: %d", result));
6353 #else
6354   m_atomic_prep_xids++;
6355 #endif
6356   thd->get_transaction()->m_flags.xid_written = true;
6357 }
6358 
dec_prep_xids(THD * thd)6359 void MYSQL_BIN_LOG::dec_prep_xids(THD *thd) {
6360   DBUG_TRACE;
6361   int32 result = --m_atomic_prep_xids;
6362   DBUG_PRINT("debug", ("m_atomic_prep_xids: %d", result));
6363   thd->get_transaction()->m_flags.xid_written = false;
6364   if (result == 0) {
6365     mysql_mutex_lock(&LOCK_xids);
6366     mysql_cond_signal(&m_prep_xids_cond);
6367     mysql_mutex_unlock(&LOCK_xids);
6368   }
6369 }
6370 
6371 /*
6372   Wrappers around new_file_impl to avoid using argument
6373   to control locking. The argument 1) less readable 2) breaks
6374   incapsulation 3) allows external access to the class without
6375   a lock (which is not possible with private new_file_without_locking
6376   method).
6377 
6378   @retval
6379     nonzero - error
6380 
6381 */
6382 
new_file(Format_description_log_event * extra_description_event)6383 int MYSQL_BIN_LOG::new_file(
6384     Format_description_log_event *extra_description_event) {
6385   return new_file_impl(true /*need_lock_log=true*/, extra_description_event);
6386 }
6387 
6388 /*
6389   @retval
6390     nonzero - error
6391 */
new_file_without_locking(Format_description_log_event * extra_description_event)6392 int MYSQL_BIN_LOG::new_file_without_locking(
6393     Format_description_log_event *extra_description_event) {
6394   return new_file_impl(false /*need_lock_log=false*/, extra_description_event);
6395 }
6396 
6397 /**
6398   Start writing to a new log file or reopen the old file.
6399 
6400   @param need_lock_log If true, this function acquires LOCK_log;
6401   otherwise the caller should already have acquired it.
6402 
6403   @param extra_description_event The master's FDE to be written by the I/O
6404   thread while creating a new relay log file. This should be NULL for
6405   binary log files.
6406 
6407   @retval 0 success
6408   @retval nonzero - error
6409 
6410   @note The new file name is stored last in the index file
6411 */
new_file_impl(bool need_lock_log,Format_description_log_event * extra_description_event)6412 int MYSQL_BIN_LOG::new_file_impl(
6413     bool need_lock_log, Format_description_log_event *extra_description_event) {
6414   int error = 0;
6415   bool close_on_error = false;
6416   char new_name[FN_REFLEN], *new_name_ptr = nullptr, *old_name, *file_to_open;
6417   const size_t ERR_CLOSE_MSG_LEN = 1024;
6418   char close_on_error_msg[ERR_CLOSE_MSG_LEN];
6419   memset(close_on_error_msg, 0, sizeof close_on_error_msg);
6420 
6421   DBUG_TRACE;
6422   if (!is_open()) {
6423     DBUG_PRINT("info", ("log is closed"));
6424     return error;
6425   }
6426 
6427   if (need_lock_log)
6428     mysql_mutex_lock(&LOCK_log);
6429   else
6430     mysql_mutex_assert_owner(&LOCK_log);
6431   DBUG_EXECUTE_IF("semi_sync_3-way_deadlock",
6432                   DEBUG_SYNC(current_thd, "before_rotate_binlog"););
6433   mysql_mutex_lock(&LOCK_xids);
6434   /*
6435     We need to ensure that the number of prepared XIDs are 0.
6436 
6437     If m_atomic_prep_xids is not zero:
6438     - We wait for storage engine commit, hence decrease m_atomic_prep_xids
6439     - We keep the LOCK_log to block new transactions from being
6440       written to the binary log.
6441    */
6442   while (get_prep_xids() > 0) {
6443     mysql_cond_wait(&m_prep_xids_cond, &LOCK_xids);
6444   }
6445   mysql_mutex_unlock(&LOCK_xids);
6446 
6447   mysql_mutex_lock(&LOCK_index);
6448 
6449   mysql_mutex_assert_owner(&LOCK_log);
6450   mysql_mutex_assert_owner(&LOCK_index);
6451 
6452   if (DBUG_EVALUATE_IF("expire_logs_always", 0, 1) &&
6453       (error = ha_flush_logs())) {
6454     goto end;
6455   }
6456 
6457   if (!is_relay_log) {
6458     /* Save set of GTIDs of the last binlog into table on binlog rotation */
6459     if ((error = gtid_state->save_gtids_of_last_binlog_into_table())) {
6460       if (error == ER_RPL_GTID_TABLE_CANNOT_OPEN) {
6461         close_on_error =
6462             m_binlog_file->get_real_file_size() >=
6463                 static_cast<my_off_t>(max_size) ||
6464             DBUG_EVALUATE_IF("simulate_max_binlog_size", true, false);
6465 
6466         if (!close_on_error) {
6467           LogErr(ERROR_LEVEL, ER_BINLOG_UNABLE_TO_ROTATE_GTID_TABLE_READONLY,
6468                  "Current binlog file was flushed to disk and will be kept in "
6469                  "use.");
6470         } else {
6471           snprintf(close_on_error_msg, sizeof close_on_error_msg,
6472                    ER_THD(current_thd, ER_RPL_GTID_TABLE_CANNOT_OPEN), "mysql",
6473                    "gtid_executed");
6474 
6475           if (binlog_error_action != ABORT_SERVER)
6476             LogErr(WARNING_LEVEL,
6477                    ER_BINLOG_UNABLE_TO_ROTATE_GTID_TABLE_READONLY,
6478                    "Binary logging going to be disabled.");
6479         }
6480 
6481         DBUG_EXECUTE_IF("gtid_executed_readonly",
6482                         { DBUG_SET("-d,gtid_executed_readonly"); });
6483         DBUG_EXECUTE_IF("simulate_max_binlog_size",
6484                         { DBUG_SET("-d,simulate_max_binlog_size"); });
6485       } else {
6486         close_on_error = true;
6487         snprintf(close_on_error_msg, sizeof close_on_error_msg, "%s",
6488                  ER_THD(current_thd, ER_OOM_SAVE_GTIDS));
6489       }
6490       goto end;
6491     }
6492   }
6493 
6494   /*
6495     If user hasn't specified an extension, generate a new log name
6496     We have to do this here and not in open as we want to store the
6497     new file name in the current binary log file.
6498   */
6499   new_name_ptr = new_name;
6500   if ((error = generate_new_name(new_name, name))) {
6501     // Use the old name if generation of new name fails.
6502     strcpy(new_name, name);
6503     close_on_error = true;
6504     snprintf(close_on_error_msg, sizeof close_on_error_msg,
6505              ER_THD(current_thd, ER_NO_UNIQUE_LOGFILE), name);
6506     if (strlen(close_on_error_msg)) {
6507       close_on_error_msg[strlen(close_on_error_msg) - 1] = '\0';
6508     }
6509     goto end;
6510   }
6511 
6512   /*
6513     Make sure that the log_file is initialized before writing
6514     Rotate_log_event into it.
6515   */
6516   if (m_binlog_file->is_open()) {
6517     /*
6518       We log the whole file name for log file as the user may decide
6519       to change base names at some point.
6520     */
6521     Rotate_log_event r(new_name + dirname_length(new_name), 0, LOG_EVENT_OFFSET,
6522                        is_relay_log ? Rotate_log_event::RELAY_LOG : 0);
6523 
6524     if (DBUG_EVALUATE_IF("fault_injection_new_file_rotate_event", (error = 1),
6525                          false) ||
6526         (error = write_event_to_binlog(&r))) {
6527       char errbuf[MYSYS_STRERROR_SIZE];
6528       DBUG_EXECUTE_IF("fault_injection_new_file_rotate_event", errno = 2;);
6529       close_on_error = true;
6530       snprintf(close_on_error_msg, sizeof close_on_error_msg,
6531                ER_THD(current_thd, ER_ERROR_ON_WRITE), name, errno,
6532                my_strerror(errbuf, sizeof(errbuf), errno));
6533       my_printf_error(ER_ERROR_ON_WRITE, ER_THD(current_thd, ER_ERROR_ON_WRITE),
6534                       MYF(ME_FATALERROR), name, errno,
6535                       my_strerror(errbuf, sizeof(errbuf), errno));
6536       goto end;
6537     }
6538 
6539     if ((error = m_binlog_file->flush())) {
6540       close_on_error = true;
6541       snprintf(close_on_error_msg, sizeof close_on_error_msg, "%s",
6542                "Either disk is full or file system is read only");
6543       goto end;
6544     }
6545   }
6546 
6547   DEBUG_SYNC(current_thd, "after_rotate_event_appended");
6548 
6549   old_name = name;
6550   name = nullptr;  // Don't free name
6551   close(LOG_CLOSE_TO_BE_OPENED | LOG_CLOSE_INDEX, false /*need_lock_log=false*/,
6552         false /*need_lock_index=false*/);
6553 
6554   if (checksum_alg_reset != binary_log::BINLOG_CHECKSUM_ALG_UNDEF) {
6555     DBUG_ASSERT(!is_relay_log);
6556     DBUG_ASSERT(binlog_checksum_options != checksum_alg_reset);
6557     binlog_checksum_options = checksum_alg_reset;
6558   }
6559   /*
6560     Note that at this point, atomic_log_state != LOG_CLOSED
6561     (important for is_open()).
6562   */
6563 
6564   DEBUG_SYNC(current_thd, "binlog_rotate_between_close_and_open");
6565   /*
6566     new_file() is only used for rotation (in FLUSH LOGS or because size >
6567     max_binlog_size or max_relay_log_size).
6568     If this is a binary log, the Format_description_log_event at the beginning
6569     of the new file should have created=0 (to distinguish with the
6570     Format_description_log_event written at server startup, which should
6571     trigger temp tables deletion on slaves.
6572   */
6573 
6574   /* reopen index binlog file, BUG#34582 */
6575   file_to_open = index_file_name;
6576   error = open_index_file(index_file_name, nullptr,
6577                           false /*need_lock_index=false*/);
6578   if (!error) {
6579     /* reopen the binary log file. */
6580     file_to_open = new_name_ptr;
6581     error = open_binlog(old_name, new_name_ptr, max_size,
6582                         true /*null_created_arg=true*/,
6583                         false /*need_lock_index=false*/,
6584                         true /*need_sid_lock=true*/, extra_description_event);
6585   }
6586 
6587   /* handle reopening errors */
6588   if (error) {
6589     char errbuf[MYSYS_STRERROR_SIZE];
6590     my_printf_error(ER_CANT_OPEN_FILE, ER_THD(current_thd, ER_CANT_OPEN_FILE),
6591                     MYF(ME_FATALERROR), file_to_open, error,
6592                     my_strerror(errbuf, sizeof(errbuf), error));
6593     close_on_error = true;
6594     snprintf(close_on_error_msg, sizeof close_on_error_msg,
6595              ER_THD(current_thd, ER_CANT_OPEN_FILE), file_to_open, error,
6596              my_strerror(errbuf, sizeof(errbuf), error));
6597   }
6598   my_free(old_name);
6599 
6600 end:
6601 
6602   if (error && close_on_error /* rotate, flush or reopen failed */) {
6603     /*
6604       Close whatever was left opened.
6605 
6606       We are keeping the behavior as it exists today, ie,
6607       we disable logging and move on (see: BUG#51014).
6608 
6609       TODO: as part of WL#1790 consider other approaches:
6610        - kill mysql (safety);
6611        - try multiple locations for opening a log file;
6612        - switch server to protected/readonly mode
6613        - ...
6614     */
6615     if (binlog_error_action == ABORT_SERVER) {
6616       char abort_msg[ERR_CLOSE_MSG_LEN + 48];
6617       memset(abort_msg, 0, sizeof abort_msg);
6618       snprintf(abort_msg, sizeof abort_msg,
6619                "%s, while rotating the binlog. "
6620                "Aborting the server",
6621                close_on_error_msg);
6622       exec_binlog_error_action_abort(abort_msg);
6623     } else
6624       LogErr(ERROR_LEVEL, ER_BINLOG_CANT_OPEN_FOR_LOGGING,
6625              new_name_ptr != nullptr ? new_name_ptr : "new file", errno);
6626 
6627     close(LOG_CLOSE_INDEX, false /*need_lock_log=false*/,
6628           false /*need_lock_index=false*/);
6629   }
6630 
6631   mysql_mutex_unlock(&LOCK_index);
6632   if (need_lock_log) mysql_mutex_unlock(&LOCK_log);
6633 
6634   DEBUG_SYNC(current_thd, "after_disable_binlog");
6635   return error;
6636 }
6637 
6638 /**
6639   Called after an event has been written to the relay log by the IO
6640   thread.  This flushes and possibly syncs the file (according to the
6641   sync options), rotates the file if it has grown over the limit, and
6642   finally calls signal_update().
6643 
6644   @note The caller must hold LOCK_log before invoking this function.
6645 
6646   @param mi Master_info for the IO thread.
6647 
6648   @retval false success
6649   @retval true error
6650 */
after_write_to_relay_log(Master_info * mi)6651 bool MYSQL_BIN_LOG::after_write_to_relay_log(Master_info *mi) {
6652   DBUG_TRACE;
6653   DBUG_PRINT("info", ("max_size: %lu", max_size));
6654 
6655   // Check pre-conditions
6656   mysql_mutex_assert_owner(&LOCK_log);
6657   DBUG_ASSERT(is_relay_log);
6658 
6659   /*
6660     We allow the relay log rotation by relay log size
6661     only if the trx parser is not inside a transaction.
6662   */
6663   bool can_rotate = mi->transaction_parser.is_not_inside_transaction();
6664 
6665 #ifndef DBUG_OFF
6666   if (m_binlog_file->get_real_file_size() >
6667           DBUG_EVALUATE_IF("rotate_slave_debug_group", 500, max_size) &&
6668       !can_rotate) {
6669     DBUG_PRINT("info", ("Postponing the rotation by size waiting for "
6670                         "the end of the current transaction."));
6671   }
6672 #endif
6673 
6674   // Flush and sync
6675   bool error = flush_and_sync(false);
6676   if (error) {
6677     mi->report(ERROR_LEVEL, ER_SLAVE_RELAY_LOG_WRITE_FAILURE,
6678                ER_THD(current_thd, ER_SLAVE_RELAY_LOG_WRITE_FAILURE),
6679                "failed to flush event to relay log file");
6680     truncate_relaylog_file(mi, atomic_binlog_end_pos);
6681   } else {
6682     if (can_rotate) {
6683       mysql_mutex_lock(&mi->data_lock);
6684       /*
6685         If the last event of the transaction has been flushed, we can add
6686         the GTID (if it is not empty) to the logged set, or else it will
6687         not be available in the Previous GTIDs of the next relay log file
6688         if we are going to rotate the relay log.
6689       */
6690       const Gtid *last_gtid_queued = mi->get_queueing_trx_gtid();
6691       if (!last_gtid_queued->is_empty()) {
6692         mi->rli->get_sid_lock()->rdlock();
6693         DBUG_SIGNAL_WAIT_FOR(current_thd, "updating_received_transaction_set",
6694                              "reached_updating_received_transaction_set",
6695                              "continue_updating_received_transaction_set");
6696         mi->rli->add_logged_gtid(last_gtid_queued->sidno,
6697                                  last_gtid_queued->gno);
6698         mi->rli->get_sid_lock()->unlock();
6699       }
6700 
6701       if (mi->is_queueing_trx()) {
6702         mi->finished_queueing();
6703 
6704         Trx_monitoring_info processing;
6705         Trx_monitoring_info last;
6706         mi->get_gtid_monitoring_info()->copy_info_to(&processing, &last);
6707 
6708         // update the compression information
6709         binlog::global_context.monitoring_context()
6710             .transaction_compression()
6711             .update(binlog::monitoring::log_type::RELAY, last.compression_type,
6712                     last.gtid, last.end_time, last.compressed_bytes,
6713                     last.uncompressed_bytes,
6714                     mi->rli->get_gtid_set()->get_sid_map());
6715       }
6716       mysql_mutex_unlock(&mi->data_lock);
6717 
6718       /*
6719         If relay log is too big, rotate. But only if not in the middle of a
6720         transaction when GTIDs are enabled.
6721 
6722         Also rotate, if a deffered flush request has been placed.
6723 
6724         We now try to mimic the following master binlog behavior: "A transaction
6725         is written in one chunk to the binary log, so it is never split between
6726         several binary logs. Therefore, if you have big transactions, you might
6727         see binary log files larger than max_binlog_size."
6728       */
6729       if (m_binlog_file->get_real_file_size() >
6730               DBUG_EVALUATE_IF("rotate_slave_debug_group", 500, max_size) ||
6731           mi->is_rotate_requested()) {
6732         error = new_file_without_locking(mi->get_mi_description_event());
6733         mi->clear_rotate_requests();
6734       }
6735     }
6736   }
6737 
6738   lock_binlog_end_pos();
6739   mi->rli->ign_master_log_name_end[0] = 0;
6740   update_binlog_end_pos(false /*need_lock*/);
6741   harvest_bytes_written(mi->rli, true /*need_log_space_lock=true*/);
6742   unlock_binlog_end_pos();
6743 
6744   return error;
6745 }
6746 
write_event(Log_event * ev,Master_info * mi)6747 bool MYSQL_BIN_LOG::write_event(Log_event *ev, Master_info *mi) {
6748   DBUG_TRACE;
6749 
6750   DBUG_EXECUTE_IF("fail_to_write_ignored_event_to_relay_log", { return true; });
6751   // check preconditions
6752   DBUG_ASSERT(is_relay_log);
6753 
6754   mysql_mutex_assert_owner(&LOCK_log);
6755 
6756   // write data
6757   bool error = false;
6758   if (!binary_event_serialize(ev, m_binlog_file)) {
6759     bytes_written += ev->common_header->data_written;
6760     error = after_write_to_relay_log(mi);
6761   } else {
6762     mi->report(ERROR_LEVEL, ER_SLAVE_RELAY_LOG_WRITE_FAILURE,
6763                ER_THD(current_thd, ER_SLAVE_RELAY_LOG_WRITE_FAILURE),
6764                "failed to write event to the relay log file");
6765     truncate_relaylog_file(mi, atomic_binlog_end_pos);
6766     error = true;
6767   }
6768 
6769   return error;
6770 }
6771 
write_buffer(const char * buf,uint len,Master_info * mi)6772 bool MYSQL_BIN_LOG::write_buffer(const char *buf, uint len, Master_info *mi) {
6773   DBUG_TRACE;
6774 
6775   // check preconditions
6776   DBUG_ASSERT(is_relay_log);
6777   mysql_mutex_assert_owner(&LOCK_log);
6778 
6779   // write data
6780   bool error = false;
6781   if (m_binlog_file->write(pointer_cast<const uchar *>(buf), len) == 0) {
6782     bytes_written += len;
6783     error = after_write_to_relay_log(mi);
6784   } else {
6785     mi->report(ERROR_LEVEL, ER_SLAVE_RELAY_LOG_WRITE_FAILURE,
6786                ER_THD(current_thd, ER_SLAVE_RELAY_LOG_WRITE_FAILURE),
6787                "failed to write event to the relay log file");
6788     truncate_relaylog_file(mi, atomic_binlog_end_pos);
6789     error = true;
6790   }
6791 
6792   return error;
6793 }
6794 
flush()6795 bool MYSQL_BIN_LOG::flush() {
6796   return m_binlog_file->is_open() && m_binlog_file->flush();
6797 }
6798 
flush_and_sync(const bool force)6799 bool MYSQL_BIN_LOG::flush_and_sync(const bool force) {
6800   mysql_mutex_assert_owner(&LOCK_log);
6801 
6802   if (m_binlog_file->flush()) return true;
6803 
6804   std::pair<bool, bool> result = sync_binlog_file(force);
6805 
6806   return result.first;
6807 }
6808 
start_union_events(THD * thd,query_id_t query_id_param)6809 void MYSQL_BIN_LOG::start_union_events(THD *thd, query_id_t query_id_param) {
6810   DBUG_ASSERT(!thd->binlog_evt_union.do_union);
6811   thd->binlog_evt_union.do_union = true;
6812   thd->binlog_evt_union.unioned_events = false;
6813   thd->binlog_evt_union.unioned_events_trans = false;
6814   thd->binlog_evt_union.first_query_id = query_id_param;
6815 }
6816 
stop_union_events(THD * thd)6817 void MYSQL_BIN_LOG::stop_union_events(THD *thd) {
6818   DBUG_ASSERT(thd->binlog_evt_union.do_union);
6819   thd->binlog_evt_union.do_union = false;
6820 }
6821 
is_query_in_union(THD * thd,query_id_t query_id_param)6822 bool MYSQL_BIN_LOG::is_query_in_union(THD *thd, query_id_t query_id_param) {
6823   return (thd->binlog_evt_union.do_union &&
6824           query_id_param >= thd->binlog_evt_union.first_query_id);
6825 }
6826 
6827 /*
6828   Updates thd's position-of-next-event variables
6829   after a *real* write a file.
6830  */
update_thd_next_event_pos(THD * thd)6831 void MYSQL_BIN_LOG::update_thd_next_event_pos(THD *thd) {
6832   if (likely(thd != nullptr)) {
6833     thd->set_next_event_pos(log_file_name, m_binlog_file->position());
6834   }
6835 }
6836 
6837 /*
6838   Moves the last bunch of rows from the pending Rows event to a cache (either
6839   transactional cache if is_transaction is @c true, or the non-transactional
6840   cache otherwise. Sets a new pending event.
6841 
6842   @param thd               a pointer to the user thread.
6843   @param evt               a pointer to the row event.
6844   @param is_transactional  @c true indicates a transactional cache,
6845                            otherwise @c false a non-transactional.
6846 */
flush_and_set_pending_rows_event(THD * thd,Rows_log_event * event,bool is_transactional)6847 int MYSQL_BIN_LOG::flush_and_set_pending_rows_event(THD *thd,
6848                                                     Rows_log_event *event,
6849                                                     bool is_transactional) {
6850   DBUG_TRACE;
6851   DBUG_ASSERT(mysql_bin_log.is_open());
6852   DBUG_PRINT("enter", ("event: %p", event));
6853 
6854   int error = 0;
6855   binlog_cache_mngr *const cache_mngr = thd_get_cache_mngr(thd);
6856 
6857   DBUG_ASSERT(cache_mngr);
6858 
6859   binlog_cache_data *cache_data =
6860       cache_mngr->get_binlog_cache_data(is_transactional);
6861 
6862   DBUG_PRINT("info", ("cache_mngr->pending(): %p", cache_data->pending()));
6863 
6864   if (Rows_log_event *pending = cache_data->pending()) {
6865     /*
6866       Write pending event to the cache.
6867     */
6868     if (cache_data->write_event(pending)) {
6869       report_cache_write_error(thd, is_transactional);
6870       if (check_write_error(thd) && cache_data &&
6871           stmt_cannot_safely_rollback(thd))
6872         cache_data->set_incident();
6873       delete pending;
6874       cache_data->set_pending(nullptr);
6875       return 1;
6876     }
6877 
6878     delete pending;
6879   }
6880 
6881   cache_data->set_pending(event);
6882 
6883   return error;
6884 }
6885 
6886 /**
6887   Write an event to the binary log cache.
6888 */
6889 
write_event(Log_event * event_info)6890 bool MYSQL_BIN_LOG::write_event(Log_event *event_info) {
6891   THD *thd = event_info->thd;
6892   bool error = true;
6893   DBUG_TRACE;
6894 
6895   if (thd->binlog_evt_union.do_union) {
6896     /*
6897       In Stored function; Remember that function call caused an update.
6898       We will log the function call to the binary log on function exit
6899     */
6900     thd->binlog_evt_union.unioned_events = true;
6901     thd->binlog_evt_union.unioned_events_trans |=
6902         event_info->is_using_trans_cache();
6903     return false;
6904   }
6905 
6906   /*
6907     We only end the statement if we are in a top-level statement.  If
6908     we are inside a stored function, we do not end the statement since
6909     this will close all tables on the slave. But there can be a special case
6910     where we are inside a stored function/trigger and a SAVEPOINT is being
6911     set in side the stored function/trigger. This SAVEPOINT execution will
6912     force the pending event to be flushed without an STMT_END_F flag. This
6913     will result in a case where following DMLs will be considered as part of
6914     same statement and result in data loss on slave. Hence in this case we
6915     force the end_stmt to be true.
6916   */
6917   bool const end_stmt =
6918       (thd->in_sub_stmt && thd->lex->sql_command == SQLCOM_SAVEPOINT)
6919           ? true
6920           : (thd->locked_tables_mode && thd->lex->requires_prelocking());
6921   if (thd->binlog_flush_pending_rows_event(end_stmt,
6922                                            event_info->is_using_trans_cache()))
6923     return error;
6924 
6925   /*
6926      In most cases this is only called if 'is_open()' is true; in fact this is
6927      mostly called if is_open() *was* true a few instructions before, but it
6928      could have changed since.
6929   */
6930   if (likely(is_open())) {
6931     /*
6932       In the future we need to add to the following if tests like
6933       "do the involved tables match (to be implemented)
6934       binlog_[wild_]{do|ignore}_table?" (WL#1049)"
6935     */
6936     const char *local_db = event_info->get_db();
6937     if ((thd && !(thd->variables.option_bits & OPTION_BIN_LOG)) ||
6938         (thd->lex->sql_command != SQLCOM_ROLLBACK_TO_SAVEPOINT &&
6939          thd->lex->sql_command != SQLCOM_SAVEPOINT &&
6940          (!event_info->is_no_filter_event() &&
6941           !binlog_filter->db_ok(local_db))))
6942       return false;
6943 
6944     DBUG_ASSERT(event_info->is_using_trans_cache() ||
6945                 event_info->is_using_stmt_cache());
6946 
6947     if (binlog_start_trans_and_stmt(thd, event_info)) return error;
6948 
6949     bool is_trans_cache = event_info->is_using_trans_cache();
6950     binlog_cache_mngr *cache_mngr = thd_get_cache_mngr(thd);
6951     binlog_cache_data *cache_data =
6952         cache_mngr->get_binlog_cache_data(is_trans_cache);
6953 
6954     DBUG_PRINT("info", ("event type: %d", event_info->get_type_code()));
6955 
6956     /*
6957        No check for auto events flag here - this write method should
6958        never be called if auto-events are enabled.
6959 
6960        Write first log events which describe the 'run environment'
6961        of the SQL command. If row-based binlogging, Insert_id, Rand
6962        and other kind of "setting context" events are not needed.
6963     */
6964     if (thd) {
6965       if (!thd->is_current_stmt_binlog_format_row()) {
6966         if (thd->stmt_depends_on_first_successful_insert_id_in_prev_stmt) {
6967           Intvar_log_event e(
6968               thd, (uchar)binary_log::Intvar_event::LAST_INSERT_ID_EVENT,
6969               thd->first_successful_insert_id_in_prev_stmt_for_binlog,
6970               event_info->event_cache_type, event_info->event_logging_type);
6971           if (cache_data->write_event(&e)) goto err;
6972         }
6973         if (thd->auto_inc_intervals_in_cur_stmt_for_binlog.nb_elements() > 0) {
6974           DBUG_PRINT(
6975               "info",
6976               ("number of auto_inc intervals: %u",
6977                thd->auto_inc_intervals_in_cur_stmt_for_binlog.nb_elements()));
6978           Intvar_log_event e(
6979               thd, (uchar)binary_log::Intvar_event::INSERT_ID_EVENT,
6980               thd->auto_inc_intervals_in_cur_stmt_for_binlog.minimum(),
6981               event_info->event_cache_type, event_info->event_logging_type);
6982           if (cache_data->write_event(&e)) goto err;
6983         }
6984         if (thd->rand_used) {
6985           Rand_log_event e(thd, thd->rand_saved_seed1, thd->rand_saved_seed2,
6986                            event_info->event_cache_type,
6987                            event_info->event_logging_type);
6988           if (cache_data->write_event(&e)) goto err;
6989         }
6990         if (!thd->user_var_events.empty()) {
6991           for (size_t i = 0; i < thd->user_var_events.size(); i++) {
6992             Binlog_user_var_event *user_var_event = thd->user_var_events[i];
6993 
6994             /* setting flags for user var log event */
6995             uchar flags = User_var_log_event::UNDEF_F;
6996             if (user_var_event->unsigned_flag)
6997               flags |= User_var_log_event::UNSIGNED_F;
6998 
6999             User_var_log_event e(
7000                 thd, user_var_event->user_var_event->entry_name.ptr(),
7001                 user_var_event->user_var_event->entry_name.length(),
7002                 user_var_event->value, user_var_event->length,
7003                 user_var_event->type, user_var_event->charset_number, flags,
7004                 event_info->event_cache_type, event_info->event_logging_type);
7005             if (cache_data->write_event(&e)) goto err;
7006           }
7007         }
7008       }
7009     }
7010 
7011     /*
7012       Write the event.
7013     */
7014     if (cache_data->write_event(event_info)) goto err;
7015 
7016     if (DBUG_EVALUATE_IF("injecting_fault_writing", 1, 0)) goto err;
7017 
7018     /*
7019       After writing the event, if the trx-cache was used and any unsafe
7020       change was written into it, the cache is marked as cannot safely
7021       roll back.
7022     */
7023     if (is_trans_cache && stmt_cannot_safely_rollback(thd))
7024       cache_mngr->trx_cache.set_cannot_rollback();
7025 
7026     error = false;
7027 
7028   err:
7029     if (error) {
7030       report_cache_write_error(thd, is_trans_cache);
7031       if (check_write_error(thd) && cache_data &&
7032           stmt_cannot_safely_rollback(thd))
7033         cache_data->set_incident();
7034     }
7035   }
7036 
7037   return error;
7038 }
7039 
7040 /**
7041   The method executes rotation when LOCK_log is already acquired
7042   by the caller.
7043 
7044   @param force_rotate  caller can request the log rotation
7045   @param check_purge   is set to true if rotation took place
7046 
7047   @note
7048     If rotation fails, for instance the server was unable
7049     to create a new log file, we still try to write an
7050     incident event to the current log.
7051 
7052   @note The caller must hold LOCK_log when invoking this function.
7053 
7054   @retval
7055     nonzero - error in rotating routine.
7056 */
rotate(bool force_rotate,bool * check_purge)7057 int MYSQL_BIN_LOG::rotate(bool force_rotate, bool *check_purge) {
7058   int error = 0;
7059   DBUG_TRACE;
7060 
7061   DBUG_ASSERT(!is_relay_log);
7062   mysql_mutex_assert_owner(&LOCK_log);
7063 
7064   *check_purge = false;
7065 
7066   if (DBUG_EVALUATE_IF("force_rotate", 1, 0) || force_rotate ||
7067       (m_binlog_file->get_real_file_size() >= (my_off_t)max_size) ||
7068       DBUG_EVALUATE_IF("simulate_max_binlog_size", true, false)) {
7069     error = new_file_without_locking(nullptr);
7070     *check_purge = true;
7071   }
7072   return error;
7073 }
7074 
7075 /**
7076   The method executes logs purging routine.
7077 */
purge()7078 void MYSQL_BIN_LOG::purge() {
7079   if (expire_logs_days || binlog_expire_logs_seconds) {
7080     DEBUG_SYNC(current_thd, "at_purge_logs_before_date");
7081     time_t purge_time = 0;
7082 
7083     if (binlog_expire_logs_seconds) {
7084       purge_time = my_time(0) - binlog_expire_logs_seconds;
7085     } else
7086       purge_time = my_time(0) - expire_logs_days * 24 * 60 * 60;
7087 
7088     DBUG_EXECUTE_IF("expire_logs_always", { purge_time = my_time(0); });
7089     if (purge_time >= 0) {
7090       Is_instance_backup_locked_result is_instance_locked =
7091           is_instance_backup_locked(current_thd);
7092 
7093       if (is_instance_locked == Is_instance_backup_locked_result::OOM) {
7094         exec_binlog_error_action_abort(
7095             "Out of memory happened while checking if "
7096             "instance was locked for backup");
7097       }
7098       if (is_instance_locked == Is_instance_backup_locked_result::NOT_LOCKED) {
7099         /*
7100           Flush logs for storage engines, so that the last transaction
7101           is persisted inside storage engines.
7102         */
7103         ha_flush_logs();
7104         purge_logs_before_date(purge_time, true);
7105       }
7106     }
7107   }
7108 }
7109 
7110 /**
7111   Execute a FLUSH LOGS statement.
7112 
7113   The method is a shortcut of @c rotate() and @c purge().
7114   LOCK_log is acquired prior to rotate and is released after it.
7115 
7116   @param thd           Current session.
7117   @param force_rotate  caller can request the log rotation
7118 
7119   @retval
7120     nonzero - error in rotating routine.
7121 */
rotate_and_purge(THD * thd,bool force_rotate)7122 int MYSQL_BIN_LOG::rotate_and_purge(THD *thd, bool force_rotate) {
7123   int error = 0;
7124   DBUG_TRACE;
7125   bool check_purge = false;
7126 
7127   /*
7128     FLUSH BINARY LOGS command should ignore 'read-only' and 'super_read_only'
7129     options so that it can update 'mysql.gtid_executed' replication repository
7130     table.
7131   */
7132   thd->set_skip_readonly_check();
7133   /*
7134     Wait for handlerton to insert any pending information into the binlog.
7135     For e.g. ha_ndbcluster which updates the binlog asynchronously this is
7136     needed so that the user see its own commands in the binlog.
7137   */
7138   ha_binlog_wait(thd);
7139 
7140   DBUG_ASSERT(!is_relay_log);
7141   mysql_mutex_lock(&LOCK_log);
7142   error = rotate(force_rotate, &check_purge);
7143   /*
7144     NOTE: Run purge_logs wo/ holding LOCK_log because it does not need
7145           the mutex. Otherwise causes various deadlocks.
7146   */
7147   mysql_mutex_unlock(&LOCK_log);
7148 
7149   if (!error && check_purge) purge();
7150 
7151   return error;
7152 }
7153 
next_file_id()7154 uint MYSQL_BIN_LOG::next_file_id() {
7155   uint res;
7156   mysql_mutex_lock(&LOCK_log);
7157   res = file_id++;
7158   mysql_mutex_unlock(&LOCK_log);
7159   return res;
7160 }
7161 
get_gtid_executed(Sid_map * sid_map,Gtid_set * gtid_set)7162 int MYSQL_BIN_LOG::get_gtid_executed(Sid_map *sid_map, Gtid_set *gtid_set) {
7163   DBUG_TRACE;
7164   int error = 0;
7165 
7166   mysql_mutex_lock(&mysql_bin_log.LOCK_commit);
7167   global_sid_lock->wrlock();
7168 
7169   enum_return_status return_status = global_sid_map->copy(sid_map);
7170   if (return_status != RETURN_STATUS_OK) {
7171     error = 1;
7172     goto end;
7173   }
7174 
7175   return_status = gtid_set->add_gtid_set(gtid_state->get_executed_gtids());
7176   if (return_status != RETURN_STATUS_OK) error = 1;
7177 
7178 end:
7179   global_sid_lock->unlock();
7180   mysql_mutex_unlock(&mysql_bin_log.LOCK_commit);
7181 
7182   return error;
7183 }
7184 
7185 /**
7186   Write the contents of the given IO_CACHE to the binary log.
7187 
7188   The cache will be reset as a READ_CACHE to be able to read the
7189   contents from it.
7190 
7191   The data will be post-processed: see class Binlog_event_writer for
7192   details.
7193 
7194   @param cache Events will be read from this IO_CACHE.
7195   @param writer Events will be written to this Binlog_event_writer.
7196 
7197   @retval true IO error.
7198   @retval false Success.
7199 
7200   @see MYSQL_BIN_LOG::write_cache
7201 */
do_write_cache(Binlog_cache_storage * cache,Binlog_event_writer * writer)7202 bool MYSQL_BIN_LOG::do_write_cache(Binlog_cache_storage *cache,
7203                                    Binlog_event_writer *writer) {
7204   DBUG_TRACE;
7205 
7206   DBUG_EXECUTE_IF("simulate_do_write_cache_failure", {
7207     /*
7208        see binlog_cache_data::write_event() that reacts on
7209        @c simulate_disk_full_at_flush_pending.
7210     */
7211     DBUG_SET("-d,simulate_do_write_cache_failure");
7212     return true;
7213   });
7214 
7215 #ifndef DBUG_OFF
7216   uint64 expected_total_len = cache->length();
7217   DBUG_PRINT("info", ("bytes in cache= %" PRIu64, expected_total_len));
7218 #endif
7219 
7220   bool error = false;
7221   if (cache->copy_to(writer, &error)) {
7222     if (error) report_binlog_write_error();
7223     return true;
7224   }
7225   return false;
7226 }
7227 
7228 /**
7229   Writes an incident event to stmt_cache.
7230 
7231   @param ev Incident event to be written
7232   @param thd Thread variable
7233   @param need_lock_log If true, will acquire LOCK_log; otherwise the
7234   caller should already have acquired LOCK_log.
7235   @param err_msg Error message written to log file for the incident.
7236   @param do_flush_and_sync If true, will call flush_and_sync(), rotate() and
7237   purge().
7238 
7239   @retval false error
7240   @retval true success
7241 */
write_incident(Incident_log_event * ev,THD * thd,bool need_lock_log,const char * err_msg,bool do_flush_and_sync)7242 bool MYSQL_BIN_LOG::write_incident(Incident_log_event *ev, THD *thd,
7243                                    bool need_lock_log, const char *err_msg,
7244                                    bool do_flush_and_sync) {
7245   uint error = 0;
7246   DBUG_TRACE;
7247   DBUG_ASSERT(err_msg);
7248 
7249   if (!is_open()) return error;
7250 
7251   binlog_cache_mngr *cache_mngr = thd_get_cache_mngr(thd);
7252 
7253   /*
7254     thd->cache_mngr may be uninitialized when first transaction resulted in an
7255     incident. If there is no cache manager exists for the session, then we
7256     create one, so that a GTID is generated and is written prior to flushing
7257     the stmt_cache.
7258   */
7259   if (cache_mngr == NULL ||
7260       DBUG_EVALUATE_IF("simulate_cache_creation_failure", 1, 0)) {
7261     if (thd->binlog_setup_trx_data() ||
7262         DBUG_EVALUATE_IF("simulate_cache_creation_failure", 1, 0)) {
7263       auto gtid_mode = global_gtid_mode.get();
7264       if (gtid_mode == Gtid_mode::ON || gtid_mode == Gtid_mode::ON_PERMISSIVE) {
7265         std::ostringstream message;
7266 
7267         message << "Could not create IO cache while writing an incident event "
7268                    "to the binary log. Since GTID_MODE = "
7269                 << gtid_mode
7270                 << ", server is unable to proceed with logging. Query: '";
7271         /**
7272           The reason for the error may be that the query was
7273           huge. Better cut it to not run into resource problems.
7274         */
7275         message.write(thd->query().str, MYSQL_ERRMSG_SIZE);
7276         message << "'.";
7277 
7278         handle_binlog_flush_or_sync_error(thd, true, message.str().c_str());
7279         return true;
7280       }
7281     } else
7282       cache_mngr = thd_get_cache_mngr(thd);
7283   }
7284 
7285 #ifndef DBUG_OFF
7286   if (DBUG_EVALUATE_IF("simulate_write_incident_event_into_binlog_directly", 1,
7287                        0) &&
7288       !cache_mngr->stmt_cache.is_binlog_empty()) {
7289     /* The stmt_cache contains corruption data, so we can reset it. */
7290     cache_mngr->stmt_cache.reset();
7291   }
7292 #endif
7293 
7294   /*
7295     If there is no binlog cache then we write incidents directly
7296     into the binlog. If caller needs GTIDs it has to setup the
7297     binlog cache (for the injector thread).
7298   */
7299   if (cache_mngr == nullptr ||
7300       DBUG_EVALUATE_IF("simulate_write_incident_event_into_binlog_directly", 1,
7301                        0)) {
7302     if (need_lock_log)
7303       mysql_mutex_lock(&LOCK_log);
7304     else
7305       mysql_mutex_assert_owner(&LOCK_log);
7306     /* Write an incident event into binlog directly. */
7307     error = write_event_to_binlog(ev);
7308     /*
7309       Write an error to log. So that user might have a chance
7310       to be alerted and explore incident details.
7311     */
7312     if (!error)
7313       LogErr(ERROR_LEVEL, ER_BINLOG_LOGGING_INCIDENT_TO_STOP_SLAVES, err_msg);
7314   } else  // (cache_mngr != NULL)
7315   {
7316     if (!cache_mngr->stmt_cache.is_binlog_empty()) {
7317       /* The stmt_cache contains corruption data, so we can reset it. */
7318       cache_mngr->stmt_cache.reset();
7319     }
7320     if (!cache_mngr->trx_cache.is_binlog_empty()) {
7321       /* The trx_cache contains corruption data, so we can reset it. */
7322       cache_mngr->trx_cache.reset();
7323     }
7324     /*
7325       Write the incident event into stmt_cache, so that a GTID is generated and
7326       written for it prior to flushing the stmt_cache.
7327     */
7328     binlog_cache_data *cache_data = cache_mngr->get_binlog_cache_data(false);
7329     if ((error = cache_data->write_event(ev))) {
7330       LogErr(ERROR_LEVEL, ER_BINLOG_EVENT_WRITE_TO_STMT_CACHE_FAILED);
7331       cache_mngr->stmt_cache.reset();
7332       return error;
7333     }
7334 
7335     if (need_lock_log)
7336       mysql_mutex_lock(&LOCK_log);
7337     else
7338       mysql_mutex_assert_owner(&LOCK_log);
7339   }
7340 
7341   if (do_flush_and_sync) {
7342     if (!error && !(error = flush_and_sync())) {
7343       bool check_purge = false;
7344       update_binlog_end_pos();
7345       is_rotating_caused_by_incident = true;
7346       error = rotate(true, &check_purge);
7347       is_rotating_caused_by_incident = false;
7348       if (!error && check_purge) purge();
7349     }
7350   }
7351 
7352   if (need_lock_log) mysql_mutex_unlock(&LOCK_log);
7353 
7354   /*
7355     Write an error to log. So that user might have a chance
7356     to be alerted and explore incident details.
7357   */
7358   if (!error && cache_mngr != nullptr)
7359     LogErr(ERROR_LEVEL, ER_BINLOG_LOGGING_INCIDENT_TO_STOP_SLAVES, err_msg);
7360 
7361   return error;
7362 }
7363 
write_dml_directly(THD * thd,const char * stmt,size_t stmt_len,enum_sql_command sql_command)7364 bool MYSQL_BIN_LOG::write_dml_directly(THD *thd, const char *stmt,
7365                                        size_t stmt_len,
7366                                        enum_sql_command sql_command) {
7367   bool ret = false;
7368   /* backup the original command */
7369   enum_sql_command save_sql_command = thd->lex->sql_command;
7370   thd->lex->sql_command = sql_command;
7371 
7372   if (thd->binlog_query(THD::STMT_QUERY_TYPE, stmt, stmt_len, false, false,
7373                         false, 0) ||
7374       commit(thd, false) != TC_LOG::RESULT_SUCCESS) {
7375     ret = true;
7376   }
7377 
7378   thd->lex->sql_command = save_sql_command;
7379   return ret;
7380 }
7381 
7382 /**
7383   Creates an incident event and writes it to the binary log.
7384 
7385   @param thd  Thread variable
7386   @param need_lock_log If the binary lock should be locked or not
7387   @param err_msg Error message written to log file for the incident.
7388   @param do_flush_and_sync If true, will call flush_and_sync(), rotate() and
7389   purge().
7390 
7391   @retval
7392     0    error
7393   @retval
7394     1    success
7395 */
write_incident(THD * thd,bool need_lock_log,const char * err_msg,bool do_flush_and_sync)7396 bool MYSQL_BIN_LOG::write_incident(THD *thd, bool need_lock_log,
7397                                    const char *err_msg,
7398                                    bool do_flush_and_sync) {
7399   DBUG_TRACE;
7400 
7401   if (!is_open()) return false;
7402 
7403   LEX_CSTRING write_error_msg = {err_msg, strlen(err_msg)};
7404   binary_log::Incident_event::enum_incident incident =
7405       binary_log::Incident_event::INCIDENT_LOST_EVENTS;
7406   Incident_log_event ev(thd, incident, write_error_msg);
7407 
7408   return write_incident(&ev, thd, need_lock_log, err_msg, do_flush_and_sync);
7409 }
7410 
7411 /*
7412   Write the event into current binlog directly without going though a session
7413   binlog cache. It will update the event's log_pos and set checksum accordingly.
7414   binary_event_serialize can be called directly if log_pos should not be
7415   updated.
7416 */
write_event_to_binlog(Log_event * ev)7417 inline bool MYSQL_BIN_LOG::write_event_to_binlog(Log_event *ev) {
7418   ev->common_footer->checksum_alg =
7419       is_relay_log
7420           ? relay_log_checksum_alg
7421           : static_cast<enum_binlog_checksum_alg>(binlog_checksum_options);
7422   DBUG_ASSERT(ev->common_footer->checksum_alg !=
7423               binary_log::BINLOG_CHECKSUM_ALG_UNDEF);
7424 
7425   /*
7426     Stores current position into log_pos, it is used to calculate correcty
7427     end_log_pos by adding data_written in Log_event::write_header().
7428   */
7429   ev->common_header->log_pos = m_binlog_file->position();
7430 
7431   if (binary_event_serialize(ev, m_binlog_file)) return true;
7432 
7433   add_bytes_written(ev->common_header->data_written);
7434   return false;
7435 }
7436 
7437 /* Write the event into current binlog and flush and sync */
write_event_to_binlog_and_sync(Log_event * ev)7438 bool MYSQL_BIN_LOG::write_event_to_binlog_and_sync(Log_event *ev) {
7439   if (write_event_to_binlog(ev) || m_binlog_file->flush() ||
7440       m_binlog_file->sync())
7441     return true;
7442 
7443   update_binlog_end_pos();
7444   return false;
7445 }
7446 
7447 /**
7448   Write the contents of the statement or transaction cache to the binary log.
7449 
7450   Comparison with do_write_cache:
7451 
7452   - do_write_cache is a lower-level function that only performs the
7453     actual write.
7454 
7455   - write_cache is a higher-level function that calls do_write_cache
7456     and additionally performs some maintenance tasks, including:
7457     - report any errors that occurred
7458     - write incident event if needed
7459     - update gtid_state
7460     - update thd.binlog_next_event_pos
7461 
7462   @param thd Thread variable
7463 
7464   @param cache_data Events will be read from the IO_CACHE of this
7465   cache_data object.
7466 
7467   @param writer Events will be written to this Binlog_event_writer.
7468 
7469   @retval true IO error.
7470   @retval false Success.
7471 
7472   @note We only come here if there is something in the cache.
7473   @note Whatever is in the cache is always a complete transaction.
7474   @note 'cache' needs to be reinitialized after this functions returns.
7475 */
write_cache(THD * thd,binlog_cache_data * cache_data,Binlog_event_writer * writer)7476 bool MYSQL_BIN_LOG::write_cache(THD *thd, binlog_cache_data *cache_data,
7477                                 Binlog_event_writer *writer) {
7478   DBUG_TRACE;
7479 
7480   Binlog_cache_storage *cache = cache_data->get_cache();
7481   bool incident = cache_data->has_incident();
7482 
7483   mysql_mutex_assert_owner(&LOCK_log);
7484 
7485   DBUG_ASSERT(is_open());
7486   if (likely(is_open()))  // Should always be true
7487   {
7488     /*
7489       We only bother to write to the binary log if there is anything
7490       to write.
7491 
7492       @todo Is this check redundant? Probably this is only called if
7493       there is anything in the cache (see @note in comment above this
7494       function). Check if we can replace this by an assertion. /Sven
7495     */
7496     if (!cache->is_empty()) {
7497       DBUG_EXECUTE_IF("crash_before_writing_xid", {
7498         if (do_write_cache(cache, writer))
7499           DBUG_PRINT("info", ("error writing binlog cache: %d", write_error));
7500         flush_and_sync(true);
7501         DBUG_PRINT("info", ("crashing before writing xid"));
7502         DBUG_SUICIDE();
7503       });
7504       if (do_write_cache(cache, writer)) goto err;
7505 
7506       const char *err_msg =
7507           "Non-transactional changes did not get into "
7508           "the binlog.";
7509       if (incident &&
7510           write_incident(thd, false /*need_lock_log=false*/, err_msg,
7511                          false /*do_flush_and_sync==false*/)) {
7512         report_binlog_write_error();
7513         goto err;
7514       }
7515       DBUG_EXECUTE_IF("half_binlogged_transaction", DBUG_SUICIDE(););
7516     }
7517     update_thd_next_event_pos(thd);
7518   }
7519 
7520   return false;
7521 
7522 err:
7523   thd->commit_error = THD::CE_FLUSH_ERROR;
7524 
7525   return true;
7526 }
7527 
report_binlog_write_error()7528 void MYSQL_BIN_LOG::report_binlog_write_error() {
7529   char errbuf[MYSYS_STRERROR_SIZE];
7530 
7531   write_error = true;
7532   LogErr(ERROR_LEVEL, ER_FAILED_TO_WRITE_TO_FILE, name, errno,
7533          my_strerror(errbuf, sizeof(errbuf), errno));
7534 }
7535 
7536 /**
7537   Wait until we get a signal that the binary log has been updated.
7538   Applies to master only.
7539 
7540   NOTES
7541   @param[in] timeout    a pointer to a timespec;
7542                         NULL means to wait w/o timeout.
7543   @retval    0          if got signalled on update
7544   @retval    non-0      if wait timeout elapsed
7545   @note
7546     LOCK_binlog_end_pos must be taken before calling this function.
7547     LOCK_binlog_end_pos is being released while the thread is waiting.
7548     LOCK_binlog_end_pos is released by the caller.
7549 */
7550 
wait_for_update(const struct timespec * timeout)7551 int MYSQL_BIN_LOG::wait_for_update(const struct timespec *timeout) {
7552   int ret = 0;
7553   DBUG_TRACE;
7554 
7555   if (!timeout)
7556     mysql_cond_wait(&update_cond, &LOCK_binlog_end_pos);
7557   else
7558     ret = mysql_cond_timedwait(&update_cond, &LOCK_binlog_end_pos,
7559                                const_cast<struct timespec *>(timeout));
7560   return ret;
7561 }
7562 
7563 /**
7564   Close the log file.
7565 
7566   @param exiting     Bitmask for one or more of the following bits:
7567           - LOG_CLOSE_INDEX : if we should close the index file
7568           - LOG_CLOSE_TO_BE_OPENED : if we intend to call open
7569                                      at once after close.
7570           - LOG_CLOSE_STOP_EVENT : write a 'stop' event to the log
7571 
7572   @param need_lock_log If true, this function acquires LOCK_log;
7573   otherwise the caller should already have acquired it.
7574 
7575   @param need_lock_index If true, this function acquires LOCK_index;
7576   otherwise the caller should already have acquired it.
7577 
7578   @note
7579     One can do an open on the object at once after doing a close.
7580     The internal structures are not freed until cleanup() is called
7581 */
7582 
close(uint exiting,bool need_lock_log,bool need_lock_index)7583 void MYSQL_BIN_LOG::close(
7584     uint exiting, bool need_lock_log,
7585     bool need_lock_index) {  // One can't set log_type here!
7586   DBUG_TRACE;
7587   DBUG_PRINT("enter", ("exiting: %d", (int)exiting));
7588   if (need_lock_log)
7589     mysql_mutex_lock(&LOCK_log);
7590   else
7591     mysql_mutex_assert_owner(&LOCK_log);
7592 
7593   if (atomic_log_state == LOG_OPENED) {
7594     if ((exiting & LOG_CLOSE_STOP_EVENT) != 0) {
7595       /**
7596         TODO(WL#7546): Change the implementation to Stop_event after write() is
7597         moved into libbinlogevents
7598       */
7599       Stop_log_event s;
7600       // the checksumming rule for relay-log case is similar to Rotate
7601       s.common_footer->checksum_alg =
7602           is_relay_log
7603               ? relay_log_checksum_alg
7604               : static_cast<enum_binlog_checksum_alg>(binlog_checksum_options);
7605       DBUG_ASSERT(!is_relay_log || relay_log_checksum_alg !=
7606                                        binary_log::BINLOG_CHECKSUM_ALG_UNDEF);
7607       if (!write_event_to_binlog(&s) && !m_binlog_file->flush())
7608         update_binlog_end_pos();
7609     }
7610 
7611     /* The following update should not be done in relay log files */
7612     if (!is_relay_log) {
7613       my_off_t offset = BIN_LOG_HEADER_SIZE + FLAGS_OFFSET;
7614       uchar flags = 0;  // clearing LOG_EVENT_BINLOG_IN_USE_F
7615       (void)m_binlog_file->update(&flags, 1, offset);
7616     }
7617 
7618     if (m_binlog_file->flush_and_sync() && !write_error) {
7619       report_binlog_write_error();
7620     }
7621 
7622     /*
7623       LOCK_sync to guarantee that no thread is calling m_binlog_file
7624       to sync data to disk when another thread is closing m_binlog_file.
7625     */
7626     if (!is_relay_log) mysql_mutex_lock(&LOCK_sync);
7627     m_binlog_file->close();
7628     if (!is_relay_log) mysql_mutex_unlock(&LOCK_sync);
7629 
7630     atomic_log_state =
7631         (exiting & LOG_CLOSE_TO_BE_OPENED) ? LOG_TO_BE_OPENED : LOG_CLOSED;
7632     my_free(name);
7633     name = nullptr;
7634   }
7635 
7636   /*
7637     The following test is needed even if is_open() is not set, as we may have
7638     called a not complete close earlier and the index file is still open.
7639   */
7640 
7641   if (need_lock_index)
7642     mysql_mutex_lock(&LOCK_index);
7643   else
7644     mysql_mutex_assert_owner(&LOCK_index);
7645 
7646   if ((exiting & LOG_CLOSE_INDEX) && my_b_inited(&index_file)) {
7647     end_io_cache(&index_file);
7648     if (mysql_file_close(index_file.file, MYF(0)) < 0 && !write_error) {
7649       report_binlog_write_error();
7650     }
7651   }
7652 
7653   if (need_lock_index) mysql_mutex_unlock(&LOCK_index);
7654 
7655   atomic_log_state =
7656       (exiting & LOG_CLOSE_TO_BE_OPENED) ? LOG_TO_BE_OPENED : LOG_CLOSED;
7657   my_free(name);
7658   name = nullptr;
7659 
7660   if (need_lock_log) mysql_mutex_unlock(&LOCK_log);
7661 }
7662 
harvest_bytes_written(Relay_log_info * rli,bool need_log_space_lock)7663 void MYSQL_BIN_LOG::harvest_bytes_written(Relay_log_info *rli,
7664                                           bool need_log_space_lock) {
7665 #ifndef DBUG_OFF
7666   char buf1[22], buf2[22];
7667 #endif
7668 
7669   DBUG_TRACE;
7670   if (need_log_space_lock)
7671     mysql_mutex_lock(&rli->log_space_lock);
7672   else
7673     mysql_mutex_assert_owner(&rli->log_space_lock);
7674   rli->log_space_total += bytes_written;
7675   DBUG_PRINT("info",
7676              ("relay_log_space: %s  bytes_written: %s",
7677               llstr(rli->log_space_total, buf1), llstr(bytes_written, buf2)));
7678   bytes_written = 0;
7679   if (need_log_space_lock) mysql_mutex_unlock(&rli->log_space_lock);
7680 }
7681 
set_max_size(ulong max_size_arg)7682 void MYSQL_BIN_LOG::set_max_size(ulong max_size_arg) {
7683   /*
7684     We need to take locks, otherwise this may happen:
7685     new_file() is called, calls open(old_max_size), then before open() starts,
7686     set_max_size() sets max_size to max_size_arg, then open() starts and
7687     uses the old_max_size argument, so max_size_arg has been overwritten and
7688     it's like if the SET command was never run.
7689   */
7690   DBUG_TRACE;
7691   mysql_mutex_lock(&LOCK_log);
7692   if (is_open()) max_size = max_size_arg;
7693   mysql_mutex_unlock(&LOCK_log);
7694 }
7695 
7696 /****** transaction coordinator log for 2pc - binlog() based solution ******/
7697 
7698 /**
7699   @todo
7700   keep in-memory list of prepared transactions
7701   (add to list in log(), remove on unlog())
7702   and copy it to the new binlog if rotated
7703   but let's check the behaviour of tc_log_page_waits first!
7704 */
7705 
open_binlog(const char * opt_name)7706 int MYSQL_BIN_LOG::open_binlog(const char *opt_name) {
7707   LOG_INFO log_info;
7708   int error = 1;
7709 
7710   /*
7711     This function is used for 2pc transaction coordination.  Hence, it
7712     is never used for relay logs.
7713   */
7714   DBUG_ASSERT(!is_relay_log);
7715   DBUG_ASSERT(total_ha_2pc > 1 || (1 == total_ha_2pc && opt_bin_log));
7716   DBUG_ASSERT(opt_name && opt_name[0]);
7717 
7718   if (!my_b_inited(&index_file)) {
7719     /* There was a failure to open the index file, can't open the binlog */
7720     cleanup();
7721     return 1;
7722   }
7723 
7724   if (using_heuristic_recover()) {
7725     /* generate a new binlog to mask a corrupted one */
7726     mysql_mutex_lock(&LOCK_log);
7727     open_binlog(opt_name, nullptr, max_binlog_size, false,
7728                 true /*need_lock_index=true*/, true /*need_sid_lock=true*/,
7729                 nullptr);
7730     mysql_mutex_unlock(&LOCK_log);
7731     cleanup();
7732     return 1;
7733   }
7734 
7735   if ((error = find_log_pos(&log_info, NullS, true /*need_lock_index=true*/))) {
7736     if (error != LOG_INFO_EOF)
7737       LogErr(ERROR_LEVEL, ER_BINLOG_CANT_FIND_LOG_IN_INDEX, error);
7738     else
7739       error = 0;
7740     goto err;
7741   }
7742 
7743   {
7744     Log_event *ev = nullptr;
7745     char log_name[FN_REFLEN];
7746     my_off_t valid_pos = 0;
7747     my_off_t binlog_size = 0;
7748 
7749     do {
7750       strmake(log_name, log_info.log_file_name, sizeof(log_name) - 1);
7751     } while (
7752         !(error = find_next_log(&log_info, true /*need_lock_index=true*/)));
7753 
7754     if (error != LOG_INFO_EOF) {
7755       LogErr(ERROR_LEVEL, ER_BINLOG_CANT_FIND_LOG_IN_INDEX, error);
7756       goto err;
7757     }
7758 
7759     Binlog_file_reader binlog_file_reader(opt_master_verify_checksum);
7760     if (binlog_file_reader.open(log_name)) {
7761       LogErr(ERROR_LEVEL, ER_BINLOG_FILE_OPEN_FAILED,
7762              binlog_file_reader.get_error_str());
7763       goto err;
7764     }
7765 
7766     /*
7767       If the binary log was not properly closed it means that the server
7768       may have crashed. In that case, we need to call
7769       MYSQL_BIN_LOG::binlog_recover
7770       to:
7771 
7772         a) collect logged XIDs;
7773         b) complete the 2PC of the pending XIDs;
7774         c) collect the last valid position.
7775 
7776       Therefore, we do need to iterate over the binary log, even if
7777       total_ha_2pc == 1, to find the last valid group of events written.
7778       Later we will take this value and truncate the log if need be.
7779     */
7780     if ((ev = binlog_file_reader.read_event_object()) &&
7781         ev->get_type_code() == binary_log::FORMAT_DESCRIPTION_EVENT &&
7782         (ev->common_header->flags & LOG_EVENT_BINLOG_IN_USE_F ||
7783          DBUG_EVALUATE_IF("eval_force_bin_log_recovery", true, false))) {
7784       LogErr(INFORMATION_LEVEL, ER_BINLOG_RECOVERING_AFTER_CRASH_USING,
7785              opt_name);
7786       valid_pos = binlog_file_reader.position();
7787       error = binlog_recover(&binlog_file_reader, &valid_pos);
7788       binlog_size = binlog_file_reader.ifile()->length();
7789     } else
7790       error = 0;
7791 
7792     delete ev;
7793 
7794     if (error) goto err;
7795 
7796     /* Trim the crashed binlog file to last valid transaction
7797       or event (non-transaction) base on valid_pos. */
7798     if (valid_pos > 0) {
7799       std::unique_ptr<Binlog_ofile> ofile(
7800           Binlog_ofile::open_existing(key_file_binlog, log_name, MYF(MY_WME)));
7801 
7802       if (!ofile) {
7803         LogErr(ERROR_LEVEL, ER_BINLOG_CANT_OPEN_CRASHED_BINLOG);
7804         return -1;
7805       }
7806 
7807       /* Change binlog file size to valid_pos */
7808       if (valid_pos < binlog_size) {
7809         if (ofile->truncate(valid_pos)) {
7810           LogErr(ERROR_LEVEL, ER_BINLOG_CANT_TRIM_CRASHED_BINLOG);
7811           return -1;
7812         }
7813         LogErr(INFORMATION_LEVEL, ER_BINLOG_CRASHED_BINLOG_TRIMMED, log_name,
7814                binlog_size, valid_pos, valid_pos);
7815       }
7816 
7817       /* Clear LOG_EVENT_BINLOG_IN_USE_F */
7818       uchar flags = 0;
7819       if (ofile->update(&flags, 1, BIN_LOG_HEADER_SIZE + FLAGS_OFFSET)) {
7820         LogErr(ERROR_LEVEL,
7821                ER_BINLOG_CANT_CLEAR_IN_USE_FLAG_FOR_CRASHED_BINLOG);
7822         return -1;
7823       }
7824     }  // end if (valid_pos > 0)
7825   }
7826 
7827 err:
7828   return error;
7829 }
7830 
7831 /**
7832  Truncate the active relay log file in the specified position.
7833 
7834   @param mi Master_info of the channel going to truncate the relay log file.
7835   @param truncate_pos The position to truncate the active relay log file.
7836   @return False on success and true on failure.
7837 */
truncate_relaylog_file(Master_info * mi,my_off_t truncate_pos)7838 bool MYSQL_BIN_LOG::truncate_relaylog_file(Master_info *mi,
7839                                            my_off_t truncate_pos) {
7840   DBUG_TRACE;
7841   DBUG_ASSERT(is_relay_log);
7842   mysql_mutex_assert_owner(&LOCK_log);
7843   Relay_log_info *rli = mi->rli;
7844   bool error = false;
7845 
7846   /*
7847     If the relay log was closed by an error (binlog_error_action=IGNORE_ERROR)
7848     this truncate function should produce no result as the relay log is already
7849     in really bad shape.
7850   */
7851   if (!is_open()) {
7852     return false;
7853   }
7854 
7855   my_off_t relaylog_file_size = m_binlog_file->position();
7856 
7857   if (truncate_pos > 0 && truncate_pos < relaylog_file_size) {
7858     if (m_binlog_file->truncate(truncate_pos)) {
7859       mi->report(ERROR_LEVEL, ER_SLAVE_RELAY_LOG_WRITE_FAILURE,
7860                  ER_THD(current_thd, ER_SLAVE_RELAY_LOG_WRITE_FAILURE),
7861                  "failed to truncate relay log file");
7862       error = true;
7863     } else {
7864       LogErr(INFORMATION_LEVEL, ER_SLAVE_RELAY_LOG_TRUNCATE_INFO, log_file_name,
7865              relaylog_file_size, truncate_pos);
7866 
7867       // Re-init the SQL thread IO_CACHE
7868       DBUG_ASSERT(strcmp(rli->get_event_relay_log_name(), log_file_name) ||
7869                   rli->get_event_relay_log_pos() <= truncate_pos);
7870       rli->notify_relay_log_truncated();
7871     }
7872   }
7873   return error;
7874 }
7875 
7876 /** This is called on shutdown, after ha_panic. */
close()7877 void MYSQL_BIN_LOG::close() {}
7878 
7879 /*
7880   Prepare the transaction in the transaction coordinator.
7881 
7882   This function will prepare the transaction in the storage engines
7883   (by calling @c ha_prepare_low) what will write a prepare record
7884   to the log buffers.
7885 
7886   @retval 0    success
7887   @retval 1    error
7888 */
prepare(THD * thd,bool all)7889 int MYSQL_BIN_LOG::prepare(THD *thd, bool all) {
7890   DBUG_TRACE;
7891 
7892   DBUG_ASSERT(opt_bin_log);
7893   /*
7894     The applier thread explicitly overrides the value of sql_log_bin
7895     with the value of log_slave_updates.
7896   */
7897   DBUG_ASSERT(thd->slave_thread ? opt_log_slave_updates
7898                                 : thd->variables.sql_log_bin);
7899 
7900   /*
7901     Set HA_IGNORE_DURABILITY to not flush the prepared record of the
7902     transaction to the log of storage engine (for example, InnoDB
7903     redo log) during the prepare phase. So that we can flush prepared
7904     records of transactions to the log of storage engine in a group
7905     right before flushing them to binary log during binlog group
7906     commit flush stage. Reset to HA_REGULAR_DURABILITY at the
7907     beginning of parsing next command.
7908   */
7909   thd->durability_property = HA_IGNORE_DURABILITY;
7910 
7911   int error = ha_prepare_low(thd, all);
7912 
7913   return error;
7914 }
7915 
7916 /**
7917   Commit the transaction in the transaction coordinator.
7918 
7919   This function will commit the sessions transaction in the binary log
7920   and in the storage engines (by calling @c ha_commit_low). If the
7921   transaction was successfully logged (or not successfully unlogged)
7922   but the commit in the engines did not succed, there is a risk of
7923   inconsistency between the engines and the binary log.
7924 
7925   For binary log group commit, the commit is separated into three
7926   parts:
7927 
7928   1. First part consists of filling the necessary caches and
7929      finalizing them (if they need to be finalized). After this,
7930      nothing is added to any of the caches.
7931 
7932   2. Second part execute an ordered flush and commit. This will be
7933      done using the group commit functionality in ordered_commit.
7934 
7935   3. Third part checks any errors resulting from the ordered commit
7936      and handles them appropriately.
7937 
7938   @retval RESULT_SUCCESS   success
7939   @retval RESULT_ABORTED   error, transaction was neither logged nor committed
7940   @retval RESULT_INCONSISTENT  error, transaction was logged but not committed
7941 */
commit(THD * thd,bool all)7942 TC_LOG::enum_result MYSQL_BIN_LOG::commit(THD *thd, bool all) {
7943   DBUG_TRACE;
7944   DBUG_PRINT("info",
7945              ("query='%s'", thd == current_thd ? thd->query().str : nullptr));
7946   binlog_cache_mngr *cache_mngr = thd_get_cache_mngr(thd);
7947   Transaction_ctx *trn_ctx = thd->get_transaction();
7948   my_xid xid = trn_ctx->xid_state()->get_xid()->get_my_xid();
7949   bool stmt_stuff_logged = false;
7950   bool trx_stuff_logged = false;
7951   bool skip_commit = is_loggable_xa_prepare(thd);
7952   bool is_atomic_ddl = false;
7953 
7954   DBUG_PRINT("enter", ("thd: 0x%llx, all: %s, xid: %llu, cache_mngr: 0x%llx",
7955                        (ulonglong)thd, YESNO(all), (ulonglong)xid,
7956                        (ulonglong)cache_mngr));
7957 
7958   /*
7959     No cache manager means nothing to log, but we still have to commit
7960     the transaction.
7961    */
7962   if (cache_mngr == nullptr) {
7963     if (!skip_commit && ha_commit_low(thd, all)) return RESULT_ABORTED;
7964     return RESULT_SUCCESS;
7965   }
7966 
7967   Transaction_ctx::enum_trx_scope trx_scope =
7968       all ? Transaction_ctx::SESSION : Transaction_ctx::STMT;
7969 
7970   DBUG_PRINT("debug", ("in_transaction: %s, no_2pc: %s, rw_ha_count: %d",
7971                        YESNO(thd->in_multi_stmt_transaction_mode()),
7972                        YESNO(trn_ctx->no_2pc(trx_scope)),
7973                        trn_ctx->rw_ha_count(trx_scope)));
7974   DBUG_PRINT("debug",
7975              ("all.cannot_safely_rollback(): %s, trx_cache_empty: %s",
7976               YESNO(trn_ctx->cannot_safely_rollback(Transaction_ctx::SESSION)),
7977               YESNO(cache_mngr->trx_cache.is_binlog_empty())));
7978   DBUG_PRINT("debug",
7979              ("stmt.cannot_safely_rollback(): %s, stmt_cache_empty: %s",
7980               YESNO(trn_ctx->cannot_safely_rollback(Transaction_ctx::STMT)),
7981               YESNO(cache_mngr->stmt_cache.is_binlog_empty())));
7982 
7983   /*
7984     If there are no handlertons registered, there is nothing to
7985     commit. Note that DDLs are written earlier in this case (inside
7986     binlog_query).
7987 
7988     TODO: This can be a problem in those cases that there are no
7989     handlertons registered. DDLs are one example, but the other case
7990     is MyISAM. In this case, we could register a dummy handlerton to
7991     trigger the commit.
7992 
7993     Any statement that requires logging will call binlog_query before
7994     trans_commit_stmt, so an alternative is to use the condition
7995     "binlog_query called or stmt.ha_list != 0".
7996    */
7997   if (!all && !trn_ctx->is_active(trx_scope) &&
7998       cache_mngr->stmt_cache.is_binlog_empty())
7999     return RESULT_SUCCESS;
8000 
8001   if (thd->lex->sql_command == SQLCOM_XA_COMMIT) {
8002     /* The Commit phase of the XA two phase logging. */
8003 
8004 #ifndef DBUG_OFF
8005     bool one_phase = get_xa_opt(thd) == XA_ONE_PHASE;
8006     DBUG_ASSERT(all || (thd->slave_thread && one_phase));
8007     DBUG_ASSERT(!skip_commit || one_phase);
8008 #endif
8009 
8010     XID_STATE *xs = thd->get_transaction()->xid_state();
8011     if (DBUG_EVALUATE_IF(
8012             "simulate_xa_commit_log_failure", true,
8013             do_binlog_xa_commit_rollback(thd, xs->get_xid(), true)))
8014       return RESULT_ABORTED;
8015   }
8016 
8017   if (!cache_mngr->stmt_cache.is_binlog_empty()) {
8018     /*
8019       Commit parent identification of non-transactional query has
8020       been deferred until now, except for the mixed transaction case.
8021     */
8022     trn_ctx->store_commit_parent(
8023         m_dependency_tracker.get_max_committed_timestamp());
8024     if (cache_mngr->stmt_cache.finalize(thd)) return RESULT_ABORTED;
8025     stmt_stuff_logged = true;
8026   }
8027 
8028   /*
8029     We commit the transaction if:
8030      - We are not in a transaction and committing a statement, or
8031      - We are in a transaction and a full transaction is committed.
8032     Otherwise, we accumulate the changes.
8033   */
8034   if (!cache_mngr->trx_cache.is_binlog_empty() && ending_trans(thd, all) &&
8035       !trx_stuff_logged) {
8036     const bool real_trans =
8037         (all || !trn_ctx->is_active(Transaction_ctx::SESSION));
8038 
8039     bool one_phase = get_xa_opt(thd) == XA_ONE_PHASE;
8040     bool is_loggable_xa = is_loggable_xa_prepare(thd);
8041     XID_STATE *xs = thd->get_transaction()->xid_state();
8042 
8043     /*
8044       Log and finalize transaction cache regarding XA PREPARE/XA COMMIT ONE
8045       PHASE if one of the following statements is true:
8046       - If it is a loggable XA transaction in prepare state;
8047       - If it is a transaction being commited with 'XA COMMIT ONE PHASE',
8048       statement and is not an empty transaction when GTID_NEXT is set to a
8049       manual GTID.
8050 
8051       For other XA COMMIT ONE PHASE statements that already have been finalized
8052       or are finalizing empty transactions when GTID_NEXT is set to a manual
8053       GTID, just let the execution flow get into the final 'else' branch and log
8054       a final 'COMMIT;' statement.
8055     */
8056     if (is_loggable_xa ||  // XA transaction in prepare state
8057         (thd->lex->sql_command == SQLCOM_XA_COMMIT &&  // Is a 'XA COMMIT
8058          one_phase &&                                  // ONE PHASE'
8059          xs != nullptr &&                              // and it has not yet
8060          !xs->is_binlogged() &&                        // been logged
8061          (thd->owned_gtid.sidno <= 0 ||  // and GTID_NEXT is NOT set to a
8062                                          // manual GTID
8063           !xs->has_state(XID_STATE::XA_NOTR))))  // and the transaction is NOT
8064                                                  // empty and NOT finalized in
8065                                                  // 'trans_xa_commit'
8066     {
8067       /* The prepare phase of XA transaction two phase logging. */
8068       int err = 0;
8069 
8070       DBUG_ASSERT(thd->lex->sql_command != SQLCOM_XA_COMMIT || one_phase);
8071 
8072       XA_prepare_log_event end_evt(thd, xs->get_xid(), one_phase);
8073 
8074       DBUG_ASSERT(!is_loggable_xa || skip_commit);
8075 
8076       err = cache_mngr->trx_cache.finalize(thd, &end_evt, xs);
8077       if (err) return RESULT_ABORTED;
8078       if (is_loggable_xa)
8079         if (DBUG_EVALUATE_IF("simulate_xa_prepare_failure_in_cache_finalize",
8080                              true, false))
8081           return RESULT_ABORTED;
8082     }
8083     /*
8084       If is atomic DDL, finalize cache for DDL and no further logging is needed.
8085     */
8086     else if ((is_atomic_ddl = cache_mngr->trx_cache.has_xid())) {
8087       if (cache_mngr->trx_cache.finalize(thd, nullptr)) return RESULT_ABORTED;
8088     }
8089     /*
8090       We are committing a 2PC transaction if it is a "real" transaction
8091       and has an XID assigned (because some handlerton registered). A
8092       transaction is "real" if either 'all' is true or
8093       'trn_ctx->is_active(Transaction_ctx::SESSION)' is not true.
8094 
8095       Note: This is kind of strange since registering the binlog
8096       handlerton will then make the transaction 2PC, which is not really
8097       true. This occurs for example if a MyISAM statement is executed
8098       with row-based replication on.
8099     */
8100     else if (real_trans && xid && trn_ctx->rw_ha_count(trx_scope) > 1 &&
8101              !trn_ctx->no_2pc(trx_scope)) {
8102       Xid_log_event end_evt(thd, xid);
8103       if (cache_mngr->trx_cache.finalize(thd, &end_evt)) return RESULT_ABORTED;
8104     }
8105     /*
8106       No further action needed and no special case applies, log a final
8107       'COMMIT' statement and finalize the transaction cache.
8108 
8109       Empty transactions finalized with 'XA COMMIT ONE PHASE' will be covered
8110       by this branch.
8111      */
8112     else {
8113       Query_log_event end_evt(thd, STRING_WITH_LEN("COMMIT"), true, false, true,
8114                               0, true);
8115       if (cache_mngr->trx_cache.finalize(thd, &end_evt)) return RESULT_ABORTED;
8116     }
8117     trx_stuff_logged = true;
8118   }
8119 
8120   /*
8121     This is part of the stmt rollback.
8122   */
8123   if (!all) cache_mngr->trx_cache.set_prev_position(MY_OFF_T_UNDEF);
8124 
8125   /*
8126     Now all the events are written to the caches, so we will commit
8127     the transaction in the engines. This is done using the group
8128     commit logic in ordered_commit, which will return when the
8129     transaction is committed.
8130 
8131     If the commit in the engines fail, we still have something logged
8132     to the binary log so we have to report this as a "bad" failure
8133     (failed to commit, but logged something).
8134   */
8135   if (stmt_stuff_logged || trx_stuff_logged) {
8136     if (RUN_HOOK(
8137             transaction, before_commit,
8138             (thd, all, thd_get_cache_mngr(thd)->get_trx_cache(),
8139              thd_get_cache_mngr(thd)->get_stmt_cache(),
8140              max<my_off_t>(max_binlog_cache_size, max_binlog_stmt_cache_size),
8141              is_atomic_ddl)) ||
8142         DBUG_EVALUATE_IF("simulate_failure_in_before_commit_hook", true,
8143                          false)) {
8144       ha_rollback_low(thd, all);
8145       gtid_state->update_on_rollback(thd);
8146       thd_get_cache_mngr(thd)->reset();
8147       // Reset the thread OK status before changing the outcome.
8148       if (thd->get_stmt_da()->is_ok())
8149         thd->get_stmt_da()->reset_diagnostics_area();
8150       my_error(ER_RUN_HOOK_ERROR, MYF(0), "before_commit");
8151       return RESULT_ABORTED;
8152     }
8153     /*
8154       Check whether the transaction should commit or abort given the
8155       plugin feedback.
8156     */
8157     if (thd->get_transaction()
8158             ->get_rpl_transaction_ctx()
8159             ->is_transaction_rollback() ||
8160         (DBUG_EVALUATE_IF("simulate_transaction_rollback_request", true,
8161                           false))) {
8162       ha_rollback_low(thd, all);
8163       gtid_state->update_on_rollback(thd);
8164       thd_get_cache_mngr(thd)->reset();
8165       if (thd->get_stmt_da()->is_ok())
8166         thd->get_stmt_da()->reset_diagnostics_area();
8167       my_error(ER_TRANSACTION_ROLLBACK_DURING_COMMIT, MYF(0));
8168       return RESULT_ABORTED;
8169     }
8170 
8171     if (ordered_commit(thd, all, skip_commit)) return RESULT_INCONSISTENT;
8172 
8173     DBUG_EXECUTE_IF("ensure_binlog_cache_is_reset", {
8174       /* Assert that binlog cache is reset at commit time. */
8175       DBUG_ASSERT(binlog_cache_is_reset);
8176       binlog_cache_is_reset = false;
8177     };);
8178 
8179     /*
8180       Mark the flag m_is_binlogged to true only after we are done
8181       with checking all the error cases.
8182     */
8183     if (is_loggable_xa_prepare(thd)) {
8184       thd->get_transaction()->xid_state()->set_binlogged();
8185       /*
8186         Inform hook listeners that a XA PREPARE did commit, that
8187         is, did log a transaction to the binary log.
8188       */
8189       (void)RUN_HOOK(transaction, after_commit, (thd, all));
8190     }
8191   } else if (!skip_commit) {
8192     if (ha_commit_low(thd, all)) return RESULT_INCONSISTENT;
8193   }
8194 
8195   return RESULT_SUCCESS;
8196 }
8197 
8198 /**
8199    Flush caches for session.
8200 
8201    @note @c set_trans_pos is called with a pointer to the file name
8202    that the binary log currently use and a rotation will change the
8203    contents of the variable.
8204 
8205    The position is used when calling the after_flush, after_commit,
8206    and after_rollback hooks, but these have been placed so that they
8207    occur before a rotation is executed.
8208 
8209    It is the responsibility of any plugin that use this position to
8210    copy it if they need it after the hook has returned.
8211 
8212    The current "global" transaction_counter is stepped and its new value
8213    is assigned to the transaction.
8214  */
flush_thread_caches(THD * thd)8215 std::pair<int, my_off_t> MYSQL_BIN_LOG::flush_thread_caches(THD *thd) {
8216   binlog_cache_mngr *cache_mngr = thd_get_cache_mngr(thd);
8217   my_off_t bytes = 0;
8218   bool wrote_xid = false;
8219   int error = cache_mngr->flush(thd, &bytes, &wrote_xid);
8220   if (!error && bytes > 0) {
8221     /*
8222       Note that set_trans_pos does not copy the file name. See
8223       this function documentation for more info.
8224     */
8225     thd->set_trans_pos(log_file_name, m_binlog_file->position());
8226     if (wrote_xid) inc_prep_xids(thd);
8227   }
8228   DBUG_PRINT("debug", ("bytes: %llu", bytes));
8229   return std::make_pair(error, bytes);
8230 }
8231 
init_thd_variables(THD * thd,bool all,bool skip_commit)8232 void MYSQL_BIN_LOG::init_thd_variables(THD *thd, bool all, bool skip_commit) {
8233   /*
8234     These values are used while committing a transaction, so clear
8235     everything.
8236 
8237     Notes:
8238 
8239     - It would be good if we could keep transaction coordinator
8240       log-specific data out of the THD structure, but that is not the
8241       case right now.
8242 
8243     - Everything in the transaction structure is reset when calling
8244       ha_commit_low since that calls Transaction_ctx::cleanup.
8245   */
8246   thd->tx_commit_pending = true;
8247   thd->commit_error = THD::CE_NONE;
8248   thd->next_to_commit = nullptr;
8249   thd->durability_property = HA_IGNORE_DURABILITY;
8250   thd->get_transaction()->m_flags.real_commit = all;
8251   thd->get_transaction()->m_flags.xid_written = false;
8252   thd->get_transaction()->m_flags.commit_low = !skip_commit;
8253   thd->get_transaction()->m_flags.run_hooks = !skip_commit;
8254 #ifndef DBUG_OFF
8255   /*
8256      The group commit Leader may have to wait for follower whose transaction
8257      is not ready to be preempted. Initially the status is pessimistic.
8258      Preemption guarding logics is necessary only when !DBUG_OFF is set.
8259      It won't be required for the dbug-off case as long as the follower won't
8260      execute any thread-specific write access code in this method, which is
8261      the case as of current.
8262   */
8263   thd->get_transaction()->m_flags.ready_preempt = 0;
8264 #endif
8265 }
8266 
fetch_and_process_flush_stage_queue(const bool check_and_skip_flush_logs)8267 THD *MYSQL_BIN_LOG::fetch_and_process_flush_stage_queue(
8268     const bool check_and_skip_flush_logs) {
8269   /*
8270     Fetch the entire flush queue and empty it, so that the next batch
8271     has a leader. We must do this before invoking ha_flush_logs(...)
8272     for guaranteeing to flush prepared records of transactions before
8273     flushing them to binary log, which is required by crash recovery.
8274   */
8275   Commit_stage_manager::get_instance().lock_queue(
8276       Commit_stage_manager::BINLOG_FLUSH_STAGE);
8277 
8278   THD *first_seen =
8279       Commit_stage_manager::get_instance().fetch_queue_skip_acquire_lock(
8280           Commit_stage_manager::BINLOG_FLUSH_STAGE);
8281   DBUG_ASSERT(first_seen != nullptr);
8282 
8283   THD *commit_order_thd =
8284       Commit_stage_manager::get_instance().fetch_queue_skip_acquire_lock(
8285           Commit_stage_manager::COMMIT_ORDER_FLUSH_STAGE);
8286 
8287   Commit_stage_manager::get_instance().unlock_queue(
8288       Commit_stage_manager::BINLOG_FLUSH_STAGE);
8289 
8290   if (!check_and_skip_flush_logs ||
8291       (check_and_skip_flush_logs && commit_order_thd != nullptr)) {
8292     /*
8293       We flush prepared records of transactions to the log of storage
8294       engine (for example, InnoDB redo log) in a group right before
8295       flushing them to binary log.
8296     */
8297     ha_flush_logs(true);
8298   }
8299 
8300   /*
8301     The transactions are flushed to the disk and so threads
8302     executing slave preserve commit order can be unblocked.
8303   */
8304   Commit_stage_manager::get_instance()
8305       .process_final_stage_for_ordered_commit_group(commit_order_thd);
8306   return first_seen;
8307 }
8308 
process_flush_stage_queue(my_off_t * total_bytes_var,bool * rotate_var,THD ** out_queue_var)8309 int MYSQL_BIN_LOG::process_flush_stage_queue(my_off_t *total_bytes_var,
8310                                              bool *rotate_var,
8311                                              THD **out_queue_var) {
8312   DBUG_TRACE;
8313 #ifndef DBUG_OFF
8314   // number of flushes per group.
8315   int no_flushes = 0;
8316 #endif
8317   DBUG_ASSERT(total_bytes_var && rotate_var && out_queue_var);
8318   my_off_t total_bytes = 0;
8319   int flush_error = 1;
8320   mysql_mutex_assert_owner(&LOCK_log);
8321 
8322   THD *first_seen = fetch_and_process_flush_stage_queue();
8323   DBUG_EXECUTE_IF("crash_after_flush_engine_log", DBUG_SUICIDE(););
8324   assign_automatic_gtids_to_flush_group(first_seen);
8325   /* Flush thread caches to binary log. */
8326   for (THD *head = first_seen; head; head = head->next_to_commit) {
8327     std::pair<int, my_off_t> result = flush_thread_caches(head);
8328     total_bytes += result.second;
8329     if (flush_error == 1) flush_error = result.first;
8330 #ifndef DBUG_OFF
8331     no_flushes++;
8332 #endif
8333   }
8334 
8335   *out_queue_var = first_seen;
8336   *total_bytes_var = total_bytes;
8337   if (total_bytes > 0 &&
8338       (m_binlog_file->get_real_file_size() >= (my_off_t)max_size ||
8339        DBUG_EVALUATE_IF("simulate_max_binlog_size", true, false)))
8340     *rotate_var = true;
8341 #ifndef DBUG_OFF
8342   DBUG_PRINT("info", ("no_flushes:= %d", no_flushes));
8343   no_flushes = 0;
8344 #endif
8345   return flush_error;
8346 }
8347 
8348 /**
8349   Commit a sequence of sessions.
8350 
8351   This function commit an entire queue of sessions starting with the
8352   session in @c first. If there were an error in the flushing part of
8353   the ordered commit, the error code is passed in and all the threads
8354   are marked accordingly (but not committed).
8355 
8356   It will also add the GTIDs of the transactions to gtid_executed.
8357 
8358   @see MYSQL_BIN_LOG::ordered_commit
8359 
8360   @param thd The "master" thread
8361   @param first First thread in the queue of threads to commit
8362  */
8363 
process_commit_stage_queue(THD * thd,THD * first)8364 void MYSQL_BIN_LOG::process_commit_stage_queue(THD *thd, THD *first) {
8365   mysql_mutex_assert_owner(&LOCK_commit);
8366 #ifndef DBUG_OFF
8367   thd->get_transaction()->m_flags.ready_preempt =
8368       true;  // formality by the leader
8369 #endif
8370   for (THD *head = first; head; head = head->next_to_commit) {
8371     DBUG_PRINT("debug", ("Thread ID: %u, commit_error: %d, commit_pending: %s",
8372                          head->thread_id(), head->commit_error,
8373                          YESNO(head->tx_commit_pending)));
8374     DBUG_EXECUTE_IF(
8375         "block_leader_after_delete",
8376         if (thd != head) { DBUG_SET("+d,after_delete_wait"); };);
8377     /*
8378       If flushing failed, set commit_error for the session, skip the
8379       transaction and proceed with the next transaction instead. This
8380       will mark all threads as failed, since the flush failed.
8381 
8382       If flush succeeded, attach to the session and commit it in the
8383       engines.
8384     */
8385 #ifndef DBUG_OFF
8386     Commit_stage_manager::get_instance().clear_preempt_status(head);
8387 #endif
8388     if (head->get_transaction()->sequence_number != SEQ_UNINIT) {
8389       mysql_mutex_lock(&LOCK_slave_trans_dep_tracker);
8390       m_dependency_tracker.update_max_committed(head);
8391       mysql_mutex_unlock(&LOCK_slave_trans_dep_tracker);
8392     }
8393     /*
8394       Flush/Sync error should be ignored and continue
8395       to commit phase. And thd->commit_error cannot be
8396       COMMIT_ERROR at this moment.
8397     */
8398     DBUG_ASSERT(head->commit_error != THD::CE_COMMIT_ERROR);
8399     Thd_backup_and_restore switch_thd(thd, head);
8400     bool all = head->get_transaction()->m_flags.real_commit;
8401     if (head->get_transaction()->m_flags.commit_low) {
8402       /* head is parked to have exited append() */
8403       DBUG_ASSERT(head->get_transaction()->m_flags.ready_preempt);
8404       /*
8405         storage engine commit
8406        */
8407       if (ha_commit_low(head, all, false))
8408         head->commit_error = THD::CE_COMMIT_ERROR;
8409     }
8410     DBUG_PRINT("debug", ("commit_error: %d, commit_pending: %s",
8411                          head->commit_error, YESNO(head->tx_commit_pending)));
8412   }
8413 
8414   /*
8415     Handle the GTID of the threads.
8416     gtid_executed table is kept updated even though transactions fail to be
8417     logged. That's required by slave auto positioning.
8418   */
8419   gtid_state->update_commit_group(first);
8420 
8421   for (THD *head = first; head; head = head->next_to_commit) {
8422     /*
8423       Decrement the prepared XID counter after storage engine commit.
8424       We also need decrement the prepared XID when encountering a
8425       flush error or session attach error for avoiding 3-way deadlock
8426       among user thread, rotate thread and dump thread.
8427     */
8428     if (head->get_transaction()->m_flags.xid_written) dec_prep_xids(head);
8429   }
8430 }
8431 
8432 /**
8433   Process after commit for a sequence of sessions.
8434 
8435   @param thd The "master" thread
8436   @param first First thread in the queue of threads to commit
8437  */
8438 
process_after_commit_stage_queue(THD * thd,THD * first)8439 void MYSQL_BIN_LOG::process_after_commit_stage_queue(THD *thd, THD *first) {
8440   for (THD *head = first; head; head = head->next_to_commit) {
8441     if (head->get_transaction()->m_flags.run_hooks &&
8442         head->commit_error != THD::CE_COMMIT_ERROR) {
8443       /*
8444         TODO: This hook here should probably move outside/below this
8445               if and be the only after_commit invocation left in the
8446               code.
8447       */
8448       Thd_backup_and_restore switch_thd(thd, head);
8449       bool all = head->get_transaction()->m_flags.real_commit;
8450       (void)RUN_HOOK(transaction, after_commit, (head, all));
8451       /*
8452         When after_commit finished for the transaction, clear the run_hooks
8453         flag. This allow other parts of the system to check if after_commit was
8454         called.
8455       */
8456       head->get_transaction()->m_flags.run_hooks = false;
8457     }
8458   }
8459 }
8460 
8461 #ifndef DBUG_OFF
8462 /** Names for the stages. */
8463 static const char *g_stage_name[] = {
8464     "FLUSH",
8465     "SYNC",
8466     "COMMIT",
8467 };
8468 #endif
8469 
change_stage(THD * thd MY_ATTRIBUTE ((unused)),Commit_stage_manager::StageID stage,THD * queue,mysql_mutex_t * leave_mutex,mysql_mutex_t * enter_mutex)8470 bool MYSQL_BIN_LOG::change_stage(THD *thd MY_ATTRIBUTE((unused)),
8471                                  Commit_stage_manager::StageID stage,
8472                                  THD *queue, mysql_mutex_t *leave_mutex,
8473                                  mysql_mutex_t *enter_mutex) {
8474   DBUG_TRACE;
8475   DBUG_PRINT("enter", ("thd: 0x%llx, stage: %s, queue: 0x%llx", (ulonglong)thd,
8476                        g_stage_name[stage], (ulonglong)queue));
8477   DBUG_ASSERT(0 <= stage && stage < Commit_stage_manager::STAGE_COUNTER);
8478   DBUG_ASSERT(enter_mutex);
8479   DBUG_ASSERT(queue);
8480   /*
8481     enroll_for will release the leave_mutex once the sessions are
8482     queued.
8483   */
8484   if (!Commit_stage_manager::get_instance().enroll_for(
8485           stage, queue, leave_mutex, enter_mutex)) {
8486     DBUG_ASSERT(!thd_get_cache_mngr(thd)->dbug_any_finalized());
8487     return true;
8488   }
8489 
8490   return false;
8491 }
8492 
8493 /**
8494   Flush the I/O cache to file.
8495 
8496   Flush the binary log to the binlog file if any byte where written
8497   and signal that the binary log file has been updated if the flush
8498   succeeds.
8499 */
8500 
flush_cache_to_file(my_off_t * end_pos_var)8501 int MYSQL_BIN_LOG::flush_cache_to_file(my_off_t *end_pos_var) {
8502   if (m_binlog_file->flush()) {
8503     THD *thd = current_thd;
8504     thd->commit_error = THD::CE_FLUSH_ERROR;
8505     return ER_ERROR_ON_WRITE;
8506   }
8507   *end_pos_var = m_binlog_file->position();
8508   return 0;
8509 }
8510 
8511 /**
8512   Call fsync() to sync the file to disk.
8513 */
sync_binlog_file(bool force)8514 std::pair<bool, bool> MYSQL_BIN_LOG::sync_binlog_file(bool force) {
8515   bool synced = false;
8516   unsigned int sync_period = get_sync_period();
8517   if (force || (sync_period && ++sync_counter >= sync_period)) {
8518     sync_counter = 0;
8519 
8520     /*
8521       There is a chance that binlog file could be closed by 'RESET MASTER' or
8522       or 'FLUSH LOGS' just after the leader releases LOCK_log and before it
8523       acquires LOCK_sync log. So it should check if m_binlog_file is opened.
8524     */
8525     if (DBUG_EVALUATE_IF("simulate_error_during_sync_binlog_file", 1,
8526                          m_binlog_file->is_open() && m_binlog_file->sync())) {
8527       THD *thd = current_thd;
8528       thd->commit_error = THD::CE_SYNC_ERROR;
8529       return std::make_pair(true, synced);
8530     }
8531     synced = true;
8532   }
8533   return std::make_pair(false, synced);
8534 }
8535 
8536 /**
8537    Helper function executed when leaving @c ordered_commit.
8538 
8539    This function contain the necessary code for fetching the error
8540    code, doing post-commit checks, and wrapping up the commit if
8541    necessary.
8542 
8543    It is typically called when enter_stage indicates that the thread
8544    should bail out, and also when the ultimate leader thread finishes
8545    executing @c ordered_commit.
8546 
8547    It is typically used in this manner:
8548    @code
8549    if (enter_stage(thd, Thread_queue::BINLOG_FLUSH_STAGE, thd, &LOCK_log))
8550      return finish_commit(thd);
8551    @endcode
8552 
8553    @return Error code if the session commit failed, or zero on
8554    success.
8555  */
finish_commit(THD * thd)8556 int MYSQL_BIN_LOG::finish_commit(THD *thd) {
8557   DBUG_TRACE;
8558   DEBUG_SYNC(thd, "reached_finish_commit");
8559   /*
8560     In some unlikely situations, it can happen that binary
8561     log is closed before the thread flushes it's cache.
8562     In that case, clear the caches before doing commit.
8563   */
8564   if (unlikely(!is_open())) {
8565     binlog_cache_mngr *cache_mngr = thd_get_cache_mngr(thd);
8566     if (cache_mngr) cache_mngr->reset();
8567   }
8568 
8569   if (thd->get_transaction()->sequence_number != SEQ_UNINIT) {
8570     mysql_mutex_lock(&LOCK_slave_trans_dep_tracker);
8571     m_dependency_tracker.update_max_committed(thd);
8572     mysql_mutex_unlock(&LOCK_slave_trans_dep_tracker);
8573   }
8574   if (thd->get_transaction()->m_flags.commit_low) {
8575     const bool all = thd->get_transaction()->m_flags.real_commit;
8576     /*
8577       Now flush error and sync erros are ignored and we are continuing and
8578       committing. And at this time, commit_error cannot be COMMIT_ERROR.
8579     */
8580     DBUG_ASSERT(thd->commit_error != THD::CE_COMMIT_ERROR);
8581     /*
8582       storage engine commit
8583     */
8584     if (ha_commit_low(thd, all, false))
8585       thd->commit_error = THD::CE_COMMIT_ERROR;
8586     /*
8587       Decrement the prepared XID counter after storage engine commit
8588     */
8589     if (thd->get_transaction()->m_flags.xid_written) dec_prep_xids(thd);
8590     /*
8591       If commit succeeded, we call the after_commit hook
8592 
8593       TODO: This hook here should probably move outside/below this
8594             if and be the only after_commit invocation left in the
8595             code.
8596     */
8597     if ((thd->commit_error != THD::CE_COMMIT_ERROR) &&
8598         thd->get_transaction()->m_flags.run_hooks) {
8599       (void)RUN_HOOK(transaction, after_commit, (thd, all));
8600       thd->get_transaction()->m_flags.run_hooks = false;
8601     }
8602   } else if (thd->get_transaction()->m_flags.xid_written)
8603     dec_prep_xids(thd);
8604 
8605   /*
8606     If the ordered commit didn't updated the GTIDs for this thd yet
8607     at process_commit_stage_queue (i.e. --binlog-order-commits=0)
8608     the thd still has the ownership of a GTID and we must handle it.
8609   */
8610   if (!thd->owned_gtid_is_empty()) {
8611     /*
8612       Gtid is added to gtid_state.executed_gtids and removed from owned_gtids
8613       on update_on_commit().
8614     */
8615     if (thd->commit_error == THD::CE_NONE) {
8616       gtid_state->update_on_commit(thd);
8617     } else
8618       gtid_state->update_on_rollback(thd);
8619   }
8620 
8621   DBUG_EXECUTE_IF("leaving_finish_commit", {
8622     const char act[] = "now SIGNAL signal_leaving_finish_commit";
8623     DBUG_ASSERT(!debug_sync_set_action(current_thd, STRING_WITH_LEN(act)));
8624   };);
8625 
8626   DBUG_ASSERT(thd->commit_error || !thd->get_transaction()->m_flags.run_hooks);
8627   DBUG_ASSERT(!thd_get_cache_mngr(thd)->dbug_any_finalized());
8628   DBUG_PRINT("return", ("Thread ID: %u, commit_error: %d", thd->thread_id(),
8629                         thd->commit_error));
8630   /*
8631     flush or sync errors are handled by the leader of the group
8632     (using binlog_error_action). Hence treat only COMMIT_ERRORs as errors.
8633   */
8634   return thd->commit_error == THD::CE_COMMIT_ERROR;
8635 }
8636 
8637 /**
8638    Auxiliary function used in ordered_commit.
8639 */
call_after_sync_hook(THD * queue_head)8640 static inline int call_after_sync_hook(THD *queue_head) {
8641   const char *log_file = nullptr;
8642   my_off_t pos = 0;
8643 
8644   if (NO_HOOK(binlog_storage)) return 0;
8645 
8646   DBUG_ASSERT(queue_head != nullptr);
8647   for (THD *thd = queue_head; thd != nullptr; thd = thd->next_to_commit)
8648     if (likely(thd->commit_error == THD::CE_NONE))
8649       thd->get_trans_fixed_pos(&log_file, &pos);
8650 
8651   if (DBUG_EVALUATE_IF("simulate_after_sync_hook_error", 1, 0) ||
8652       RUN_HOOK(binlog_storage, after_sync, (queue_head, log_file, pos))) {
8653     LogErr(ERROR_LEVEL, ER_BINLOG_FAILED_TO_RUN_AFTER_SYNC_HOOK);
8654     return ER_ERROR_ON_WRITE;
8655   }
8656   return 0;
8657 }
8658 
8659 /**
8660   Helper function to handle flush or sync stage errors.
8661   If binlog_error_action= ABORT_SERVER, server will be aborted
8662   after reporting the error to the client.
8663   If binlog_error_action= IGNORE_ERROR, binlog will be closed
8664   for the reset of the life time of the server. close() call is protected
8665   with LOCK_log to avoid any parallel operations on binary log.
8666 
8667   @param thd Thread object that faced flush/sync error
8668   @param need_lock_log
8669                        > Indicates true if LOCk_log is needed before closing
8670                          binlog (happens when we are handling sync error)
8671                        > Indicates false if LOCK_log is already acquired
8672                          by the thread (happens when we are handling flush
8673                          error)
8674   @param message Message stating the reason of the failure
8675 */
handle_binlog_flush_or_sync_error(THD * thd,bool need_lock_log,const char * message)8676 void MYSQL_BIN_LOG::handle_binlog_flush_or_sync_error(THD *thd,
8677                                                       bool need_lock_log,
8678                                                       const char *message) {
8679   char errmsg[MYSQL_ERRMSG_SIZE] = {0};
8680   if (message == nullptr)
8681     sprintf(
8682         errmsg,
8683         "An error occurred during %s stage of the commit. "
8684         "'binlog_error_action' is set to '%s'.",
8685         thd->commit_error == THD::CE_FLUSH_ERROR ? "flush" : "sync",
8686         binlog_error_action == ABORT_SERVER ? "ABORT_SERVER" : "IGNORE_ERROR");
8687   else
8688     strncpy(errmsg, message, MYSQL_ERRMSG_SIZE - 1);
8689   if (binlog_error_action == ABORT_SERVER) {
8690     char err_buff[MYSQL_ERRMSG_SIZE + 27];
8691     sprintf(err_buff, "%s Hence aborting the server.", errmsg);
8692     exec_binlog_error_action_abort(err_buff);
8693   } else {
8694     DEBUG_SYNC(thd, "before_binlog_closed_due_to_error");
8695     if (need_lock_log)
8696       mysql_mutex_lock(&LOCK_log);
8697     else
8698       mysql_mutex_assert_owner(&LOCK_log);
8699     /*
8700       It can happen that other group leader encountered
8701       error and already closed the binary log. So print
8702       error only if it is in open state. But we should
8703       call close() always just in case if the previous
8704       close did not close index file.
8705     */
8706     if (is_open()) {
8707       LogErr(ERROR_LEVEL, ER_TURNING_LOGGING_OFF_FOR_THE_DURATION, errmsg);
8708     }
8709     close(LOG_CLOSE_INDEX | LOG_CLOSE_STOP_EVENT, false /*need_lock_log=false*/,
8710           true /*need_lock_index=true*/);
8711     /*
8712       If there is a write error (flush/sync stage) and if
8713       binlog_error_action=IGNORE_ERROR, clear the error
8714       and allow the commit to happen in storage engine.
8715     */
8716     if (check_write_error(thd) &&
8717         DBUG_EVALUATE_IF("simulate_cache_creation_failure", false, true))
8718       thd->clear_error();
8719 
8720     if (need_lock_log) mysql_mutex_unlock(&LOCK_log);
8721     DEBUG_SYNC(thd, "after_binlog_closed_due_to_error");
8722   }
8723 }
8724 
ordered_commit(THD * thd,bool all,bool skip_commit)8725 int MYSQL_BIN_LOG::ordered_commit(THD *thd, bool all, bool skip_commit) {
8726   DBUG_TRACE;
8727   int flush_error = 0, sync_error = 0;
8728   my_off_t total_bytes = 0;
8729   bool do_rotate = false;
8730 
8731   DBUG_EXECUTE_IF("crash_commit_before_log", DBUG_SUICIDE(););
8732   init_thd_variables(thd, all, skip_commit);
8733   DBUG_PRINT("enter", ("commit_pending: %s, commit_error: %d, thread_id: %u",
8734                        YESNO(thd->tx_commit_pending), thd->commit_error,
8735                        thd->thread_id()));
8736 
8737   DEBUG_SYNC(thd, "bgc_before_flush_stage");
8738 
8739   /*
8740     Stage #0: ensure slave threads commit order as they appear in the slave's
8741               relay log for transactions flushing to binary log.
8742 
8743     This will make thread wait until its turn to commit.
8744     Commit_order_manager maintains it own queue and its own order for the
8745     commit. So Stage#0 doesn't maintain separate StageID.
8746   */
8747   if (Commit_order_manager::wait_for_its_turn_before_flush_stage(thd) ||
8748       ending_trans(thd, all) ||
8749       Commit_order_manager::get_rollback_status(thd)) {
8750     if (Commit_order_manager::wait(thd)) {
8751       return thd->commit_error;
8752     }
8753   }
8754 
8755   /*
8756     Stage #1: flushing transactions to binary log
8757 
8758     While flushing, we allow new threads to enter and will process
8759     them in due time. Once the queue was empty, we cannot reap
8760     anything more since it is possible that a thread entered and
8761     appointed itself leader for the flush phase.
8762   */
8763 
8764   if (change_stage(thd, Commit_stage_manager::BINLOG_FLUSH_STAGE, thd, nullptr,
8765                    &LOCK_log)) {
8766     DBUG_PRINT("return", ("Thread ID: %u, commit_error: %d", thd->thread_id(),
8767                           thd->commit_error));
8768     return finish_commit(thd);
8769   }
8770 
8771   THD *wait_queue = nullptr, *final_queue = nullptr;
8772   mysql_mutex_t *leave_mutex_before_commit_stage = nullptr;
8773   my_off_t flush_end_pos = 0;
8774   bool update_binlog_end_pos_after_sync;
8775   if (unlikely(!is_open())) {
8776     final_queue = fetch_and_process_flush_stage_queue(true);
8777     leave_mutex_before_commit_stage = &LOCK_log;
8778     /*
8779       binary log is closed, flush stage and sync stage should be
8780       ignored. Binlog cache should be cleared, but instead of doing
8781       it here, do that work in 'finish_commit' function so that
8782       leader and followers thread caches will be cleared.
8783     */
8784     goto commit_stage;
8785   }
8786   DEBUG_SYNC(thd, "waiting_in_the_middle_of_flush_stage");
8787   flush_error =
8788       process_flush_stage_queue(&total_bytes, &do_rotate, &wait_queue);
8789 
8790   if (flush_error == 0 && total_bytes > 0)
8791     flush_error = flush_cache_to_file(&flush_end_pos);
8792   DBUG_EXECUTE_IF("crash_after_flush_binlog", DBUG_SUICIDE(););
8793 
8794   update_binlog_end_pos_after_sync = (get_sync_period() == 1);
8795 
8796   /*
8797     If the flush finished successfully, we can call the after_flush
8798     hook. Being invoked here, we have the guarantee that the hook is
8799     executed before the before/after_send_hooks on the dump thread
8800     preventing race conditions among these plug-ins.
8801   */
8802   if (flush_error == 0) {
8803     const char *file_name_ptr = log_file_name + dirname_length(log_file_name);
8804     DBUG_ASSERT(flush_end_pos != 0);
8805     if (RUN_HOOK(binlog_storage, after_flush,
8806                  (thd, file_name_ptr, flush_end_pos))) {
8807       LogErr(ERROR_LEVEL, ER_BINLOG_FAILED_TO_RUN_AFTER_FLUSH_HOOK);
8808       flush_error = ER_ERROR_ON_WRITE;
8809     }
8810 
8811     if (!update_binlog_end_pos_after_sync) update_binlog_end_pos();
8812 
8813     DBUG_EXECUTE_IF("crash_commit_after_log", DBUG_SUICIDE(););
8814   }
8815 
8816   if (flush_error) {
8817     /*
8818       Handle flush error (if any) after leader finishes it's flush stage.
8819     */
8820     handle_binlog_flush_or_sync_error(thd, false /* need_lock_log */, nullptr);
8821   }
8822 
8823   DEBUG_SYNC(thd, "bgc_after_flush_stage_before_sync_stage");
8824 
8825   /*
8826     Stage #2: Syncing binary log file to disk
8827   */
8828 
8829   if (change_stage(thd, Commit_stage_manager::SYNC_STAGE, wait_queue, &LOCK_log,
8830                    &LOCK_sync)) {
8831     DBUG_PRINT("return", ("Thread ID: %u, commit_error: %d", thd->thread_id(),
8832                           thd->commit_error));
8833     return finish_commit(thd);
8834   }
8835 
8836   /*
8837     Shall introduce a delay only if it is going to do sync
8838     in this ongoing SYNC stage. The "+1" used below in the
8839     if condition is to count the ongoing sync stage.
8840     When sync_binlog=0 (where we never do sync in BGC group),
8841     it is considered as a special case and delay will be executed
8842     for every group just like how it is done when sync_binlog= 1.
8843   */
8844   if (!flush_error && (sync_counter + 1 >= get_sync_period()))
8845     Commit_stage_manager::get_instance().wait_count_or_timeout(
8846         opt_binlog_group_commit_sync_no_delay_count,
8847         opt_binlog_group_commit_sync_delay, Commit_stage_manager::SYNC_STAGE);
8848 
8849   final_queue = Commit_stage_manager::get_instance().fetch_queue_acquire_lock(
8850       Commit_stage_manager::SYNC_STAGE);
8851 
8852   if (flush_error == 0 && total_bytes > 0) {
8853     DEBUG_SYNC(thd, "before_sync_binlog_file");
8854     std::pair<bool, bool> result = sync_binlog_file(false);
8855     sync_error = result.first;
8856   }
8857 
8858   if (update_binlog_end_pos_after_sync) {
8859     THD *tmp_thd = final_queue;
8860     const char *binlog_file = nullptr;
8861     my_off_t pos = 0;
8862     while (tmp_thd->next_to_commit != nullptr)
8863       tmp_thd = tmp_thd->next_to_commit;
8864     if (flush_error == 0 && sync_error == 0) {
8865       tmp_thd->get_trans_fixed_pos(&binlog_file, &pos);
8866       update_binlog_end_pos(binlog_file, pos);
8867     }
8868   }
8869 
8870   DEBUG_SYNC(thd, "bgc_after_sync_stage_before_commit_stage");
8871 
8872   leave_mutex_before_commit_stage = &LOCK_sync;
8873   /*
8874     Stage #3: Commit all transactions in order.
8875 
8876     This stage is skipped if we do not need to order the commits and
8877     each thread have to execute the handlerton commit instead.
8878 
8879     Howver, since we are keeping the lock from the previous stage, we
8880     need to unlock it if we skip the stage.
8881 
8882     We must also step commit_clock before the ha_commit_low() is called
8883     either in ordered fashion(by the leader of this stage) or by the tread
8884     themselves.
8885 
8886     We are delaying the handling of sync error until
8887     all locks are released but we should not enter into
8888     commit stage if binlog_error_action is ABORT_SERVER.
8889   */
8890 commit_stage:
8891   /* Clone needs binlog commit order. */
8892   if ((opt_binlog_order_commits || Clone_handler::need_commit_order()) &&
8893       (sync_error == 0 || binlog_error_action != ABORT_SERVER)) {
8894     if (change_stage(thd, Commit_stage_manager::COMMIT_STAGE, final_queue,
8895                      leave_mutex_before_commit_stage, &LOCK_commit)) {
8896       DBUG_PRINT("return", ("Thread ID: %u, commit_error: %d", thd->thread_id(),
8897                             thd->commit_error));
8898       return finish_commit(thd);
8899     }
8900     THD *commit_queue =
8901         Commit_stage_manager::get_instance().fetch_queue_acquire_lock(
8902             Commit_stage_manager::COMMIT_STAGE);
8903     DBUG_EXECUTE_IF("semi_sync_3-way_deadlock",
8904                     DEBUG_SYNC(thd, "before_process_commit_stage_queue"););
8905 
8906     if (flush_error == 0 && sync_error == 0)
8907       sync_error = call_after_sync_hook(commit_queue);
8908 
8909     /*
8910       process_commit_stage_queue will call update_on_commit or
8911       update_on_rollback for the GTID owned by each thd in the queue.
8912 
8913       This will be done this way to guarantee that GTIDs are added to
8914       gtid_executed in order, to avoid creating unnecessary temporary
8915       gaps and keep gtid_executed as a single interval at all times.
8916 
8917       If we allow each thread to call update_on_commit only when they
8918       are at finish_commit, the GTID order cannot be guaranteed and
8919       temporary gaps may appear in gtid_executed. When this happen,
8920       the server would have to add and remove intervals from the
8921       Gtid_set, and adding and removing intervals requires a mutex,
8922       which would reduce performance.
8923     */
8924     process_commit_stage_queue(thd, commit_queue);
8925     mysql_mutex_unlock(&LOCK_commit);
8926     /*
8927       Process after_commit after LOCK_commit is released for avoiding
8928       3-way deadlock among user thread, rotate thread and dump thread.
8929     */
8930     process_after_commit_stage_queue(thd, commit_queue);
8931     final_queue = commit_queue;
8932   } else {
8933     if (leave_mutex_before_commit_stage)
8934       mysql_mutex_unlock(leave_mutex_before_commit_stage);
8935     if (flush_error == 0 && sync_error == 0)
8936       sync_error = call_after_sync_hook(final_queue);
8937   }
8938 
8939   /*
8940     Handle sync error after we release all locks in order to avoid deadlocks
8941   */
8942   if (sync_error)
8943     handle_binlog_flush_or_sync_error(thd, true /* need_lock_log */, nullptr);
8944 
8945   DEBUG_SYNC(thd, "before_signal_done");
8946   /* Commit done so signal all waiting threads */
8947   Commit_stage_manager::get_instance().signal_done(final_queue);
8948   DBUG_EXECUTE_IF("block_leader_after_delete", {
8949     const char action[] = "now SIGNAL leader_proceed";
8950     DBUG_ASSERT(!debug_sync_set_action(thd, STRING_WITH_LEN(action)));
8951   };);
8952 
8953   /*
8954     Finish the commit before executing a rotate, or run the risk of a
8955     deadlock. We don't need the return value here since it is in
8956     thd->commit_error, which is returned below.
8957   */
8958   (void)finish_commit(thd);
8959   DEBUG_SYNC(thd, "bgc_after_commit_stage_before_rotation");
8960 
8961   /*
8962     If we need to rotate, we do it without commit error.
8963     Otherwise the thd->commit_error will be possibly reset.
8964    */
8965   if (DBUG_EVALUATE_IF("force_rotate", 1, 0) ||
8966       (do_rotate && thd->commit_error == THD::CE_NONE &&
8967        !is_rotating_caused_by_incident)) {
8968     /*
8969       Do not force the rotate as several consecutive groups may
8970       request unnecessary rotations.
8971 
8972       NOTE: Run purge_logs wo/ holding LOCK_log because it does not
8973       need the mutex. Otherwise causes various deadlocks.
8974     */
8975 
8976     DEBUG_SYNC(thd, "ready_to_do_rotation");
8977     bool check_purge = false;
8978     mysql_mutex_lock(&LOCK_log);
8979     /*
8980       If rotate fails then depends on binlog_error_action variable
8981       appropriate action will be taken inside rotate call.
8982     */
8983     int error = rotate(false, &check_purge);
8984     mysql_mutex_unlock(&LOCK_log);
8985 
8986     if (error)
8987       thd->commit_error = THD::CE_COMMIT_ERROR;
8988     else if (check_purge)
8989       purge();
8990   }
8991   /*
8992     flush or sync errors are handled above (using binlog_error_action).
8993     Hence treat only COMMIT_ERRORs as errors.
8994   */
8995   return thd->commit_error == THD::CE_COMMIT_ERROR;
8996 }
8997 
8998 /**
8999   MYSQLD server recovers from last crashed binlog.
9000 
9001   @param[in] binlog_file_reader Binlog_file_reader of the crashed binlog.
9002   @param[out] valid_pos The position of the last valid transaction or
9003                         event(non-transaction) of the crashed binlog.
9004                         valid_pos must be non-NULL.
9005 
9006   After a crash, storage engines may contain transactions that are
9007   prepared but not committed (in theory any engine, in practice
9008   InnoDB).  This function uses the binary log as the source of truth
9009   to determine which of these transactions should be committed and
9010   which should be rolled back.
9011 
9012   The function collects the XIDs of all transactions that are
9013   completely written to the binary log into a hash, and passes this
9014   hash to the storage engines through the ha_recover function in the
9015   handler interface.  This tells the storage engines to commit all
9016   prepared transactions that are in the set, and to roll back all
9017   prepared transactions that are not in the set.
9018 
9019   To compute the hash, this function iterates over the last binary log
9020   only (i.e. it assumes that 'log' is the last binary log).  It
9021   instantiates each event.  For XID-events (i.e. commit to InnoDB), it
9022   extracts the xid from the event and stores it in the hash.
9023 
9024   It is enough to iterate over only the last binary log because when
9025   the binary log is rotated we force engines to commit (and we fsync
9026   the old binary log).
9027 
9028   @retval false Success
9029   @retval true Out of memory, or storage engine returns error.
9030 */
binlog_recover(Binlog_file_reader * binlog_file_reader,my_off_t * valid_pos)9031 static bool binlog_recover(Binlog_file_reader *binlog_file_reader,
9032                            my_off_t *valid_pos) {
9033   bool res = false;
9034   binlog::tools::Iterator it(binlog_file_reader);
9035   it.set_copy_event_buffer();
9036 
9037   /*
9038     The flag is used for handling the case that a transaction
9039     is partially written to the binlog.
9040   */
9041   bool in_transaction = false;
9042   int memory_page_size = my_getpagesize();
9043   {
9044     MEM_ROOT mem_root(key_memory_binlog_recover_exec, memory_page_size);
9045     mem_root_unordered_set<my_xid> xids(&mem_root);
9046 
9047     /*
9048       now process events in the queue. Queue is dynamically changed
9049       everytime we process an event. This may be a bit suboptimal
9050       since it adds an indirection, but it helps to generalize the
9051       usage of the transaction payload event (which unfolds into
9052       several events into the queue when it is processed).
9053     */
9054     for (Log_event *ev = it.begin(); !res && (ev != it.end()); ev = it.next()) {
9055       switch (ev->get_type_code()) {
9056         // may be begin, middle or end of a transaction
9057         case binary_log::QUERY_EVENT: {
9058           // starts a transaction
9059           if (!strcmp(((Query_log_event *)ev)->query, "BEGIN"))
9060             in_transaction = true;
9061 
9062           // ends a transaction
9063           if (!strcmp(((Query_log_event *)ev)->query, "COMMIT")) {
9064             DBUG_ASSERT(in_transaction == true);
9065             in_transaction = false;
9066           }
9067           // starts and ends a transaction
9068           if (is_atomic_ddl_event(ev)) {
9069             DBUG_ASSERT(in_transaction == false);
9070             auto qev = dynamic_cast<Query_log_event *>(ev);
9071             DBUG_ASSERT(qev != nullptr);
9072             res = (qev == nullptr || !xids.insert(qev->ddl_xid).second);
9073           }
9074           break;
9075         }
9076         // ends a transaction
9077         case binary_log::XID_EVENT: {
9078           DBUG_ASSERT(in_transaction == true);
9079           in_transaction = false;
9080           Xid_log_event *xev = dynamic_cast<Xid_log_event *>(ev);
9081           DBUG_ASSERT(xev != nullptr);
9082           res = (xev == nullptr || !xids.insert(xev->xid).second);
9083           break;
9084         }
9085         default: {
9086           break;
9087         }
9088       }
9089 
9090       /*
9091         Recorded valid position for the crashed binlog file
9092         which did not contain incorrect events. The following
9093         positions increase the variable valid_pos:
9094 
9095         1 -
9096           ...
9097           <---> HERE IS VALID <--->
9098           GTID
9099           BEGIN
9100           ...
9101           COMMIT
9102           ...
9103 
9104         2 -
9105           ...
9106           <---> HERE IS VALID <--->
9107           GTID
9108           DDL/UTILITY
9109           ...
9110 
9111         In other words, the following positions do not increase
9112         the variable valid_pos:
9113 
9114         1 -
9115           GTID
9116           <---> HERE IS VALID <--->
9117           ...
9118 
9119         2 -
9120           GTID
9121           BEGIN
9122           <---> HERE IS VALID <--->
9123           ...
9124       */
9125       if (!in_transaction && !is_gtid_event(ev))
9126         *valid_pos = binlog_file_reader->position();
9127 
9128       delete ev;
9129       ev = nullptr;
9130       res = it.has_error();
9131     }
9132 
9133     /*
9134       Call ha_recover if and only if there is a registered engine that
9135       does 2PC, otherwise in DBUG builds calling ha_recover directly
9136       will result in an assert. (Production builds would be safe since
9137       ha_recover returns right away if total_ha_2pc <= opt_log_bin.)
9138      */
9139     res = res || (total_ha_2pc > 1 && ha_recover(&xids));
9140   }
9141 
9142   if (res) LogErr(ERROR_LEVEL, ER_BINLOG_CRASH_RECOVERY_FAILED);
9143   return res;
9144 }
9145 
report_missing_purged_gtids(const Gtid_set * slave_executed_gtid_set,const char ** errmsg)9146 void MYSQL_BIN_LOG::report_missing_purged_gtids(
9147     const Gtid_set *slave_executed_gtid_set, const char **errmsg) {
9148   DBUG_TRACE;
9149   THD *thd = current_thd;
9150   Gtid_set gtid_missing(gtid_state->get_lost_gtids()->get_sid_map());
9151   gtid_missing.add_gtid_set(gtid_state->get_lost_gtids());
9152   gtid_missing.remove_gtid_set(slave_executed_gtid_set);
9153 
9154   String tmp_uuid;
9155 
9156   /* Protects thd->user_vars. */
9157   mysql_mutex_lock(&current_thd->LOCK_thd_data);
9158   const auto it = current_thd->user_vars.find("slave_uuid");
9159   if (it != current_thd->user_vars.end() && it->second->length() > 0) {
9160     tmp_uuid.copy(it->second->ptr(), it->second->length(), NULL);
9161   }
9162   mysql_mutex_unlock(&current_thd->LOCK_thd_data);
9163 
9164   char *missing_gtids = NULL;
9165   char *slave_executed_gtids = NULL;
9166   gtid_missing.to_string(&missing_gtids, false);
9167   slave_executed_gtid_set->to_string(&slave_executed_gtids, false);
9168 
9169   /*
9170      Log the information about the missing purged GTIDs to the error log.
9171   */
9172   std::ostringstream log_info;
9173   log_info << "The missing transactions are '" << missing_gtids << "'";
9174 
9175   LogErr(WARNING_LEVEL, ER_FOUND_MISSING_GTIDS, tmp_uuid.ptr(),
9176          log_info.str().c_str());
9177 
9178   /*
9179      Send the information about the slave executed GTIDs and missing
9180      purged GTIDs to slave if the message is less than MYSQL_ERRMSG_SIZE.
9181   */
9182   std::ostringstream gtid_info;
9183   gtid_info << "The GTID set sent by the slave is '" << slave_executed_gtids
9184             << "', and the missing transactions are '" << missing_gtids << "'";
9185   *errmsg = ER_THD(thd, ER_MASTER_HAS_PURGED_REQUIRED_GTIDS);
9186 
9187   /* Don't consider the "%s" in the format string. Subtract 2 from the
9188      total length */
9189   int total_length = (strlen(*errmsg) - 2 + gtid_info.str().length());
9190 
9191   DBUG_EXECUTE_IF("simulate_long_missing_gtids",
9192                   { total_length = MYSQL_ERRMSG_SIZE + 1; });
9193 
9194   if (total_length > MYSQL_ERRMSG_SIZE)
9195     gtid_info.str(
9196         "The GTID sets and the missing purged transactions are too"
9197         " long to print in this message. For more information,"
9198         " please see the master's error log or the manual for"
9199         " GTID_SUBTRACT");
9200 
9201   /* Buffer for formatting the message about the missing GTIDs. */
9202   static char buff[MYSQL_ERRMSG_SIZE];
9203   snprintf(buff, MYSQL_ERRMSG_SIZE, *errmsg, gtid_info.str().c_str());
9204   *errmsg = const_cast<const char *>(buff);
9205 
9206   my_free(missing_gtids);
9207   my_free(slave_executed_gtids);
9208 }
9209 
report_missing_gtids(const Gtid_set * previous_gtid_set,const Gtid_set * slave_executed_gtid_set,const char ** errmsg)9210 void MYSQL_BIN_LOG::report_missing_gtids(
9211     const Gtid_set *previous_gtid_set, const Gtid_set *slave_executed_gtid_set,
9212     const char **errmsg) {
9213   DBUG_TRACE;
9214   THD *thd = current_thd;
9215   char *missing_gtids = NULL;
9216   char *slave_executed_gtids = NULL;
9217   Gtid_set gtid_missing(slave_executed_gtid_set->get_sid_map());
9218   gtid_missing.add_gtid_set(slave_executed_gtid_set);
9219   gtid_missing.remove_gtid_set(previous_gtid_set);
9220   gtid_missing.to_string(&missing_gtids, false);
9221   slave_executed_gtid_set->to_string(&slave_executed_gtids, false);
9222 
9223   String tmp_uuid;
9224 
9225   /* Protects thd->user_vars. */
9226   mysql_mutex_lock(&current_thd->LOCK_thd_data);
9227   const auto it = current_thd->user_vars.find("slave_uuid");
9228   if (it != current_thd->user_vars.end() && it->second->length() > 0) {
9229     tmp_uuid.copy(it->second->ptr(), it->second->length(), NULL);
9230   }
9231   mysql_mutex_unlock(&current_thd->LOCK_thd_data);
9232 
9233   /*
9234      Log the information about the missing purged GTIDs to the error log.
9235   */
9236   std::ostringstream log_info;
9237   log_info << "If the binary log files have been deleted from disk,"
9238               " check the consistency of 'GTID_PURGED' variable."
9239               " The missing transactions are '"
9240            << missing_gtids << "'";
9241   LogErr(WARNING_LEVEL, ER_FOUND_MISSING_GTIDS, tmp_uuid.ptr(),
9242          log_info.str().c_str());
9243   /*
9244      Send the information about the slave executed GTIDs and missing
9245      purged GTIDs to slave if the message is less than MYSQL_ERRMSG_SIZE.
9246   */
9247   std::ostringstream gtid_info;
9248   gtid_info << "The GTID set sent by the slave is '" << slave_executed_gtids
9249             << "', and the missing transactions are '" << missing_gtids << "'";
9250   *errmsg = ER_THD(thd, ER_MASTER_HAS_PURGED_REQUIRED_GTIDS);
9251 
9252   /* Don't consider the "%s" in the format string. Subtract 2 from the
9253      total length */
9254   if ((strlen(*errmsg) - 2 + gtid_info.str().length()) > MYSQL_ERRMSG_SIZE)
9255     gtid_info.str(
9256         "The GTID sets and the missing purged transactions are too"
9257         " long to print in this message. For more information,"
9258         " please see the master's error log or the manual for"
9259         " GTID_SUBTRACT");
9260   /* Buffer for formatting the message about the missing GTIDs. */
9261   static char buff[MYSQL_ERRMSG_SIZE];
9262   snprintf(buff, MYSQL_ERRMSG_SIZE, *errmsg, gtid_info.str().c_str());
9263   *errmsg = const_cast<const char *>(buff);
9264   my_free(missing_gtids);
9265   my_free(slave_executed_gtids);
9266 }
9267 
update_binlog_end_pos(bool need_lock)9268 void MYSQL_BIN_LOG::update_binlog_end_pos(bool need_lock) {
9269   if (need_lock)
9270     lock_binlog_end_pos();
9271   else
9272     mysql_mutex_assert_owner(&LOCK_binlog_end_pos);
9273   atomic_binlog_end_pos = m_binlog_file->position();
9274   signal_update();
9275   if (need_lock) unlock_binlog_end_pos();
9276 }
9277 
update_binlog_end_pos(const char * file,my_off_t pos)9278 inline void MYSQL_BIN_LOG::update_binlog_end_pos(const char *file,
9279                                                  my_off_t pos) {
9280   lock_binlog_end_pos();
9281   if (is_active(file) && (pos > atomic_binlog_end_pos))
9282     atomic_binlog_end_pos = pos;
9283   signal_update();
9284   unlock_binlog_end_pos();
9285 }
9286 
is_binlog_cache_empty(bool is_transactional) const9287 bool THD::is_binlog_cache_empty(bool is_transactional) const {
9288   DBUG_TRACE;
9289 
9290   // If opt_bin_log==0, it is not safe to call thd_get_cache_mngr
9291   // because binlog_hton has not been completely set up.
9292   DBUG_ASSERT(opt_bin_log);
9293   binlog_cache_mngr *cache_mngr = thd_get_cache_mngr(this);
9294 
9295   // cache_mngr is NULL until we call thd->binlog_setup_trx_data, so
9296   // we assert that this has been done.
9297   DBUG_ASSERT(cache_mngr != nullptr);
9298 
9299   binlog_cache_data *cache_data =
9300       cache_mngr->get_binlog_cache_data(is_transactional);
9301   DBUG_ASSERT(cache_data != nullptr);
9302 
9303   return cache_data->is_binlog_empty();
9304 }
9305 
9306 /*
9307   These functions are placed in this file since they need access to
9308   binlog_hton, which has internal linkage.
9309 */
9310 
binlog_setup_trx_data()9311 int THD::binlog_setup_trx_data() {
9312   DBUG_TRACE;
9313   binlog_cache_mngr *cache_mngr = thd_get_cache_mngr(this);
9314 
9315   if (cache_mngr) return 0;  // Already set up
9316 
9317   cache_mngr = (binlog_cache_mngr *)my_malloc(key_memory_binlog_cache_mngr,
9318                                               sizeof(binlog_cache_mngr),
9319                                               MYF(MY_ZEROFILL));
9320   if (!cache_mngr) {
9321     return 1;  // Didn't manage to set it up
9322   }
9323 
9324   cache_mngr = new (cache_mngr)
9325       binlog_cache_mngr(&binlog_stmt_cache_use, &binlog_stmt_cache_disk_use,
9326                         &binlog_cache_use, &binlog_cache_disk_use);
9327   if (cache_mngr->init()) {
9328     cache_mngr->~binlog_cache_mngr();
9329     my_free(cache_mngr);
9330     return 1;
9331   }
9332 
9333   DBUG_PRINT("debug", ("Set ha_data slot %d to 0x%llx", binlog_hton->slot,
9334                        (ulonglong)cache_mngr));
9335   thd_set_ha_data(this, binlog_hton, cache_mngr);
9336 
9337   return 0;
9338 }
9339 
9340 /**
9341 
9342 */
register_binlog_handler(THD * thd,bool trx)9343 void register_binlog_handler(THD *thd, bool trx) {
9344   DBUG_TRACE;
9345   /*
9346     If this is the first call to this function while processing a statement,
9347     the transactional cache does not have a savepoint defined. So, in what
9348     follows:
9349       . an implicit savepoint is defined;
9350       . callbacks are registered;
9351       . binary log is set as read/write.
9352 
9353     The savepoint allows for truncating the trx-cache transactional changes
9354     fail. Callbacks are necessary to flush caches upon committing or rolling
9355     back a statement or a transaction. However, notifications do not happen
9356     if the binary log is set as read/write.
9357   */
9358   binlog_cache_mngr *cache_mngr = thd_get_cache_mngr(thd);
9359   if (cache_mngr->trx_cache.get_prev_position() == MY_OFF_T_UNDEF) {
9360     /*
9361       Set an implicit savepoint in order to be able to truncate a trx-cache.
9362     */
9363     my_off_t pos = 0;
9364     binlog_trans_log_savepos(thd, &pos);
9365     cache_mngr->trx_cache.set_prev_position(pos);
9366 
9367     /*
9368       Set callbacks in order to be able to call commmit or rollback.
9369     */
9370     if (trx) trans_register_ha(thd, true, binlog_hton, nullptr);
9371     trans_register_ha(thd, false, binlog_hton, nullptr);
9372 
9373     /*
9374       Set the binary log as read/write otherwise callbacks are not called.
9375     */
9376     thd->get_ha_data(binlog_hton->slot)->ha_info[0].set_trx_read_write();
9377   }
9378 }
9379 
9380 /**
9381   Function to start a statement and optionally a transaction for the
9382   binary log.
9383 
9384   This function does three things:
9385     - Starts a transaction if not in autocommit mode or if a BEGIN
9386       statement has been seen.
9387 
9388     - Start a statement transaction to allow us to truncate the cache.
9389 
9390     - Save the currrent binlog position so that we can roll back the
9391       statement by truncating the cache.
9392 
9393       We only update the saved position if the old one was undefined,
9394       the reason is that there are some cases (e.g., for CREATE-SELECT)
9395       where the position is saved twice (e.g., both in
9396       Query_result_create::prepare() and THD::binlog_write_table_map()), but
9397       we should use the first. This means that calls to this function
9398       can be used to start the statement before the first table map
9399       event, to include some extra events.
9400 
9401   Note however that IMMEDIATE_LOGGING implies that the statement is
9402   written without BEGIN/COMMIT.
9403 
9404   @param thd         Thread variable
9405   @param start_event The first event requested to be written into the
9406                      binary log
9407  */
binlog_start_trans_and_stmt(THD * thd,Log_event * start_event)9408 static int binlog_start_trans_and_stmt(THD *thd, Log_event *start_event) {
9409   DBUG_TRACE;
9410 
9411   /*
9412     Initialize the cache manager if this was not done yet.
9413   */
9414   if (thd->binlog_setup_trx_data()) return 1;
9415 
9416   /*
9417     Retrieve the appropriated cache.
9418   */
9419   bool is_transactional = start_event->is_using_trans_cache();
9420   binlog_cache_mngr *cache_mngr = thd_get_cache_mngr(thd);
9421   binlog_cache_data *cache_data =
9422       cache_mngr->get_binlog_cache_data(is_transactional);
9423 
9424   /*
9425     If the event is requesting immediatly logging, there is no need to go
9426     further down and set savepoint and register callbacks.
9427   */
9428   if (start_event->is_using_immediate_logging()) return 0;
9429 
9430   register_binlog_handler(thd, thd->in_multi_stmt_transaction_mode());
9431 
9432   /* Transactional DDL is logged traditionally without BEGIN. */
9433   if (is_atomic_ddl_event(start_event)) return 0;
9434 
9435   /*
9436     If the cache is empty log "BEGIN" at the beginning of every transaction.
9437     Here, a transaction is either a BEGIN..COMMIT/ROLLBACK block or a single
9438     statement in autocommit mode.
9439   */
9440   if (cache_data->is_binlog_empty()) {
9441     static const char begin[] = "BEGIN";
9442     const char *query = nullptr;
9443     char buf[XID::ser_buf_size];
9444     char xa_start[sizeof("XA START") + 1 + sizeof(buf)];
9445     XID_STATE *xs = thd->get_transaction()->xid_state();
9446     int qlen = sizeof(begin) - 1;
9447 
9448     if (is_transactional && xs->has_state(XID_STATE::XA_ACTIVE)) {
9449       /*
9450         XA-prepare logging case.
9451       */
9452       qlen = sprintf(xa_start, "XA START %s", xs->get_xid()->serialize(buf));
9453       query = xa_start;
9454     } else {
9455       /*
9456         Regular transaction case.
9457       */
9458       query = begin;
9459     }
9460 
9461     Query_log_event qinfo(thd, query, qlen, is_transactional, false, true, 0,
9462                           true);
9463     if (cache_data->write_event(&qinfo)) return 1;
9464   }
9465 
9466   return 0;
9467 }
9468 
9469 /**
9470   This function writes a table map to the binary log.
9471   Note that in order to keep the signature uniform with related methods,
9472   we use a redundant parameter to indicate whether a transactional table
9473   was changed or not.
9474   Sometimes it will write a Rows_query_log_event into binary log before
9475   the table map too.
9476 
9477   @param table             a pointer to the table.
9478   @param is_transactional  @c true indicates a transactional table,
9479                            otherwise @c false a non-transactional.
9480   @param binlog_rows_query @c true indicates a Rows_query log event
9481                            will be binlogged before table map,
9482                            otherwise @c false indicates it will not
9483                            be binlogged.
9484   @return
9485     nonzero if an error pops up when writing the table map event
9486     or the Rows_query log event.
9487 */
binlog_write_table_map(TABLE * table,bool is_transactional,bool binlog_rows_query)9488 int THD::binlog_write_table_map(TABLE *table, bool is_transactional,
9489                                 bool binlog_rows_query) {
9490   int error;
9491   DBUG_TRACE;
9492   DBUG_PRINT("enter", ("table: %p (%s: #%llu)", table, table->s->table_name.str,
9493                        table->s->table_map_id.id()));
9494 
9495   /* Pre-conditions */
9496   DBUG_ASSERT(is_current_stmt_binlog_format_row() && mysql_bin_log.is_open());
9497   DBUG_ASSERT(table->s->table_map_id.is_valid());
9498 
9499   Table_map_log_event the_event(this, table, table->s->table_map_id,
9500                                 is_transactional);
9501 
9502   binlog_start_trans_and_stmt(this, &the_event);
9503 
9504   binlog_cache_mngr *const cache_mngr = thd_get_cache_mngr(this);
9505 
9506   binlog_cache_data *cache_data =
9507       cache_mngr->get_binlog_cache_data(is_transactional);
9508 
9509   if (binlog_rows_query && this->query().str) {
9510     /* Write the Rows_query_log_event into binlog before the table map */
9511     Rows_query_log_event rows_query_ev(this, this->query().str,
9512                                        this->query().length);
9513     if ((error = cache_data->write_event(&rows_query_ev))) return error;
9514   }
9515 
9516   if ((error = cache_data->write_event(&the_event))) return error;
9517 
9518   binlog_table_maps++;
9519   return 0;
9520 }
9521 
9522 /**
9523   This function retrieves a pending row event from a cache which is
9524   specified through the parameter @c is_transactional. Respectively, when it
9525   is @c true, the pending event is returned from the transactional cache.
9526   Otherwise from the non-transactional cache.
9527 
9528   @param is_transactional  @c true indicates a transactional cache,
9529                            otherwise @c false a non-transactional.
9530   @return
9531     The row event if any.
9532 */
binlog_get_pending_rows_event(bool is_transactional) const9533 Rows_log_event *THD::binlog_get_pending_rows_event(
9534     bool is_transactional) const {
9535   Rows_log_event *rows = nullptr;
9536   binlog_cache_mngr *const cache_mngr = thd_get_cache_mngr(this);
9537 
9538   /*
9539     This is less than ideal, but here's the story: If there is no cache_mngr,
9540     prepare_pending_rows_event() has never been called (since the cache_mngr
9541     is set up there). In that case, we just return NULL.
9542    */
9543   if (cache_mngr) {
9544     binlog_cache_data *cache_data =
9545         cache_mngr->get_binlog_cache_data(is_transactional);
9546 
9547     rows = cache_data->pending();
9548   }
9549   return (rows);
9550 }
9551 
9552 /**
9553    @param db_param    db name c-string to be inserted into alphabetically sorted
9554                 THD::binlog_accessed_db_names list.
9555 
9556                 Note, that space for both the data and the node
9557                 struct are allocated in THD::main_mem_root.
9558                 The list lasts for the top-level query time and is reset
9559                 in @c THD::cleanup_after_query().
9560 */
add_to_binlog_accessed_dbs(const char * db_param)9561 void THD::add_to_binlog_accessed_dbs(const char *db_param) {
9562   char *after_db;
9563   /*
9564     binlog_accessed_db_names list is to maintain the database
9565     names which are referenced in a given command.
9566     Prior to bug 17806014 fix, 'main_mem_root' memory root used
9567     to store this list. The 'main_mem_root' scope is till the end
9568     of the query. Hence it caused increasing memory consumption
9569     problem in big procedures like the ones mentioned below.
9570     Eg: CALL p1() where p1 is having 1,00,000 create and drop tables.
9571     'main_mem_root' is freed only at the end of the command CALL p1()'s
9572     execution. But binlog_accessed_db_names list scope is only till the
9573     individual statements specified the procedure(create/drop statements).
9574     Hence the memory allocated in 'main_mem_root' was left uncleared
9575     until the p1's completion, even though it is not required after
9576     completion of individual statements.
9577 
9578     Instead of using 'main_mem_root' whose scope is complete query execution,
9579     now the memroot is changed to use 'thd->mem_root' whose scope is until the
9580     individual statement in CALL p1(). 'thd->mem_root' is set to
9581     'execute_mem_root' in the context of procedure and it's scope is till the
9582     individual statement in CALL p1() and thd->memroot is equal to
9583     'main_mem_root' in the context of a normal 'top level query'.
9584 
9585     Eg: a) create table t1(i int); => If this function is called while
9586            processing this statement, thd->memroot is equal to &main_mem_root
9587            which will be freed immediately after executing this statement.
9588         b) CALL p1() -> p1 contains create table t1(i int); => If this function
9589            is called while processing create table statement which is inside
9590            a stored procedure, then thd->memroot is equal to 'execute_mem_root'
9591            which will be freed immediately after executing this statement.
9592     In both a and b case, thd->memroot will be freed immediately and will not
9593     increase memory consumption.
9594 
9595     A special case(stored functions/triggers):
9596     Consider the following example:
9597     create function f1(i int) returns int
9598     begin
9599       insert into db1.t1 values (1);
9600       insert into db2.t1 values (2);
9601     end;
9602     When we are processing SELECT f1(), the list should contain db1, db2 names.
9603     Since thd->mem_root contains 'execute_mem_root' in the context of
9604     stored function, the mem root will be freed after adding db1 in
9605     the list and when we are processing the second statement and when we try
9606     to add 'db2' in the db1's list, it will lead to crash as db1's memory
9607     is already freed. To handle this special case, if in_sub_stmt is set
9608     (which is true incase of stored functions/triggers), we use &main_mem_root,
9609     if not set we will use thd->memroot which changes it's value to
9610     'execute_mem_root' or '&main_mem_root' depends on the context.
9611    */
9612   MEM_ROOT *db_mem_root = in_sub_stmt ? &main_mem_root : mem_root;
9613 
9614   if (!binlog_accessed_db_names)
9615     binlog_accessed_db_names = new (db_mem_root) List<char>;
9616 
9617   if (binlog_accessed_db_names->elements > MAX_DBS_IN_EVENT_MTS) {
9618     push_warning_printf(
9619         this, Sql_condition::SL_WARNING, ER_MTS_UPDATED_DBS_GREATER_MAX,
9620         ER_THD(this, ER_MTS_UPDATED_DBS_GREATER_MAX), MAX_DBS_IN_EVENT_MTS);
9621     return;
9622   }
9623 
9624   after_db = strdup_root(db_mem_root, db_param);
9625 
9626   /*
9627      sorted insertion is implemented with first rearranging data
9628      (pointer to char*) of the links and final appending of the least
9629      ordered data to create a new link in the list.
9630   */
9631   if (binlog_accessed_db_names->elements != 0) {
9632     List_iterator<char> it(*get_binlog_accessed_db_names());
9633 
9634     while (it++) {
9635       char *swap = nullptr;
9636       char **ref_cur_db = it.ref();
9637       int cmp = strcmp(after_db, *ref_cur_db);
9638 
9639       DBUG_ASSERT(!swap || cmp < 0);
9640 
9641       if (cmp == 0) {
9642         after_db = nullptr; /* dup to ignore */
9643         break;
9644       } else if (swap || cmp > 0) {
9645         swap = *ref_cur_db;
9646         *ref_cur_db = after_db;
9647         after_db = swap;
9648       }
9649     }
9650   }
9651   if (after_db) binlog_accessed_db_names->push_back(after_db, db_mem_root);
9652 }
9653 
9654 /*
9655   Tells if two (or more) tables have auto_increment columns and we want to
9656   lock those tables with a write lock.
9657 
9658   SYNOPSIS
9659     has_two_write_locked_tables_with_auto_increment
9660       tables        Table list
9661 
9662   NOTES:
9663     Call this function only when you have established the list of all tables
9664     which you'll want to update (including stored functions, triggers, views
9665     inside your statement).
9666 */
9667 
has_write_table_with_auto_increment(TABLE_LIST * tables)9668 static bool has_write_table_with_auto_increment(TABLE_LIST *tables) {
9669   for (TABLE_LIST *table = tables; table; table = table->next_global) {
9670     /* we must do preliminary checks as table->table may be NULL */
9671     if (!table->is_placeholder() && table->table->found_next_number_field &&
9672         (table->lock_descriptor().type >= TL_WRITE_ALLOW_WRITE))
9673       return true;
9674   }
9675 
9676   return false;
9677 }
9678 
9679 /*
9680    checks if we have select tables in the table list and write tables
9681    with auto-increment column.
9682 
9683   SYNOPSIS
9684    has_two_write_locked_tables_with_auto_increment_and_select
9685       tables        Table list
9686 
9687   RETURN VALUES
9688 
9689    -true if the table list has atleast one table with auto-increment column
9690 
9691 
9692          and atleast one table to select from.
9693    -false otherwise
9694 */
9695 
has_write_table_with_auto_increment_and_select(TABLE_LIST * tables)9696 static bool has_write_table_with_auto_increment_and_select(TABLE_LIST *tables) {
9697   bool has_select = false;
9698   bool has_auto_increment_tables = has_write_table_with_auto_increment(tables);
9699   for (TABLE_LIST *table = tables; table; table = table->next_global) {
9700     if (!table->is_placeholder() &&
9701         (table->lock_descriptor().type <= TL_READ_NO_INSERT)) {
9702       has_select = true;
9703       break;
9704     }
9705   }
9706   return (has_select && has_auto_increment_tables);
9707 }
9708 
9709 /*
9710   Tells if there is a table whose auto_increment column is a part
9711   of a compound primary key while is not the first column in
9712   the table definition.
9713 
9714   @param tables Table list
9715 
9716   @return true if the table exists, fais if does not.
9717 */
9718 
has_write_table_auto_increment_not_first_in_pk(TABLE_LIST * tables)9719 static bool has_write_table_auto_increment_not_first_in_pk(TABLE_LIST *tables) {
9720   for (TABLE_LIST *table = tables; table; table = table->next_global) {
9721     /* we must do preliminary checks as table->table may be NULL */
9722     if (!table->is_placeholder() && table->table->found_next_number_field &&
9723         (table->lock_descriptor().type >= TL_WRITE_ALLOW_WRITE) &&
9724         table->table->s->next_number_keypart != 0)
9725       return true;
9726   }
9727 
9728   return false;
9729 }
9730 
9731 /**
9732   Checks if a table has a column with a non-deterministic DEFAULT expression.
9733 */
has_nondeterministic_default(const TABLE * table)9734 static bool has_nondeterministic_default(const TABLE *table) {
9735   return std::any_of(
9736       table->field, table->field + table->s->fields, [](const Field *field) {
9737         return field->m_default_val_expr != nullptr &&
9738                field->m_default_val_expr->get_stmt_unsafe_flags() != 0;
9739       });
9740 }
9741 
9742 /**
9743   Checks if a TABLE_LIST contains a table that has been opened for writing, and
9744   that has a column with a non-deterministic DEFAULT expression.
9745 */
has_write_table_with_nondeterministic_default(const TABLE_LIST * tables)9746 static bool has_write_table_with_nondeterministic_default(
9747     const TABLE_LIST *tables) {
9748   for (const TABLE_LIST *table = tables; table != nullptr;
9749        table = table->next_global) {
9750     /* we must do preliminary checks as table->table may be NULL */
9751     if (!table->is_placeholder() &&
9752         table->lock_descriptor().type >= TL_WRITE_ALLOW_WRITE &&
9753         has_nondeterministic_default(table->table))
9754       return true;
9755   }
9756   return false;
9757 }
9758 
9759 /*
9760   Function to check whether the table in query uses a fulltext parser
9761   plugin or not.
9762 
9763   @param s - table share pointer.
9764 
9765   @retval true - The table uses fulltext parser plugin.
9766   @retval false - Otherwise.
9767 */
fulltext_unsafe_set(TABLE_SHARE * s)9768 static bool inline fulltext_unsafe_set(TABLE_SHARE *s) {
9769   for (unsigned int i = 0; i < s->keys; i++) {
9770     if ((s->key_info[i].flags & HA_USES_PARSER) && s->keys_in_use.is_set(i))
9771       return true;
9772   }
9773   return false;
9774 }
9775 #ifndef DBUG_OFF
get_locked_tables_mode_name(enum_locked_tables_mode locked_tables_mode)9776 const char *get_locked_tables_mode_name(
9777     enum_locked_tables_mode locked_tables_mode) {
9778   switch (locked_tables_mode) {
9779     case LTM_NONE:
9780       return "LTM_NONE";
9781     case LTM_LOCK_TABLES:
9782       return "LTM_LOCK_TABLES";
9783     case LTM_PRELOCKED:
9784       return "LTM_PRELOCKED";
9785     case LTM_PRELOCKED_UNDER_LOCK_TABLES:
9786       return "LTM_PRELOCKED_UNDER_LOCK_TABLES";
9787     default:
9788       return "Unknown table lock mode";
9789   }
9790 }
9791 #endif
9792 
9793 /**
9794   Decide on logging format to use for the statement and issue errors
9795   or warnings as needed.  The decision depends on the following
9796   parameters:
9797 
9798   - The logging mode, i.e., the value of binlog_format.  Can be
9799     statement, mixed, or row.
9800 
9801   - The type of statement.  There are three types of statements:
9802     "normal" safe statements; unsafe statements; and row injections.
9803     An unsafe statement is one that, if logged in statement format,
9804     might produce different results when replayed on the slave (e.g.,
9805     queries with a LIMIT clause).  A row injection is either a BINLOG
9806     statement, or a row event executed by the slave's SQL thread.
9807 
9808   - The capabilities of tables modified by the statement.  The
9809     *capabilities vector* for a table is a set of flags associated
9810     with the table.  Currently, it only includes two flags: *row
9811     capability flag* and *statement capability flag*.
9812 
9813     The row capability flag is set if and only if the engine can
9814     handle row-based logging. The statement capability flag is set if
9815     and only if the table can handle statement-based logging.
9816 
9817   Decision table for logging format
9818   ---------------------------------
9819 
9820   The following table summarizes how the format and generated
9821   warning/error depends on the tables' capabilities, the statement
9822   type, and the current binlog_format.
9823 
9824      Row capable        N NNNNNNNNN YYYYYYYYY YYYYYYYYY
9825      Statement capable  N YYYYYYYYY NNNNNNNNN YYYYYYYYY
9826 
9827      Statement type     * SSSUUUIII SSSUUUIII SSSUUUIII
9828 
9829      binlog_format      * SMRSMRSMR SMRSMRSMR SMRSMRSMR
9830 
9831      Logged format      - SS-S----- -RR-RR-RR SRRSRR-RR
9832      Warning/Error      1 --2732444 5--5--6-- ---7--6--
9833 
9834   Legend
9835   ------
9836 
9837   Row capable:    N - Some table not row-capable, Y - All tables row-capable
9838   Stmt capable:   N - Some table not stmt-capable, Y - All tables stmt-capable
9839   Statement type: (S)afe, (U)nsafe, or Row (I)njection
9840   binlog_format:  (S)TATEMENT, (M)IXED, or (R)OW
9841   Logged format:  (S)tatement or (R)ow
9842   Warning/Error:  Warnings and error messages are as follows:
9843 
9844   1. Error: Cannot execute statement: binlogging impossible since both
9845      row-incapable engines and statement-incapable engines are
9846      involved.
9847 
9848   2. Error: Cannot execute statement: binlogging impossible since
9849      BINLOG_FORMAT = ROW and at least one table uses a storage engine
9850      limited to statement-logging.
9851 
9852   3. Error: Cannot execute statement: binlogging of unsafe statement
9853      is impossible when storage engine is limited to statement-logging
9854      and BINLOG_FORMAT = MIXED.
9855 
9856   4. Error: Cannot execute row injection: binlogging impossible since
9857      at least one table uses a storage engine limited to
9858      statement-logging.
9859 
9860   5. Error: Cannot execute statement: binlogging impossible since
9861      BINLOG_FORMAT = STATEMENT and at least one table uses a storage
9862      engine limited to row-logging.
9863 
9864   6. Error: Cannot execute row injection: binlogging impossible since
9865      BINLOG_FORMAT = STATEMENT.
9866 
9867   7. Warning: Unsafe statement binlogged in statement format since
9868      BINLOG_FORMAT = STATEMENT.
9869 
9870   In addition, we can produce the following error (not depending on
9871   the variables of the decision diagram):
9872 
9873   8. Error: Cannot execute statement: binlogging impossible since more
9874      than one engine is involved and at least one engine is
9875      self-logging.
9876 
9877   9. Error: Do not allow users to modify a gtid_executed table
9878      explicitly by a XA transaction.
9879 
9880   For each error case above, the statement is prevented from being
9881   logged, we report an error, and roll back the statement.  For
9882   warnings, we set the thd->binlog_flags variable: the warning will be
9883   printed only if the statement is successfully logged.
9884 
9885   @see THD::binlog_query
9886 
9887   @param[in] tables Tables involved in the query
9888 
9889   @retval 0 No error; statement can be logged.
9890   @retval -1 One of the error conditions above applies (1, 2, 4, 5, 6 or 9).
9891 */
9892 
decide_logging_format(TABLE_LIST * tables)9893 int THD::decide_logging_format(TABLE_LIST *tables) {
9894   DBUG_TRACE;
9895   DBUG_PRINT("info", ("query: %s", query().str));
9896   DBUG_PRINT("info", ("variables.binlog_format: %lu", variables.binlog_format));
9897   DBUG_PRINT("info", ("lex->get_stmt_unsafe_flags(): 0x%x",
9898                       lex->get_stmt_unsafe_flags()));
9899 
9900 #if defined(ENABLED_DEBUG_SYNC)
9901   if (!is_attachable_ro_transaction_active())
9902     DEBUG_SYNC(this, "begin_decide_logging_format");
9903 #endif
9904 
9905   reset_binlog_local_stmt_filter();
9906 
9907   /*
9908     We should not decide logging format if the binlog is closed or
9909     binlogging is off, or if the statement is filtered out from the
9910     binlog by filtering rules.
9911   */
9912   if (mysql_bin_log.is_open() && (variables.option_bits & OPTION_BIN_LOG) &&
9913       !(variables.binlog_format == BINLOG_FORMAT_STMT &&
9914         !binlog_filter->db_ok(m_db.str))) {
9915     /*
9916       Compute one bit field with the union of all the engine
9917       capabilities, and one with the intersection of all the engine
9918       capabilities.
9919     */
9920     handler::Table_flags flags_write_some_set = 0;
9921     handler::Table_flags flags_access_some_set = 0;
9922     handler::Table_flags flags_write_all_set =
9923         HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE;
9924 
9925     /*
9926        If different types of engines are about to be updated.
9927        For example: Innodb and Falcon; Innodb and MyIsam.
9928     */
9929     bool multi_write_engine = false;
9930     /*
9931        If different types of engines are about to be accessed
9932        and any of them is about to be updated. For example:
9933        Innodb and Falcon; Innodb and MyIsam.
9934     */
9935     bool multi_access_engine = false;
9936     /*
9937       Track if statement creates or drops a temporary table
9938       and log in ROW if it does.
9939 */
9940     bool is_create_drop_temp_table = false;
9941     /*
9942        Identifies if a table is changed.
9943     */
9944     bool is_write = false;
9945     /*
9946        A pointer to a previous table that was changed.
9947     */
9948     TABLE *prev_write_table = nullptr;
9949     /*
9950        A pointer to a previous table that was accessed.
9951     */
9952     TABLE *prev_access_table = nullptr;
9953     /*
9954       True if at least one table is transactional.
9955     */
9956     bool write_to_some_transactional_table = false;
9957     /*
9958       True if at least one table is non-transactional.
9959     */
9960     bool write_to_some_non_transactional_table = false;
9961     /*
9962        True if all non-transactional tables that has been updated
9963        are temporary.
9964     */
9965     bool write_all_non_transactional_are_tmp_tables = true;
9966     /**
9967       The number of tables used in the current statement,
9968       that should be replicated.
9969     */
9970     uint replicated_tables_count = 0;
9971     /**
9972       The number of tables written to in the current statement,
9973       that should not be replicated.
9974       A table should not be replicated when it is considered
9975       'local' to a MySQL instance.
9976       Currently, these tables are:
9977       - mysql.slow_log
9978       - mysql.general_log
9979       - mysql.slave_relay_log_info
9980       - mysql.slave_master_info
9981       - mysql.slave_worker_info
9982       - performance_schema.*
9983       - TODO: information_schema.*
9984       In practice, from this list, only performance_schema.* tables
9985       are written to by user queries.
9986     */
9987     uint non_replicated_tables_count = 0;
9988     /**
9989       Indicate whether we alreadly reported a warning
9990       on modifying gtid_executed table.
9991     */
9992     int warned_gtid_executed_table = 0;
9993 #ifndef DBUG_OFF
9994     {
9995       DBUG_PRINT("debug", ("prelocked_mode: %s",
9996                            get_locked_tables_mode_name(locked_tables_mode)));
9997     }
9998 #endif
9999 
10000     if (variables.binlog_format != BINLOG_FORMAT_ROW && tables) {
10001       /*
10002         DML statements that modify a table with an auto_increment column based
10003         on rows selected from a table are unsafe as the order in which the rows
10004         are fetched fron the select tables cannot be determined and may differ
10005         on master and slave.
10006        */
10007       if (has_write_table_with_auto_increment_and_select(tables))
10008         lex->set_stmt_unsafe(LEX::BINLOG_STMT_UNSAFE_WRITE_AUTOINC_SELECT);
10009 
10010       if (has_write_table_auto_increment_not_first_in_pk(tables))
10011         lex->set_stmt_unsafe(LEX::BINLOG_STMT_UNSAFE_AUTOINC_NOT_FIRST);
10012 
10013       /*
10014         A query that modifies autoinc column in sub-statement can make the
10015         master and slave inconsistent.
10016         We can solve these problems in mixed mode by switching to binlogging
10017         if at least one updated table is used by sub-statement
10018        */
10019       if (lex->requires_prelocking() &&
10020           has_write_table_with_auto_increment(lex->first_not_own_table()))
10021         lex->set_stmt_unsafe(LEX::BINLOG_STMT_UNSAFE_AUTOINC_COLUMNS);
10022 
10023       /*
10024         A query that modifies a table with a non-deterministic column default
10025         expression in a substatement, can make the master and the slave
10026         inconsistent. Switch to row logging in mixed mode, and raise a warning
10027         in statement mode.
10028       */
10029       if (lex->requires_prelocking() &&
10030           has_write_table_with_nondeterministic_default(
10031               lex->first_not_own_table()))
10032         lex->set_stmt_unsafe(
10033             LEX::BINLOG_STMT_UNSAFE_DEFAULT_EXPRESSION_IN_SUBSTATEMENT);
10034     }
10035 
10036     /*
10037       Get the capabilities vector for all involved storage engines and
10038       mask out the flags for the binary log.
10039     */
10040     for (TABLE_LIST *table = tables; table; table = table->next_global) {
10041       if (table->is_placeholder()) {
10042         /*
10043           Detect if this is a CREATE TEMPORARY or DROP of a
10044           temporary table. This will be used later in determining whether to
10045           log in ROW or STMT if MIXED replication is being used.
10046         */
10047         if (!is_create_drop_temp_table && !table->table &&
10048             ((lex->sql_command == SQLCOM_CREATE_TABLE &&
10049               (lex->create_info->options & HA_LEX_CREATE_TMP_TABLE)) ||
10050              ((lex->sql_command == SQLCOM_DROP_TABLE ||
10051                lex->sql_command == SQLCOM_TRUNCATE) &&
10052               find_temporary_table(this, table)))) {
10053           is_create_drop_temp_table = true;
10054         }
10055         continue;
10056       }
10057       handler::Table_flags const flags = table->table->file->ha_table_flags();
10058 
10059       DBUG_PRINT("info", ("table: %s; ha_table_flags: 0x%llx",
10060                           table->table_name, flags));
10061 
10062       if (table->table->no_replicate) {
10063         if (!warned_gtid_executed_table) {
10064           warned_gtid_executed_table =
10065               gtid_state->warn_or_err_on_modify_gtid_table(this, table);
10066           /*
10067             Do not allow users to modify the gtid_executed table
10068             explicitly by a XA transaction.
10069           */
10070           if (warned_gtid_executed_table == 2) return -1;
10071         }
10072         /*
10073           The statement uses a table that is not replicated.
10074           The following properties about the table:
10075           - persistent / transient
10076           - transactional / non transactional
10077           - temporary / permanent
10078           - read or write
10079           - multiple engines involved because of this table
10080           are not relevant, as this table is completely ignored.
10081           Because the statement uses a non replicated table,
10082           using STATEMENT format in the binlog is impossible.
10083           Either this statement will be discarded entirely,
10084           or it will be logged (possibly partially) in ROW format.
10085         */
10086         lex->set_stmt_unsafe(LEX::BINLOG_STMT_UNSAFE_SYSTEM_TABLE);
10087 
10088         if (table->lock_descriptor().type >= TL_WRITE_ALLOW_WRITE) {
10089           non_replicated_tables_count++;
10090           continue;
10091         }
10092       }
10093 
10094       replicated_tables_count++;
10095 
10096       bool trans = table->table->file->has_transactions();
10097 
10098       if (table->lock_descriptor().type >= TL_WRITE_ALLOW_WRITE) {
10099         write_to_some_transactional_table =
10100             write_to_some_transactional_table || trans;
10101 
10102         write_to_some_non_transactional_table =
10103             write_to_some_non_transactional_table || !trans;
10104 
10105         if (prev_write_table &&
10106             prev_write_table->file->ht != table->table->file->ht)
10107           multi_write_engine = true;
10108 
10109         if (table->table->s->tmp_table)
10110           lex->set_stmt_accessed_table(
10111               trans ? LEX::STMT_WRITES_TEMP_TRANS_TABLE
10112                     : LEX::STMT_WRITES_TEMP_NON_TRANS_TABLE);
10113         else
10114           lex->set_stmt_accessed_table(trans
10115                                            ? LEX::STMT_WRITES_TRANS_TABLE
10116                                            : LEX::STMT_WRITES_NON_TRANS_TABLE);
10117 
10118         /*
10119          Non-transactional updates are allowed when row binlog format is
10120          used and all non-transactional tables are temporary.
10121          Binlog format is checked on THD::is_dml_gtid_compatible() method.
10122         */
10123         if (!trans)
10124           write_all_non_transactional_are_tmp_tables =
10125               write_all_non_transactional_are_tmp_tables &&
10126               table->table->s->tmp_table;
10127 
10128         flags_write_all_set &= flags;
10129         flags_write_some_set |= flags;
10130         is_write = true;
10131 
10132         prev_write_table = table->table;
10133 
10134         /*
10135           It should be marked unsafe if a table which uses a fulltext parser
10136           plugin is modified. See also bug#48183.
10137         */
10138         if (!lex->is_stmt_unsafe(LEX::BINLOG_STMT_UNSAFE_FULLTEXT_PLUGIN)) {
10139           if (fulltext_unsafe_set(table->table->s))
10140             lex->set_stmt_unsafe(LEX::BINLOG_STMT_UNSAFE_FULLTEXT_PLUGIN);
10141         }
10142         /*
10143           INSERT...ON DUPLICATE KEY UPDATE on a table with more than one unique
10144           keys can be unsafe. Check for it if the flag is already not marked for
10145           the given statement.
10146         */
10147         if (!lex->is_stmt_unsafe(LEX::BINLOG_STMT_UNSAFE_INSERT_TWO_KEYS) &&
10148             lex->sql_command == SQLCOM_INSERT &&
10149             lex->duplicates == DUP_UPDATE) {
10150           uint keys = table->table->s->keys, i = 0, unique_keys = 0;
10151           for (KEY *keyinfo = table->table->s->key_info;
10152                i < keys && unique_keys <= 1; i++, keyinfo++) {
10153             if (keyinfo->flags & HA_NOSAME) unique_keys++;
10154           }
10155           if (unique_keys > 1)
10156             lex->set_stmt_unsafe(LEX::BINLOG_STMT_UNSAFE_INSERT_TWO_KEYS);
10157         }
10158       }
10159       if (lex->get_using_match()) {
10160         if (fulltext_unsafe_set(table->table->s))
10161           lex->set_stmt_unsafe(LEX::BINLOG_STMT_UNSAFE_FULLTEXT_PLUGIN);
10162       }
10163 
10164       flags_access_some_set |= flags;
10165 
10166       if (lex->sql_command != SQLCOM_CREATE_TABLE ||
10167           (lex->sql_command == SQLCOM_CREATE_TABLE &&
10168            ((lex->create_info->options & HA_LEX_CREATE_TMP_TABLE) ||
10169             (table->lock_descriptor().type < TL_WRITE_ALLOW_WRITE)))) {
10170         if (table->table->s->tmp_table)
10171           lex->set_stmt_accessed_table(
10172               trans ? LEX::STMT_READS_TEMP_TRANS_TABLE
10173                     : LEX::STMT_READS_TEMP_NON_TRANS_TABLE);
10174         else
10175           lex->set_stmt_accessed_table(trans ? LEX::STMT_READS_TRANS_TABLE
10176                                              : LEX::STMT_READS_NON_TRANS_TABLE);
10177       }
10178 
10179       if (prev_access_table &&
10180           prev_access_table->file->ht != table->table->file->ht)
10181         multi_access_engine = true;
10182 
10183       prev_access_table = table->table;
10184     }
10185     DBUG_ASSERT(!is_write || write_to_some_transactional_table ||
10186                 write_to_some_non_transactional_table);
10187     /*
10188       write_all_non_transactional_are_tmp_tables may be true if any
10189       non-transactional table was not updated, so we fix its value here.
10190     */
10191     write_all_non_transactional_are_tmp_tables =
10192         write_all_non_transactional_are_tmp_tables &&
10193         write_to_some_non_transactional_table;
10194 
10195     DBUG_PRINT("info", ("flags_write_all_set: 0x%llx", flags_write_all_set));
10196     DBUG_PRINT("info", ("flags_write_some_set: 0x%llx", flags_write_some_set));
10197     DBUG_PRINT("info",
10198                ("flags_access_some_set: 0x%llx", flags_access_some_set));
10199     DBUG_PRINT("info", ("multi_write_engine: %d", multi_write_engine));
10200     DBUG_PRINT("info", ("multi_access_engine: %d", multi_access_engine));
10201 
10202     int error = 0;
10203     int unsafe_flags;
10204 
10205     /*
10206       With transactional data dictionary, CREATE TABLE runs as one statement
10207       in a multi-statement transaction internally. Revert this for the
10208       purposes of determining mixed statement safety.
10209     */
10210     const bool multi_stmt_trans = lex->sql_command != SQLCOM_CREATE_TABLE &&
10211                                   in_multi_stmt_transaction_mode();
10212     bool trans_table = trans_has_updated_trans_table(this);
10213     bool binlog_direct = variables.binlog_direct_non_trans_update;
10214 
10215     if (lex->is_mixed_stmt_unsafe(multi_stmt_trans, binlog_direct, trans_table,
10216                                   tx_isolation))
10217       lex->set_stmt_unsafe(LEX::BINLOG_STMT_UNSAFE_MIXED_STATEMENT);
10218     else if (multi_stmt_trans && trans_table && !binlog_direct &&
10219              lex->stmt_accessed_table(LEX::STMT_WRITES_NON_TRANS_TABLE))
10220       lex->set_stmt_unsafe(LEX::BINLOG_STMT_UNSAFE_NONTRANS_AFTER_TRANS);
10221 
10222     /*
10223       If more than one engine is involved in the statement and at
10224       least one is doing it's own logging (is *self-logging*), the
10225       statement cannot be logged atomically, so we generate an error
10226       rather than allowing the binlog to become corrupt.
10227     */
10228     if (multi_write_engine && (flags_write_some_set & HA_HAS_OWN_BINLOGGING))
10229       my_error((error = ER_BINLOG_MULTIPLE_ENGINES_AND_SELF_LOGGING_ENGINE),
10230                MYF(0));
10231     else if (multi_access_engine &&
10232              flags_access_some_set & HA_HAS_OWN_BINLOGGING)
10233       lex->set_stmt_unsafe(
10234           LEX::BINLOG_STMT_UNSAFE_MULTIPLE_ENGINES_AND_SELF_LOGGING_ENGINE);
10235 
10236     /* XA is unsafe for statements */
10237     if (is_write &&
10238         !get_transaction()->xid_state()->has_state(XID_STATE::XA_NOTR))
10239       lex->set_stmt_unsafe(LEX::BINLOG_STMT_UNSAFE_XA);
10240 
10241     DBUG_EXECUTE_IF("make_stmt_only_engines",
10242                     { flags_write_all_set = HA_BINLOG_STMT_CAPABLE; };);
10243 
10244     /* both statement-only and row-only engines involved */
10245     if ((flags_write_all_set &
10246          (HA_BINLOG_STMT_CAPABLE | HA_BINLOG_ROW_CAPABLE)) == 0) {
10247       /*
10248         1. Error: Binary logging impossible since both row-incapable
10249            engines and statement-incapable engines are involved
10250       */
10251       my_error((error = ER_BINLOG_ROW_ENGINE_AND_STMT_ENGINE), MYF(0));
10252     }
10253     /* statement-only engines involved */
10254     else if ((flags_write_all_set & HA_BINLOG_ROW_CAPABLE) == 0) {
10255       if (lex->is_stmt_row_injection()) {
10256         /*
10257           4. Error: Cannot execute row injection since table uses
10258              storage engine limited to statement-logging
10259         */
10260         my_error((error = ER_BINLOG_ROW_INJECTION_AND_STMT_ENGINE), MYF(0));
10261       } else if (variables.binlog_format == BINLOG_FORMAT_ROW &&
10262                  sqlcom_can_generate_row_events(this->lex->sql_command)) {
10263         /*
10264           2. Error: Cannot modify table that uses a storage engine
10265              limited to statement-logging when BINLOG_FORMAT = ROW
10266         */
10267         my_error((error = ER_BINLOG_ROW_MODE_AND_STMT_ENGINE), MYF(0));
10268       } else if (variables.binlog_format == BINLOG_FORMAT_MIXED &&
10269                  ((unsafe_flags = lex->get_stmt_unsafe_flags()) != 0)) {
10270         /*
10271           3. Error: Cannot execute statement: binlogging of unsafe
10272              statement is impossible when storage engine is limited to
10273              statement-logging and BINLOG_FORMAT = MIXED.
10274         */
10275         for (int unsafe_type = 0; unsafe_type < LEX::BINLOG_STMT_UNSAFE_COUNT;
10276              unsafe_type++)
10277           if (unsafe_flags & (1 << unsafe_type))
10278             my_error(
10279                 (error = ER_BINLOG_UNSAFE_AND_STMT_ENGINE), MYF(0),
10280                 ER_THD_NONCONST(current_thd,
10281                                 LEX::binlog_stmt_unsafe_errcode[unsafe_type]));
10282       } else if (is_write &&
10283                  ((unsafe_flags = lex->get_stmt_unsafe_flags()) != 0)) {
10284         /*
10285           7. Warning: Unsafe statement logged as statement due to
10286              binlog_format = STATEMENT
10287         */
10288         binlog_unsafe_warning_flags |= unsafe_flags;
10289         DBUG_PRINT("info", ("Scheduling warning to be issued by "
10290                             "binlog_query: '%s'",
10291                             ER_THD(current_thd, ER_BINLOG_UNSAFE_STATEMENT)));
10292         DBUG_PRINT("info", ("binlog_unsafe_warning_flags: 0x%x",
10293                             binlog_unsafe_warning_flags));
10294       }
10295       /* log in statement format! */
10296     }
10297     /* no statement-only engines */
10298     else {
10299       /* binlog_format = STATEMENT */
10300       if (variables.binlog_format == BINLOG_FORMAT_STMT) {
10301         if (lex->is_stmt_row_injection()) {
10302           /*
10303             6. Error: Cannot execute row injection since
10304                BINLOG_FORMAT = STATEMENT
10305           */
10306           my_error((error = ER_BINLOG_ROW_INJECTION_AND_STMT_MODE), MYF(0));
10307         } else if ((flags_write_all_set & HA_BINLOG_STMT_CAPABLE) == 0 &&
10308                    sqlcom_can_generate_row_events(this->lex->sql_command)) {
10309           /*
10310             5. Error: Cannot modify table that uses a storage engine
10311                limited to row-logging when binlog_format = STATEMENT
10312           */
10313           my_error((error = ER_BINLOG_STMT_MODE_AND_ROW_ENGINE), MYF(0), "");
10314         } else if (is_write &&
10315                    (unsafe_flags = lex->get_stmt_unsafe_flags()) != 0) {
10316           /*
10317             7. Warning: Unsafe statement logged as statement due to
10318                binlog_format = STATEMENT
10319           */
10320           binlog_unsafe_warning_flags |= unsafe_flags;
10321           DBUG_PRINT("info", ("Scheduling warning to be issued by "
10322                               "binlog_query: '%s'",
10323                               ER_THD(current_thd, ER_BINLOG_UNSAFE_STATEMENT)));
10324           DBUG_PRINT("info", ("binlog_unsafe_warning_flags: 0x%x",
10325                               binlog_unsafe_warning_flags));
10326         }
10327         /* log in statement format! */
10328       }
10329       /* No statement-only engines and binlog_format != STATEMENT.
10330          I.e., nothing prevents us from row logging if needed. */
10331       else {
10332         if (lex->is_stmt_unsafe() || lex->is_stmt_row_injection() ||
10333             lex->is_stmt_unsafe_with_mixed_mode() ||
10334             (flags_write_all_set & HA_BINLOG_STMT_CAPABLE) == 0 ||
10335             lex->stmt_accessed_table(LEX::STMT_READS_TEMP_TRANS_TABLE) ||
10336             lex->stmt_accessed_table(LEX::STMT_READS_TEMP_NON_TRANS_TABLE) ||
10337             is_create_drop_temp_table) {
10338 #ifndef DBUG_OFF
10339           int flags = lex->get_stmt_unsafe_flags();
10340           DBUG_PRINT("info", ("setting row format for unsafe statement"));
10341           for (int i = 0; i < Query_tables_list::BINLOG_STMT_UNSAFE_COUNT;
10342                i++) {
10343             if (flags & (1 << i))
10344               DBUG_PRINT(
10345                   "info",
10346                   ("unsafe reason: %s",
10347                    ER_THD_NONCONST(
10348                        current_thd,
10349                        Query_tables_list::binlog_stmt_unsafe_errcode[i])));
10350           }
10351           DBUG_PRINT("info",
10352                      ("is_row_injection=%d", lex->is_stmt_row_injection()));
10353           DBUG_PRINT("info", ("stmt_capable=%llu",
10354                               (flags_write_all_set & HA_BINLOG_STMT_CAPABLE)));
10355           DBUG_PRINT("info", ("lex->is_stmt_unsafe_with_mixed_mode = %d",
10356                               lex->is_stmt_unsafe_with_mixed_mode()));
10357 #endif
10358           /* log in row format! */
10359           set_current_stmt_binlog_format_row_if_mixed();
10360         }
10361       }
10362     }
10363 
10364     if (non_replicated_tables_count > 0) {
10365       if ((replicated_tables_count == 0) || !is_write) {
10366         DBUG_PRINT("info",
10367                    ("decision: no logging, no replicated table affected"));
10368         set_binlog_local_stmt_filter();
10369       } else {
10370         if (!is_current_stmt_binlog_format_row()) {
10371           my_error((error = ER_BINLOG_STMT_MODE_AND_NO_REPL_TABLES), MYF(0));
10372         } else {
10373           clear_binlog_local_stmt_filter();
10374         }
10375       }
10376     } else {
10377       clear_binlog_local_stmt_filter();
10378     }
10379 
10380     if (!error &&
10381         !is_dml_gtid_compatible(write_to_some_transactional_table,
10382                                 write_to_some_non_transactional_table,
10383                                 write_all_non_transactional_are_tmp_tables))
10384       error = 1;
10385 
10386     if (error) {
10387       DBUG_PRINT("info", ("decision: no logging since an error was generated"));
10388       return -1;
10389     }
10390 
10391     if (is_write &&
10392         lex->sql_command != SQLCOM_END /* rows-event applying by slave */) {
10393       /*
10394         Master side of DML in the STMT format events parallelization.
10395         All involving table db:s are stored in a abc-ordered name list.
10396         In case the number of databases exceeds MAX_DBS_IN_EVENT_MTS maximum
10397         the list gathering breaks since it won't be sent to the slave.
10398       */
10399       for (TABLE_LIST *table = tables; table; table = table->next_global) {
10400         if (table->is_placeholder()) continue;
10401 
10402         DBUG_ASSERT(table->table);
10403 
10404         if (table->table->s->is_referenced_by_foreign_key()) {
10405           /*
10406              FK-referenced dbs can't be gathered currently. The following
10407              event will be marked for sequential execution on slave.
10408           */
10409           binlog_accessed_db_names = nullptr;
10410           add_to_binlog_accessed_dbs("");
10411           break;
10412         }
10413         if (!is_current_stmt_binlog_format_row())
10414           add_to_binlog_accessed_dbs(table->db);
10415       }
10416     }
10417     DBUG_PRINT("info",
10418                ("decision: logging in %s format",
10419                 is_current_stmt_binlog_format_row() ? "ROW" : "STATEMENT"));
10420 
10421     if (variables.binlog_format == BINLOG_FORMAT_ROW &&
10422         (lex->sql_command == SQLCOM_UPDATE ||
10423          lex->sql_command == SQLCOM_UPDATE_MULTI ||
10424          lex->sql_command == SQLCOM_DELETE ||
10425          lex->sql_command == SQLCOM_DELETE_MULTI)) {
10426       String table_names;
10427       /*
10428         Generate a warning for UPDATE/DELETE statements that modify a
10429         BLACKHOLE table, as row events are not logged in row format.
10430       */
10431       for (TABLE_LIST *table = tables; table; table = table->next_global) {
10432         if (table->is_placeholder()) continue;
10433         if (table->table->file->ht->db_type == DB_TYPE_BLACKHOLE_DB &&
10434             table->lock_descriptor().type >= TL_WRITE_ALLOW_WRITE) {
10435           table_names.append(table->table_name);
10436           table_names.append(",");
10437         }
10438       }
10439       if (!table_names.is_empty()) {
10440         bool is_update = (lex->sql_command == SQLCOM_UPDATE ||
10441                           lex->sql_command == SQLCOM_UPDATE_MULTI);
10442         /*
10443           Replace the last ',' with '.' for table_names
10444         */
10445         table_names.replace(table_names.length() - 1, 1, ".", 1);
10446         push_warning_printf(
10447             this, Sql_condition::SL_WARNING, WARN_ON_BLOCKHOLE_IN_RBR,
10448             ER_THD(this, WARN_ON_BLOCKHOLE_IN_RBR),
10449             is_update ? "UPDATE" : "DELETE", table_names.c_ptr());
10450       }
10451     }
10452   } else {
10453     DBUG_PRINT(
10454         "info",
10455         ("decision: no logging since "
10456          "mysql_bin_log.is_open() = %d "
10457          "and (options & OPTION_BIN_LOG) = 0x%llx "
10458          "and binlog_format = %lu "
10459          "and binlog_filter->db_ok(db) = %d",
10460          mysql_bin_log.is_open(), (variables.option_bits & OPTION_BIN_LOG),
10461          variables.binlog_format, binlog_filter->db_ok(m_db.str)));
10462 
10463     for (TABLE_LIST *table = tables; table; table = table->next_global) {
10464       if (!table->is_placeholder() && table->table->no_replicate &&
10465           gtid_state->warn_or_err_on_modify_gtid_table(this, table))
10466         break;
10467     }
10468   }
10469 
10470 #if defined(ENABLED_DEBUG_SYNC)
10471   if (!is_attachable_ro_transaction_active())
10472     DEBUG_SYNC(this, "end_decide_logging_format");
10473 #endif
10474 
10475   return 0;
10476 }
10477 
10478 /**
10479   Given that a possible violation of gtid consistency has happened,
10480   checks if gtid-inconsistencies are forbidden by the current value of
10481   ENFORCE_GTID_CONSISTENCY and GTID_MODE. If forbidden, generates
10482   error or warning accordingly.
10483 
10484   @param thd The thread that has issued the GTID-violating statement.
10485 
10486   @param error_code The error code to use, if error or warning is to
10487   be generated.
10488 
10489   @param log_error_code The error code to use, if error message is to
10490   be logged.
10491 
10492   @retval false Error was generated.
10493   @retval true No error was generated (possibly a warning was generated).
10494 */
handle_gtid_consistency_violation(THD * thd,int error_code,int log_error_code)10495 static bool handle_gtid_consistency_violation(THD *thd, int error_code,
10496                                               int log_error_code) {
10497   DBUG_TRACE;
10498 
10499   enum_gtid_type gtid_next_type = thd->variables.gtid_next.type;
10500   global_sid_lock->rdlock();
10501   enum_gtid_consistency_mode gtid_consistency_mode =
10502       get_gtid_consistency_mode();
10503   auto gtid_mode = global_gtid_mode.get();
10504 
10505   DBUG_PRINT("info", ("gtid_next.type=%d gtid_mode=%s "
10506                       "gtid_consistency_mode=%d error=%d query=%s",
10507                       gtid_next_type, Gtid_mode::to_string(gtid_mode),
10508                       gtid_consistency_mode, error_code, thd->query().str));
10509 
10510   /*
10511     GTID violations should generate error if:
10512     - GTID_MODE=ON or ON_PERMISSIVE and GTID_NEXT='AUTOMATIC' (since the
10513       transaction is expected to commit using a GTID), or
10514     - GTID_NEXT='UUID:NUMBER' (since the transaction is expected to
10515       commit usinga GTID), or
10516     - ENFORCE_GTID_CONSISTENCY=ON.
10517   */
10518   if ((gtid_next_type == AUTOMATIC_GTID &&
10519        gtid_mode >= Gtid_mode::ON_PERMISSIVE) ||
10520       gtid_next_type == ASSIGNED_GTID ||
10521       gtid_consistency_mode == GTID_CONSISTENCY_MODE_ON) {
10522     global_sid_lock->unlock();
10523     my_error(error_code, MYF(0));
10524     return false;
10525   } else {
10526     /*
10527       If we are not generating an error, we must increase the counter
10528       of GTID-violating transactions.  This will prevent a concurrent
10529       client from executing a SET GTID_MODE or SET
10530       ENFORCE_GTID_CONSISTENCY statement that would be incompatible
10531       with this transaction.
10532 
10533       If the transaction had already been accounted as a gtid violating
10534       transaction, then don't increment the counters, just issue the
10535       warning below. This prevents calling
10536       begin_automatic_gtid_violating_transaction or
10537       begin_anonymous_gtid_violating_transaction multiple times for the
10538       same transaction, which would make the counter go out of sync.
10539     */
10540     if (!thd->has_gtid_consistency_violation) {
10541       if (gtid_next_type == AUTOMATIC_GTID)
10542         gtid_state->begin_automatic_gtid_violating_transaction();
10543       else {
10544         DBUG_ASSERT(gtid_next_type == ANONYMOUS_GTID);
10545         gtid_state->begin_anonymous_gtid_violating_transaction();
10546       }
10547 
10548       /*
10549         If a transaction generates multiple GTID violation conditions,
10550         it must still only update the counters once.  Hence we use
10551         this per-thread flag to keep track of whether the thread has a
10552         consistency or not.  This function must only be called if the
10553         transaction does not already have a GTID violation.
10554       */
10555       thd->has_gtid_consistency_violation = true;
10556     }
10557 
10558     global_sid_lock->unlock();
10559 
10560     // Generate warning if ENFORCE_GTID_CONSISTENCY = WARN.
10561     if (gtid_consistency_mode == GTID_CONSISTENCY_MODE_WARN) {
10562       // Need to print to log so that replication admin knows when users
10563       // have adjusted their workloads.
10564       LogErr(WARNING_LEVEL, log_error_code);
10565       // Need to print to client so that users can adjust their workload.
10566       push_warning(thd, Sql_condition::SL_WARNING, error_code,
10567                    ER_THD_NONCONST(thd, error_code));
10568     }
10569     return true;
10570   }
10571 }
10572 
is_ddl_gtid_compatible()10573 bool THD::is_ddl_gtid_compatible() {
10574   DBUG_TRACE;
10575 
10576   // If @@session.sql_log_bin has been manually turned off (only
10577   // doable by SUPER), then no problem, we can execute any statement.
10578   if ((variables.option_bits & OPTION_BIN_LOG) == 0 ||
10579       mysql_bin_log.is_open() == false)
10580     return true;
10581 
10582   DBUG_PRINT("info",
10583              ("SQLCOM_CREATE:%d CREATE-TMP:%d SELECT:%d SQLCOM_DROP:%d "
10584               "DROP-TMP:%d trx:%d",
10585               lex->sql_command == SQLCOM_CREATE_TABLE,
10586               (lex->sql_command == SQLCOM_CREATE_TABLE &&
10587                (lex->create_info->options & HA_LEX_CREATE_TMP_TABLE)),
10588               lex->select_lex->fields_list.elements,
10589               lex->sql_command == SQLCOM_DROP_TABLE,
10590               (lex->sql_command == SQLCOM_DROP_TABLE && lex->drop_temporary),
10591               in_multi_stmt_transaction_mode()));
10592 
10593   if (lex->sql_command == SQLCOM_CREATE_TABLE &&
10594       !(lex->create_info->options & HA_LEX_CREATE_TMP_TABLE) &&
10595       lex->select_lex->get_fields_list()->elements) {
10596     if (!(get_default_handlerton(this, lex->create_info->db_type)->flags &
10597           HTON_SUPPORTS_ATOMIC_DDL)) {
10598       /*
10599         CREATE ... SELECT (without TEMPORARY) for engines not supporting atomic
10600         DDL is unsafe because if binlog_format=row it will be logged as a CREATE
10601         TABLE followed by row events, re-executed non-atomically as two
10602         transactions, and then written to the slave's binary log as two separate
10603         transactions with the same GTID.
10604       */
10605       bool ret = handle_gtid_consistency_violation(
10606           this, ER_GTID_UNSAFE_CREATE_SELECT,
10607           ER_RPL_GTID_UNSAFE_STMT_CREATE_SELECT);
10608       return ret;
10609     }
10610   } else if ((lex->sql_command == SQLCOM_CREATE_TABLE &&
10611               (lex->create_info->options & HA_LEX_CREATE_TMP_TABLE) != 0) ||
10612              (lex->sql_command == SQLCOM_DROP_TABLE && lex->drop_temporary)) {
10613     /*
10614       When @@session.binlog_format=statement, [CREATE|DROP] TEMPORARY TABLE
10615       is unsafe to execute inside a transaction or Procedure, because the
10616       [CREATE|DROP] statement on the temporary table will be executed and
10617       written into binary log with a GTID even if the transaction or
10618       Procedure is rolled back.
10619     */
10620     if (variables.binlog_format == BINLOG_FORMAT_STMT &&
10621         (in_multi_stmt_transaction_mode() || in_sub_stmt)) {
10622       bool ret = handle_gtid_consistency_violation(
10623           this, ER_CLIENT_GTID_UNSAFE_CREATE_DROP_TEMP_TABLE_IN_TRX_IN_SBR,
10624           ER_SERVER_GTID_UNSAFE_CREATE_DROP_TEMP_TABLE_IN_TRX_IN_SBR);
10625       return ret;
10626     }
10627   }
10628   return true;
10629 }
10630 
is_dml_gtid_compatible(bool some_transactional_table,bool some_non_transactional_table,bool non_transactional_tables_are_tmp)10631 bool THD::is_dml_gtid_compatible(bool some_transactional_table,
10632                                  bool some_non_transactional_table,
10633                                  bool non_transactional_tables_are_tmp) {
10634   DBUG_TRACE;
10635 
10636   // If @@session.sql_log_bin has been manually turned off (only
10637   // doable by SUPER), then no problem, we can execute any statement.
10638   if ((variables.option_bits & OPTION_BIN_LOG) == 0 ||
10639       mysql_bin_log.is_open() == false)
10640     return true;
10641 
10642   /*
10643     Single non-transactional updates are allowed when not mixed
10644     together with transactional statements within a transaction.
10645     Furthermore, writing to transactional and non-transactional
10646     engines in a single statement is also disallowed.
10647     Multi-statement transactions on non-transactional tables are
10648     split into single-statement transactions when
10649     GTID_NEXT = "AUTOMATIC".
10650 
10651     Non-transactional updates are allowed when row binlog format is
10652     used and all non-transactional tables are temporary.
10653 
10654     The debug symbol "allow_gtid_unsafe_non_transactional_updates"
10655     disables the error.  This is useful because it allows us to run
10656     old tests that were not written with the restrictions of GTIDs in
10657     mind.
10658   */
10659   DBUG_PRINT("info", ("some_non_transactional_table=%d "
10660                       "some_transactional_table=%d "
10661                       "trans_has_updated_trans_table=%d "
10662                       "non_transactional_tables_are_tmp=%d "
10663                       "is_current_stmt_binlog_format_row=%d",
10664                       some_non_transactional_table, some_transactional_table,
10665                       trans_has_updated_trans_table(this),
10666                       non_transactional_tables_are_tmp,
10667                       is_current_stmt_binlog_format_row()));
10668   if (some_non_transactional_table &&
10669       (some_transactional_table || trans_has_updated_trans_table(this)) &&
10670       !(non_transactional_tables_are_tmp &&
10671         is_current_stmt_binlog_format_row()) &&
10672       !DBUG_EVALUATE_IF("allow_gtid_unsafe_non_transactional_updates", 1, 0)) {
10673     return handle_gtid_consistency_violation(
10674         this, ER_GTID_UNSAFE_NON_TRANSACTIONAL_TABLE,
10675         ER_RPL_GTID_UNSAFE_STMT_ON_NON_TRANS_TABLE);
10676   }
10677 
10678   return true;
10679 }
10680 
10681 /*
10682   Implementation of interface to write rows to the binary log through the
10683   thread.  The thread is responsible for writing the rows it has
10684   inserted/updated/deleted.
10685 */
10686 
10687 /*
10688   Template member function for ensuring that there is an rows log
10689   event of the apropriate type before proceeding.
10690 
10691   PRE CONDITION:
10692     - Events of type 'RowEventT' have the type code 'type_code'.
10693 
10694   POST CONDITION:
10695     If a non-NULL pointer is returned, the pending event for thread 'thd' will
10696     be an event of type 'RowEventT' (which have the type code 'type_code')
10697     will either empty or have enough space to hold 'needed' bytes.  In
10698     addition, the columns bitmap will be correct for the row, meaning that
10699     the pending event will be flushed if the columns in the event differ from
10700     the columns suppled to the function.
10701 
10702   RETURNS
10703     If no error, a non-NULL pending event (either one which already existed or
10704     the newly created one).
10705     If error, NULL.
10706  */
10707 
10708 template <class RowsEventT>
binlog_prepare_pending_rows_event(TABLE * table,uint32 serv_id,size_t needed,bool is_transactional,const unsigned char * extra_row_info,uint32 source_part_id)10709 Rows_log_event *THD::binlog_prepare_pending_rows_event(
10710     TABLE *table, uint32 serv_id, size_t needed, bool is_transactional,
10711     const unsigned char *extra_row_info, uint32 source_part_id) {
10712   DBUG_TRACE;
10713 
10714   DBUG_EXECUTE_IF("simulate_null_pending_rows_event", { return nullptr; });
10715 
10716   /* Fetch the type code for the RowsEventT template parameter */
10717   int const general_type_code = RowsEventT::TYPE_CODE;
10718 
10719   partition_info *part_info = table->part_info;
10720   auto part_id = get_rpl_part_id(part_info);
10721 
10722   Rows_log_event *pending = binlog_get_pending_rows_event(is_transactional);
10723 
10724   if (unlikely(pending && !pending->is_valid())) return nullptr;
10725 
10726   /*
10727     Check if the current event is non-NULL and a write-rows
10728     event. Also check if the table provided is mapped: if it is not,
10729     then we have switched to writing to a new table.
10730     If there is no pending event, we need to create one. If there is a pending
10731     event, but it's not about the same table id, or not of the same type
10732     (between Write, Update and Delete), or not the same affected columns, or
10733     going to be too big, flush this event to disk and create a new pending
10734     event.
10735 
10736     We do not need to check that the pending event and the new event
10737     have the same setting for partial json updates, because
10738     partialness of json can only be changed outside transactions.
10739   */
10740   if (!pending || pending->server_id != serv_id ||
10741       pending->get_table_id() != table->s->table_map_id ||
10742       pending->get_general_type_code() != general_type_code ||
10743       pending->get_data_size() + needed > binlog_row_event_max_size ||
10744       pending->read_write_bitmaps_cmp(table) == false ||
10745       !(pending->m_extra_row_info.compare_extra_row_info(
10746           extra_row_info, part_id, source_part_id))) {
10747     /* Create a new RowsEventT... */
10748     Rows_log_event *const ev = new RowsEventT(
10749         this, table, table->s->table_map_id, is_transactional, extra_row_info);
10750     if (unlikely(!ev)) return nullptr;
10751     ev->server_id = serv_id;  // I don't like this, it's too easy to forget.
10752     /*
10753       flush the pending event and replace it with the newly created
10754       event...
10755     */
10756     if (unlikely(mysql_bin_log.flush_and_set_pending_rows_event(
10757             this, ev, is_transactional))) {
10758       delete ev;
10759       return nullptr;
10760     }
10761 
10762     return ev; /* This is the new pending event */
10763   }
10764   return pending; /* This is the current pending event */
10765 }
10766 
10767 /* Declare in unnamed namespace. */
10768 namespace {
10769 
10770 /**
10771    Class to handle temporary allocation of memory for row data.
10772 
10773    The responsibilities of the class is to provide memory for
10774    packing one or two rows of packed data (depending on what
10775    constructor is called).
10776 
10777    In order to make the allocation more efficient for rows without blobs,
10778    a pointer to the allocated memory is stored in the table structure
10779    for such rows.  If memory for a table containing a blob field
10780    is requested, only memory for that is allocated, and subsequently
10781    released when the object is destroyed.
10782 
10783  */
10784 class Row_data_memory {
10785  public:
10786   /**
10787     Build an object to keep track of a block-local piece of memory
10788     for storing a row of data.
10789 
10790     @param table
10791     Table where the pre-allocated memory is stored.
10792 
10793     @param data
10794     Pointer to the table record.
10795    */
Row_data_memory(TABLE * table,const uchar * data)10796   Row_data_memory(TABLE *table, const uchar *data) : m_memory(nullptr) {
10797 #ifndef DBUG_OFF
10798     m_alloc_checked = false;
10799 #endif
10800     allocate_memory(table, max_row_length(table, data));
10801     m_ptr[0] = has_memory() ? m_memory : nullptr;
10802     m_ptr[1] = nullptr;
10803   }
10804 
Row_data_memory(TABLE * table,const uchar * data1,const uchar * data2,ulonglong value_options=0)10805   Row_data_memory(TABLE *table, const uchar *data1, const uchar *data2,
10806                   ulonglong value_options = 0)
10807       : m_memory(nullptr) {
10808 #ifndef DBUG_OFF
10809     m_alloc_checked = false;
10810 #endif
10811     size_t len1 = max_row_length(table, data1);
10812     size_t len2 = max_row_length(table, data2, value_options);
10813     allocate_memory(table, len1 + len2);
10814     m_ptr[0] = has_memory() ? m_memory : nullptr;
10815     m_ptr[1] = has_memory() ? m_memory + len1 : nullptr;
10816   }
10817 
~Row_data_memory()10818   ~Row_data_memory() {
10819     if (m_memory != nullptr && m_release_memory_on_destruction)
10820       my_free(m_memory);
10821   }
10822 
10823   /**
10824      Is there memory allocated?
10825 
10826      @retval true There is memory allocated
10827      @retval false Memory allocation failed
10828    */
has_memory() const10829   bool has_memory() const {
10830 #ifndef DBUG_OFF
10831     m_alloc_checked = true;
10832 #endif
10833     return m_memory != nullptr;
10834   }
10835 
slot(uint s)10836   uchar *slot(uint s) {
10837     DBUG_ASSERT(s < sizeof(m_ptr) / sizeof(*m_ptr));
10838     DBUG_ASSERT(m_ptr[s] != nullptr);
10839     DBUG_ASSERT(m_alloc_checked == true);
10840     return m_ptr[s];
10841   }
10842 
10843  private:
10844   /**
10845     Compute an upper bound on the amount of memory needed.
10846 
10847     This may return an over-approximation.
10848 
10849     @param table The table
10850     @param data The server's row record.
10851     @param value_options The value of @@global.binlog_row_value_options
10852   */
max_row_length(TABLE * table,const uchar * data,ulonglong value_options=0)10853   size_t max_row_length(TABLE *table, const uchar *data,
10854                         ulonglong value_options = 0) {
10855     TABLE_SHARE *table_s = table->s;
10856     Replicated_columns_view fields{table, Replicated_columns_view::OUTBOUND};
10857     /*
10858       The server stores rows using "records".  A record is a sequence of bytes
10859       which contains values or pointers to values for all fields (columns).  The
10860       server uses table_s->reclength bytes for a row record.
10861 
10862       The layout of a record is roughly:
10863 
10864       - N+1+B bits, packed into CEIL((N+1+B)/8) bytes, where N is the number of
10865         nullable columns in the table, and B is the sum of the number of bits of
10866         all BIT columns.
10867 
10868       - A sequence of serialized fields, each corresponding to a non-BIT,
10869         non-NULL column in the table.
10870 
10871         For variable-length columns, the first component of the serialized field
10872         is a length, stored using 1, 2, 3, or 4 bytes depending on the maximum
10873         length for the data type.
10874 
10875         For most data types, the next component of the serialized field is the
10876         actual data.  But for for VARCHAR, VARBINARY, TEXT, BLOB, and JSON, the
10877         next component of the serialized field is a serialized pointer,
10878         i.e. sizeof(pointer) bytes, which point to another memory area where the
10879         actual data is stored.
10880 
10881       The layout of a row image in the binary log is roughly:
10882 
10883       - If this is an after-image and partial JSON is enabled, 1 byte containing
10884         value_options.  If the PARTIAL_JSON bit of value_options is set, this is
10885         followed by P bits (the "partial_bits"), packed into CEIL(P) bytes,
10886         where P is the number of JSON columns in the table.
10887 
10888       - M bits (the "null_bits"), packed into CEIL(M) bytes, where M is the
10889         number of columns in the image.
10890 
10891       - A sequence of serialized fields, each corresponding to a non-NULL column
10892         in the row image.
10893 
10894         For variable-length columns, the first component of the serialized field
10895         is a length, stored using 1, 2, 3, or 4 bytes depending on the maximum
10896         length for the data type.
10897 
10898         For most data types, the next component of the serialized field is the
10899         actual field data.  But for JSON fields where the corresponding bit of
10900         the partial_bits is 1, this is a sequence of diffs instead.
10901 
10902       Now we try to use table_s->reclength to estimate how much memory to
10903       allocate for a row image in the binlog.  Due to the differences this will
10904       only be an upper bound.  Notice the differences:
10905 
10906       - The binlog may only include a subset of the fields (the row image),
10907         whereas reclength contains space for all fields.
10908 
10909       - BIT columns are not packed together with NULL bits in the binlog, so up
10910         to 1 more byte per BIT column may be needed.
10911 
10912       - The binlog has a null bit even for non-nullable fields, whereas the
10913         reclength only contains space nullable fields, so the binlog may need up
10914         to CEIL(table_s->fields/8) more bytes.
10915 
10916       - The binlog only has a null bit for fields in the image, whereas the
10917         reclength contains space for all fields.
10918 
10919       - The binlog contains the full blob whereas the record only contains
10920         sizeof(pointer) bytes.
10921 
10922       - The binlog contains value_options and partial_bits.  So this may use up
10923         to 1+CEIL(table_s->fields/8) more bytes.
10924 
10925       - The binlog may contain partial JSON.  This is guaranteed to be smaller
10926         than the size of the full value.
10927 
10928       - There may exist columns that, due to their nature, are not replicated,
10929         for instance, hidden generated columns used for functional indexes.
10930 
10931       For those data types that are not stored using a pointer, the size of the
10932       field in the binary log is at most 2 bytes more than what the field
10933       contributes to in table_s->reclength, because those data types use at most
10934       1 byte for the length and waste less than a byte on extra padding and
10935       extra bits in null_bits or BIT columns.
10936 
10937       For those data types that are stored using a pointer, the size of the
10938       field in the binary log is at most 2 bytes more than what the field
10939       contributes to in table_s->reclength, plus the size of the data.  The size
10940       of the pointer is at least 4 on all supported platforms, so it is bigger
10941       than what is used by partial_bits, value_format, or any waste due to extra
10942       padding and extra bits in null_bits.
10943     */
10944     size_t length = table_s->reclength + 2 * (fields.filtered_size());
10945 
10946     for (uint i = 0; i < table_s->blob_fields; i++) {
10947       if (fields.is_excluded(table_s->blob_field[i])) continue;
10948 
10949       Field *field = table->field[table_s->blob_field[i]];
10950       Field_blob *field_blob = down_cast<Field_blob *>(field);
10951 
10952       if (field_blob->type() == MYSQL_TYPE_JSON &&
10953           (value_options & PARTIAL_JSON_UPDATES) != 0) {
10954         Field_json *field_json = down_cast<Field_json *>(field_blob);
10955         length += field_json->get_diff_vector_and_length(value_options);
10956       } else
10957         length +=
10958             field_blob->get_length(data + field_blob->offset(table->record[0]));
10959     }
10960     return length;
10961   }
10962 
allocate_memory(TABLE * const table,const size_t total_length)10963   void allocate_memory(TABLE *const table, const size_t total_length) {
10964     if (table->s->blob_fields == 0) {
10965       /*
10966         The maximum length of a packed record is less than this
10967         length. We use this value instead of the supplied length
10968         when allocating memory for records, since we don't know how
10969         the memory will be used in future allocations.
10970 
10971         Since table->s->reclength is for unpacked records, we have
10972         to add two bytes for each field, which can potentially be
10973         added to hold the length of a packed field.
10974       */
10975       size_t const maxlen = table->s->reclength + 2 * table->s->fields;
10976 
10977       /*
10978         Allocate memory for two records if memory hasn't been
10979         allocated. We allocate memory for two records so that it can
10980         be used when processing update rows as well.
10981       */
10982       if (table->write_row_record == nullptr)
10983         table->write_row_record = (uchar *)table->mem_root.Alloc(2 * maxlen);
10984       m_memory = table->write_row_record;
10985       m_release_memory_on_destruction = false;
10986     } else {
10987       m_memory = (uchar *)my_malloc(key_memory_Row_data_memory_memory,
10988                                     total_length, MYF(MY_WME));
10989       m_release_memory_on_destruction = true;
10990     }
10991   }
10992 
10993 #ifndef DBUG_OFF
10994   mutable bool m_alloc_checked;
10995 #endif
10996   bool m_release_memory_on_destruction;
10997   uchar *m_memory;
10998   uchar *m_ptr[2];
10999 };
11000 
11001 }  // namespace
11002 
binlog_write_row(TABLE * table,bool is_trans,uchar const * record,const unsigned char * extra_row_info)11003 int THD::binlog_write_row(TABLE *table, bool is_trans, uchar const *record,
11004                           const unsigned char *extra_row_info) {
11005   DBUG_ASSERT(is_current_stmt_binlog_format_row() && mysql_bin_log.is_open());
11006 
11007   /*
11008     Pack records into format for transfer. We are allocating more
11009     memory than needed, but that doesn't matter.
11010   */
11011   Row_data_memory memory(table, record);
11012   if (!memory.has_memory()) return HA_ERR_OUT_OF_MEM;
11013 
11014   uchar *row_data = memory.slot(0);
11015 
11016   size_t const len = pack_row(table, table->write_set, row_data, record,
11017                               enum_row_image_type::WRITE_AI);
11018 
11019   Rows_log_event *const ev =
11020       binlog_prepare_pending_rows_event<Write_rows_log_event>(
11021           table, server_id, len, is_trans, extra_row_info);
11022 
11023   if (unlikely(ev == nullptr)) return HA_ERR_OUT_OF_MEM;
11024 
11025   return ev->add_row_data(row_data, len);
11026 }
11027 
binlog_update_row(TABLE * table,bool is_trans,const uchar * before_record,const uchar * after_record,const unsigned char * extra_row_info)11028 int THD::binlog_update_row(TABLE *table, bool is_trans,
11029                            const uchar *before_record,
11030                            const uchar *after_record,
11031                            const unsigned char *extra_row_info) {
11032   DBUG_ASSERT(is_current_stmt_binlog_format_row() && mysql_bin_log.is_open());
11033   int error = 0;
11034 
11035   /**
11036     Save a reference to the original read and write set bitmaps.
11037     We will need this to restore the bitmaps at the end.
11038    */
11039   MY_BITMAP *old_read_set = table->read_set;
11040   MY_BITMAP *old_write_set = table->write_set;
11041 
11042   /**
11043      This will remove spurious fields required during execution but
11044      not needed for binlogging. This is done according to the:
11045      binlog-row-image option.
11046    */
11047   binlog_prepare_row_images(this, table);
11048 
11049   Row_data_memory row_data(table, before_record, after_record,
11050                            variables.binlog_row_value_options);
11051   if (!row_data.has_memory()) return HA_ERR_OUT_OF_MEM;
11052 
11053   uchar *before_row = row_data.slot(0);
11054   uchar *after_row = row_data.slot(1);
11055 
11056   size_t const before_size =
11057       pack_row(table, table->read_set, before_row, before_record,
11058                enum_row_image_type::UPDATE_BI);
11059   size_t const after_size = pack_row(
11060       table, table->write_set, after_row, after_record,
11061       enum_row_image_type::UPDATE_AI, variables.binlog_row_value_options);
11062 
11063   DBUG_DUMP("before_record", before_record, table->s->reclength);
11064   DBUG_DUMP("after_record", after_record, table->s->reclength);
11065   DBUG_DUMP("before_row", before_row, before_size);
11066   DBUG_DUMP("after_row", after_row, after_size);
11067 
11068   partition_info *part_info = table->part_info;
11069   uint32 source_part_id = binary_log::Rows_event::Extra_row_info::UNDEFINED;
11070   if (part_info) {
11071     uint32 new_part_id = binary_log::Rows_event::Extra_row_info::UNDEFINED;
11072     longlong func_value = 0;
11073     get_parts_for_update(before_record, after_record, table->record[0],
11074                          part_info, &source_part_id, &new_part_id, &func_value);
11075   }
11076 
11077   Rows_log_event *const ev =
11078       binlog_prepare_pending_rows_event<Update_rows_log_event>(
11079           table, server_id, before_size + after_size, is_trans, extra_row_info,
11080           source_part_id);
11081 
11082   if (unlikely(ev == nullptr)) return HA_ERR_OUT_OF_MEM;
11083 
11084   if (part_info) {
11085     ev->m_extra_row_info.set_source_partition_id(source_part_id);
11086   }
11087 
11088   error = ev->add_row_data(before_row, before_size) ||
11089           ev->add_row_data(after_row, after_size);
11090 
11091   /* restore read/write set for the rest of execution */
11092   table->column_bitmaps_set_no_signal(old_read_set, old_write_set);
11093 
11094   bitmap_clear_all(&table->tmp_set);
11095 
11096   return error;
11097 }
11098 
binlog_delete_row(TABLE * table,bool is_trans,uchar const * record,const unsigned char * extra_row_info)11099 int THD::binlog_delete_row(TABLE *table, bool is_trans, uchar const *record,
11100                            const unsigned char *extra_row_info) {
11101   DBUG_ASSERT(is_current_stmt_binlog_format_row() && mysql_bin_log.is_open());
11102   int error = 0;
11103 
11104   /**
11105     Save a reference to the original read and write set bitmaps.
11106     We will need this to restore the bitmaps at the end.
11107    */
11108   MY_BITMAP *old_read_set = table->read_set;
11109   MY_BITMAP *old_write_set = table->write_set;
11110 
11111   /**
11112      This will remove spurious fields required during execution but
11113      not needed for binlogging. This is done according to the:
11114      binlog-row-image option.
11115    */
11116   binlog_prepare_row_images(this, table);
11117 
11118   /*
11119      Pack records into format for transfer. We are allocating more
11120      memory than needed, but that doesn't matter.
11121   */
11122   Row_data_memory memory(table, record);
11123   if (unlikely(!memory.has_memory())) return HA_ERR_OUT_OF_MEM;
11124 
11125   uchar *row_data = memory.slot(0);
11126 
11127   DBUG_DUMP("table->read_set", (uchar *)table->read_set->bitmap,
11128             (table->s->fields + 7) / 8);
11129   size_t const len = pack_row(table, table->read_set, row_data, record,
11130                               enum_row_image_type::DELETE_BI);
11131 
11132   Rows_log_event *const ev =
11133       binlog_prepare_pending_rows_event<Delete_rows_log_event>(
11134           table, server_id, len, is_trans, extra_row_info);
11135 
11136   if (unlikely(ev == nullptr)) return HA_ERR_OUT_OF_MEM;
11137 
11138   error = ev->add_row_data(row_data, len);
11139 
11140   /* restore read/write set for the rest of execution */
11141   table->column_bitmaps_set_no_signal(old_read_set, old_write_set);
11142 
11143   bitmap_clear_all(&table->tmp_set);
11144   return error;
11145 }
11146 
binlog_prepare_row_images(const THD * thd,TABLE * table)11147 void binlog_prepare_row_images(const THD *thd, TABLE *table) {
11148   DBUG_TRACE;
11149   /**
11150     Remove from read_set spurious columns. The write_set has been
11151     handled before in table->mark_columns_needed_for_update.
11152    */
11153 
11154   DBUG_PRINT_BITSET("debug", "table->read_set (before preparing): %s",
11155                     table->read_set);
11156 
11157   /**
11158     if there is a primary key in the table (ie, user declared PK or a
11159     non-null unique index) and we dont want to ship the entire image,
11160     and the handler involved supports this.
11161    */
11162   if (table->s->primary_key < MAX_KEY &&
11163       (thd->variables.binlog_row_image < BINLOG_ROW_IMAGE_FULL) &&
11164       !ha_check_storage_engine_flag(table->s->db_type(),
11165                                     HTON_NO_BINLOG_ROW_OPT)) {
11166     /**
11167       Just to be sure that tmp_set is currently not in use as
11168       the read_set already.
11169     */
11170     DBUG_ASSERT(table->read_set != &table->tmp_set);
11171     // Verify it's not used
11172     DBUG_ASSERT(bitmap_is_clear_all(&table->tmp_set));
11173 
11174     switch (thd->variables.binlog_row_image) {
11175       case BINLOG_ROW_IMAGE_MINIMAL:
11176         /* MINIMAL: Mark only PK */
11177         table->mark_columns_used_by_index_no_reset(table->s->primary_key,
11178                                                    &table->tmp_set);
11179         break;
11180       case BINLOG_ROW_IMAGE_NOBLOB:
11181         /**
11182           NOBLOB: Remove unnecessary BLOB fields from read_set
11183                   (the ones that are not part of PK).
11184          */
11185         bitmap_union(&table->tmp_set, table->read_set);
11186         for (Field **ptr = table->field; *ptr; ptr++) {
11187           Field *field = (*ptr);
11188           if ((field->type() == MYSQL_TYPE_BLOB) &&
11189               !field->is_flag_set(PRI_KEY_FLAG))
11190             bitmap_clear_bit(&table->tmp_set, field->field_index());
11191         }
11192         break;
11193       default:
11194         DBUG_ASSERT(0);  // impossible.
11195     }
11196 
11197     /* set the temporary read_set */
11198     table->column_bitmaps_set_no_signal(&table->tmp_set, table->write_set);
11199   }
11200 
11201   DBUG_PRINT_BITSET("debug", "table->read_set (after preparing): %s",
11202                     table->read_set);
11203 }
11204 
binlog_flush_pending_rows_event(bool stmt_end,bool is_transactional)11205 int THD::binlog_flush_pending_rows_event(bool stmt_end, bool is_transactional) {
11206   DBUG_TRACE;
11207   /*
11208     We shall flush the pending event even if we are not in row-based
11209     mode: it might be the case that we left row-based mode before
11210     flushing anything (e.g., if we have explicitly locked tables).
11211    */
11212   if (!mysql_bin_log.is_open()) return 0;
11213 
11214   /*
11215     Mark the event as the last event of a statement if the stmt_end
11216     flag is set.
11217   */
11218   int error = 0;
11219   if (Rows_log_event *pending =
11220           binlog_get_pending_rows_event(is_transactional)) {
11221     if (stmt_end) {
11222       pending->set_flags(Rows_log_event::STMT_END_F);
11223       binlog_table_maps = 0;
11224     }
11225 
11226     error = mysql_bin_log.flush_and_set_pending_rows_event(this, nullptr,
11227                                                            is_transactional);
11228   }
11229 
11230   return error;
11231 }
11232 
11233 #if !defined(DBUG_OFF)
show_query_type(THD::enum_binlog_query_type qtype)11234 static const char *show_query_type(THD::enum_binlog_query_type qtype) {
11235   switch (qtype) {
11236     case THD::ROW_QUERY_TYPE:
11237       return "ROW";
11238     case THD::STMT_QUERY_TYPE:
11239       return "STMT";
11240     case THD::QUERY_TYPE_COUNT:
11241     default:
11242       DBUG_ASSERT(0 <= qtype && qtype < THD::QUERY_TYPE_COUNT);
11243   }
11244   static char buf[64];
11245   sprintf(buf, "UNKNOWN#%d", qtype);
11246   return buf;
11247 }
11248 #endif
11249 
11250 /**
11251   Auxiliary function to reset the limit unsafety warning suppression.
11252 */
reset_binlog_unsafe_suppression()11253 static void reset_binlog_unsafe_suppression() {
11254   DBUG_TRACE;
11255   unsafe_warning_suppression_is_activated = false;
11256   limit_unsafe_warning_count = 0;
11257   limit_unsafe_suppression_start_time = my_getsystime() / 10000000;
11258 }
11259 
11260 /**
11261   Auxiliary function to print warning in the error log.
11262 */
print_unsafe_warning_to_log(int unsafe_type,char * buf,const char * query)11263 static void print_unsafe_warning_to_log(int unsafe_type, char *buf,
11264                                         const char *query) {
11265   DBUG_TRACE;
11266   sprintf(buf, ER_DEFAULT(ER_BINLOG_UNSAFE_STATEMENT),
11267           ER_DEFAULT_NONCONST(LEX::binlog_stmt_unsafe_errcode[unsafe_type]));
11268   LogErr(WARNING_LEVEL, ER_BINLOG_UNSAFE_MESSAGE_AND_STATEMENT, buf, query);
11269 }
11270 
11271 /**
11272   Auxiliary function to check if the warning for limit unsafety should be
11273   thrown or suppressed. Details of the implementation can be found in the
11274   comments inline.
11275 
11276   @param buf         Buffer to hold the warning message text
11277   @param unsafe_type The type of unsafety.
11278   @param query       The actual query statement.
11279 
11280   TODO: Remove this function and implement a general service for all warnings
11281   that would prevent flooding the error log. => switch to log_throttle class?
11282 */
do_unsafe_limit_checkout(char * buf,int unsafe_type,const char * query)11283 static void do_unsafe_limit_checkout(char *buf, int unsafe_type,
11284                                      const char *query) {
11285   ulonglong now;
11286   DBUG_TRACE;
11287   DBUG_ASSERT(unsafe_type == LEX::BINLOG_STMT_UNSAFE_LIMIT);
11288   limit_unsafe_warning_count++;
11289   /*
11290     INITIALIZING:
11291     If this is the first time this function is called with log warning
11292     enabled, the monitoring the unsafe warnings should start.
11293   */
11294   if (limit_unsafe_suppression_start_time == 0) {
11295     limit_unsafe_suppression_start_time = my_getsystime() / 10000000;
11296     print_unsafe_warning_to_log(unsafe_type, buf, query);
11297   } else {
11298     if (!unsafe_warning_suppression_is_activated)
11299       print_unsafe_warning_to_log(unsafe_type, buf, query);
11300 
11301     if (limit_unsafe_warning_count >=
11302         LIMIT_UNSAFE_WARNING_ACTIVATION_THRESHOLD_COUNT) {
11303       now = my_getsystime() / 10000000;
11304       if (!unsafe_warning_suppression_is_activated) {
11305         /*
11306           ACTIVATION:
11307           We got LIMIT_UNSAFE_WARNING_ACTIVATION_THRESHOLD_COUNT warnings in
11308           less than LIMIT_UNSAFE_WARNING_ACTIVATION_TIMEOUT we activate the
11309           suppression.
11310         */
11311         if ((now - limit_unsafe_suppression_start_time) <=
11312             LIMIT_UNSAFE_WARNING_ACTIVATION_TIMEOUT) {
11313           unsafe_warning_suppression_is_activated = true;
11314           DBUG_PRINT("info", ("A warning flood has been detected and the limit \
11315 unsafety warning suppression has been activated."));
11316         } else {
11317           /*
11318            there is no flooding till now, therefore we restart the monitoring
11319           */
11320           limit_unsafe_suppression_start_time = my_getsystime() / 10000000;
11321           limit_unsafe_warning_count = 0;
11322         }
11323       } else {
11324         /*
11325           Print the suppression note and the unsafe warning.
11326         */
11327         LogErr(INFORMATION_LEVEL, ER_BINLOG_WARNING_SUPPRESSED,
11328                limit_unsafe_warning_count,
11329                (int)(now - limit_unsafe_suppression_start_time));
11330         print_unsafe_warning_to_log(unsafe_type, buf, query);
11331         /*
11332           DEACTIVATION: We got LIMIT_UNSAFE_WARNING_ACTIVATION_THRESHOLD_COUNT
11333           warnings in more than  LIMIT_UNSAFE_WARNING_ACTIVATION_TIMEOUT, the
11334           suppression should be deactivated.
11335         */
11336         if ((now - limit_unsafe_suppression_start_time) >
11337             LIMIT_UNSAFE_WARNING_ACTIVATION_TIMEOUT) {
11338           reset_binlog_unsafe_suppression();
11339           DBUG_PRINT("info", ("The limit unsafety warning supression has been \
11340 deactivated"));
11341         }
11342       }
11343       limit_unsafe_warning_count = 0;
11344     }
11345   }
11346 }
11347 
11348 /**
11349   Auxiliary method used by @c binlog_query() to raise warnings.
11350 
11351   The type of warning and the type of unsafeness is stored in
11352   THD::binlog_unsafe_warning_flags.
11353 */
issue_unsafe_warnings()11354 void THD::issue_unsafe_warnings() {
11355   char buf[MYSQL_ERRMSG_SIZE * 2];
11356   DBUG_TRACE;
11357   /*
11358     Ensure that binlog_unsafe_warning_flags is big enough to hold all
11359     bits.  This is actually a constant expression.
11360   */
11361   DBUG_ASSERT(LEX::BINLOG_STMT_UNSAFE_COUNT <=
11362               sizeof(binlog_unsafe_warning_flags) * CHAR_BIT);
11363 
11364   uint32 unsafe_type_flags = binlog_unsafe_warning_flags;
11365 
11366   /*
11367     For each unsafe_type, check if the statement is unsafe in this way
11368     and issue a warning.
11369   */
11370   for (int unsafe_type = 0; unsafe_type < LEX::BINLOG_STMT_UNSAFE_COUNT;
11371        unsafe_type++) {
11372     if ((unsafe_type_flags & (1 << unsafe_type)) != 0) {
11373       push_warning_printf(
11374           this, Sql_condition::SL_NOTE, ER_BINLOG_UNSAFE_STATEMENT,
11375           ER_THD(this, ER_BINLOG_UNSAFE_STATEMENT),
11376           ER_THD_NONCONST(this, LEX::binlog_stmt_unsafe_errcode[unsafe_type]));
11377       if (log_error_verbosity > 1 && opt_log_unsafe_statements) {
11378         if (unsafe_type == LEX::BINLOG_STMT_UNSAFE_LIMIT)
11379           do_unsafe_limit_checkout(buf, unsafe_type, query().str);
11380         else  // cases other than LIMIT unsafety
11381           print_unsafe_warning_to_log(unsafe_type, buf, query().str);
11382       }
11383     }
11384   }
11385 }
11386 
11387 /**
11388   Log the current query.
11389 
11390   The query will be logged in either row format or statement format
11391   depending on the value of @c current_stmt_binlog_format_row field and
11392   the value of the @c qtype parameter.
11393 
11394   This function must be called:
11395 
11396   - After the all calls to ha_*_row() functions have been issued.
11397 
11398   - After any writes to system tables. Rationale: if system tables
11399     were written after a call to this function, and the master crashes
11400     after the call to this function and before writing the system
11401     tables, then the master and slave get out of sync.
11402 
11403   - Before tables are unlocked and closed.
11404 
11405   @see decide_logging_format
11406 
11407   @retval 0 Success
11408 
11409   @retval nonzero If there is a failure when writing the query (e.g.,
11410   write failure), then the error code is returned.
11411 */
binlog_query(THD::enum_binlog_query_type qtype,const char * query_arg,size_t query_len,bool is_trans,bool direct,bool suppress_use,int errcode)11412 int THD::binlog_query(THD::enum_binlog_query_type qtype, const char *query_arg,
11413                       size_t query_len, bool is_trans, bool direct,
11414                       bool suppress_use, int errcode) {
11415   DBUG_TRACE;
11416   DBUG_PRINT("enter",
11417              ("qtype: %s  query: '%s'", show_query_type(qtype), query_arg));
11418   DBUG_ASSERT(query_arg && mysql_bin_log.is_open());
11419 
11420   if (get_binlog_local_stmt_filter() == BINLOG_FILTER_SET) {
11421     /*
11422       The current statement is to be ignored, and not written to
11423       the binlog. Do not call issue_unsafe_warnings().
11424     */
11425     return 0;
11426   }
11427 
11428   /*
11429     If we are not in prelocked mode, mysql_unlock_tables() will be
11430     called after this binlog_query(), so we have to flush the pending
11431     rows event with the STMT_END_F set to unlock all tables at the
11432     slave side as well.
11433 
11434     If we are in prelocked mode, the flushing will be done inside the
11435     top-most close_thread_tables().
11436   */
11437   if (this->locked_tables_mode <= LTM_LOCK_TABLES)
11438     if (int error = binlog_flush_pending_rows_event(true, is_trans))
11439       return error;
11440 
11441   /*
11442     Warnings for unsafe statements logged in statement format are
11443     printed in three places instead of in decide_logging_format().
11444     This is because the warnings should be printed only if the statement
11445     is actually logged. When executing decide_logging_format(), we cannot
11446     know for sure if the statement will be logged:
11447 
11448     1 - sp_head::execute_procedure which prints out warnings for calls to
11449     stored procedures.
11450 
11451     2 - sp_head::execute_function which prints out warnings for calls
11452     involving functions.
11453 
11454     3 - THD::binlog_query (here) which prints warning for top level
11455     statements not covered by the two cases above: i.e., if not insided a
11456     procedure and a function.
11457 
11458     Besides, we should not try to print these warnings if it is not
11459     possible to write statements to the binary log as it happens when
11460     the execution is inside a function, or generaly speaking, when
11461     the variables.option_bits & OPTION_BIN_LOG is false.
11462   */
11463   if ((variables.option_bits & OPTION_BIN_LOG) && sp_runtime_ctx == nullptr &&
11464       !binlog_evt_union.do_union)
11465     issue_unsafe_warnings();
11466 
11467   switch (qtype) {
11468       /*
11469         ROW_QUERY_TYPE means that the statement may be logged either in
11470         row format or in statement format.  If
11471         current_stmt_binlog_format is row, it means that the
11472         statement has already been logged in row format and hence shall
11473         not be logged again.
11474       */
11475     case THD::ROW_QUERY_TYPE:
11476       DBUG_PRINT("debug", ("is_current_stmt_binlog_format_row: %d",
11477                            is_current_stmt_binlog_format_row()));
11478       if (is_current_stmt_binlog_format_row()) return 0;
11479       /* Fall through */
11480 
11481       /*
11482         STMT_QUERY_TYPE means that the query must be logged in statement
11483         format; it cannot be logged in row format.  This is typically
11484         used by DDL statements.  It is an error to use this query type
11485         if current_stmt_binlog_format_row is row.
11486 
11487         @todo Currently there are places that call this method with
11488         STMT_QUERY_TYPE and current_stmt_binlog_format is row.  Fix those
11489         places and add assert to ensure correct behavior. /Sven
11490       */
11491     case THD::STMT_QUERY_TYPE:
11492       /*
11493         The MYSQL_BIN_LOG::write() function will set the STMT_END_F flag and
11494         flush the pending rows event if necessary.
11495       */
11496       {
11497         Query_log_event qinfo(this, query_arg, query_len, is_trans, direct,
11498                               suppress_use, errcode);
11499         /*
11500           Binlog table maps will be irrelevant after a Query_log_event
11501           (they are just removed on the slave side) so after the query
11502           log event is written to the binary log, we pretend that no
11503           table maps were written.
11504          */
11505         int error = mysql_bin_log.write_event(&qinfo);
11506         binlog_table_maps = 0;
11507         return error;
11508       }
11509       break;
11510 
11511     case THD::QUERY_TYPE_COUNT:
11512     default:
11513       DBUG_ASSERT(0 <= qtype && qtype < QUERY_TYPE_COUNT);
11514   }
11515   return 0;
11516 }
11517 
11518 struct st_mysql_storage_engine binlog_storage_engine = {
11519     MYSQL_HANDLERTON_INTERFACE_VERSION};
11520 
11521 /** @} */
11522 
mysql_declare_plugin(binlog)11523 mysql_declare_plugin(binlog){
11524     MYSQL_STORAGE_ENGINE_PLUGIN,
11525     &binlog_storage_engine,
11526     "binlog",
11527     PLUGIN_AUTHOR_ORACLE,
11528     "This is a pseudo storage engine to represent the binlog in a transaction",
11529     PLUGIN_LICENSE_GPL,
11530     binlog_init,   /* Plugin Init */
11531     nullptr,       /* Plugin Check uninstall */
11532     binlog_deinit, /* Plugin Deinit */
11533     0x0100 /* 1.0 */,
11534     nullptr, /* status variables                */
11535     nullptr, /* system variables                */
11536     nullptr, /* config options                  */
11537     0,
11538 } mysql_declare_plugin_end;
11539