1 /*
2    Copyright (c) 2012, Monty Program Ab
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; version 2 of the License.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
16 
17 #ifdef USE_PRAGMA_IMPLEMENTATION
18 #pragma implementation  // gcc: Class implementation
19 #endif
20 
21 #define MYSQL_SERVER 1
22 
23 /* The C++ file's header */
24 #include "./ha_rocksdb.h"
25 
26 #ifdef TARGET_OS_LINUX
27 #include <errno.h>
28 #include <sys/resource.h>
29 #include <sys/syscall.h>
30 #include <sys/types.h>
31 #endif
32 
33 /* C++ standard header files */
34 #include <inttypes.h>
35 #include <algorithm>
36 #include <limits>
37 #include <map>
38 #include <queue>
39 #include <set>
40 #include <string>
41 #include <vector>
42 
43 /* MySQL includes */
44 #include "debug_sync.h"
45 #include "my_bit.h"
46 #include "my_stacktrace.h"
47 #include "my_sys.h"
48 #include "mysql/psi/mysql_table.h"
49 #include "mysql/thread_pool_priv.h"
50 #include "mysys_err.h"
51 #include "sql_audit.h"
52 #include "sql_partition.h"
53 #include "sql_table.h"
54 
55 /* RocksDB includes */
56 #include "monitoring/histogram.h"
57 #include "rocksdb/compaction_filter.h"
58 #include "rocksdb/env.h"
59 #include "rocksdb/memory_allocator.h"
60 #include "rocksdb/persistent_cache.h"
61 #include "rocksdb/rate_limiter.h"
62 #include "rocksdb/slice_transform.h"
63 #include "rocksdb/thread_status.h"
64 #include "rocksdb/trace_reader_writer.h"
65 #include "rocksdb/utilities/checkpoint.h"
66 #include "rocksdb/utilities/convenience.h"
67 #include "rocksdb/utilities/memory_util.h"
68 #include "rocksdb/utilities/sim_cache.h"
69 #include "rocksdb/utilities/write_batch_with_index.h"
70 #include "util/stop_watch.h"
71 
72 /* MyRocks includes */
73 #include "./event_listener.h"
74 #include "./ha_rocksdb_proto.h"
75 #include "./ha_rockspart.h"
76 #include "./logger.h"
77 #include "./rdb_cf_manager.h"
78 #include "./rdb_cf_options.h"
79 #include "./rdb_converter.h"
80 #include "./rdb_datadic.h"
81 #include "./rdb_i_s.h"
82 #include "./rdb_index_merge.h"
83 #include "./rdb_mutex_wrapper.h"
84 #include "./rdb_psi.h"
85 #include "./rdb_threads.h"
86 
87 // Internal MySQL APIs not exposed in any header.
88 extern "C" {
89 /**
90  *   Get the user thread's binary logging format
91  *   @param thd  user thread
92  *   @return Value to be used as index into the binlog_format_names array
93  */
94 int thd_binlog_format(const MYSQL_THD thd);
95 
96 /**
97  *   Check if binary logging is filtered for thread's current db.
98  *   @param  thd   Thread handle
99  *   @retval 1 the query is not filtered, 0 otherwise.
100  */
101 bool thd_binlog_filter_ok(const MYSQL_THD thd);
102 }
103 
104 namespace myrocks {
105 
106 static st_global_stats global_stats;
107 static st_export_stats export_stats;
108 static st_memory_stats memory_stats;
109 static st_io_stall_stats io_stall_stats;
110 
111 const std::string DEFAULT_CF_NAME("default");
112 const std::string DEFAULT_SYSTEM_CF_NAME("__system__");
113 const std::string PER_INDEX_CF_NAME("$per_index_cf");
114 const std::string DEFAULT_SK_CF_NAME("default_sk");
115 const std::string TRUNCATE_TABLE_PREFIX("#truncate_tmp#");
116 
117 static std::vector<std::string> rdb_tables_to_recalc;
118 
119 static Rdb_exec_time st_rdb_exec_time;
120 
121 /**
122   Updates row counters based on the table type and operation type.
123 */
update_row_stats(const operation_type & type)124 void ha_rocksdb::update_row_stats(const operation_type &type) {
125   assert(type < ROWS_MAX);
126   // Find if we are modifying system databases.
127   if (table->s && m_tbl_def->m_is_mysql_system_table) {
128     global_stats.system_rows[type].inc();
129   } else {
130     global_stats.rows[type].inc();
131   }
132 }
133 
inc_covered_sk_lookup()134 void ha_rocksdb::inc_covered_sk_lookup() {
135   global_stats.covered_secondary_key_lookups.inc();
136 }
137 
138 void dbug_dump_database(rocksdb::DB *db);
139 static handler *rocksdb_create_handler(my_core::handlerton *hton,
140                                        my_core::TABLE_SHARE *table_arg,
141                                        my_core::MEM_ROOT *mem_root);
142 
getCompactRangeOptions(int concurrency=0,rocksdb::BottommostLevelCompaction bottommost_level_compaction=rocksdb::BottommostLevelCompaction::kForceOptimized)143 static rocksdb::CompactRangeOptions getCompactRangeOptions(
144     int concurrency = 0,
145     rocksdb::BottommostLevelCompaction bottommost_level_compaction =
146         rocksdb::BottommostLevelCompaction::kForceOptimized) {
147   rocksdb::CompactRangeOptions compact_range_options;
148   compact_range_options.bottommost_level_compaction =
149       bottommost_level_compaction;
150   compact_range_options.exclusive_manual_compaction = false;
151   if (concurrency > 0) {
152     compact_range_options.max_subcompactions = concurrency;
153   }
154   return compact_range_options;
155 }
156 
157 ///////////////////////////////////////////////////////////
158 // Parameters and settings
159 ///////////////////////////////////////////////////////////
160 static char *rocksdb_default_cf_options = nullptr;
161 static char *rocksdb_override_cf_options = nullptr;
162 static char *rocksdb_update_cf_options = nullptr;
163 static my_bool rocksdb_use_default_sk_cf = false;
164 
165 ///////////////////////////////////////////////////////////
166 // Globals
167 ///////////////////////////////////////////////////////////
168 handlerton *rocksdb_hton;
169 
170 rocksdb::TransactionDB *rdb = nullptr;
171 rocksdb::HistogramImpl *commit_latency_stats = nullptr;
172 
173 static std::shared_ptr<rocksdb::Statistics> rocksdb_stats;
174 static std::shared_ptr<Rdb_tbl_prop_coll_factory> properties_collector_factory;
175 
176 Rdb_dict_manager dict_manager;
177 Rdb_cf_manager cf_manager;
178 Rdb_ddl_manager ddl_manager;
179 Rdb_hton_init_state hton_init_state;
180 
181 /**
182   MyRocks background thread control
183   N.B. This is besides RocksDB's own background threads
184        (@see rocksdb::CancelAllBackgroundWork())
185 */
186 
187 static Rdb_background_thread rdb_bg_thread;
188 
189 static Rdb_index_stats_thread rdb_is_thread;
190 
191 static Rdb_manual_compaction_thread rdb_mc_thread;
192 
193 static Rdb_drop_index_thread rdb_drop_idx_thread;
194 // List of table names (using regex) that are exceptions to the strict
195 // collation check requirement.
196 Regex *rdb_collation_exceptions;
197 
198 static const char *rdb_get_error_messages(int error);
199 
rocksdb_flush_all_memtables()200 static void rocksdb_flush_all_memtables() {
201   const Rdb_cf_manager &cf_manager = rdb_get_cf_manager();
202 
203   // RocksDB will fail the flush if the CF is deleted,
204   // but here we don't handle return status
205   for (const auto &cf_handle : cf_manager.get_all_cf()) {
206     rdb->Flush(rocksdb::FlushOptions(), cf_handle.get());
207   }
208 }
209 
rocksdb_delete_column_family_stub(THD * const,struct st_mysql_sys_var * const,void * const,const void * const)210 static void rocksdb_delete_column_family_stub(
211     THD *const /* thd */, struct st_mysql_sys_var *const /* var */,
212     void *const /* var_ptr */, const void *const /* save */) {}
213 
rocksdb_delete_column_family(THD * const,struct st_mysql_sys_var * const,void * const,struct st_mysql_value * const value)214 static int rocksdb_delete_column_family(
215     THD *const /* thd */, struct st_mysql_sys_var *const /* var */,
216     void *const /* var_ptr */, struct st_mysql_value *const value) {
217   assert(value != nullptr);
218 
219   char buff[STRING_BUFFER_USUAL_SIZE];
220   int len = sizeof(buff);
221 
222   const char *const cf = value->val_str(value, buff, &len);
223   if (cf == nullptr) return HA_EXIT_SUCCESS;
224 
225   std::string cf_name = std::string(cf);
226   // Forbid to remove these built-in CFs
227   if (cf_name == DEFAULT_SYSTEM_CF_NAME || cf_name == DEFAULT_CF_NAME ||
228       cf_name.empty() ||
229       (cf_name == DEFAULT_SK_CF_NAME && rocksdb_use_default_sk_cf)) {
230     my_error(ER_CANT_DROP_CF, MYF(0), cf);
231     return HA_EXIT_FAILURE;
232   }
233 
234   auto &cf_manager = rdb_get_cf_manager();
235   int ret = 0;
236 
237   {
238     std::lock_guard<Rdb_dict_manager> dm_lock(dict_manager);
239     ret = cf_manager.drop_cf(&ddl_manager, &dict_manager, cf_name);
240   }
241 
242   if (ret == HA_EXIT_SUCCESS) {
243     rdb_drop_idx_thread.signal();
244   } else {
245     my_error(ER_CANT_DROP_CF, MYF(0), cf);
246   }
247 
248   return ret;
249 }
250 
251 ///////////////////////////////////////////////////////////
252 // Hash map: table name => open table handler
253 ///////////////////////////////////////////////////////////
254 
255 namespace  // anonymous namespace = not visible outside this source file
256 {
257 
258 class Rdb_open_tables_map {
259  private:
260   /* Hash table used to track the handlers of open tables */
261   std::unordered_map<std::string, Rdb_table_handler *> m_table_map;
262 
263   /* The mutex used to protect the hash table */
264   mutable mysql_mutex_t m_mutex;
265 
266  public:
init()267   void init() {
268     m_table_map.clear();
269     mysql_mutex_init(rdb_psi_open_tbls_mutex_key, &m_mutex, MY_MUTEX_INIT_FAST);
270   }
271 
free()272   void free() {
273     m_table_map.clear();
274     mysql_mutex_destroy(&m_mutex);
275   }
276 
count()277   size_t count() { return m_table_map.size(); }
278 
279   Rdb_table_handler *get_table_handler(const char *const table_name);
280   void release_table_handler(Rdb_table_handler *const table_handler);
281 
282   std::vector<std::string> get_table_names(void) const;
283 };
284 
285 }  // anonymous namespace
286 
287 static Rdb_open_tables_map rdb_open_tables;
288 
rdb_normalize_dir(std::string dir)289 static std::string rdb_normalize_dir(std::string dir) {
290   while (dir.size() > 0 && dir.back() == '/') {
291     dir.resize(dir.size() - 1);
292   }
293   return dir;
294 }
295 
rocksdb_create_checkpoint(THD * const thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * const var MY_ATTRIBUTE ((__unused__)),void * const save MY_ATTRIBUTE ((__unused__)),struct st_mysql_value * const value)296 static int rocksdb_create_checkpoint(
297     THD *const thd MY_ATTRIBUTE((__unused__)),
298     struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
299     void *const save MY_ATTRIBUTE((__unused__)),
300     struct st_mysql_value *const value) {
301   char buf[FN_REFLEN];
302   int len = sizeof(buf);
303   const char *const checkpoint_dir_raw = value->val_str(value, buf, &len);
304   if (checkpoint_dir_raw) {
305     if (rdb != nullptr) {
306       std::string checkpoint_dir = rdb_normalize_dir(checkpoint_dir_raw);
307       // NO_LINT_DEBUG
308       sql_print_information("RocksDB: creating checkpoint in directory : %s\n",
309                             checkpoint_dir.c_str());
310       rocksdb::Checkpoint *checkpoint;
311       auto status = rocksdb::Checkpoint::Create(rdb, &checkpoint);
312       // We can only return HA_EXIT_FAILURE/HA_EXIT_SUCCESS here which is why
313       // the return code is ignored, but by calling into rdb_error_to_mysql,
314       // it will call my_error for us, which will propogate up to the client.
315       int rc MY_ATTRIBUTE((__unused__));
316       if (status.ok()) {
317         status = checkpoint->CreateCheckpoint(checkpoint_dir.c_str());
318         delete checkpoint;
319         if (status.ok()) {
320           // NO_LINT_DEBUG
321           sql_print_information(
322               "RocksDB: created checkpoint in directory : %s\n",
323               checkpoint_dir.c_str());
324           return HA_EXIT_SUCCESS;
325         } else {
326           rc = ha_rocksdb::rdb_error_to_mysql(status);
327         }
328       } else {
329         rc = ha_rocksdb::rdb_error_to_mysql(status);
330       }
331     }
332   }
333   return HA_EXIT_FAILURE;
334 }
335 
336 /* This method is needed to indicate that the
337    ROCKSDB_CREATE_CHECKPOINT command is not read-only */
rocksdb_create_checkpoint_stub(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)338 static void rocksdb_create_checkpoint_stub(THD *const thd,
339                                            struct st_mysql_sys_var *const var,
340                                            void *const var_ptr,
341                                            const void *const save) {}
342 
rocksdb_force_flush_memtable_now_stub(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)343 static void rocksdb_force_flush_memtable_now_stub(
344     THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
345     const void *const save) {}
346 
rocksdb_force_flush_memtable_now(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,struct st_mysql_value * const value)347 static int rocksdb_force_flush_memtable_now(
348     THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
349     struct st_mysql_value *const value) {
350   // NO_LINT_DEBUG
351   sql_print_information("RocksDB: Manual memtable flush.");
352   rocksdb_flush_all_memtables();
353   return HA_EXIT_SUCCESS;
354 }
355 
rocksdb_force_flush_memtable_and_lzero_now_stub(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)356 static void rocksdb_force_flush_memtable_and_lzero_now_stub(
357     THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
358     const void *const save) {}
359 
rocksdb_force_flush_memtable_and_lzero_now(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,struct st_mysql_value * const value)360 static int rocksdb_force_flush_memtable_and_lzero_now(
361     THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
362     struct st_mysql_value *const value) {
363   // NO_LINT_DEBUG
364   sql_print_information("RocksDB: Manual memtable and L0 flush.");
365   rocksdb_flush_all_memtables();
366 
367   const Rdb_cf_manager &cf_manager = rdb_get_cf_manager();
368   rocksdb::CompactionOptions c_options = rocksdb::CompactionOptions();
369   rocksdb::ColumnFamilyMetaData metadata;
370   rocksdb::ColumnFamilyDescriptor cf_descr;
371 
372   static constexpr int max_attempts = 3;
373   int i, num_errors = 0;
374 
375   for (const auto &cf_handle : cf_manager.get_all_cf()) {
376     for (i = 0; i < max_attempts; i++) {
377       rdb->GetColumnFamilyMetaData(cf_handle.get(), &metadata);
378       cf_handle->GetDescriptor(&cf_descr);
379       c_options.output_file_size_limit = cf_descr.options.target_file_size_base;
380 
381       assert(metadata.levels[0].level == 0);
382       std::vector<std::string> file_names;
383       for (const auto &file : metadata.levels[0].files) {
384         file_names.emplace_back(file.db_path + file.name);
385       }
386 
387       if (file_names.empty()) {
388         break;
389       }
390 
391       rocksdb::Status s;
392       s = rdb->CompactFiles(c_options, cf_handle.get(), file_names, 1);
393 
394       if (!s.ok()) {
395         std::shared_ptr<rocksdb::ColumnFamilyHandle> cfh =
396             cf_manager.get_cf(cf_handle->GetID());
397 
398         // If the CF handle has been removed from cf_manager, it is not an
399         // error. We are done with this CF and proceed to the next CF.
400         if (!cfh) {
401           // NO_LINT_DEBUG
402           sql_print_information("cf %s has been dropped during CompactFiles.",
403                                 cf_handle->GetName().c_str());
404           break;
405         }
406 
407         // Due to a race, it's possible for CompactFiles to collide
408         // with auto compaction, causing an error to return
409         // regarding file not found. In that case, retry.
410         if (s.IsInvalidArgument()) {
411           continue;
412         }
413 
414         if (!s.ok() && !s.IsAborted()) {
415           rdb_handle_io_error(s, RDB_IO_ERROR_GENERAL);
416           return HA_EXIT_FAILURE;
417         }
418         break;
419       }
420     }
421     if (i == max_attempts) {
422       num_errors++;
423     }
424   }
425 
426   return num_errors == 0 ? HA_EXIT_SUCCESS : HA_EXIT_FAILURE;
427 }
428 
429 static void rocksdb_drop_index_wakeup_thread(
430     my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
431     struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
432     void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save);
433 
434 static my_bool rocksdb_pause_background_work = 0;
435 static mysql_mutex_t rdb_sysvars_mutex;
436 static mysql_mutex_t rdb_block_cache_resize_mutex;
437 static mysql_mutex_t rdb_bottom_pri_background_compactions_resize_mutex;
438 
rocksdb_set_pause_background_work(my_core::THD * const thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * const var MY_ATTRIBUTE ((__unused__)),void * const var_ptr MY_ATTRIBUTE ((__unused__)),const void * const save)439 static void rocksdb_set_pause_background_work(
440     my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
441     struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
442     void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
443   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
444   const bool pause_requested = *static_cast<const bool *>(save);
445   if (rocksdb_pause_background_work != pause_requested) {
446     if (pause_requested) {
447       rdb->PauseBackgroundWork();
448     } else {
449       rdb->ContinueBackgroundWork();
450     }
451     rocksdb_pause_background_work = pause_requested;
452   }
453   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
454 }
455 
456 static void rocksdb_set_compaction_options(THD *thd,
457                                            struct st_mysql_sys_var *var,
458                                            void *var_ptr, const void *save);
459 
460 static void rocksdb_set_table_stats_sampling_pct(THD *thd,
461                                                  struct st_mysql_sys_var *var,
462                                                  void *var_ptr,
463                                                  const void *save);
464 
465 static void rocksdb_update_table_stats_use_table_scan(
466     THD *const /* thd */, struct st_mysql_sys_var *const /* var */,
467     void *const var_ptr, const void *const save);
468 
469 static int rocksdb_index_stats_thread_renice(
470     THD *const /* thd */, struct st_mysql_sys_var *const /* var */,
471     void *const save, struct st_mysql_value *const value);
472 
473 static void rocksdb_set_rate_limiter_bytes_per_sec(THD *thd,
474                                                    struct st_mysql_sys_var *var,
475                                                    void *var_ptr,
476                                                    const void *save);
477 
478 static void rocksdb_set_sst_mgr_rate_bytes_per_sec(THD *thd,
479                                                    struct st_mysql_sys_var *var,
480                                                    void *var_ptr,
481                                                    const void *save);
482 
483 static void rocksdb_set_delayed_write_rate(THD *thd,
484                                            struct st_mysql_sys_var *var,
485                                            void *var_ptr, const void *save);
486 
487 static void rocksdb_set_max_latest_deadlocks(THD *thd,
488                                              struct st_mysql_sys_var *var,
489                                              void *var_ptr, const void *save);
490 
491 static void rdb_set_collation_exception_list(const char *exception_list);
492 static void rocksdb_set_collation_exception_list(THD *thd,
493                                                  struct st_mysql_sys_var *var,
494                                                  void *var_ptr,
495                                                  const void *save);
496 
497 static int rocksdb_validate_update_cf_options(THD *thd,
498                                               struct st_mysql_sys_var *var,
499                                               void *save,
500                                               st_mysql_value *value);
501 
502 static void rocksdb_set_update_cf_options(THD *thd,
503                                           struct st_mysql_sys_var *var,
504                                           void *var_ptr, const void *save);
505 
506 static int rocksdb_check_bulk_load(
507     THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)),
508     void *save, struct st_mysql_value *value);
509 
510 static int rocksdb_check_bulk_load_allow_unsorted(
511     THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)),
512     void *save, struct st_mysql_value *value);
513 
514 static void rocksdb_set_max_background_jobs(THD *thd,
515                                             struct st_mysql_sys_var *const var,
516                                             void *const var_ptr,
517                                             const void *const save);
518 static void rocksdb_set_max_background_compactions(THD *thd,
519                                                    struct st_mysql_sys_var *const var,
520                                                    void *const var_ptr,
521                                                    const void *const save);
522 
523 static void rocksdb_set_bytes_per_sync(THD *thd,
524                                        struct st_mysql_sys_var *const var,
525                                        void *const var_ptr,
526                                        const void *const save);
527 static void rocksdb_set_wal_bytes_per_sync(THD *thd,
528                                            struct st_mysql_sys_var *const var,
529                                            void *const var_ptr,
530                                            const void *const save);
531 static int rocksdb_validate_set_block_cache_size(
532     THD *thd, struct st_mysql_sys_var *const var, void *var_ptr,
533     struct st_mysql_value *value);
534 static int rocksdb_tracing(THD *const thd MY_ATTRIBUTE((__unused__)),
535                            struct st_mysql_sys_var *const var, void *const save,
536                            struct st_mysql_value *const value,
537                            bool trace_block_cache_access = true);
538 static int rocksdb_validate_max_bottom_pri_background_compactions(
539     THD *thd MY_ATTRIBUTE((__unused__)),
540     struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
541     void *var_ptr, struct st_mysql_value *value);
542 //////////////////////////////////////////////////////////////////////////////
543 // Options definitions
544 //////////////////////////////////////////////////////////////////////////////
545 static const constexpr ulong RDB_MAX_LOCK_WAIT_SECONDS = 1024 * 1024 * 1024;
546 static const constexpr ulong RDB_MAX_ROW_LOCKS = 1024 * 1024 * 1024;
547 static const constexpr ulong RDB_DEFAULT_ROW_LOCKS = 1024 * 1024;
548 static const constexpr ulong RDB_DEFAULT_BULK_LOAD_SIZE = 1000;
549 static const constexpr ulong RDB_MAX_BULK_LOAD_SIZE = 1024 * 1024 * 1024;
550 static const constexpr size_t RDB_DEFAULT_MERGE_BUF_SIZE = 64 * 1024 * 1024;
551 static const constexpr size_t RDB_MIN_MERGE_BUF_SIZE = 100;
552 static const constexpr size_t RDB_DEFAULT_MERGE_COMBINE_READ_SIZE =
553     1024 * 1024 * 1024;
554 static const constexpr size_t RDB_MIN_MERGE_COMBINE_READ_SIZE = 100;
555 static const constexpr size_t RDB_DEFAULT_MERGE_TMP_FILE_REMOVAL_DELAY = 0;
556 static const constexpr size_t RDB_MIN_MERGE_TMP_FILE_REMOVAL_DELAY = 0;
557 static const constexpr int64 RDB_DEFAULT_BLOCK_CACHE_SIZE = 512 * 1024 * 1024;
558 static const constexpr int64 RDB_MIN_BLOCK_CACHE_SIZE = 1024;
559 static const constexpr int RDB_MAX_CHECKSUMS_PCT = 100;
560 static const constexpr uint32_t
561     RDB_DEFAULT_FORCE_COMPUTE_MEMTABLE_STATS_CACHETIME = 60 * 1000 * 1000;
562 static const constexpr ulong RDB_DEADLOCK_DETECT_DEPTH = 50;
563 static const constexpr uint ROCKSDB_MAX_BOTTOM_PRI_BACKGROUND_COMPACTIONS = 64;
564 
565 static long long rocksdb_block_cache_size = RDB_DEFAULT_BLOCK_CACHE_SIZE;
566 static long long rocksdb_sim_cache_size = 0;
567 static double rocksdb_cache_high_pri_pool_ratio = 0.0;
568 static my_bool rocksdb_cache_dump = FALSE;
569 /* Use unsigned long long instead of uint64_t because of MySQL compatibility */
570 static unsigned long long  // NOLINT(runtime/int)
571     rocksdb_rate_limiter_bytes_per_sec = 0;
572 static unsigned long long  // NOLINT(runtime/int)
573     rocksdb_sst_mgr_rate_bytes_per_sec = DEFAULT_SST_MGR_RATE_BYTES_PER_SEC;
574 static unsigned long long rocksdb_delayed_write_rate;
575 static uint32_t rocksdb_max_latest_deadlocks = RDB_DEADLOCK_DETECT_DEPTH;
576 static unsigned long  // NOLINT(runtime/int)
577     rocksdb_persistent_cache_size_mb = 0;
578 static uint64_t rocksdb_info_log_level = rocksdb::InfoLogLevel::ERROR_LEVEL;
579 static char *rocksdb_wal_dir = nullptr;
580 static char *rocksdb_persistent_cache_path = nullptr;
581 static uint64_t rocksdb_index_type =
582     rocksdb::BlockBasedTableOptions::kBinarySearch;
583 static uint32_t rocksdb_flush_log_at_trx_commit = 1;
584 static uint32_t rocksdb_debug_optimizer_n_rows = 0;
585 static my_bool rocksdb_force_compute_memtable_stats = TRUE;
586 static uint32_t rocksdb_force_compute_memtable_stats_cachetime =
587     RDB_DEFAULT_FORCE_COMPUTE_MEMTABLE_STATS_CACHETIME;
588 static my_bool rocksdb_debug_optimizer_no_zero_cardinality = TRUE;
589 static uint32_t rocksdb_wal_recovery_mode =
590     static_cast<uint32_t>(rocksdb::WALRecoveryMode::kPointInTimeRecovery);
591 static my_bool rocksdb_track_and_verify_wals_in_manifest = TRUE;
592 static uint32_t rocksdb_stats_level = 0;
593 static uint32_t rocksdb_access_hint_on_compaction_start =
594     rocksdb::Options::AccessHint::NORMAL;
595 static char *rocksdb_compact_cf_name = nullptr;
596 static char *rocksdb_delete_cf_name = nullptr;
597 static char *rocksdb_checkpoint_name = nullptr;
598 static char *rocksdb_block_cache_trace_options_str = nullptr;
599 static char *rocksdb_trace_options_str = nullptr;
600 static my_bool rocksdb_signal_drop_index_thread = FALSE;
601 static my_bool rocksdb_strict_collation_check = TRUE;
602 static my_bool rocksdb_ignore_unknown_options = TRUE;
603 static char *rocksdb_strict_collation_exceptions = nullptr;
604 static my_bool rocksdb_collect_sst_properties = TRUE;
605 static my_bool rocksdb_force_flush_memtable_now_var = FALSE;
606 static my_bool rocksdb_force_flush_memtable_and_lzero_now_var = FALSE;
607 static my_bool rocksdb_enable_native_partition = FALSE;
608 static my_bool rocksdb_enable_ttl = TRUE;
609 static my_bool rocksdb_enable_ttl_read_filtering = TRUE;
610 static int rocksdb_debug_ttl_rec_ts = 0;
611 static int rocksdb_debug_ttl_snapshot_ts = 0;
612 static int rocksdb_debug_ttl_read_filter_ts = 0;
613 static my_bool rocksdb_debug_ttl_ignore_pk = FALSE;
614 static my_bool rocksdb_reset_stats = FALSE;
615 static uint32_t rocksdb_seconds_between_stat_computes = 3600;
616 static long long rocksdb_compaction_sequential_deletes = 0l;
617 static long long rocksdb_compaction_sequential_deletes_window = 0l;
618 static long long rocksdb_compaction_sequential_deletes_file_size = 0l;
619 #if defined(ROCKSDB_INCLUDE_VALIDATE_TABLES) && ROCKSDB_INCLUDE_VALIDATE_TABLES
620 static uint32_t rocksdb_validate_tables = 1;
621 #endif  // defined(ROCKSDB_INCLUDE_VALIDATE_TABLES) &&
622         // ROCKSDB_INCLUDE_VALIDATE_TABLES
623 static char *rocksdb_datadir = nullptr;
624 static uint32_t rocksdb_max_bottom_pri_background_compactions = 0;
625 static uint32_t rocksdb_table_stats_sampling_pct =
626     RDB_DEFAULT_TBL_STATS_SAMPLE_PCT;
627 static uint32_t rocksdb_table_stats_recalc_threshold_pct = 10;
628 static unsigned long long rocksdb_table_stats_recalc_threshold_count = 100ul;
629 static my_bool rocksdb_table_stats_use_table_scan = 0;
630 static int32_t rocksdb_table_stats_background_thread_nice_value =
631     THREAD_PRIO_MAX;
632 static unsigned long long rocksdb_table_stats_max_num_rows_scanned = 0ul;
633 static my_bool rocksdb_enable_bulk_load_api = TRUE;
634 static my_bool rocksdb_enable_remove_orphaned_dropped_cfs = TRUE;
635 static my_bool rpl_skip_tx_api_var = FALSE;
636 static my_bool rocksdb_print_snapshot_conflict_queries = FALSE;
637 static my_bool rocksdb_large_prefix = FALSE;
638 static my_bool rocksdb_allow_to_start_after_corruption = FALSE;
639 static uint64_t rocksdb_write_policy =
640     rocksdb::TxnDBWritePolicy::WRITE_COMMITTED;
641 char *rocksdb_read_free_rpl_tables;
642 ulong rocksdb_max_row_locks;
643 std::mutex rocksdb_read_free_rpl_tables_mutex;
644 #if defined(HAVE_PSI_INTERFACE)
645 Regex rdb_read_free_regex_handler(key_rwlock_read_free_rpl_tables);
646 #else
647 Regex rdb_read_free_regex_handler;
648 #endif
649 enum read_free_rpl_type { OFF = 0, PK_ONLY, PK_SK };
650 static uint64_t rocksdb_read_free_rpl = read_free_rpl_type::OFF;
651 static my_bool rocksdb_error_on_suboptimal_collation = FALSE;
652 static uint32_t rocksdb_stats_recalc_rate = 0;
653 static my_bool rocksdb_no_create_column_family = FALSE;
654 static uint32_t rocksdb_debug_manual_compaction_delay = 0;
655 static uint32_t rocksdb_max_manual_compactions = 0;
656 static my_bool rocksdb_rollback_on_timeout = FALSE;
657 static my_bool rocksdb_enable_insert_with_update_caching = TRUE;
658 static my_bool rocksdb_skip_locks_if_skip_unique_check = FALSE;
659 static my_bool rocksdb_alter_column_default_inplace = FALSE;
660 std::atomic<uint64_t> rocksdb_row_lock_deadlocks(0);
661 std::atomic<uint64_t> rocksdb_row_lock_wait_timeouts(0);
662 std::atomic<uint64_t> rocksdb_snapshot_conflict_errors(0);
663 std::atomic<uint64_t> rocksdb_wal_group_syncs(0);
664 std::atomic<uint64_t> rocksdb_manual_compactions_processed(0);
665 std::atomic<uint64_t> rocksdb_manual_compactions_running(0);
666 #ifndef NDEBUG
667 std::atomic<uint64_t> rocksdb_num_get_for_update_calls(0);
668 #endif
669 
rocksdb_trace_block_cache_access(THD * const thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * const var,void * const save,struct st_mysql_value * const value)670 static int rocksdb_trace_block_cache_access(
671     THD *const thd MY_ATTRIBUTE((__unused__)),
672     struct st_mysql_sys_var *const var, void *const save,
673     struct st_mysql_value *const value) {
674   return rocksdb_tracing(thd, var, save, value,
675                          /* trace_block_cache_accecss = */ true);
676 }
677 
rocksdb_trace_queries(THD * const thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * const var,void * const save,struct st_mysql_value * const value)678 static int rocksdb_trace_queries(THD *const thd MY_ATTRIBUTE((__unused__)),
679                                  struct st_mysql_sys_var *const var,
680                                  void *const save,
681                                  struct st_mysql_value *const value) {
682   return rocksdb_tracing(thd, var, save, value,
683                          /* trace_block_cache_accecss = */ false);
684 }
685 
rocksdb_tracing(THD * const thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * const var,void * const save,struct st_mysql_value * const value,bool trace_block_cache_access)686 static int rocksdb_tracing(THD *const thd MY_ATTRIBUTE((__unused__)),
687                            struct st_mysql_sys_var *const var, void *const save,
688                            struct st_mysql_value *const value,
689                            bool trace_block_cache_access) {
690   std::string trace_folder =
691       trace_block_cache_access ? "/block_cache_traces" : "/queries_traces";
692   int len = 0;
693   const char *const trace_opt_str_raw = value->val_str(value, nullptr, &len);
694   if (trace_opt_str_raw == nullptr) {
695     return HA_EXIT_FAILURE;
696   }
697 
698   rocksdb::Status s;
699   if (rdb == nullptr) {
700     return HA_EXIT_FAILURE;
701   }
702   int rc __attribute__((__unused__));
703   std::string trace_opt_str(trace_opt_str_raw);
704   if (trace_opt_str.empty()) {
705     // End tracing block cache accesses or queries.
706     // NO_LINT_DEBUG
707     sql_print_information(
708         "RocksDB: Stop tracing block cache accesses or queries.\n");
709     s = trace_block_cache_access ? rdb->EndBlockCacheTrace() : rdb->EndTrace();
710 
711     if (!s.ok()) {
712       rc = ha_rocksdb::rdb_error_to_mysql(s);
713       return HA_EXIT_FAILURE;
714     }
715     *static_cast<const char **>(save) = trace_opt_str_raw;
716     return HA_EXIT_SUCCESS;
717   }
718 
719   // Start tracing block cache accesses or queries.
720   std::stringstream ss(trace_opt_str);
721   std::vector<std::string> trace_opts_strs;
722   while (ss.good()) {
723     std::string substr;
724     getline(ss, substr, ':');
725     trace_opts_strs.push_back(substr);
726   }
727   rocksdb::TraceOptions trace_opt;
728   try {
729     if (trace_opts_strs.size() != 3) {
730       throw std::invalid_argument("Incorrect number of arguments.");
731     }
732     trace_opt.sampling_frequency = std::stoull(trace_opts_strs[0]);
733     trace_opt.max_trace_file_size = std::stoull(trace_opts_strs[1]);
734   } catch (const std::exception &e) {
735     // NO_LINT_DEBUG
736     sql_print_information(
737         "RocksDB: Failed to parse trace option string: %s. The correct "
738         "format is sampling_frequency:max_trace_file_size:trace_file_name. "
739         "sampling_frequency and max_trace_file_size are positive integers. "
740         "The block accesses or queries are saved to the "
741         "rocksdb_datadir%s/trace_file_name.\n",
742         trace_opt_str.c_str(), trace_folder.c_str());
743     return HA_EXIT_FAILURE;
744   }
745   const std::string &trace_file_name = trace_opts_strs[2];
746   if (trace_file_name.find("/") != std::string::npos) {
747     // NO_LINT_DEBUG
748     sql_print_information(
749         "RocksDB: Start tracing failed (trace option string: %s). The file "
750         "name contains directory separator.\n",
751         trace_opt_str.c_str());
752     return HA_EXIT_FAILURE;
753   }
754   const std::string trace_dir = std::string(rocksdb_datadir) + trace_folder;
755   s = rdb->GetEnv()->CreateDirIfMissing(trace_dir);
756   if (!s.ok()) {
757     // NO_LINT_DEBUG
758     sql_print_information(
759         "RocksDB: Start tracing failed (trace option string: %s). Failed to "
760         "create the trace directory %s: %s\n",
761         trace_opt_str.c_str(), trace_dir.c_str(), s.ToString().c_str());
762     return HA_EXIT_FAILURE;
763   }
764   const std::string trace_file_path = trace_dir + "/" + trace_file_name;
765   s = rdb->GetEnv()->FileExists(trace_file_path);
766   if (s.ok() || !s.IsNotFound()) {
767     // NO_LINT_DEBUG
768     sql_print_information(
769         "RocksDB: Start tracing failed (trace option string: %s). The trace "
770         "file either already exists or we encountered an error "
771         "when calling rdb->GetEnv()->FileExists. The returned status string "
772         "is: %s\n",
773         trace_opt_str.c_str(), s.ToString().c_str());
774     return HA_EXIT_FAILURE;
775   }
776   std::unique_ptr<rocksdb::TraceWriter> trace_writer;
777   const rocksdb::EnvOptions env_option(rdb->GetDBOptions());
778   s = rocksdb::NewFileTraceWriter(rdb->GetEnv(), env_option, trace_file_path,
779                                   &trace_writer);
780   if (!s.ok()) {
781     rc = ha_rocksdb::rdb_error_to_mysql(s);
782     return HA_EXIT_FAILURE;
783   }
784   if (trace_block_cache_access) {
785     s = rdb->StartBlockCacheTrace(trace_opt, std::move(trace_writer));
786   } else {
787     s = rdb->StartTrace(trace_opt, std::move(trace_writer));
788   }
789   if (!s.ok()) {
790     rc = ha_rocksdb::rdb_error_to_mysql(s);
791     return HA_EXIT_FAILURE;
792   }
793   // NO_LINT_DEBUG
794   sql_print_information(
795       "RocksDB: Start tracing block cache accesses or queries. Sampling "
796       "frequency: %lu, "
797       "Maximum trace file size: %lu, Trace file path %s.\n",
798       trace_opt.sampling_frequency, trace_opt.max_trace_file_size,
799       trace_file_path.c_str());
800   // Save the trace option.
801   *static_cast<const char **>(save) = trace_opt_str_raw;
802   return HA_EXIT_SUCCESS;
803 }
804 
rdb_init_rocksdb_db_options(void)805 static std::unique_ptr<rocksdb::DBOptions> rdb_init_rocksdb_db_options(void) {
806   auto o = std::unique_ptr<rocksdb::DBOptions>(new rocksdb::DBOptions());
807 
808   o->create_if_missing = true;
809   o->listeners.push_back(std::make_shared<Rdb_event_listener>(&ddl_manager));
810   o->info_log_level = rocksdb::InfoLogLevel::INFO_LEVEL;
811   o->max_subcompactions = DEFAULT_SUBCOMPACTIONS;
812   o->max_open_files = -2;  // auto-tune to 50% open_files_limit
813 
814   o->two_write_queues = true;
815   o->manual_wal_flush = true;
816   return o;
817 }
818 
819 /* DBOptions contains Statistics and needs to be destructed last */
820 static std::unique_ptr<rocksdb::BlockBasedTableOptions> rocksdb_tbl_options =
821     std::unique_ptr<rocksdb::BlockBasedTableOptions>(
822         new rocksdb::BlockBasedTableOptions());
823 static std::unique_ptr<rocksdb::DBOptions> rocksdb_db_options =
824     rdb_init_rocksdb_db_options();
825 
826 static std::shared_ptr<rocksdb::RateLimiter> rocksdb_rate_limiter;
827 
828 /* This enum needs to be kept up to date with rocksdb::TxnDBWritePolicy */
829 static const char *write_policy_names[] = {"write_committed", "write_prepared",
830                                            "write_unprepared", NullS};
831 
832 static TYPELIB write_policy_typelib = {array_elements(write_policy_names) - 1,
833                                        "write_policy_typelib",
834                                        write_policy_names, nullptr};
835 
836 /* This array needs to be kept up to date with myrocks::read_free_rpl_type */
837 static const char *read_free_rpl_names[] = {"OFF", "PK_ONLY", "PK_SK", NullS};
838 
839 static TYPELIB read_free_rpl_typelib = {array_elements(read_free_rpl_names) - 1,
840                                         "read_free_rpl_typelib",
841                                         read_free_rpl_names, nullptr};
842 
843 /* This enum needs to be kept up to date with rocksdb::InfoLogLevel */
844 static const char *info_log_level_names[] = {"debug_level", "info_level",
845                                              "warn_level",  "error_level",
846                                              "fatal_level", NullS};
847 
848 static TYPELIB info_log_level_typelib = {
849     array_elements(info_log_level_names) - 1, "info_log_level_typelib",
850     info_log_level_names, nullptr};
851 
852 /* This enum needs to be kept up to date with rocksdb::BottommostLevelCompaction
853  */
854 static const char *bottommost_level_compaction_names[] = {
855     "kSkip", "kIfHaveCompactionFilter", "kForce", "kForceOptimized", NullS};
856 
857 static TYPELIB bottommost_level_compaction_typelib = {
858     array_elements(bottommost_level_compaction_names) - 1,
859     "bottommost_level_compaction_typelib", bottommost_level_compaction_names,
860     nullptr};
861 
rocksdb_set_rocksdb_info_log_level(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)862 static void rocksdb_set_rocksdb_info_log_level(
863     THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
864     const void *const save) {
865   assert(save != nullptr);
866 
867   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
868   rocksdb_info_log_level = *static_cast<const uint64_t *>(save);
869   rocksdb_db_options->info_log->SetInfoLogLevel(
870       static_cast<rocksdb::InfoLogLevel>(rocksdb_info_log_level));
871   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
872 }
873 
rocksdb_set_rocksdb_stats_level(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)874 static void rocksdb_set_rocksdb_stats_level(THD *const thd,
875                                             struct st_mysql_sys_var *const var,
876                                             void *const var_ptr,
877                                             const void *const save) {
878   assert(save != nullptr);
879 
880   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
881   rocksdb_db_options->statistics->set_stats_level(
882       static_cast<rocksdb::StatsLevel>(*static_cast<const uint64_t *>(save)));
883   // Actual stats level is defined at rocksdb dbopt::statistics::stats_level_
884   // so adjusting rocksdb_stats_level here to make sure it points to
885   // the correct stats level.
886   rocksdb_stats_level = rocksdb_db_options->statistics->get_stats_level();
887   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
888 }
889 
rocksdb_set_reset_stats(my_core::THD * const,my_core::st_mysql_sys_var * const,void * const var_ptr,const void * const save)890 static void rocksdb_set_reset_stats(
891     my_core::THD *const /* unused */,
892     my_core::st_mysql_sys_var *const /* unused */, void *const var_ptr,
893     const void *const save) {
894   assert(save != nullptr);
895   assert(rdb != nullptr);
896   assert(rocksdb_stats != nullptr);
897 
898   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
899 
900   *static_cast<bool *>(var_ptr) = *static_cast<const bool *>(save);
901 
902   if (rocksdb_reset_stats) {
903     rocksdb::Status s = rdb->ResetStats();
904 
905     // RocksDB will always return success. Let's document this assumption here
906     // as well so that we'll get immediately notified when contract changes.
907     assert(s == rocksdb::Status::OK());
908 
909     s = rocksdb_stats->Reset();
910     assert(s == rocksdb::Status::OK());
911   }
912 
913   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
914 }
915 
916 enum rocksdb_flush_log_at_trx_commit_type : unsigned int {
917   FLUSH_LOG_NEVER = 0,
918   FLUSH_LOG_SYNC,
919   FLUSH_LOG_BACKGROUND,
920   FLUSH_LOG_MAX /* must be last */
921 };
922 
rocksdb_validate_flush_log_at_trx_commit(THD * const thd,struct st_mysql_sys_var * const var,void * var_ptr,struct st_mysql_value * const value)923 static int rocksdb_validate_flush_log_at_trx_commit(
924     THD *const thd,
925     struct st_mysql_sys_var *const var, /* in: pointer to system variable */
926     void *var_ptr, /* out: immediate result for update function */
927     struct st_mysql_value *const value /* in: incoming value */) {
928   long long new_value;
929 
930   /* value is NULL */
931   if (value->val_int(value, &new_value)) {
932     return HA_EXIT_FAILURE;
933   }
934 
935   if (rocksdb_db_options->allow_mmap_writes && new_value != FLUSH_LOG_NEVER) {
936     return HA_EXIT_FAILURE;
937   }
938 
939   *static_cast<uint32_t *>(var_ptr) = static_cast<uint32_t>(new_value);
940   return HA_EXIT_SUCCESS;
941 }
rocksdb_compact_column_family_stub(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)942 static void rocksdb_compact_column_family_stub(
943     THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
944     const void *const save) {}
945 
946 static int rocksdb_compact_column_family(THD *const thd,
947                                          struct st_mysql_sys_var *const var,
948                                          void *const var_ptr,
949                                          struct st_mysql_value *const value);
950 
951 static const char *index_type_names[] = {"kBinarySearch", "kHashSearch", NullS};
952 
953 static TYPELIB index_type_typelib = {array_elements(index_type_names) - 1,
954                                      "index_type_typelib", index_type_names,
955                                      nullptr};
956 
957 // TODO: 0 means don't wait at all, and we don't support it yet?
958 static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG,
959                           "Number of seconds to wait for lock", nullptr,
960                           nullptr, /*default*/ 1, /*min*/ 1,
961                           /*max*/ RDB_MAX_LOCK_WAIT_SECONDS, 0);
962 
963 static MYSQL_THDVAR_BOOL(deadlock_detect, PLUGIN_VAR_RQCMDARG,
964                          "Enables deadlock detection", nullptr, nullptr, false);
965 
966 static MYSQL_THDVAR_ULONG(deadlock_detect_depth, PLUGIN_VAR_RQCMDARG,
967                           "Number of transactions deadlock detection will "
968                           "traverse through before assuming deadlock",
969                           nullptr, nullptr,
970                           /*default*/ RDB_DEADLOCK_DETECT_DEPTH,
971                           /*min*/ 2,
972                           /*max*/ ULONG_MAX, 0);
973 
974 static MYSQL_THDVAR_BOOL(
975     commit_time_batch_for_recovery, PLUGIN_VAR_RQCMDARG,
976     "TransactionOptions::commit_time_batch_for_recovery for RocksDB", nullptr,
977     nullptr, false);
978 
979 static MYSQL_THDVAR_BOOL(
980     trace_sst_api, PLUGIN_VAR_RQCMDARG,
981     "Generate trace output in the log for each call to the SstFileWriter",
982     nullptr, nullptr, false);
983 
984 static MYSQL_THDVAR_BOOL(
985     bulk_load, PLUGIN_VAR_RQCMDARG,
986     "Use bulk-load mode for inserts. This disables "
987     "unique_checks and enables rocksdb_commit_in_the_middle.",
988     rocksdb_check_bulk_load, nullptr, false);
989 
990 static MYSQL_THDVAR_BOOL(bulk_load_allow_sk, PLUGIN_VAR_RQCMDARG,
991                          "Allow bulk loading of sk keys during bulk-load. "
992                          "Can be changed only when bulk load is disabled.",
993                          /* Intentionally reuse unsorted's check function */
994                          rocksdb_check_bulk_load_allow_unsorted, nullptr,
995                          false);
996 
997 static MYSQL_THDVAR_BOOL(bulk_load_allow_unsorted, PLUGIN_VAR_RQCMDARG,
998                          "Allow unsorted input during bulk-load. "
999                          "Can be changed only when bulk load is disabled.",
1000                          rocksdb_check_bulk_load_allow_unsorted, nullptr,
1001                          false);
1002 
1003 static MYSQL_SYSVAR_BOOL(enable_bulk_load_api, rocksdb_enable_bulk_load_api,
1004                          PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1005                          "Enables using SstFileWriter for bulk loading",
1006                          nullptr, nullptr, rocksdb_enable_bulk_load_api);
1007 
1008 static MYSQL_SYSVAR_BOOL(
1009     enable_pipelined_write,
1010     *reinterpret_cast<my_bool *>(&rocksdb_db_options->enable_pipelined_write),
1011     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1012     "DBOptions::enable_pipelined_write for RocksDB", nullptr, nullptr,
1013     rocksdb_db_options->enable_pipelined_write);
1014 
1015 static MYSQL_SYSVAR_BOOL(enable_remove_orphaned_dropped_cfs,
1016                          rocksdb_enable_remove_orphaned_dropped_cfs,
1017                          PLUGIN_VAR_RQCMDARG,
1018                          "Enables removing dropped cfs from metadata if it "
1019                          "doesn't exist in cf manager",
1020                          nullptr, nullptr,
1021                          rocksdb_enable_remove_orphaned_dropped_cfs);
1022 
1023 static MYSQL_THDVAR_STR(tmpdir, PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_MEMALLOC,
1024                         "Directory for temporary files during DDL operations.",
1025                         nullptr, nullptr, "");
1026 
1027 static MYSQL_THDVAR_BOOL(
1028     commit_in_the_middle, PLUGIN_VAR_RQCMDARG,
1029     "Commit rows implicitly every rocksdb_bulk_load_size, on bulk load/insert, "
1030     "update and delete",
1031     nullptr, nullptr, false);
1032 
1033 #if defined(ROCKSDB_INCLUDE_RFR) && ROCKSDB_INCLUDE_RFR
1034 
1035 static MYSQL_THDVAR_BOOL(
1036     blind_delete_primary_key, PLUGIN_VAR_RQCMDARG,
1037     "Deleting rows by primary key lookup, without reading rows (Blind Deletes)."
1038     " Blind delete is disabled if the table has secondary key",
1039     nullptr, nullptr, FALSE);
1040 
1041 static MYSQL_THDVAR_BOOL(
1042     enable_iterate_bounds, PLUGIN_VAR_OPCMDARG,
1043     "Enable rocksdb iterator upper/lower bounds in read options.", nullptr,
1044     nullptr, TRUE);
1045 
1046 static const char *DEFAULT_READ_FREE_RPL_TABLES = ".*";
1047 
get_regex_flags()1048 static int get_regex_flags() {
1049   int flags = MY_REG_EXTENDED | MY_REG_NOSUB;
1050   if (lower_case_table_names) flags |= MY_REG_ICASE;
1051   return flags;
1052 }
1053 
rocksdb_validate_read_free_rpl_tables(THD * thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * var MY_ATTRIBUTE ((__unused__)),void * save,struct st_mysql_value * value)1054 static int rocksdb_validate_read_free_rpl_tables(
1055     THD *thd MY_ATTRIBUTE((__unused__)),
1056     struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)), void *save,
1057     struct st_mysql_value *value) {
1058   char buff[STRING_BUFFER_USUAL_SIZE];
1059   int length = sizeof(buff);
1060   const char *wlist_buf = value->val_str(value, buff, &length);
1061   if (wlist_buf)
1062     wlist_buf = thd->strmake(wlist_buf, length);  // make a temp copy
1063   const auto wlist = wlist_buf ? wlist_buf : DEFAULT_READ_FREE_RPL_TABLES;
1064 
1065 #if defined(HAVE_PSI_INTERFACE)
1066   Regex regex_handler(key_rwlock_read_free_rpl_tables);
1067 #else
1068   Regex regex_handler;
1069 #endif
1070 
1071   if (!regex_handler.compile(wlist, get_regex_flags(), table_alias_charset)) {
1072     warn_about_bad_patterns(regex_handler, "rocksdb_read_free_rpl_tables");
1073     return HA_EXIT_FAILURE;
1074   }
1075 
1076   *static_cast<const char **>(save) = wlist;
1077   return HA_EXIT_SUCCESS;
1078 }
1079 
rocksdb_update_read_free_rpl_tables(THD * thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * var MY_ATTRIBUTE ((__unused__)),void * var_ptr,const void * save)1080 static void rocksdb_update_read_free_rpl_tables(
1081     THD *thd MY_ATTRIBUTE((__unused__)),
1082     struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)), void *var_ptr,
1083     const void *save) {
1084   const auto wlist = *static_cast<const char *const *>(save);
1085   assert(wlist != nullptr);
1086 
1087   // This is bound to succeed since we've already checked for bad patterns in
1088   // rocksdb_validate_read_free_rpl_tables
1089   rdb_read_free_regex_handler.compile(wlist, get_regex_flags(),
1090                                       table_alias_charset);
1091 
1092   // update all table defs
1093   struct Rdb_read_free_rpl_updater : public Rdb_tables_scanner {
1094     int add_table(Rdb_tbl_def *tdef) override {
1095       tdef->check_and_set_read_free_rpl_table();
1096       return HA_EXIT_SUCCESS;
1097     }
1098   } updater;
1099   ddl_manager.scan_for_tables(&updater);
1100 
1101   *static_cast<const char **>(var_ptr) = *static_cast<char *const *>(save);
1102 }
1103 
rocksdb_set_max_bottom_pri_background_compactions_internal(uint val)1104 static void rocksdb_set_max_bottom_pri_background_compactions_internal(
1105     uint val) {
1106   // Set lower priority for compactions
1107   if (val > 0) {
1108     // This creates background threads in rocksdb with BOTTOM priority pool.
1109     // Compactions for bottommost level use threads in the BOTTOM pool, and
1110     // the threads in the BOTTOM pool run with lower OS priority (19 in Linux).
1111     rdb->GetEnv()->SetBackgroundThreads(val, rocksdb::Env::Priority::BOTTOM);
1112     rdb->GetEnv()->LowerThreadPoolCPUPriority(rocksdb::Env::Priority::BOTTOM);
1113     sql_print_information(
1114         "Set %d compaction thread(s) with "
1115         "lower scheduling priority.",
1116         val);
1117   }
1118 }
1119 
1120 static MYSQL_SYSVAR_STR(
1121     read_free_rpl_tables, rocksdb_read_free_rpl_tables,
1122     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
1123     "Regex that describes set of tables that will use read-free replication "
1124     "on the slave (i.e. not lookup a row during replication)",
1125     rocksdb_validate_read_free_rpl_tables, rocksdb_update_read_free_rpl_tables,
1126     DEFAULT_READ_FREE_RPL_TABLES);
1127 
1128 static MYSQL_SYSVAR_ENUM(
1129     read_free_rpl, rocksdb_read_free_rpl,
1130     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
1131     "Use read-free replication on the slave (i.e. no row lookup during "
1132     "replication). Default is OFF, PK_SK will enable it on all tables with "
1133     "primary key. PK_ONLY will enable it on tables where the only key is the "
1134     "primary key (i.e. no secondary keys).",
1135     nullptr, nullptr, read_free_rpl_type::OFF, &read_free_rpl_typelib);
1136 #endif  // defined(ROCKSDB_INCLUDE_RFR) && ROCKSDB_INCLUDE_RFR
1137 
1138 static MYSQL_SYSVAR_BOOL(
1139     rpl_skip_tx_api, rpl_skip_tx_api_var, PLUGIN_VAR_RQCMDARG,
1140     "Use write batches for replication thread instead of tx api", nullptr,
1141     nullptr, false);
1142 
1143 static MYSQL_THDVAR_BOOL(skip_bloom_filter_on_read, PLUGIN_VAR_RQCMDARG,
1144                          "Skip using bloom filter for reads", nullptr, nullptr,
1145                          false);
1146 
1147 static MYSQL_SYSVAR_ULONG(max_row_locks, rocksdb_max_row_locks,
1148                           PLUGIN_VAR_RQCMDARG,
1149                           "Maximum number of locks a transaction can have",
1150                           nullptr, nullptr,
1151                           /*default*/ RDB_DEFAULT_ROW_LOCKS,
1152                           /*min*/ 1,
1153                           /*max*/ RDB_MAX_ROW_LOCKS, 0);
1154 
1155 static MYSQL_THDVAR_ULONGLONG(
1156     write_batch_max_bytes, PLUGIN_VAR_RQCMDARG,
1157     "Maximum size of write batch in bytes. 0 means no limit.", nullptr, nullptr,
1158     /* default */ 0, /* min */ 0, /* max */ SIZE_T_MAX, 1);
1159 
1160 static MYSQL_THDVAR_ULONGLONG(
1161     write_batch_flush_threshold, PLUGIN_VAR_RQCMDARG,
1162     "Maximum size of write batch in bytes before flushing. Only valid if "
1163     "rocksdb_write_policy is WRITE_UNPREPARED. 0 means no limit.",
1164     nullptr, nullptr, /* default */ 0, /* min */ 0, /* max */ SIZE_T_MAX, 1);
1165 
1166 static MYSQL_THDVAR_BOOL(
1167     lock_scanned_rows, PLUGIN_VAR_RQCMDARG,
1168     "Take and hold locks on rows that are scanned but not updated", nullptr,
1169     nullptr, false);
1170 
1171 static MYSQL_THDVAR_ULONG(bulk_load_size, PLUGIN_VAR_RQCMDARG,
1172                           "Max #records in a batch for bulk-load mode", nullptr,
1173                           nullptr,
1174                           /*default*/ RDB_DEFAULT_BULK_LOAD_SIZE,
1175                           /*min*/ 1,
1176                           /*max*/ RDB_MAX_BULK_LOAD_SIZE, 0);
1177 
1178 static MYSQL_THDVAR_ULONGLONG(
1179     merge_buf_size, PLUGIN_VAR_RQCMDARG,
1180     "Size to allocate for merge sort buffers written out to disk "
1181     "during inplace index creation.",
1182     nullptr, nullptr,
1183     /* default (64MB) */ RDB_DEFAULT_MERGE_BUF_SIZE,
1184     /* min (100B) */ RDB_MIN_MERGE_BUF_SIZE,
1185     /* max */ SIZE_T_MAX, 1);
1186 
1187 static MYSQL_THDVAR_ULONGLONG(
1188     merge_combine_read_size, PLUGIN_VAR_RQCMDARG,
1189     "Size that we have to work with during combine (reading from disk) phase "
1190     "of "
1191     "external sort during fast index creation.",
1192     nullptr, nullptr,
1193     /* default (1GB) */ RDB_DEFAULT_MERGE_COMBINE_READ_SIZE,
1194     /* min (100B) */ RDB_MIN_MERGE_COMBINE_READ_SIZE,
1195     /* max */ SIZE_T_MAX, 1);
1196 
1197 static MYSQL_THDVAR_ULONGLONG(
1198     merge_tmp_file_removal_delay_ms, PLUGIN_VAR_RQCMDARG,
1199     "Fast index creation creates a large tmp file on disk during index "
1200     "creation.  Removing this large file all at once when index creation is "
1201     "complete can cause trim stalls on Flash.  This variable specifies a "
1202     "duration to sleep (in milliseconds) between calling chsize() to truncate "
1203     "the file in chunks.  The chunk size is  the same as merge_buf_size.",
1204     nullptr, nullptr,
1205     /* default (0ms) */ RDB_DEFAULT_MERGE_TMP_FILE_REMOVAL_DELAY,
1206     /* min (0ms) */ RDB_MIN_MERGE_TMP_FILE_REMOVAL_DELAY,
1207     /* max */ SIZE_T_MAX, 1);
1208 
1209 static MYSQL_THDVAR_INT(
1210     manual_compaction_threads, PLUGIN_VAR_RQCMDARG,
1211     "How many rocksdb threads to run for manual compactions", nullptr, nullptr,
1212     /* default rocksdb.dboption max_subcompactions */ 0,
1213     /* min */ 0, /* max */ 128, 0);
1214 
1215 static MYSQL_THDVAR_ENUM(
1216     manual_compaction_bottommost_level, PLUGIN_VAR_RQCMDARG,
1217     "Option for bottommost level compaction during manual "
1218     "compaction",
1219     nullptr, nullptr,
1220     /* default */
1221     (ulong)rocksdb::BottommostLevelCompaction::kForceOptimized,
1222     &bottommost_level_compaction_typelib);
1223 
1224 static MYSQL_SYSVAR_BOOL(
1225     create_if_missing,
1226     *reinterpret_cast<my_bool *>(&rocksdb_db_options->create_if_missing),
1227     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1228     "DBOptions::create_if_missing for RocksDB", nullptr, nullptr,
1229     rocksdb_db_options->create_if_missing);
1230 
concurrent_prepare_update(THD * thd,st_mysql_sys_var * var,void * var_ptr,const void * save)1231 static void concurrent_prepare_update(THD *thd, st_mysql_sys_var *var,
1232                                       void *var_ptr, const void *save) {
1233   push_warning(thd, Sql_condition::SL_WARNING, HA_ERR_WRONG_COMMAND,
1234                "Using rocksdb_concurrent_prepare is deprecated and the "
1235                "parameter may be removed in future releases.");
1236 }
1237 
1238 static MYSQL_SYSVAR_BOOL(
1239     concurrent_prepare,
1240     *reinterpret_cast<my_bool *>(&rocksdb_db_options->two_write_queues),
1241     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1242     "DEPRECATED, use rocksdb_two_write_queries instead.", nullptr,
1243     concurrent_prepare_update, rocksdb_db_options->two_write_queues);
1244 
1245 static MYSQL_SYSVAR_BOOL(
1246     two_write_queues,
1247     *reinterpret_cast<my_bool *>(&rocksdb_db_options->two_write_queues),
1248     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1249     "DBOptions::two_write_queues for RocksDB", nullptr, nullptr,
1250     rocksdb_db_options->two_write_queues);
1251 
1252 static MYSQL_SYSVAR_BOOL(
1253     manual_wal_flush,
1254     *reinterpret_cast<my_bool *>(&rocksdb_db_options->manual_wal_flush),
1255     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1256     "DBOptions::manual_wal_flush for RocksDB", nullptr, nullptr,
1257     rocksdb_db_options->manual_wal_flush);
1258 
1259 static MYSQL_SYSVAR_ENUM(write_policy, rocksdb_write_policy,
1260                          PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1261                          "DBOptions::write_policy for RocksDB", nullptr,
1262                          nullptr, rocksdb::TxnDBWritePolicy::WRITE_COMMITTED,
1263                          &write_policy_typelib);
1264 
1265 static MYSQL_SYSVAR_BOOL(
1266     create_missing_column_families,
1267     *reinterpret_cast<my_bool *>(
1268         &rocksdb_db_options->create_missing_column_families),
1269     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1270     "DBOptions::create_missing_column_families for RocksDB", nullptr, nullptr,
1271     rocksdb_db_options->create_missing_column_families);
1272 
1273 static MYSQL_SYSVAR_BOOL(
1274     error_if_exists,
1275     *reinterpret_cast<my_bool *>(&rocksdb_db_options->error_if_exists),
1276     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1277     "DBOptions::error_if_exists for RocksDB", nullptr, nullptr,
1278     rocksdb_db_options->error_if_exists);
1279 
1280 static MYSQL_SYSVAR_BOOL(
1281     paranoid_checks,
1282     *reinterpret_cast<my_bool *>(&rocksdb_db_options->paranoid_checks),
1283     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1284     "DBOptions::paranoid_checks for RocksDB", nullptr, nullptr,
1285     rocksdb_db_options->paranoid_checks);
1286 
1287 static MYSQL_SYSVAR_ULONGLONG(
1288     rate_limiter_bytes_per_sec, rocksdb_rate_limiter_bytes_per_sec,
1289     PLUGIN_VAR_RQCMDARG, "DBOptions::rate_limiter bytes_per_sec for RocksDB",
1290     nullptr, rocksdb_set_rate_limiter_bytes_per_sec, /* default */ 0L,
1291     /* min */ 0L, /* max */ MAX_RATE_LIMITER_BYTES_PER_SEC, 0);
1292 
1293 static MYSQL_SYSVAR_ULONGLONG(
1294     sst_mgr_rate_bytes_per_sec, rocksdb_sst_mgr_rate_bytes_per_sec,
1295     PLUGIN_VAR_RQCMDARG,
1296     "DBOptions::sst_file_manager rate_bytes_per_sec for RocksDB", nullptr,
1297     rocksdb_set_sst_mgr_rate_bytes_per_sec,
1298     /* default */ DEFAULT_SST_MGR_RATE_BYTES_PER_SEC,
1299     /* min */ 0L, /* max */ UINT64_MAX, 0);
1300 
1301 static MYSQL_SYSVAR_ULONGLONG(delayed_write_rate, rocksdb_delayed_write_rate,
1302                               PLUGIN_VAR_RQCMDARG,
1303                               "DBOptions::delayed_write_rate", nullptr,
1304                               rocksdb_set_delayed_write_rate,
1305                               rocksdb_db_options->delayed_write_rate, 0,
1306                               UINT64_MAX, 0);
1307 
1308 static MYSQL_SYSVAR_UINT(max_latest_deadlocks, rocksdb_max_latest_deadlocks,
1309                          PLUGIN_VAR_RQCMDARG,
1310                          "Maximum number of recent "
1311                          "deadlocks to store",
1312                          nullptr, rocksdb_set_max_latest_deadlocks,
1313                          rocksdb::kInitialMaxDeadlocks, 0, UINT32_MAX, 0);
1314 
1315 static MYSQL_SYSVAR_ENUM(
1316     info_log_level, rocksdb_info_log_level, PLUGIN_VAR_RQCMDARG,
1317     "Filter level for info logs to be written mysqld error log. "
1318     "Valid values include 'debug_level', 'info_level', 'warn_level'"
1319     "'error_level' and 'fatal_level'.",
1320     nullptr, rocksdb_set_rocksdb_info_log_level,
1321     rocksdb::InfoLogLevel::ERROR_LEVEL, &info_log_level_typelib);
1322 
1323 static MYSQL_THDVAR_INT(
1324     perf_context_level, PLUGIN_VAR_RQCMDARG,
1325     "Perf Context Level for rocksdb internal timer stat collection", nullptr,
1326     nullptr,
1327     /* default */ rocksdb::PerfLevel::kUninitialized,
1328     /* min */ rocksdb::PerfLevel::kUninitialized,
1329     /* max */ rocksdb::PerfLevel::kOutOfBounds - 1, 0);
1330 
1331 static MYSQL_SYSVAR_UINT(
1332     wal_recovery_mode, rocksdb_wal_recovery_mode, PLUGIN_VAR_RQCMDARG,
1333     "DBOptions::wal_recovery_mode for RocksDB. Default is kPointInTimeRecovery",
1334     nullptr, nullptr,
1335     /* default */ (uint)rocksdb::WALRecoveryMode::kPointInTimeRecovery,
1336     /* min */ (uint)rocksdb::WALRecoveryMode::kTolerateCorruptedTailRecords,
1337     /* max */ (uint)rocksdb::WALRecoveryMode::kSkipAnyCorruptedRecords, 0);
1338 
1339 static MYSQL_SYSVAR_BOOL(
1340     track_and_verify_wals_in_manifest,
1341     *reinterpret_cast<my_bool *>(&rocksdb_track_and_verify_wals_in_manifest),
1342     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1343     "DBOptions::track_and_verify_wals_in_manifest for RocksDB", nullptr,
1344     nullptr, true);
1345 
1346 static MYSQL_SYSVAR_UINT(
1347     stats_level, rocksdb_stats_level, PLUGIN_VAR_RQCMDARG,
1348     "Statistics Level for RocksDB. Default is 1 (kExceptHistogramOrTimers)",
1349     nullptr, rocksdb_set_rocksdb_stats_level,
1350     /* default */ (uint)rocksdb::StatsLevel::kExceptHistogramOrTimers,
1351     /* min */ (uint)rocksdb::StatsLevel::kExceptTickers,
1352     /* max */ (uint)rocksdb::StatsLevel::kAll, 0);
1353 
1354 static MYSQL_SYSVAR_ULONG(compaction_readahead_size,
1355                           rocksdb_db_options->compaction_readahead_size,
1356                           PLUGIN_VAR_RQCMDARG,
1357                           "DBOptions::compaction_readahead_size for RocksDB",
1358                           nullptr, nullptr,
1359                           rocksdb_db_options->compaction_readahead_size,
1360                           /* min */ 0L, /* max */ ULONG_MAX, 0);
1361 
1362 static MYSQL_SYSVAR_BOOL(
1363     new_table_reader_for_compaction_inputs,
1364     *reinterpret_cast<my_bool *>(
1365         &rocksdb_db_options->new_table_reader_for_compaction_inputs),
1366     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1367     "DBOptions::new_table_reader_for_compaction_inputs for RocksDB", nullptr,
1368     nullptr, rocksdb_db_options->new_table_reader_for_compaction_inputs);
1369 
1370 static MYSQL_SYSVAR_UINT(
1371     access_hint_on_compaction_start, rocksdb_access_hint_on_compaction_start,
1372     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1373     "DBOptions::access_hint_on_compaction_start for RocksDB", nullptr, nullptr,
1374     /* default */ (uint)rocksdb::Options::AccessHint::NORMAL,
1375     /* min */ (uint)rocksdb::Options::AccessHint::NONE,
1376     /* max */ (uint)rocksdb::Options::AccessHint::WILLNEED, 0);
1377 
1378 static MYSQL_SYSVAR_BOOL(
1379     allow_concurrent_memtable_write,
1380     *reinterpret_cast<my_bool *>(
1381         &rocksdb_db_options->allow_concurrent_memtable_write),
1382     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1383     "DBOptions::allow_concurrent_memtable_write for RocksDB", nullptr, nullptr,
1384     false);
1385 
1386 static MYSQL_SYSVAR_BOOL(
1387     enable_write_thread_adaptive_yield,
1388     *reinterpret_cast<my_bool *>(
1389         &rocksdb_db_options->enable_write_thread_adaptive_yield),
1390     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1391     "DBOptions::enable_write_thread_adaptive_yield for RocksDB", nullptr,
1392     nullptr, false);
1393 
1394 static MYSQL_SYSVAR_INT(max_open_files, rocksdb_db_options->max_open_files,
1395                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1396                         "DBOptions::max_open_files for RocksDB", nullptr,
1397                         nullptr, rocksdb_db_options->max_open_files,
1398                         /* min */ -2, /* max */ INT_MAX, 0);
1399 
1400 static MYSQL_SYSVAR_ULONG(max_total_wal_size,
1401                           rocksdb_db_options->max_total_wal_size,
1402                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1403                           "DBOptions::max_total_wal_size for RocksDB", nullptr,
1404                           nullptr, rocksdb_db_options->max_total_wal_size,
1405                           /* min */ 0L, /* max */ LONG_MAX, 0);
1406 
1407 static MYSQL_SYSVAR_BOOL(
1408     use_fsync, *reinterpret_cast<my_bool *>(&rocksdb_db_options->use_fsync),
1409     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1410     "DBOptions::use_fsync for RocksDB", nullptr, nullptr,
1411     rocksdb_db_options->use_fsync);
1412 
1413 static MYSQL_SYSVAR_STR(wal_dir, rocksdb_wal_dir,
1414                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1415                         "DBOptions::wal_dir for RocksDB", nullptr, nullptr,
1416                         rocksdb_db_options->wal_dir.c_str());
1417 
1418 static MYSQL_SYSVAR_STR(
1419     persistent_cache_path, rocksdb_persistent_cache_path,
1420     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1421     "Path for BlockBasedTableOptions::persistent_cache for RocksDB", nullptr,
1422     nullptr, "");
1423 
1424 static MYSQL_SYSVAR_ULONG(
1425     persistent_cache_size_mb, rocksdb_persistent_cache_size_mb,
1426     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1427     "Size of cache in MB for BlockBasedTableOptions::persistent_cache "
1428     "for RocksDB",
1429     nullptr, nullptr, rocksdb_persistent_cache_size_mb,
1430     /* min */ 0L, /* max */ ULONG_MAX, 0);
1431 
1432 static MYSQL_SYSVAR_ULONG(
1433     delete_obsolete_files_period_micros,
1434     rocksdb_db_options->delete_obsolete_files_period_micros,
1435     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1436     "DBOptions::delete_obsolete_files_period_micros for RocksDB", nullptr,
1437     nullptr, rocksdb_db_options->delete_obsolete_files_period_micros,
1438     /* min */ 0L, /* max */ LONG_MAX, 0);
1439 
1440 static MYSQL_SYSVAR_INT(max_background_jobs,
1441                         rocksdb_db_options->max_background_jobs,
1442                         PLUGIN_VAR_RQCMDARG,
1443                         "DBOptions::max_background_jobs for RocksDB", nullptr,
1444                         rocksdb_set_max_background_jobs,
1445                         rocksdb_db_options->max_background_jobs,
1446                         /* min */ -1, /* max */ MAX_BACKGROUND_JOBS, 0);
1447 
1448 static MYSQL_SYSVAR_INT(max_background_flushes,
1449                         rocksdb_db_options->max_background_flushes,
1450                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1451                         "DBOptions::max_background_flushes for RocksDB", nullptr,
1452                         nullptr,
1453                         rocksdb_db_options->max_background_flushes,
1454                         /* min */ -1, /* max */ 64, 0);
1455 
1456 static MYSQL_SYSVAR_INT(max_background_compactions,
1457                         rocksdb_db_options->max_background_compactions,
1458                         PLUGIN_VAR_RQCMDARG,
1459                         "DBOptions::max_background_compactions for RocksDB", nullptr,
1460                         rocksdb_set_max_background_compactions,
1461                         rocksdb_db_options->max_background_compactions,
1462                         /* min */ -1, /* max */ 64, 0);
1463 
1464 static MYSQL_SYSVAR_UINT(
1465     max_bottom_pri_background_compactions,
1466     rocksdb_max_bottom_pri_background_compactions, PLUGIN_VAR_RQCMDARG,
1467     "Creating specified number of threads, setting lower "
1468     "CPU priority, and letting Lmax compactions use them. "
1469     "Maximum total compaction concurrency continues to be capped to "
1470     "rocksdb_max_background_compactions or "
1471     "rocksdb_max_background_jobs. In addition to that, Lmax "
1472     "compaction concurrency is capped to "
1473     "rocksdb_max_bottom_pri_background_compactions. Default value is 0, "
1474     "which means all compactions are under concurrency of "
1475     "rocksdb_max_background_compactions|jobs. If you set very low "
1476     "rocksdb_max_bottom_pri_background_compactions (e.g. 1 or 2), compactions "
1477     "may not be able to keep up. Since Lmax normally has "
1478     "90 percent of data, it is recommended to set closer number to "
1479     "rocksdb_max_background_compactions|jobs. This option is helpful to "
1480     "give more CPU resources to other threads (e.g. query processing).",
1481     rocksdb_validate_max_bottom_pri_background_compactions, nullptr, 0,
1482     /* min */ 0, /* max */ ROCKSDB_MAX_BOTTOM_PRI_BACKGROUND_COMPACTIONS, 0);
1483 
1484 static MYSQL_SYSVAR_UINT(max_subcompactions,
1485                          rocksdb_db_options->max_subcompactions,
1486                          PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1487                          "DBOptions::max_subcompactions for RocksDB", nullptr,
1488                          nullptr, rocksdb_db_options->max_subcompactions,
1489                          /* min */ 1, /* max */ MAX_SUBCOMPACTIONS, 0);
1490 
1491 static MYSQL_SYSVAR_ULONG(max_log_file_size,
1492                           rocksdb_db_options->max_log_file_size,
1493                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1494                           "DBOptions::max_log_file_size for RocksDB", nullptr,
1495                           nullptr, rocksdb_db_options->max_log_file_size,
1496                           /* min */ 0L, /* max */ LONG_MAX, 0);
1497 
1498 static MYSQL_SYSVAR_ULONG(log_file_time_to_roll,
1499                           rocksdb_db_options->log_file_time_to_roll,
1500                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1501                           "DBOptions::log_file_time_to_roll for RocksDB",
1502                           nullptr, nullptr,
1503                           rocksdb_db_options->log_file_time_to_roll,
1504                           /* min */ 0L, /* max */ LONG_MAX, 0);
1505 
1506 static MYSQL_SYSVAR_ULONG(keep_log_file_num,
1507                           rocksdb_db_options->keep_log_file_num,
1508                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1509                           "DBOptions::keep_log_file_num for RocksDB", nullptr,
1510                           nullptr, rocksdb_db_options->keep_log_file_num,
1511                           /* min */ 0L, /* max */ LONG_MAX, 0);
1512 
1513 static MYSQL_SYSVAR_ULONG(max_manifest_file_size,
1514                           rocksdb_db_options->max_manifest_file_size,
1515                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1516                           "DBOptions::max_manifest_file_size for RocksDB",
1517                           nullptr, nullptr,
1518                           rocksdb_db_options->max_manifest_file_size,
1519                           /* min */ 0L, /* max */ ULONG_MAX, 0);
1520 
1521 static MYSQL_SYSVAR_INT(table_cache_numshardbits,
1522                         rocksdb_db_options->table_cache_numshardbits,
1523                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1524                         "DBOptions::table_cache_numshardbits for RocksDB",
1525                         nullptr, nullptr,
1526                         rocksdb_db_options->table_cache_numshardbits,
1527                         /* min */ 0, /* max */ 19, 0);
1528 
1529 static MYSQL_SYSVAR_ULONG(wal_ttl_seconds, rocksdb_db_options->WAL_ttl_seconds,
1530                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1531                           "DBOptions::WAL_ttl_seconds for RocksDB", nullptr,
1532                           nullptr, rocksdb_db_options->WAL_ttl_seconds,
1533                           /* min */ 0L, /* max */ LONG_MAX, 0);
1534 
1535 static MYSQL_SYSVAR_ULONG(wal_size_limit_mb,
1536                           rocksdb_db_options->WAL_size_limit_MB,
1537                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1538                           "DBOptions::WAL_size_limit_MB for RocksDB", nullptr,
1539                           nullptr, rocksdb_db_options->WAL_size_limit_MB,
1540                           /* min */ 0L, /* max */ LONG_MAX, 0);
1541 
1542 static MYSQL_SYSVAR_ULONG(manifest_preallocation_size,
1543                           rocksdb_db_options->manifest_preallocation_size,
1544                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1545                           "DBOptions::manifest_preallocation_size for RocksDB",
1546                           nullptr, nullptr,
1547                           rocksdb_db_options->manifest_preallocation_size,
1548                           /* min */ 0L, /* max */ LONG_MAX, 0);
1549 
1550 static MYSQL_SYSVAR_BOOL(
1551     use_direct_reads,
1552     *reinterpret_cast<my_bool *>(&rocksdb_db_options->use_direct_reads),
1553     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1554     "DBOptions::use_direct_reads for RocksDB", nullptr, nullptr,
1555     rocksdb_db_options->use_direct_reads);
1556 
1557 static MYSQL_SYSVAR_BOOL(
1558     use_direct_io_for_flush_and_compaction,
1559     *reinterpret_cast<my_bool *>(
1560         &rocksdb_db_options->use_direct_io_for_flush_and_compaction),
1561     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1562     "DBOptions::use_direct_io_for_flush_and_compaction for RocksDB", nullptr,
1563     nullptr, rocksdb_db_options->use_direct_io_for_flush_and_compaction);
1564 
1565 static MYSQL_SYSVAR_BOOL(
1566     allow_mmap_reads,
1567     *reinterpret_cast<my_bool *>(&rocksdb_db_options->allow_mmap_reads),
1568     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1569     "DBOptions::allow_mmap_reads for RocksDB", nullptr, nullptr,
1570     rocksdb_db_options->allow_mmap_reads);
1571 
1572 static MYSQL_SYSVAR_BOOL(
1573     allow_mmap_writes,
1574     *reinterpret_cast<my_bool *>(&rocksdb_db_options->allow_mmap_writes),
1575     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1576     "DBOptions::allow_mmap_writes for RocksDB", nullptr, nullptr,
1577     rocksdb_db_options->allow_mmap_writes);
1578 
1579 static MYSQL_SYSVAR_BOOL(
1580     is_fd_close_on_exec,
1581     *reinterpret_cast<my_bool *>(&rocksdb_db_options->is_fd_close_on_exec),
1582     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1583     "DBOptions::is_fd_close_on_exec for RocksDB", nullptr, nullptr,
1584     rocksdb_db_options->is_fd_close_on_exec);
1585 
1586 static MYSQL_SYSVAR_UINT(stats_dump_period_sec,
1587                          rocksdb_db_options->stats_dump_period_sec,
1588                          PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1589                          "DBOptions::stats_dump_period_sec for RocksDB",
1590                          nullptr, nullptr,
1591                          rocksdb_db_options->stats_dump_period_sec,
1592                          /* min */ 0, /* max */ INT_MAX, 0);
1593 
1594 static MYSQL_SYSVAR_BOOL(
1595     advise_random_on_open,
1596     *reinterpret_cast<my_bool *>(&rocksdb_db_options->advise_random_on_open),
1597     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1598     "DBOptions::advise_random_on_open for RocksDB", nullptr, nullptr,
1599     rocksdb_db_options->advise_random_on_open);
1600 
1601 static MYSQL_SYSVAR_ULONG(db_write_buffer_size,
1602                           rocksdb_db_options->db_write_buffer_size,
1603                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1604                           "DBOptions::db_write_buffer_size for RocksDB",
1605                           nullptr, nullptr,
1606                           rocksdb_db_options->db_write_buffer_size,
1607                           /* min */ 0L, /* max */ LONG_MAX, 0);
1608 
1609 static MYSQL_SYSVAR_BOOL(
1610     use_adaptive_mutex,
1611     *reinterpret_cast<my_bool *>(&rocksdb_db_options->use_adaptive_mutex),
1612     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1613     "DBOptions::use_adaptive_mutex for RocksDB", nullptr, nullptr,
1614     rocksdb_db_options->use_adaptive_mutex);
1615 
1616 static MYSQL_SYSVAR_ULONG(bytes_per_sync, rocksdb_db_options->bytes_per_sync,
1617                           PLUGIN_VAR_RQCMDARG,
1618                           "DBOptions::bytes_per_sync for RocksDB", nullptr,
1619                           rocksdb_set_bytes_per_sync,
1620                           rocksdb_db_options->bytes_per_sync,
1621                           /* min */ 0L, /* max */ LONG_MAX, 0);
1622 
1623 static MYSQL_SYSVAR_ULONG(wal_bytes_per_sync,
1624                           rocksdb_db_options->wal_bytes_per_sync,
1625                           PLUGIN_VAR_RQCMDARG,
1626                           "DBOptions::wal_bytes_per_sync for RocksDB", nullptr,
1627                           rocksdb_set_wal_bytes_per_sync,
1628                           rocksdb_db_options->wal_bytes_per_sync,
1629                           /* min */ 0L, /* max */ LONG_MAX, 0);
1630 
1631 static MYSQL_SYSVAR_BOOL(
1632     enable_thread_tracking,
1633     *reinterpret_cast<my_bool *>(&rocksdb_db_options->enable_thread_tracking),
1634     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1635     "DBOptions::enable_thread_tracking for RocksDB", nullptr, nullptr, true);
1636 
1637 static MYSQL_SYSVAR_LONGLONG(block_cache_size, rocksdb_block_cache_size,
1638                              PLUGIN_VAR_RQCMDARG,
1639                              "block_cache size for RocksDB",
1640                              rocksdb_validate_set_block_cache_size, nullptr,
1641                              /* default */ RDB_DEFAULT_BLOCK_CACHE_SIZE,
1642                              /* min */ RDB_MIN_BLOCK_CACHE_SIZE,
1643                              /* max */ LLONG_MAX,
1644                              /* Block size */ RDB_MIN_BLOCK_CACHE_SIZE);
1645 
1646 static MYSQL_SYSVAR_LONGLONG(sim_cache_size, rocksdb_sim_cache_size,
1647                              PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1648                              "Simulated cache size for RocksDB", nullptr,
1649                              nullptr,
1650                              /* default */ 0,
1651                              /* min */ 0,
1652                              /* max */ LLONG_MAX,
1653                              /* Block size */ 0);
1654 
1655 static MYSQL_SYSVAR_BOOL(cache_dump, rocksdb_cache_dump,
1656                          PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1657                          "Include RocksDB block cache content in core dump.",
1658                          nullptr, nullptr, true);
1659 
1660 static MYSQL_SYSVAR_DOUBLE(cache_high_pri_pool_ratio,
1661                            rocksdb_cache_high_pri_pool_ratio,
1662                            PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1663                            "Specify the size of block cache high-pri pool",
1664                            nullptr, nullptr, /* default */ 0.0, /* min */ 0.0,
1665                            /* max */ 1.0, 0);
1666 
1667 static MYSQL_SYSVAR_BOOL(
1668     cache_index_and_filter_blocks,
1669     *reinterpret_cast<my_bool *>(
1670         &rocksdb_tbl_options->cache_index_and_filter_blocks),
1671     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1672     "BlockBasedTableOptions::cache_index_and_filter_blocks for RocksDB",
1673     nullptr, nullptr, true);
1674 
1675 static MYSQL_SYSVAR_BOOL(
1676     cache_index_and_filter_with_high_priority,
1677     *reinterpret_cast<my_bool *>(
1678         &rocksdb_tbl_options->cache_index_and_filter_blocks_with_high_priority),
1679     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1680     "cache_index_and_filter_blocks_with_high_priority for RocksDB", nullptr,
1681     nullptr, true);
1682 
1683 // When pin_l0_filter_and_index_blocks_in_cache is true, RocksDB will  use the
1684 // LRU cache, but will always keep the filter & idndex block's handle checked
1685 // out (=won't call ShardedLRUCache::Release), plus the parsed out objects
1686 // the LRU cache will never push flush them out, hence they're pinned.
1687 //
1688 // This fixes the mutex contention between :ShardedLRUCache::Lookup and
1689 // ShardedLRUCache::Release which reduced the QPS ratio (QPS using secondary
1690 // index / QPS using PK).
1691 static MYSQL_SYSVAR_BOOL(
1692     pin_l0_filter_and_index_blocks_in_cache,
1693     *reinterpret_cast<my_bool *>(
1694         &rocksdb_tbl_options->pin_l0_filter_and_index_blocks_in_cache),
1695     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1696     "pin_l0_filter_and_index_blocks_in_cache for RocksDB", nullptr, nullptr,
1697     true);
1698 
1699 static MYSQL_SYSVAR_ENUM(index_type, rocksdb_index_type,
1700                          PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1701                          "BlockBasedTableOptions::index_type for RocksDB",
1702                          nullptr, nullptr,
1703                          (uint64_t)rocksdb_tbl_options->index_type,
1704                          &index_type_typelib);
1705 
1706 static MYSQL_SYSVAR_BOOL(
1707     hash_index_allow_collision,
1708     *reinterpret_cast<my_bool *>(
1709         &rocksdb_tbl_options->hash_index_allow_collision),
1710     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1711     "BlockBasedTableOptions::hash_index_allow_collision for RocksDB", nullptr,
1712     nullptr, rocksdb_tbl_options->hash_index_allow_collision);
1713 
1714 static MYSQL_SYSVAR_BOOL(
1715     no_block_cache,
1716     *reinterpret_cast<my_bool *>(&rocksdb_tbl_options->no_block_cache),
1717     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1718     "BlockBasedTableOptions::no_block_cache for RocksDB", nullptr, nullptr,
1719     rocksdb_tbl_options->no_block_cache);
1720 
1721 static MYSQL_SYSVAR_ULONG(block_size, rocksdb_tbl_options->block_size,
1722                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1723                           "BlockBasedTableOptions::block_size for RocksDB",
1724                           nullptr, nullptr, rocksdb_tbl_options->block_size,
1725                           /* min */ 1024L, /* max */ LONG_MAX, 0);
1726 
1727 static MYSQL_SYSVAR_INT(
1728     block_size_deviation, rocksdb_tbl_options->block_size_deviation,
1729     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1730     "BlockBasedTableOptions::block_size_deviation for RocksDB", nullptr,
1731     nullptr, rocksdb_tbl_options->block_size_deviation,
1732     /* min */ 0, /* max */ INT_MAX, 0);
1733 
1734 static MYSQL_SYSVAR_INT(
1735     block_restart_interval, rocksdb_tbl_options->block_restart_interval,
1736     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1737     "BlockBasedTableOptions::block_restart_interval for RocksDB", nullptr,
1738     nullptr, rocksdb_tbl_options->block_restart_interval,
1739     /* min */ 1, /* max */ INT_MAX, 0);
1740 
1741 static MYSQL_SYSVAR_BOOL(
1742     whole_key_filtering,
1743     *reinterpret_cast<my_bool *>(&rocksdb_tbl_options->whole_key_filtering),
1744     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1745     "BlockBasedTableOptions::whole_key_filtering for RocksDB", nullptr, nullptr,
1746     rocksdb_tbl_options->whole_key_filtering);
1747 
1748 static MYSQL_SYSVAR_STR(default_cf_options, rocksdb_default_cf_options,
1749                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1750                         "default cf options for RocksDB", nullptr, nullptr,
1751                         "compression=kLZ4Compression;"
1752                         "bottommost_compression=kLZ4Compression");
1753 
1754 static MYSQL_SYSVAR_STR(override_cf_options, rocksdb_override_cf_options,
1755                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1756                         "option overrides per cf for RocksDB", nullptr, nullptr,
1757                         "");
1758 
1759 static MYSQL_SYSVAR_STR(update_cf_options, rocksdb_update_cf_options,
1760                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC |
1761                             PLUGIN_VAR_NOCMDOPT,
1762                         "Option updates per column family for RocksDB",
1763                         rocksdb_validate_update_cf_options,
1764                         rocksdb_set_update_cf_options, nullptr);
1765 
1766 static MYSQL_SYSVAR_BOOL(use_default_sk_cf, rocksdb_use_default_sk_cf,
1767                          PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1768                          "Use default_sk for secondary keys", nullptr, nullptr,
1769                          false);
1770 
1771 static MYSQL_SYSVAR_UINT(flush_log_at_trx_commit,
1772                          rocksdb_flush_log_at_trx_commit, PLUGIN_VAR_RQCMDARG,
1773                          "Sync on transaction commit. Similar to "
1774                          "innodb_flush_log_at_trx_commit. 1: sync on commit, "
1775                          "0,2: not sync on commit",
1776                          rocksdb_validate_flush_log_at_trx_commit, nullptr,
1777                          /* default */ FLUSH_LOG_SYNC,
1778                          /* min */ FLUSH_LOG_NEVER,
1779                          /* max */ FLUSH_LOG_BACKGROUND, 0);
1780 
1781 static MYSQL_THDVAR_BOOL(write_disable_wal, PLUGIN_VAR_RQCMDARG,
1782                          "WriteOptions::disableWAL for RocksDB", nullptr,
1783                          nullptr, rocksdb::WriteOptions().disableWAL);
1784 
1785 static MYSQL_THDVAR_BOOL(
1786     write_ignore_missing_column_families, PLUGIN_VAR_RQCMDARG,
1787     "WriteOptions::ignore_missing_column_families for RocksDB", nullptr,
1788     nullptr, rocksdb::WriteOptions().ignore_missing_column_families);
1789 
1790 static MYSQL_THDVAR_BOOL(skip_fill_cache, PLUGIN_VAR_RQCMDARG,
1791                          "Skip filling block cache on read requests", nullptr,
1792                          nullptr, false);
1793 
1794 static MYSQL_THDVAR_BOOL(
1795     unsafe_for_binlog, PLUGIN_VAR_RQCMDARG,
1796     "Allowing statement based binary logging which may break consistency",
1797     nullptr, nullptr, false);
1798 
1799 static MYSQL_THDVAR_UINT(records_in_range, PLUGIN_VAR_RQCMDARG,
1800                          "Used to override the result of records_in_range(). "
1801                          "Set to a positive number to override",
1802                          nullptr, nullptr, 0,
1803                          /* min */ 0, /* max */ INT_MAX, 0);
1804 
1805 static MYSQL_THDVAR_UINT(force_index_records_in_range, PLUGIN_VAR_RQCMDARG,
1806                          "Used to override the result of records_in_range() "
1807                          "when FORCE INDEX is used.",
1808                          nullptr, nullptr, 0,
1809                          /* min */ 0, /* max */ INT_MAX, 0);
1810 
1811 static MYSQL_SYSVAR_UINT(
1812     debug_optimizer_n_rows, rocksdb_debug_optimizer_n_rows,
1813     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_NOSYSVAR,
1814     "Test only to override rocksdb estimates of table size in a memtable",
1815     nullptr, nullptr, 0, /* min */ 0, /* max */ INT_MAX, 0);
1816 
1817 static MYSQL_SYSVAR_BOOL(force_compute_memtable_stats,
1818                          rocksdb_force_compute_memtable_stats,
1819                          PLUGIN_VAR_RQCMDARG,
1820                          "Force to always compute memtable stats", nullptr,
1821                          nullptr, true);
1822 
1823 static MYSQL_SYSVAR_UINT(
1824     force_compute_memtable_stats_cachetime,
1825     rocksdb_force_compute_memtable_stats_cachetime, PLUGIN_VAR_RQCMDARG,
1826     "Time in usecs to cache memtable estimates", nullptr, nullptr,
1827     /* default */ RDB_DEFAULT_FORCE_COMPUTE_MEMTABLE_STATS_CACHETIME,
1828     /* min */ 0, /* max */ INT_MAX, 0);
1829 
1830 static MYSQL_SYSVAR_BOOL(
1831     debug_optimizer_no_zero_cardinality,
1832     rocksdb_debug_optimizer_no_zero_cardinality, PLUGIN_VAR_RQCMDARG,
1833     "In case if cardinality is zero, overrides it with some value", nullptr,
1834     nullptr, true);
1835 
1836 static MYSQL_SYSVAR_STR(compact_cf, rocksdb_compact_cf_name,
1837                         PLUGIN_VAR_RQCMDARG, "Compact column family",
1838                         rocksdb_compact_column_family,
1839                         rocksdb_compact_column_family_stub, "");
1840 
1841 static MYSQL_SYSVAR_STR(delete_cf, rocksdb_delete_cf_name, PLUGIN_VAR_RQCMDARG,
1842                         "Delete column family", rocksdb_delete_column_family,
1843                         rocksdb_delete_column_family_stub, "");
1844 
1845 static MYSQL_SYSVAR_STR(create_checkpoint, rocksdb_checkpoint_name,
1846                         PLUGIN_VAR_RQCMDARG, "Checkpoint directory",
1847                         rocksdb_create_checkpoint,
1848                         rocksdb_create_checkpoint_stub, "");
1849 
1850 static MYSQL_SYSVAR_BOOL(signal_drop_index_thread,
1851                          rocksdb_signal_drop_index_thread, PLUGIN_VAR_RQCMDARG,
1852                          "Wake up drop index thread", nullptr,
1853                          rocksdb_drop_index_wakeup_thread, false);
1854 
1855 static MYSQL_SYSVAR_BOOL(pause_background_work, rocksdb_pause_background_work,
1856                          PLUGIN_VAR_RQCMDARG,
1857                          "Disable all rocksdb background operations", nullptr,
1858                          rocksdb_set_pause_background_work, false);
1859 
1860 static MYSQL_SYSVAR_BOOL(enable_native_partition,
1861                          rocksdb_enable_native_partition, PLUGIN_VAR_READONLY,
1862                          "Enable native partitioning", nullptr, nullptr, false);
1863 
1864 static MYSQL_SYSVAR_BOOL(
1865     enable_ttl, rocksdb_enable_ttl, PLUGIN_VAR_RQCMDARG,
1866     "Enable expired TTL records to be dropped during compaction.", nullptr,
1867     nullptr, true);
1868 
1869 static MYSQL_SYSVAR_BOOL(
1870     enable_ttl_read_filtering, rocksdb_enable_ttl_read_filtering,
1871     PLUGIN_VAR_RQCMDARG,
1872     "For tables with TTL, expired records are skipped/filtered out during "
1873     "processing and in query results. Disabling this will allow these records "
1874     "to be seen, but as a result rows may disappear in the middle of "
1875     "transactions as they are dropped during compaction. Use with caution.",
1876     nullptr, nullptr, true);
1877 
1878 static MYSQL_SYSVAR_INT(
1879     debug_ttl_rec_ts, rocksdb_debug_ttl_rec_ts, PLUGIN_VAR_RQCMDARG,
1880     "For debugging purposes only.  Overrides the TTL of records to "
1881     "now() + debug_ttl_rec_ts.  The value can be +/- to simulate "
1882     "a record inserted in the past vs a record inserted in the 'future'. "
1883     "A value of 0 denotes that the variable is not set. This variable is a "
1884     "no-op in non-debug builds.",
1885     nullptr, nullptr, 0, /* min */ -3600, /* max */ 3600, 0);
1886 
1887 static MYSQL_SYSVAR_INT(
1888     debug_ttl_snapshot_ts, rocksdb_debug_ttl_snapshot_ts, PLUGIN_VAR_RQCMDARG,
1889     "For debugging purposes only.  Sets the snapshot during compaction to "
1890     "now() + debug_set_ttl_snapshot_ts.  The value can be +/- to simulate "
1891     "a snapshot in the past vs a snapshot created in the 'future'. "
1892     "A value of 0 denotes that the variable is not set. This variable is a "
1893     "no-op in non-debug builds.",
1894     nullptr, nullptr, 0, /* min */ -3600, /* max */ 3600, 0);
1895 
1896 static MYSQL_SYSVAR_INT(
1897     debug_ttl_read_filter_ts, rocksdb_debug_ttl_read_filter_ts,
1898     PLUGIN_VAR_RQCMDARG,
1899     "For debugging purposes only.  Overrides the TTL read filtering time to "
1900     "time + debug_ttl_read_filter_ts. A value of 0 denotes that the variable "
1901     "is not set. This variable is a no-op in non-debug builds.",
1902     nullptr, nullptr, 0, /* min */ -3600, /* max */ 3600, 0);
1903 
1904 static MYSQL_SYSVAR_BOOL(
1905     debug_ttl_ignore_pk, rocksdb_debug_ttl_ignore_pk, PLUGIN_VAR_RQCMDARG,
1906     "For debugging purposes only. If true, compaction filtering will not occur "
1907     "on PK TTL data. This variable is a no-op in non-debug builds.",
1908     nullptr, nullptr, false);
1909 
1910 static MYSQL_SYSVAR_UINT(
1911     max_manual_compactions, rocksdb_max_manual_compactions, PLUGIN_VAR_RQCMDARG,
1912     "Maximum number of pending + ongoing number of manual compactions.",
1913     nullptr, nullptr, /* default */ 10, /* min */ 0, /* max */ UINT_MAX, 0);
1914 
1915 static MYSQL_SYSVAR_BOOL(
1916     rollback_on_timeout, rocksdb_rollback_on_timeout, PLUGIN_VAR_OPCMDARG,
1917     "Whether to roll back the complete transaction or a single statement on "
1918     "lock wait timeout (a single statement by default)",
1919     NULL, NULL, FALSE);
1920 
1921 static MYSQL_SYSVAR_UINT(
1922     debug_manual_compaction_delay, rocksdb_debug_manual_compaction_delay,
1923     PLUGIN_VAR_RQCMDARG,
1924     "For debugging purposes only. Sleeping specified seconds "
1925     "for simulating long running compactions.",
1926     nullptr, nullptr, 0, /* min */ 0, /* max */ UINT_MAX, 0);
1927 
1928 static MYSQL_SYSVAR_BOOL(
1929     reset_stats, rocksdb_reset_stats, PLUGIN_VAR_RQCMDARG,
1930     "Reset the RocksDB internal statistics without restarting the DB.", nullptr,
1931     rocksdb_set_reset_stats, false);
1932 
1933 static MYSQL_SYSVAR_BOOL(ignore_unknown_options, rocksdb_ignore_unknown_options,
1934                          PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
1935                          "Enable ignoring unknown options passed to RocksDB",
1936                          nullptr, nullptr, true);
1937 
1938 static MYSQL_SYSVAR_BOOL(strict_collation_check, rocksdb_strict_collation_check,
1939                          PLUGIN_VAR_RQCMDARG,
1940                          "Enforce case sensitive collation for MyRocks indexes",
1941                          nullptr, nullptr, true);
1942 
1943 static MYSQL_SYSVAR_STR(strict_collation_exceptions,
1944                         rocksdb_strict_collation_exceptions,
1945                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
1946                         "Regex that describes set of tables that are excluded "
1947                         "from the case sensitive collation enforcement",
1948                         nullptr, rocksdb_set_collation_exception_list, "");
1949 
1950 static MYSQL_SYSVAR_BOOL(collect_sst_properties, rocksdb_collect_sst_properties,
1951                          PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1952                          "Enables collecting SST file properties on each flush",
1953                          nullptr, nullptr, rocksdb_collect_sst_properties);
1954 
1955 static MYSQL_SYSVAR_BOOL(
1956     force_flush_memtable_now, rocksdb_force_flush_memtable_now_var,
1957     PLUGIN_VAR_RQCMDARG,
1958     "Forces memstore flush which may block all write requests so be careful",
1959     rocksdb_force_flush_memtable_now, rocksdb_force_flush_memtable_now_stub,
1960     false);
1961 
1962 static MYSQL_SYSVAR_BOOL(
1963     force_flush_memtable_and_lzero_now,
1964     rocksdb_force_flush_memtable_and_lzero_now_var, PLUGIN_VAR_RQCMDARG,
1965     "Acts similar to force_flush_memtable_now, but also compacts all L0 files.",
1966     rocksdb_force_flush_memtable_and_lzero_now,
1967     rocksdb_force_flush_memtable_and_lzero_now_stub, false);
1968 
1969 static MYSQL_SYSVAR_UINT(
1970     seconds_between_stat_computes, rocksdb_seconds_between_stat_computes,
1971     PLUGIN_VAR_RQCMDARG,
1972     "Sets a number of seconds to wait between optimizer stats recomputation. "
1973     "Only changed indexes will be refreshed.",
1974     nullptr, nullptr, rocksdb_seconds_between_stat_computes,
1975     /* min */ 0L, /* max */ UINT_MAX, 0);
1976 
1977 static MYSQL_SYSVAR_LONGLONG(compaction_sequential_deletes,
1978                              rocksdb_compaction_sequential_deletes,
1979                              PLUGIN_VAR_RQCMDARG,
1980                              "RocksDB will trigger compaction for the file if "
1981                              "it has more than this number sequential deletes "
1982                              "per window",
1983                              nullptr, rocksdb_set_compaction_options,
1984                              DEFAULT_COMPACTION_SEQUENTIAL_DELETES,
1985                              /* min */ 0L,
1986                              /* max */ MAX_COMPACTION_SEQUENTIAL_DELETES, 0);
1987 
1988 static MYSQL_SYSVAR_LONGLONG(
1989     compaction_sequential_deletes_window,
1990     rocksdb_compaction_sequential_deletes_window, PLUGIN_VAR_RQCMDARG,
1991     "Size of the window for counting rocksdb_compaction_sequential_deletes",
1992     nullptr, rocksdb_set_compaction_options,
1993     DEFAULT_COMPACTION_SEQUENTIAL_DELETES_WINDOW,
1994     /* min */ 0L, /* max */ MAX_COMPACTION_SEQUENTIAL_DELETES_WINDOW, 0);
1995 
1996 static MYSQL_SYSVAR_LONGLONG(
1997     compaction_sequential_deletes_file_size,
1998     rocksdb_compaction_sequential_deletes_file_size, PLUGIN_VAR_RQCMDARG,
1999     "Minimum file size required for compaction_sequential_deletes", nullptr,
2000     rocksdb_set_compaction_options, 0L,
2001     /* min */ -1L, /* max */ LLONG_MAX, 0);
2002 
2003 static MYSQL_SYSVAR_BOOL(
2004     compaction_sequential_deletes_count_sd,
2005     rocksdb_compaction_sequential_deletes_count_sd, PLUGIN_VAR_RQCMDARG,
2006     "Counting SingleDelete as rocksdb_compaction_sequential_deletes", nullptr,
2007     nullptr, rocksdb_compaction_sequential_deletes_count_sd);
2008 
2009 static MYSQL_SYSVAR_BOOL(
2010     print_snapshot_conflict_queries, rocksdb_print_snapshot_conflict_queries,
2011     PLUGIN_VAR_RQCMDARG,
2012     "Logging queries that got snapshot conflict errors into *.err log", nullptr,
2013     nullptr, rocksdb_print_snapshot_conflict_queries);
2014 
2015 static MYSQL_THDVAR_INT(checksums_pct, PLUGIN_VAR_RQCMDARG,
2016                         "How many percentages of rows to be checksummed",
2017                         nullptr, nullptr, RDB_MAX_CHECKSUMS_PCT,
2018                         /* min */ 0, /* max */ RDB_MAX_CHECKSUMS_PCT, 0);
2019 
2020 static MYSQL_THDVAR_BOOL(store_row_debug_checksums, PLUGIN_VAR_RQCMDARG,
2021                          "Include checksums when writing index/table records",
2022                          nullptr, nullptr, false /* default value */);
2023 
2024 static MYSQL_THDVAR_BOOL(verify_row_debug_checksums, PLUGIN_VAR_RQCMDARG,
2025                          "Verify checksums when reading index/table records",
2026                          nullptr, nullptr, false /* default value */);
2027 
2028 static MYSQL_THDVAR_BOOL(master_skip_tx_api, PLUGIN_VAR_RQCMDARG,
2029                          "Skipping holding any lock on row access. "
2030                          "Not effective on slave.",
2031                          nullptr, nullptr, false);
2032 
2033 #if defined(ROCKSDB_INCLUDE_VALIDATE_TABLES) && ROCKSDB_INCLUDE_VALIDATE_TABLES
2034 static MYSQL_SYSVAR_UINT(
2035     validate_tables, rocksdb_validate_tables,
2036     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
2037     "Verify all .frm files match all RocksDB tables (0 means no verification, "
2038     "1 means verify and fail on error, and 2 means verify but continue",
2039     nullptr, nullptr, 1 /* default value */, 0 /* min value */,
2040     2 /* max value */, 0);
2041 #endif  // defined(ROCKSDB_INCLUDE_VALIDATE_TABLES) &&
2042         // ROCKSDB_INCLUDE_VALIDATE_TABLES
2043 
2044 static MYSQL_SYSVAR_STR(datadir, rocksdb_datadir,
2045                         PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
2046                         "RocksDB data directory", nullptr, nullptr,
2047                         "./.rocksdb");
2048 
2049 static MYSQL_SYSVAR_UINT(
2050     table_stats_sampling_pct, rocksdb_table_stats_sampling_pct,
2051     PLUGIN_VAR_RQCMDARG,
2052     "Percentage of entries to sample when collecting statistics about table "
2053     "properties. Specify either 0 to sample everything or percentage "
2054     "[" STRINGIFY_ARG(RDB_TBL_STATS_SAMPLE_PCT_MIN) ".." STRINGIFY_ARG(
2055         RDB_TBL_STATS_SAMPLE_PCT_MAX) "]. "
2056                                       "By default " STRINGIFY_ARG(
2057                                           RDB_DEFAULT_TBL_STATS_SAMPLE_PCT) "% "
2058                                                                             "of"
2059                                                                             " e"
2060                                                                             "nt"
2061                                                                             "ri"
2062                                                                             "es"
2063                                                                             " a"
2064                                                                             "re"
2065                                                                             " "
2066                                                                             "sa"
2067                                                                             "mp"
2068                                                                             "le"
2069                                                                             "d"
2070                                                                             ".",
2071     nullptr, rocksdb_set_table_stats_sampling_pct, /* default */
2072     RDB_DEFAULT_TBL_STATS_SAMPLE_PCT, /* everything */ 0,
2073     /* max */ RDB_TBL_STATS_SAMPLE_PCT_MAX, 0);
2074 
2075 static MYSQL_SYSVAR_UINT(table_stats_recalc_threshold_pct,
2076                          rocksdb_table_stats_recalc_threshold_pct,
2077                          PLUGIN_VAR_RQCMDARG,
2078                          "Percentage of number of modified rows over total "
2079                          "number of rows to trigger stats recalculation",
2080                          nullptr, nullptr, /* default */
2081                          rocksdb_table_stats_recalc_threshold_pct,
2082                          /* everything */ 0,
2083                          /* max */ RDB_TBL_STATS_RECALC_THRESHOLD_PCT_MAX, 0);
2084 
2085 static MYSQL_SYSVAR_ULONGLONG(
2086     table_stats_recalc_threshold_count,
2087     rocksdb_table_stats_recalc_threshold_count, PLUGIN_VAR_RQCMDARG,
2088     "Number of modified rows to trigger stats recalculation", nullptr,
2089     nullptr, /* default */
2090     rocksdb_table_stats_recalc_threshold_count,
2091     /* everything */ 0,
2092     /* max */ UINT64_MAX, 0);
2093 
2094 static MYSQL_SYSVAR_INT(
2095     table_stats_background_thread_nice_value,
2096     rocksdb_table_stats_background_thread_nice_value, PLUGIN_VAR_RQCMDARG,
2097     "nice value for index stats", rocksdb_index_stats_thread_renice, nullptr,
2098     /* default */ rocksdb_table_stats_background_thread_nice_value,
2099     /* min */ THREAD_PRIO_MIN, /* max */ THREAD_PRIO_MAX, 0);
2100 
2101 static MYSQL_SYSVAR_ULONGLONG(
2102     table_stats_max_num_rows_scanned, rocksdb_table_stats_max_num_rows_scanned,
2103     PLUGIN_VAR_RQCMDARG,
2104     "The maximum number of rows to scan in table scan based "
2105     "cardinality calculation",
2106     nullptr, nullptr, /* default */
2107     0, /* everything */ 0,
2108     /* max */ UINT64_MAX, 0);
2109 
2110 static MYSQL_SYSVAR_UINT(
2111     stats_recalc_rate, rocksdb_stats_recalc_rate, PLUGIN_VAR_RQCMDARG,
2112     "The number of indexes per second to recalculate statistics for. 0 to "
2113     "disable background recalculation.",
2114     nullptr, nullptr, 0 /* default value */, 0 /* min value */,
2115     UINT_MAX /* max value */, 0);
2116 
2117 static MYSQL_SYSVAR_BOOL(table_stats_use_table_scan,
2118                          rocksdb_table_stats_use_table_scan,
2119                          PLUGIN_VAR_RQCMDARG,
2120                          "Enable table scan based index calculation.", nullptr,
2121                          rocksdb_update_table_stats_use_table_scan,
2122                          rocksdb_table_stats_use_table_scan);
2123 
2124 static MYSQL_SYSVAR_BOOL(
2125     large_prefix, rocksdb_large_prefix, PLUGIN_VAR_RQCMDARG,
2126     "Support large index prefix length of 3072 bytes. If off, the maximum "
2127     "index prefix length is 767.",
2128     nullptr, nullptr, false);
2129 
2130 static MYSQL_SYSVAR_BOOL(
2131     allow_to_start_after_corruption, rocksdb_allow_to_start_after_corruption,
2132     PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
2133     "Allow server to start successfully when RocksDB corruption is detected.",
2134     nullptr, nullptr, false);
2135 
2136 static MYSQL_SYSVAR_BOOL(error_on_suboptimal_collation,
2137                          rocksdb_error_on_suboptimal_collation,
2138                          PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
2139                          "Raise an error instead of warning if a sub-optimal "
2140                          "collation is used",
2141                          nullptr, nullptr, false);
2142 
2143 static MYSQL_SYSVAR_BOOL(
2144     no_create_column_family, rocksdb_no_create_column_family,
2145     PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
2146     "Do not allow creation of new Column Families through index comments.",
2147     nullptr, nullptr, false);
2148 
2149 static MYSQL_SYSVAR_BOOL(
2150     enable_insert_with_update_caching,
2151     rocksdb_enable_insert_with_update_caching, PLUGIN_VAR_OPCMDARG,
2152     "Whether to enable optimization where we cache the read from a failed "
2153     "insertion attempt in INSERT ON DUPLICATE KEY UPDATE",
2154     nullptr, nullptr, true);
2155 
2156 static MYSQL_SYSVAR_STR(
2157     trace_block_cache_access, rocksdb_block_cache_trace_options_str,
2158     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
2159     "Block cache trace option string. The format is "
2160     "sampling_frequency:max_trace_file_size:trace_file_name. "
2161     "sampling_frequency and max_trace_file_size are positive integers. The "
2162     "block accesses are saved to the "
2163     "rocksdb_datadir/block_cache_traces/trace_file_name.",
2164     rocksdb_trace_block_cache_access, nullptr, "");
2165 
2166 static MYSQL_SYSVAR_STR(
2167     trace_queries, rocksdb_trace_options_str,
2168     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
2169     "Trace option string. The format is "
2170     "sampling_frequency:max_trace_file_size:trace_file_name. "
2171     "sampling_frequency and max_trace_file_size are positive integers. The "
2172     "queries are saved to the "
2173     "rocksdb_datadir/queries_traces/trace_file_name.",
2174     rocksdb_trace_queries, nullptr, "");
2175 
2176 static MYSQL_SYSVAR_BOOL(skip_locks_if_skip_unique_check,
2177                          rocksdb_skip_locks_if_skip_unique_check,
2178                          PLUGIN_VAR_RQCMDARG,
2179                          "Skip row locking when unique checks are disabled.",
2180                          nullptr, nullptr, FALSE);
2181 
2182 static MYSQL_SYSVAR_BOOL(
2183     alter_column_default_inplace, rocksdb_alter_column_default_inplace,
2184     PLUGIN_VAR_RQCMDARG,
2185     "Allow inplace alter for alter column default operation", nullptr, nullptr,
2186     TRUE);
2187 
2188 static const int ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE = 100;
2189 
2190 static struct st_mysql_sys_var *rocksdb_system_variables[] = {
2191     MYSQL_SYSVAR(lock_wait_timeout),
2192     MYSQL_SYSVAR(deadlock_detect),
2193     MYSQL_SYSVAR(deadlock_detect_depth),
2194     MYSQL_SYSVAR(commit_time_batch_for_recovery),
2195     MYSQL_SYSVAR(max_row_locks),
2196     MYSQL_SYSVAR(write_batch_max_bytes),
2197     MYSQL_SYSVAR(write_batch_flush_threshold),
2198     MYSQL_SYSVAR(lock_scanned_rows),
2199     MYSQL_SYSVAR(bulk_load),
2200     MYSQL_SYSVAR(bulk_load_allow_sk),
2201     MYSQL_SYSVAR(bulk_load_allow_unsorted),
2202     MYSQL_SYSVAR(trace_sst_api),
2203     MYSQL_SYSVAR(commit_in_the_middle),
2204     MYSQL_SYSVAR(blind_delete_primary_key),
2205     MYSQL_SYSVAR(enable_iterate_bounds),
2206 #if defined(ROCKSDB_INCLUDE_RFR) && ROCKSDB_INCLUDE_RFR
2207     MYSQL_SYSVAR(read_free_rpl_tables),
2208     MYSQL_SYSVAR(read_free_rpl),
2209 #endif  // defined(ROCKSDB_INCLUDE_RFR) && ROCKSDB_INCLUDE_RFR
2210     MYSQL_SYSVAR(rpl_skip_tx_api),
2211     MYSQL_SYSVAR(bulk_load_size),
2212     MYSQL_SYSVAR(merge_buf_size),
2213     MYSQL_SYSVAR(enable_bulk_load_api),
2214     MYSQL_SYSVAR(enable_pipelined_write),
2215     MYSQL_SYSVAR(enable_remove_orphaned_dropped_cfs),
2216     MYSQL_SYSVAR(tmpdir),
2217     MYSQL_SYSVAR(merge_combine_read_size),
2218     MYSQL_SYSVAR(merge_tmp_file_removal_delay_ms),
2219     MYSQL_SYSVAR(skip_bloom_filter_on_read),
2220 
2221     MYSQL_SYSVAR(create_if_missing),
2222     MYSQL_SYSVAR(concurrent_prepare),
2223     MYSQL_SYSVAR(two_write_queues),
2224     MYSQL_SYSVAR(manual_wal_flush),
2225     MYSQL_SYSVAR(write_policy),
2226     MYSQL_SYSVAR(create_missing_column_families),
2227     MYSQL_SYSVAR(error_if_exists),
2228     MYSQL_SYSVAR(paranoid_checks),
2229     MYSQL_SYSVAR(rate_limiter_bytes_per_sec),
2230     MYSQL_SYSVAR(sst_mgr_rate_bytes_per_sec),
2231     MYSQL_SYSVAR(delayed_write_rate),
2232     MYSQL_SYSVAR(max_latest_deadlocks),
2233     MYSQL_SYSVAR(info_log_level),
2234     MYSQL_SYSVAR(max_open_files),
2235     MYSQL_SYSVAR(max_total_wal_size),
2236     MYSQL_SYSVAR(use_fsync),
2237     MYSQL_SYSVAR(wal_dir),
2238     MYSQL_SYSVAR(persistent_cache_path),
2239     MYSQL_SYSVAR(persistent_cache_size_mb),
2240     MYSQL_SYSVAR(delete_obsolete_files_period_micros),
2241     MYSQL_SYSVAR(max_background_jobs),
2242     MYSQL_SYSVAR(max_background_flushes),
2243     MYSQL_SYSVAR(max_background_compactions),
2244     MYSQL_SYSVAR(max_bottom_pri_background_compactions),
2245     MYSQL_SYSVAR(max_log_file_size),
2246     MYSQL_SYSVAR(max_subcompactions),
2247     MYSQL_SYSVAR(log_file_time_to_roll),
2248     MYSQL_SYSVAR(keep_log_file_num),
2249     MYSQL_SYSVAR(max_manifest_file_size),
2250     MYSQL_SYSVAR(table_cache_numshardbits),
2251     MYSQL_SYSVAR(wal_ttl_seconds),
2252     MYSQL_SYSVAR(wal_size_limit_mb),
2253     MYSQL_SYSVAR(manifest_preallocation_size),
2254     MYSQL_SYSVAR(use_direct_reads),
2255     MYSQL_SYSVAR(use_direct_io_for_flush_and_compaction),
2256     MYSQL_SYSVAR(allow_mmap_reads),
2257     MYSQL_SYSVAR(allow_mmap_writes),
2258     MYSQL_SYSVAR(is_fd_close_on_exec),
2259     MYSQL_SYSVAR(stats_dump_period_sec),
2260     MYSQL_SYSVAR(advise_random_on_open),
2261     MYSQL_SYSVAR(db_write_buffer_size),
2262     MYSQL_SYSVAR(use_adaptive_mutex),
2263     MYSQL_SYSVAR(bytes_per_sync),
2264     MYSQL_SYSVAR(wal_bytes_per_sync),
2265     MYSQL_SYSVAR(enable_thread_tracking),
2266     MYSQL_SYSVAR(perf_context_level),
2267     MYSQL_SYSVAR(wal_recovery_mode),
2268     MYSQL_SYSVAR(track_and_verify_wals_in_manifest),
2269     MYSQL_SYSVAR(stats_level),
2270     MYSQL_SYSVAR(access_hint_on_compaction_start),
2271     MYSQL_SYSVAR(new_table_reader_for_compaction_inputs),
2272     MYSQL_SYSVAR(compaction_readahead_size),
2273     MYSQL_SYSVAR(allow_concurrent_memtable_write),
2274     MYSQL_SYSVAR(enable_write_thread_adaptive_yield),
2275 
2276     MYSQL_SYSVAR(block_cache_size),
2277     MYSQL_SYSVAR(sim_cache_size),
2278     MYSQL_SYSVAR(cache_high_pri_pool_ratio),
2279     MYSQL_SYSVAR(cache_dump),
2280     MYSQL_SYSVAR(cache_index_and_filter_blocks),
2281     MYSQL_SYSVAR(cache_index_and_filter_with_high_priority),
2282     MYSQL_SYSVAR(pin_l0_filter_and_index_blocks_in_cache),
2283     MYSQL_SYSVAR(index_type),
2284     MYSQL_SYSVAR(hash_index_allow_collision),
2285     MYSQL_SYSVAR(no_block_cache),
2286     MYSQL_SYSVAR(block_size),
2287     MYSQL_SYSVAR(block_size_deviation),
2288     MYSQL_SYSVAR(block_restart_interval),
2289     MYSQL_SYSVAR(whole_key_filtering),
2290 
2291     MYSQL_SYSVAR(default_cf_options),
2292     MYSQL_SYSVAR(override_cf_options),
2293     MYSQL_SYSVAR(update_cf_options),
2294     MYSQL_SYSVAR(use_default_sk_cf),
2295 
2296     MYSQL_SYSVAR(flush_log_at_trx_commit),
2297     MYSQL_SYSVAR(write_disable_wal),
2298     MYSQL_SYSVAR(write_ignore_missing_column_families),
2299 
2300     MYSQL_SYSVAR(skip_fill_cache),
2301     MYSQL_SYSVAR(unsafe_for_binlog),
2302 
2303     MYSQL_SYSVAR(records_in_range),
2304     MYSQL_SYSVAR(force_index_records_in_range),
2305     MYSQL_SYSVAR(debug_optimizer_n_rows),
2306     MYSQL_SYSVAR(force_compute_memtable_stats),
2307     MYSQL_SYSVAR(force_compute_memtable_stats_cachetime),
2308     MYSQL_SYSVAR(debug_optimizer_no_zero_cardinality),
2309 
2310     MYSQL_SYSVAR(compact_cf),
2311     MYSQL_SYSVAR(delete_cf),
2312     MYSQL_SYSVAR(signal_drop_index_thread),
2313     MYSQL_SYSVAR(pause_background_work),
2314     MYSQL_SYSVAR(ignore_unknown_options),
2315     MYSQL_SYSVAR(strict_collation_check),
2316     MYSQL_SYSVAR(strict_collation_exceptions),
2317     MYSQL_SYSVAR(collect_sst_properties),
2318     MYSQL_SYSVAR(force_flush_memtable_now),
2319     MYSQL_SYSVAR(force_flush_memtable_and_lzero_now),
2320     MYSQL_SYSVAR(enable_native_partition),
2321     MYSQL_SYSVAR(enable_ttl),
2322     MYSQL_SYSVAR(enable_ttl_read_filtering),
2323     MYSQL_SYSVAR(debug_ttl_rec_ts),
2324     MYSQL_SYSVAR(debug_ttl_snapshot_ts),
2325     MYSQL_SYSVAR(debug_ttl_read_filter_ts),
2326     MYSQL_SYSVAR(debug_ttl_ignore_pk),
2327     MYSQL_SYSVAR(reset_stats),
2328     MYSQL_SYSVAR(seconds_between_stat_computes),
2329 
2330     MYSQL_SYSVAR(compaction_sequential_deletes),
2331     MYSQL_SYSVAR(compaction_sequential_deletes_window),
2332     MYSQL_SYSVAR(compaction_sequential_deletes_file_size),
2333     MYSQL_SYSVAR(compaction_sequential_deletes_count_sd),
2334     MYSQL_SYSVAR(print_snapshot_conflict_queries),
2335 
2336     MYSQL_SYSVAR(datadir),
2337     MYSQL_SYSVAR(create_checkpoint),
2338 
2339     MYSQL_SYSVAR(checksums_pct),
2340     MYSQL_SYSVAR(store_row_debug_checksums),
2341     MYSQL_SYSVAR(verify_row_debug_checksums),
2342     MYSQL_SYSVAR(master_skip_tx_api),
2343 
2344 #if defined(ROCKSDB_INCLUDE_VALIDATE_TABLES) && ROCKSDB_INCLUDE_VALIDATE_TABLES
2345     MYSQL_SYSVAR(validate_tables),
2346 #endif  // defined(ROCKSDB_INCLUDE_VALIDATE_TABLES) &&
2347         // ROCKSDB_INCLUDE_VALIDATE_TABLES
2348     MYSQL_SYSVAR(table_stats_sampling_pct),
2349     MYSQL_SYSVAR(table_stats_recalc_threshold_pct),
2350     MYSQL_SYSVAR(table_stats_recalc_threshold_count),
2351     MYSQL_SYSVAR(table_stats_max_num_rows_scanned),
2352     MYSQL_SYSVAR(table_stats_use_table_scan),
2353     MYSQL_SYSVAR(table_stats_background_thread_nice_value),
2354 
2355     MYSQL_SYSVAR(large_prefix),
2356     MYSQL_SYSVAR(allow_to_start_after_corruption),
2357     MYSQL_SYSVAR(error_on_suboptimal_collation),
2358     MYSQL_SYSVAR(no_create_column_family),
2359     MYSQL_SYSVAR(stats_recalc_rate),
2360     MYSQL_SYSVAR(debug_manual_compaction_delay),
2361     MYSQL_SYSVAR(max_manual_compactions),
2362     MYSQL_SYSVAR(manual_compaction_threads),
2363     MYSQL_SYSVAR(manual_compaction_bottommost_level),
2364     MYSQL_SYSVAR(rollback_on_timeout),
2365 
2366     MYSQL_SYSVAR(enable_insert_with_update_caching),
2367     MYSQL_SYSVAR(trace_block_cache_access),
2368     MYSQL_SYSVAR(trace_queries),
2369 
2370     MYSQL_SYSVAR(skip_locks_if_skip_unique_check),
2371     MYSQL_SYSVAR(alter_column_default_inplace),
2372     nullptr};
2373 
rdb_get_rocksdb_write_options(my_core::THD * const thd)2374 static rocksdb::WriteOptions rdb_get_rocksdb_write_options(
2375     my_core::THD *const thd) {
2376   rocksdb::WriteOptions opt;
2377 
2378   opt.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
2379   opt.disableWAL = THDVAR(thd, write_disable_wal);
2380   opt.ignore_missing_column_families =
2381       THDVAR(thd, write_ignore_missing_column_families);
2382 
2383   return opt;
2384 }
2385 
rocksdb_compact_column_family(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,struct st_mysql_value * const value)2386 static int rocksdb_compact_column_family(THD *const thd,
2387                                          struct st_mysql_sys_var *const var,
2388                                          void *const var_ptr,
2389                                          struct st_mysql_value *const value) {
2390   char buff[STRING_BUFFER_USUAL_SIZE];
2391   int len = sizeof(buff);
2392 
2393   assert(value != nullptr);
2394 
2395   if (const char *const cf = value->val_str(value, buff, &len)) {
2396     DBUG_EXECUTE_IF("rocksdb_compact_column_family", {
2397       static constexpr char act[] =
2398           "now signal ready_to_mark_cf_dropped_in_compact_column_family "
2399           "wait_for mark_cf_dropped_done_in_compact_column_family";
2400       assert(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
2401     });
2402 
2403     std::string cf_name = std::string(cf);
2404     // use rocksdb_compact_cf="" or "default" to compact default CF
2405     if (cf_name.empty()) cf_name = DEFAULT_CF_NAME;
2406 
2407     auto cfh = cf_manager.get_cf(cf_name);
2408     if (cfh != nullptr && rdb != nullptr) {
2409       rocksdb::BottommostLevelCompaction bottommost_level_compaction =
2410           (rocksdb::BottommostLevelCompaction)THDVAR(
2411               thd, manual_compaction_bottommost_level);
2412 
2413       int mc_id = rdb_mc_thread.request_manual_compaction(
2414           cfh, nullptr, nullptr, THDVAR(thd, manual_compaction_threads),
2415           bottommost_level_compaction);
2416       if (mc_id == -1) {
2417         my_error(ER_INTERNAL_ERROR, MYF(0),
2418                  "Can't schedule more manual compactions. "
2419                  "Increase rocksdb_max_manual_compactions or stop issuing "
2420                  "more manual compactions.");
2421         return HA_EXIT_FAILURE;
2422       } else if (mc_id < 0) {
2423         return HA_EXIT_FAILURE;
2424       }
2425       // NO_LINT_DEBUG
2426       sql_print_information("RocksDB: Manual compaction of column family: %s\n",
2427                             cf);
2428       // Checking thd state every short cycle (100ms). This is for allowing to
2429       // exiting this function without waiting for CompactRange to finish.
2430       do {
2431         my_sleep(100000);
2432       } while (!thd->killed &&
2433                !rdb_mc_thread.is_manual_compaction_finished(mc_id));
2434 
2435       if (thd->killed) {
2436         // This cancels if requested compaction state is INITED.
2437         // TODO(yoshinorim): Cancel running compaction as well once
2438         // it is supported in RocksDB.
2439         rdb_mc_thread.clear_manual_compaction_request(mc_id, true);
2440       }
2441     }
2442   }
2443   return HA_EXIT_SUCCESS;
2444 }
2445 
2446 /*
2447  * Serializes an xid to a string so that it can
2448  * be used as a rocksdb transaction name
2449  */
rdb_xid_to_string(const XID & src)2450 static std::string rdb_xid_to_string(const XID &src) {
2451   assert(src.get_gtrid_length() >= 0);
2452   assert(src.get_gtrid_length() <= MAXGTRIDSIZE);
2453   assert(src.get_bqual_length() >= 0);
2454   assert(src.get_bqual_length() <= MAXBQUALSIZE);
2455 
2456   std::string buf;
2457   buf.reserve(RDB_XIDHDR_LEN + src.get_gtrid_length() + src.get_bqual_length());
2458 
2459   /*
2460    * expand formatID to fill 8 bytes if it doesn't already
2461    * then reinterpret bit pattern as unsigned and store in network order
2462    */
2463   uchar fidbuf[RDB_FORMATID_SZ];
2464   int64 signed_fid8 = src.get_format_id();
2465   const uint64 raw_fid8 = *reinterpret_cast<uint64 *>(&signed_fid8);
2466   rdb_netbuf_store_uint64(fidbuf, raw_fid8);
2467   buf.append(reinterpret_cast<const char *>(fidbuf), RDB_FORMATID_SZ);
2468 
2469   buf.push_back(src.get_gtrid_length());
2470   buf.push_back(src.get_bqual_length());
2471   buf.append(src.get_data(),
2472              (src.get_gtrid_length()) + (src.get_bqual_length()));
2473   return buf;
2474 }
2475 
2476 ///////////////////////////////////////////////////////////////////////////////////////////
2477 
2478 /*
2479   Drop index thread's control
2480 */
2481 
rocksdb_drop_index_wakeup_thread(my_core::THD * const thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * const var MY_ATTRIBUTE ((__unused__)),void * const var_ptr MY_ATTRIBUTE ((__unused__)),const void * const save)2482 static void rocksdb_drop_index_wakeup_thread(
2483     my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
2484     struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
2485     void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
2486   if (*static_cast<const bool *>(save)) {
2487     rdb_drop_idx_thread.signal();
2488   }
2489 }
2490 
rocksdb_perf_context_level(THD * const thd)2491 static inline uint32_t rocksdb_perf_context_level(THD *const thd) {
2492   assert(thd != nullptr);
2493 
2494   const int session_perf_context_level = THDVAR(thd, perf_context_level);
2495   if (session_perf_context_level > rocksdb::PerfLevel::kUninitialized) {
2496     return session_perf_context_level;
2497   }
2498 
2499   /*
2500     Fallback to global thdvar, if session specific one was not set to a valid
2501     value.
2502   */
2503 
2504   const int global_perf_context_level = THDVAR(nullptr, perf_context_level);
2505   if (global_perf_context_level > rocksdb::PerfLevel::kUninitialized) {
2506     return global_perf_context_level;
2507   }
2508 
2509   return rocksdb::PerfLevel::kDisable;
2510 }
2511 
2512 /*
2513   Very short (functor-like) interface to be passed to
2514   Rdb_transaction::walk_tx_list()
2515 */
2516 
2517 interface Rdb_tx_list_walker {
2518   virtual ~Rdb_tx_list_walker() {}
2519   virtual void process_tran(const Rdb_transaction *const) = 0;
2520 };
2521 
2522 /*
2523   This is a helper class that is passed to RocksDB to get notifications when
2524   a snapshot gets created.
2525 */
2526 
2527 class Rdb_snapshot_notifier : public rocksdb::TransactionNotifier {
2528   Rdb_transaction *m_owning_tx;
2529 
2530   void SnapshotCreated(const rocksdb::Snapshot *snapshot) override;
2531 
2532  public:
2533   Rdb_snapshot_notifier(const Rdb_snapshot_notifier &) = delete;
2534   Rdb_snapshot_notifier &operator=(const Rdb_snapshot_notifier &) = delete;
2535 
Rdb_snapshot_notifier(Rdb_transaction * const owning_tx)2536   explicit Rdb_snapshot_notifier(Rdb_transaction *const owning_tx)
2537       : m_owning_tx(owning_tx) {}
2538 
2539   // If the owning Rdb_transaction gets destructed we need to not reference
2540   // it anymore.
detach()2541   void detach() { m_owning_tx = nullptr; }
2542 };
2543 
2544 /* This is the base class for transactions when interacting with rocksdb.
2545  */
2546 class Rdb_transaction {
2547  protected:
2548   ulonglong m_write_count = 0;
2549   // per row data
2550   ulonglong m_row_lock_count = 0;
2551   std::unordered_map<GL_INDEX_ID, ulonglong> m_auto_incr_map;
2552 
2553   bool m_is_delayed_snapshot = false;
2554 
2555   std::unordered_set<Rdb_tbl_def*> modified_tables;
2556 
2557  private:
2558   /*
2559     Number of write operations this transaction had when we took the last
2560     savepoint (the idea is not to take another savepoint if we haven't made
2561     any changes)
2562   */
2563   ulonglong m_writes_at_last_savepoint;
2564 
2565  protected:
2566   THD *m_thd = nullptr;
2567 
2568   rocksdb::ReadOptions m_read_opts;
2569 
2570   static std::multiset<Rdb_transaction *> s_tx_list;
2571   static mysql_mutex_t s_tx_list_mutex;
2572 
2573   Rdb_io_perf *m_tbl_io_perf;
2574 
2575   bool m_tx_read_only = false;
2576 
2577   int m_timeout_sec; /* Cached value of @@rocksdb_lock_wait_timeout */
2578 
2579   /* Maximum number of locks the transaction can have */
2580   ulonglong m_max_row_locks;
2581 
2582   bool m_is_tx_failed = false;
2583   bool m_rollback_only = false;
2584 
2585   std::shared_ptr<Rdb_snapshot_notifier> m_notifier;
2586 
2587   // This should be used only when updating binlog information.
2588   virtual rocksdb::WriteBatchBase *get_write_batch() = 0;
2589   virtual bool commit_no_binlog() = 0;
2590   virtual rocksdb::Iterator *get_iterator(
2591       const rocksdb::ReadOptions &options,
2592       rocksdb::ColumnFamilyHandle *column_family) = 0;
2593 
2594   /*
2595     @detail
2596       This function takes in the WriteBatch of the transaction to add
2597       all the AUTO_INCREMENT merges. It does so by iterating through
2598       m_auto_incr_map and then constructing key/value pairs to call merge upon.
2599 
2600     @param wb
2601    */
merge_auto_incr_map(rocksdb::WriteBatchBase * const wb)2602   rocksdb::Status merge_auto_incr_map(rocksdb::WriteBatchBase *const wb) {
2603     DBUG_EXECUTE_IF("myrocks_autoinc_upgrade", return rocksdb::Status::OK(););
2604 
2605     // Iterate through the merge map merging all keys into data dictionary.
2606     rocksdb::Status s;
2607     for (auto &it : m_auto_incr_map) {
2608       s = dict_manager.put_auto_incr_val(wb, it.first, it.second);
2609       if (!s.ok()) {
2610         return s;
2611       }
2612     }
2613     m_auto_incr_map.clear();
2614     return s;
2615   }
2616 
2617  protected:
2618   /*
2619     The following two are helper functions to be overloaded by child classes.
2620     They should provide RocksDB's savepoint semantics.
2621   */
2622   virtual void do_set_savepoint() = 0;
2623   virtual rocksdb::Status do_pop_savepoint() = 0;
2624   virtual void do_rollback_to_savepoint() = 0;
2625 
2626  public:
2627   int64_t m_snapshot_timestamp = 0;
2628   bool m_ddl_transaction;
2629 
2630   /*
2631     Tracks the number of tables in use through external_lock.
2632     This should not be reset during start_tx().
2633   */
2634   int64_t m_n_mysql_tables_in_use = 0;
2635 
2636   /*
2637     for distinction between rdb_transaction_impl and rdb_writebatch_impl
2638     when using walk tx list
2639   */
2640   virtual bool is_writebatch_trx() const = 0;
2641 
init_mutex()2642   static void init_mutex() {
2643     mysql_mutex_init(key_mutex_tx_list, &s_tx_list_mutex, MY_MUTEX_INIT_FAST);
2644   }
2645 
term_mutex()2646   static void term_mutex() {
2647     assert(s_tx_list.size() == 0);
2648     mysql_mutex_destroy(&s_tx_list_mutex);
2649   }
2650 
walk_tx_list(Rdb_tx_list_walker * walker)2651   static void walk_tx_list(Rdb_tx_list_walker *walker) {
2652     assert(walker != nullptr);
2653 
2654     RDB_MUTEX_LOCK_CHECK(s_tx_list_mutex);
2655 
2656     for (auto it : s_tx_list) walker->process_tran(it);
2657 
2658     RDB_MUTEX_UNLOCK_CHECK(s_tx_list_mutex);
2659   }
2660 
set_status_error(THD * const thd,const rocksdb::Status & s,const Rdb_key_def & kd,Rdb_tbl_def * const tbl_def)2661   int set_status_error(THD *const thd, const rocksdb::Status &s,
2662                        const Rdb_key_def &kd, Rdb_tbl_def *const tbl_def) {
2663     assert(!s.ok());
2664     assert(tbl_def != nullptr);
2665 
2666     if (s.IsTimedOut()) {
2667       /*
2668         SQL layer has weird expectations. If we return an error when
2669         doing a read in DELETE IGNORE, it will ignore the error ("because it's
2670         an IGNORE command!) but then will fail an assert, because "error code
2671         was returned, but no error happened".  Do what InnoDB's
2672         convert_error_code_to_mysql() does: force a statement
2673         rollback before returning HA_ERR_LOCK_WAIT_TIMEOUT:
2674         */
2675       thd->mark_transaction_to_rollback(
2676           static_cast<bool>(rocksdb_rollback_on_timeout));
2677 
2678       rocksdb_row_lock_wait_timeouts++;
2679 
2680       return HA_ERR_LOCK_WAIT_TIMEOUT;
2681     }
2682 
2683     if (s.IsDeadlock()) {
2684       thd->mark_transaction_to_rollback(true /* whole transaction */);
2685       rocksdb_row_lock_deadlocks++;
2686       return HA_ERR_LOCK_DEADLOCK;
2687     } else if (s.IsBusy()) {
2688       rocksdb_snapshot_conflict_errors++;
2689       if (rocksdb_print_snapshot_conflict_queries) {
2690         char user_host_buff[MAX_USER_HOST_SIZE + 1];
2691         make_user_name(thd->security_context(), user_host_buff);
2692         // NO_LINT_DEBUG
2693         sql_print_warning("Got snapshot conflict errors: User: %s Query: %.*s",
2694                           user_host_buff, static_cast<int>(thd->query().length),
2695                           thd->query().str);
2696       }
2697       return HA_ERR_ROCKSDB_STATUS_BUSY;
2698     }
2699 
2700     if (s.IsIOError() || s.IsCorruption()) {
2701       rdb_handle_io_error(s, RDB_IO_ERROR_GENERAL);
2702     }
2703 
2704     return ha_rocksdb::rdb_error_to_mysql(s);
2705   }
2706 
get_thd() const2707   THD *get_thd() const { return m_thd; }
2708 
2709   /* Used for tracking io_perf counters */
io_perf_start(Rdb_io_perf * const io_perf)2710   void io_perf_start(Rdb_io_perf *const io_perf) {
2711     /*
2712       Since perf_context is tracked per thread, it is difficult and expensive
2713       to maintain perf_context on a per table basis. Therefore, roll all
2714       perf_context data into the first table used in a query. This works well
2715       for single table queries and is probably good enough for queries that hit
2716       multiple tables.
2717 
2718       perf_context stats gathering is started when the table lock is acquired
2719       or when ha_rocksdb::start_stmt is called in case of LOCK TABLES. They
2720       are recorded when the table lock is released, or when commit/rollback
2721       is called on the transaction, whichever comes first. Table lock release
2722       and commit/rollback can happen in different orders. In the case where
2723       the lock is released before commit/rollback is called, an extra step to
2724       gather stats during commit/rollback is needed.
2725     */
2726     if (m_tbl_io_perf == nullptr &&
2727         io_perf->start(rocksdb_perf_context_level(m_thd))) {
2728       m_tbl_io_perf = io_perf;
2729     }
2730   }
2731 
io_perf_end_and_record(void)2732   void io_perf_end_and_record(void) {
2733     if (m_tbl_io_perf != nullptr) {
2734       m_tbl_io_perf->end_and_record(rocksdb_perf_context_level(m_thd));
2735       m_tbl_io_perf = nullptr;
2736     }
2737   }
2738 
io_perf_end_and_record(Rdb_io_perf * const io_perf)2739   void io_perf_end_and_record(Rdb_io_perf *const io_perf) {
2740     if (m_tbl_io_perf == io_perf) {
2741       io_perf_end_and_record();
2742     }
2743   }
2744 
set_params(int timeout_sec_arg,int max_row_locks_arg)2745   void set_params(int timeout_sec_arg, int max_row_locks_arg) {
2746     m_timeout_sec = timeout_sec_arg;
2747     m_max_row_locks = max_row_locks_arg;
2748     set_lock_timeout(timeout_sec_arg);
2749   }
2750 
2751   virtual void set_lock_timeout(int timeout_sec_arg) = 0;
2752 
get_write_count() const2753   ulonglong get_write_count() const { return m_write_count; }
2754 
get_row_lock_count() const2755   ulonglong get_row_lock_count() const { return m_row_lock_count; }
2756 
incr_row_lock_count()2757   void incr_row_lock_count() { ++m_row_lock_count; }
2758 
get_max_row_lock_count() const2759   ulonglong get_max_row_lock_count() const { return m_max_row_locks; }
2760 
get_timeout_sec() const2761   int get_timeout_sec() const { return m_timeout_sec; }
2762 
2763   virtual void set_sync(bool sync) = 0;
2764 
2765   virtual void release_lock(const Rdb_key_def &key_descr,
2766                             const std::string &rowkey) = 0;
2767 
2768   virtual bool prepare() = 0;
2769 
commit_or_rollback()2770   bool commit_or_rollback() {
2771     bool res;
2772     if (m_is_tx_failed) {
2773       rollback();
2774       res = false;
2775     } else {
2776       res = commit();
2777     }
2778     return res;
2779   }
2780 
commit()2781   bool commit() {
2782     if (get_write_count() == 0) {
2783       rollback();
2784       return false;
2785     } else if (m_rollback_only) {
2786       /*
2787         Transactions marked as rollback_only are expected to be rolled back at
2788         prepare(). But there are some exceptions like below that prepare() is
2789         never called and commit() is called instead.
2790          1. Binlog is disabled
2791          2. No modification exists in binlog cache for the transaction (#195)
2792         In both cases, rolling back transaction is safe. Nothing is written to
2793         binlog.
2794        */
2795       my_error(ER_ROLLBACK_ONLY, MYF(0));
2796       rollback();
2797       return true;
2798     } else {
2799       return commit_no_binlog();
2800     }
2801   }
2802 
2803   virtual void rollback() = 0;
2804 
snapshot_created(const rocksdb::Snapshot * const snapshot)2805   void snapshot_created(const rocksdb::Snapshot *const snapshot) {
2806     assert(snapshot != nullptr);
2807 
2808     m_read_opts.snapshot = snapshot;
2809     rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp);
2810     m_is_delayed_snapshot = false;
2811   }
2812 
2813   virtual void acquire_snapshot(bool acquire_now) = 0;
2814   virtual void release_snapshot() = 0;
2815 
has_snapshot() const2816   bool has_snapshot() const { return m_read_opts.snapshot != nullptr; }
2817 
2818  private:
2819   // The Rdb_sst_info structures we are currently loading.  In a partitioned
2820   // table this can have more than one entry
2821   std::vector<std::shared_ptr<Rdb_sst_info>> m_curr_bulk_load;
2822   std::string m_curr_bulk_load_tablename;
2823 
2824   /* External merge sorts for bulk load: key ID -> merge sort instance */
2825   std::unordered_map<GL_INDEX_ID, Rdb_index_merge> m_key_merge;
2826 
2827  public:
get_key_merge(GL_INDEX_ID kd_gl_id,rocksdb::ColumnFamilyHandle * cf,Rdb_index_merge ** key_merge)2828   int get_key_merge(GL_INDEX_ID kd_gl_id, rocksdb::ColumnFamilyHandle *cf,
2829                     Rdb_index_merge **key_merge) {
2830     int res;
2831     auto it = m_key_merge.find(kd_gl_id);
2832     if (it == m_key_merge.end()) {
2833       m_key_merge.emplace(
2834           std::piecewise_construct, std::make_tuple(kd_gl_id),
2835           std::make_tuple(
2836               get_rocksdb_tmpdir(), THDVAR(get_thd(), merge_buf_size),
2837               THDVAR(get_thd(), merge_combine_read_size),
2838               THDVAR(get_thd(), merge_tmp_file_removal_delay_ms), cf));
2839       it = m_key_merge.find(kd_gl_id);
2840       if ((res = it->second.init()) != 0) {
2841         return res;
2842       }
2843     }
2844     *key_merge = &it->second;
2845     return HA_EXIT_SUCCESS;
2846   }
2847 
2848   /* Finish bulk loading for all table handlers belongs to one connection */
finish_bulk_load(bool * is_critical_error=nullptr,int print_client_error=true)2849   int finish_bulk_load(bool *is_critical_error = nullptr,
2850                        int print_client_error = true) {
2851     Ensure_cleanup cleanup([&]() {
2852       // Always clear everything regardless of success/failure
2853       m_curr_bulk_load.clear();
2854       m_curr_bulk_load_tablename.clear();
2855       m_key_merge.clear();
2856     });
2857 
2858     int rc = 0;
2859     if (is_critical_error) {
2860       *is_critical_error = true;
2861     }
2862 
2863     // PREPARE phase: finish all on-going bulk loading Rdb_sst_info and
2864     // collect all Rdb_sst_commit_info containing (SST files, cf)
2865     int rc2 = 0;
2866     std::vector<Rdb_sst_info::Rdb_sst_commit_info> sst_commit_list;
2867     sst_commit_list.reserve(m_curr_bulk_load.size());
2868 
2869     for (auto &sst_info : m_curr_bulk_load) {
2870       Rdb_sst_info::Rdb_sst_commit_info commit_info;
2871 
2872       // Commit the list of SST files and move it to the end of
2873       // sst_commit_list, effectively transfer the ownership over
2874       rc2 = sst_info->finish(&commit_info, print_client_error);
2875       if (rc2 && rc == 0) {
2876         // Don't return yet - make sure we finish all the SST infos
2877         rc = rc2;
2878       }
2879 
2880       // Make sure we have work to do - we might be losing the race
2881       if (rc2 == 0 && commit_info.has_work()) {
2882         sst_commit_list.emplace_back(std::move(commit_info));
2883         assert(!commit_info.has_work());
2884       }
2885     }
2886 
2887     if (rc) {
2888       return rc;
2889     }
2890 
2891     // MERGING Phase: Flush the index_merge sort buffers into SST files in
2892     // Rdb_sst_info and collect all Rdb_sst_commit_info containing
2893     // (SST files, cf)
2894     if (!m_key_merge.empty()) {
2895       Ensure_cleanup malloc_cleanup([]() {
2896         /*
2897           Explicitly tell jemalloc to clean up any unused dirty pages at this
2898           point.
2899           See https://reviews.facebook.net/D63723 for more details.
2900         */
2901         purge_all_jemalloc_arenas();
2902       });
2903 
2904       rocksdb::Slice merge_key;
2905       rocksdb::Slice merge_val;
2906       for (auto it = m_key_merge.begin(); it != m_key_merge.end(); it++) {
2907         GL_INDEX_ID index_id = it->first;
2908         std::shared_ptr<const Rdb_key_def> keydef =
2909             ddl_manager.safe_find(index_id);
2910         std::string table_name = ddl_manager.safe_get_table_name(index_id);
2911 
2912         // Unable to find key definition or table name since the
2913         // table could have been dropped.
2914         // TODO(herman): there is a race here between dropping the table
2915         // and detecting a drop here. If the table is dropped while bulk
2916         // loading is finishing, these keys being added here may
2917         // be missed by the compaction filter and not be marked for
2918         // removal. It is unclear how to lock the sql table from the storage
2919         // engine to prevent modifications to it while bulk load is occurring.
2920         if (keydef == nullptr) {
2921           if (is_critical_error) {
2922             // We used to set the error but simply ignores it. This follows
2923             // current behavior and we should revisit this later
2924             *is_critical_error = false;
2925           }
2926           return HA_ERR_KEY_NOT_FOUND;
2927         } else if (table_name.empty()) {
2928           if (is_critical_error) {
2929             // We used to set the error but simply ignores it. This follows
2930             // current behavior and we should revisit this later
2931             *is_critical_error = false;
2932           }
2933           return HA_ERR_NO_SUCH_TABLE;
2934         }
2935         const std::string &index_name = keydef->get_name();
2936         Rdb_index_merge &rdb_merge = it->second;
2937 
2938         // Rdb_sst_info expects a denormalized table name in the form of
2939         // "./database/table"
2940         std::replace(table_name.begin(), table_name.end(), '.', '/');
2941         table_name = "./" + table_name;
2942         auto sst_info = std::make_shared<Rdb_sst_info>(
2943             rdb, table_name, index_name, rdb_merge.get_cf(),
2944             *rocksdb_db_options, THDVAR(get_thd(), trace_sst_api));
2945 
2946         while ((rc2 = rdb_merge.next(&merge_key, &merge_val)) == 0) {
2947           if ((rc2 = sst_info->put(merge_key, merge_val)) != 0) {
2948             rc = rc2;
2949 
2950             // Don't return yet - make sure we finish the sst_info
2951             break;
2952           }
2953         }
2954 
2955         // -1 => no more items
2956         if (rc2 != -1 && rc != 0) {
2957           rc = rc2;
2958         }
2959 
2960         Rdb_sst_info::Rdb_sst_commit_info commit_info;
2961         rc2 = sst_info->finish(&commit_info, print_client_error);
2962         if (rc2 != 0 && rc == 0) {
2963           // Only set the error from sst_info->finish if finish failed and we
2964           // didn't fail before. In other words, we don't have finish's
2965           // success mask earlier failures
2966           rc = rc2;
2967         }
2968 
2969         if (rc) {
2970           return rc;
2971         }
2972 
2973         if (commit_info.has_work()) {
2974           sst_commit_list.emplace_back(std::move(commit_info));
2975           assert(!commit_info.has_work());
2976         }
2977       }
2978     }
2979 
2980     // Early return in case we lost the race completely and end up with no
2981     // work at all
2982     if (sst_commit_list.size() == 0) {
2983       return rc;
2984     }
2985 
2986     // INGEST phase: Group all Rdb_sst_commit_info by cf (as they might
2987     // have the same cf across different indexes) and call out to RocksDB
2988     // to ingest all SST files in one atomic operation
2989     rocksdb::IngestExternalFileOptions options;
2990     options.move_files = true;
2991     options.snapshot_consistency = false;
2992     options.allow_global_seqno = false;
2993     options.allow_blocking_flush = false;
2994 
2995     std::map<rocksdb::ColumnFamilyHandle *, rocksdb::IngestExternalFileArg>
2996         arg_map;
2997 
2998     // Group by column_family
2999     for (auto &commit_info : sst_commit_list) {
3000       if (arg_map.find(commit_info.get_cf()) == arg_map.end()) {
3001         rocksdb::IngestExternalFileArg arg;
3002         arg.column_family = commit_info.get_cf(),
3003         arg.external_files = commit_info.get_committed_files(),
3004         arg.options = options;
3005 
3006         arg_map.emplace(commit_info.get_cf(), arg);
3007       } else {
3008         auto &files = arg_map[commit_info.get_cf()].external_files;
3009         files.insert(files.end(), commit_info.get_committed_files().begin(),
3010                      commit_info.get_committed_files().end());
3011       }
3012     }
3013 
3014     std::vector<rocksdb::IngestExternalFileArg> args;
3015     size_t file_count = 0;
3016     for (auto &cf_files_pair : arg_map) {
3017       args.push_back(cf_files_pair.second);
3018       file_count += cf_files_pair.second.external_files.size();
3019     }
3020 
3021     const rocksdb::Status s = rdb->IngestExternalFiles(args);
3022     if (THDVAR(m_thd, trace_sst_api)) {
3023       // NO_LINT_DEBUG
3024       sql_print_information(
3025           "SST Tracing: IngestExternalFile '%zu' files returned %s", file_count,
3026           s.ok() ? "ok" : "not ok");
3027     }
3028 
3029     if (!s.ok()) {
3030       if (print_client_error) {
3031         Rdb_sst_info::report_error_msg(s, nullptr);
3032       }
3033       return HA_ERR_ROCKSDB_BULK_LOAD;
3034     }
3035 
3036     // COMMIT phase: mark everything as completed. This avoids SST file
3037     // deletion kicking in. Otherwise SST files would get deleted if this
3038     // entire operation is aborted
3039     for (auto &commit_info : sst_commit_list) {
3040       commit_info.commit();
3041     }
3042 
3043     return rc;
3044   }
3045 
start_bulk_load(ha_rocksdb * const bulk_load,std::shared_ptr<Rdb_sst_info> sst_info)3046   int start_bulk_load(ha_rocksdb *const bulk_load,
3047                       std::shared_ptr<Rdb_sst_info> sst_info) {
3048     /*
3049      If we already have an open bulk load of a table and the name doesn't
3050      match the current one, close out the currently running one.  This allows
3051      multiple bulk loads to occur on a partitioned table, but then closes
3052      them all out when we switch to another table.
3053     */
3054     assert(bulk_load != nullptr);
3055 
3056     if (!m_curr_bulk_load.empty() &&
3057         bulk_load->get_table_basename() != m_curr_bulk_load_tablename) {
3058       const auto res = finish_bulk_load();
3059       if (res != HA_EXIT_SUCCESS) {
3060         return res;
3061       }
3062     }
3063 
3064     /*
3065      This used to track ha_rocksdb handler objects, but those can be
3066      freed by the table cache while this was referencing them. Instead
3067      of tracking ha_rocksdb handler objects, this now tracks the
3068      Rdb_sst_info allocated, and both the ha_rocksdb handler and the
3069      Rdb_transaction both have shared pointers to them.
3070 
3071      On transaction complete, it will commit each Rdb_sst_info structure found.
3072      If the ha_rocksdb object is freed, etc., it will also commit
3073      the Rdb_sst_info. The Rdb_sst_info commit path needs to be idempotent.
3074     */
3075     m_curr_bulk_load.push_back(sst_info);
3076     m_curr_bulk_load_tablename = bulk_load->get_table_basename();
3077     return HA_EXIT_SUCCESS;
3078   }
3079 
num_ongoing_bulk_load() const3080   int num_ongoing_bulk_load() const { return m_curr_bulk_load.size(); }
3081 
get_rocksdb_tmpdir() const3082   const char *get_rocksdb_tmpdir() const {
3083     const char *tmp_dir = THDVAR(get_thd(), tmpdir);
3084 
3085     /*
3086       We want to treat an empty string as nullptr, in these cases DDL operations
3087       will use the default --tmpdir passed to mysql instead.
3088     */
3089     if (tmp_dir != nullptr && *tmp_dir == '\0') {
3090       tmp_dir = nullptr;
3091     }
3092     return (tmp_dir);
3093   }
3094 
3095   /*
3096     Flush the data accumulated so far. This assumes we're doing a bulk insert.
3097 
3098     @detail
3099       This should work like transaction commit, except that we don't
3100       synchronize with the binlog (there is no API that would allow to have
3101       binlog flush the changes accumulated so far and return its current
3102       position)
3103 
3104     @todo
3105       Add test coverage for what happens when somebody attempts to do bulk
3106       inserts while inside a multi-statement transaction.
3107   */
flush_batch()3108   bool flush_batch() {
3109     if (get_write_count() == 0) return false;
3110 
3111     /* Commit the current transaction */
3112     if (commit_no_binlog()) return true;
3113 
3114     /* Start another one */
3115     start_tx();
3116     return false;
3117   }
3118 
set_auto_incr(const GL_INDEX_ID & gl_index_id,ulonglong curr_id)3119   void set_auto_incr(const GL_INDEX_ID &gl_index_id, ulonglong curr_id) {
3120     m_auto_incr_map[gl_index_id] =
3121         std::max(m_auto_incr_map[gl_index_id], curr_id);
3122   }
3123 
3124 #ifndef NDEBUG
get_auto_incr(const GL_INDEX_ID & gl_index_id)3125   ulonglong get_auto_incr(const GL_INDEX_ID &gl_index_id) {
3126     if (m_auto_incr_map.count(gl_index_id) > 0) {
3127       return m_auto_incr_map[gl_index_id];
3128     }
3129     return 0;
3130   }
3131 #endif
3132 
3133   virtual rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family,
3134                               const rocksdb::Slice &key,
3135                               const rocksdb::Slice &value,
3136                               const bool assume_tracked) = 0;
3137   virtual rocksdb::Status delete_key(
3138       rocksdb::ColumnFamilyHandle *const column_family,
3139       const rocksdb::Slice &key, const bool assume_tracked) = 0;
3140   virtual rocksdb::Status single_delete(
3141       rocksdb::ColumnFamilyHandle *const column_family,
3142       const rocksdb::Slice &key, const bool assume_tracked) = 0;
3143 
3144   virtual bool has_modifications() const = 0;
3145 
3146   virtual rocksdb::WriteBatchBase *get_indexed_write_batch() = 0;
3147   /*
3148     Return a WriteBatch that one can write to. The writes will skip any
3149     transaction locking. The writes will NOT be visible to the transaction.
3150   */
get_blind_write_batch()3151   rocksdb::WriteBatchBase *get_blind_write_batch() {
3152     return get_indexed_write_batch()->GetWriteBatch();
3153   }
3154 
3155   virtual rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family,
3156                               const rocksdb::Slice &key,
3157                               rocksdb::PinnableSlice *const value) const = 0;
3158   virtual rocksdb::Status get_for_update(const Rdb_key_def &key_descr,
3159                                          const rocksdb::Slice &key,
3160                                          rocksdb::PinnableSlice *const value,
3161                                          bool exclusive,
3162                                          const bool do_validate) = 0;
3163 
get_iterator(rocksdb::ColumnFamilyHandle * const column_family,bool skip_bloom_filter,bool fill_cache,const rocksdb::Slice & eq_cond_lower_bound,const rocksdb::Slice & eq_cond_upper_bound,bool read_current=false,bool create_snapshot=true)3164   rocksdb::Iterator *get_iterator(
3165       rocksdb::ColumnFamilyHandle *const column_family, bool skip_bloom_filter,
3166       bool fill_cache, const rocksdb::Slice &eq_cond_lower_bound,
3167       const rocksdb::Slice &eq_cond_upper_bound, bool read_current = false,
3168       bool create_snapshot = true) {
3169     // Make sure we are not doing both read_current (which implies we don't
3170     // want a snapshot) and create_snapshot which makes sure we create
3171     // a snapshot
3172     assert(column_family != nullptr);
3173     assert(!read_current || !create_snapshot);
3174 
3175     if (create_snapshot) acquire_snapshot(true);
3176 
3177     rocksdb::ReadOptions options = m_read_opts;
3178 
3179     if (skip_bloom_filter) {
3180       const bool enable_iterate_bounds =
3181           THDVAR(get_thd(), enable_iterate_bounds);
3182       options.total_order_seek = true;
3183       options.iterate_lower_bound =
3184           enable_iterate_bounds ? &eq_cond_lower_bound : nullptr;
3185       options.iterate_upper_bound =
3186           enable_iterate_bounds ? &eq_cond_upper_bound : nullptr;
3187     } else {
3188       // With this option, Iterator::Valid() returns false if key
3189       // is outside of the prefix bloom filter range set at Seek().
3190       // Must not be set to true if not using bloom filter.
3191       options.prefix_same_as_start = true;
3192     }
3193     options.fill_cache = fill_cache;
3194     if (read_current) {
3195       options.snapshot = nullptr;
3196     }
3197     return get_iterator(options, column_family);
3198   }
3199 
3200   virtual bool is_tx_started() const = 0;
3201   virtual void start_tx() = 0;
3202   virtual void start_stmt() = 0;
3203   virtual void set_name() = 0;
3204 
3205  protected:
3206   // Non-virtual functions with actions to be done on transaction start and
3207   // commit.
on_commit()3208   void on_commit() {
3209     time_t tm;
3210     tm = time(nullptr);
3211     for (auto &it : modified_tables) {
3212       it->m_update_time = tm;
3213     }
3214     modified_tables.clear();
3215   }
on_rollback()3216   void on_rollback() {
3217     modified_tables.clear();
3218   }
3219  public:
log_table_write_op(Rdb_tbl_def * tbl)3220   void log_table_write_op(Rdb_tbl_def *tbl) {
3221     modified_tables.insert(tbl);
3222   }
3223 
set_initial_savepoint()3224   void set_initial_savepoint() {
3225     /*
3226       Set the initial savepoint. If the first statement in the transaction
3227       fails, we need something to roll back to, without rolling back the
3228       entire transaction.
3229     */
3230     do_set_savepoint();
3231     m_writes_at_last_savepoint = m_write_count;
3232   }
3233 
3234   /*
3235     Called when a "top-level" statement inside a transaction completes
3236     successfully and its changes become part of the transaction's changes.
3237   */
make_stmt_savepoint_permanent()3238   int make_stmt_savepoint_permanent() {
3239     // Take another RocksDB savepoint only if we had changes since the last
3240     // one. This is very important for long transactions doing lots of
3241     // SELECTs.
3242     if (m_writes_at_last_savepoint != m_write_count) {
3243       rocksdb::Status status = rocksdb::Status::NotFound();
3244       while ((status = do_pop_savepoint()) == rocksdb::Status::OK()) {
3245       }
3246 
3247       if (status != rocksdb::Status::NotFound()) {
3248         return HA_EXIT_FAILURE;
3249       }
3250 
3251       do_set_savepoint();
3252       m_writes_at_last_savepoint = m_write_count;
3253     }
3254 
3255     return HA_EXIT_SUCCESS;
3256   }
3257 
3258   /*
3259     Rollback to the savepoint we've set before the last statement
3260   */
rollback_to_stmt_savepoint()3261   void rollback_to_stmt_savepoint() {
3262     if (m_writes_at_last_savepoint != m_write_count) {
3263       do_rollback_to_savepoint();
3264       /*
3265         RollbackToSavePoint "removes the most recent SetSavePoint()", so
3266         we need to set it again so that next statement can roll back to this
3267         stage.
3268         It's ok to do it here at statement end (instead of doing it at next
3269         statement start) because setting a savepoint is cheap.
3270       */
3271       do_set_savepoint();
3272       m_write_count = m_writes_at_last_savepoint;
3273     }
3274   }
3275 
3276   virtual void rollback_stmt() = 0;
3277 
set_tx_failed(bool failed_arg)3278   void set_tx_failed(bool failed_arg) { m_is_tx_failed = failed_arg; }
3279 
can_prepare() const3280   bool can_prepare() const {
3281     if (m_rollback_only) {
3282       my_error(ER_ROLLBACK_ONLY, MYF(0));
3283       return false;
3284     }
3285     return true;
3286   }
3287 
rollback_to_savepoint(void * const savepoint)3288   int rollback_to_savepoint(void *const savepoint) {
3289     if (has_modifications()) {
3290       my_error(ER_ROLLBACK_TO_SAVEPOINT, MYF(0));
3291       m_rollback_only = true;
3292       return HA_EXIT_FAILURE;
3293     }
3294     return HA_EXIT_SUCCESS;
3295   }
3296 
3297   /*
3298     This is used by transactions started with "START TRANSACTION WITH "
3299     "CONSISTENT [ROCKSDB] SNAPSHOT". When tx_read_only is turned on,
3300     snapshot has to be created via DB::GetSnapshot(), not via Transaction
3301     API.
3302   */
is_tx_read_only() const3303   bool is_tx_read_only() const { return m_tx_read_only; }
3304 
set_tx_read_only(bool val)3305   void set_tx_read_only(bool val) { m_tx_read_only = val; }
3306 
Rdb_transaction(THD * const thd)3307   explicit Rdb_transaction(THD *const thd)
3308       : m_thd(thd), m_tbl_io_perf(nullptr) {
3309     RDB_MUTEX_LOCK_CHECK(s_tx_list_mutex);
3310     s_tx_list.insert(this);
3311     RDB_MUTEX_UNLOCK_CHECK(s_tx_list_mutex);
3312   }
3313 
~Rdb_transaction()3314   virtual ~Rdb_transaction() {
3315     RDB_MUTEX_LOCK_CHECK(s_tx_list_mutex);
3316     s_tx_list.erase(this);
3317     RDB_MUTEX_UNLOCK_CHECK(s_tx_list_mutex);
3318   }
3319 };
3320 
3321 #ifndef NDEBUG
3322 // simulate that RocksDB has reported corrupted data
dbug_change_status_to_corrupted(rocksdb::Status * status)3323 static void dbug_change_status_to_corrupted(rocksdb::Status *status) {
3324   *status = rocksdb::Status::Corruption();
3325 }
dbug_change_status_to_io_error(rocksdb::Status * status)3326 static void dbug_change_status_to_io_error(rocksdb::Status *status) {
3327   *status = rocksdb::Status::IOError();
3328 }
dbug_change_status_to_incomplete(rocksdb::Status * status)3329 static void dbug_change_status_to_incomplete(rocksdb::Status *status) {
3330   *status = rocksdb::Status::Incomplete();
3331 }
3332 #endif
3333 
3334 /*
3335   This is a rocksdb transaction. Its members represent the current transaction,
3336   which consists of:
3337   - the snapshot
3338   - the changes we've made but are not seeing yet.
3339 
3340   The changes are made to individual tables, which store them here and then
3341   this object commits them on commit.
3342 */
3343 class Rdb_transaction_impl : public Rdb_transaction {
3344   rocksdb::Transaction *m_rocksdb_tx = nullptr;
3345   rocksdb::Transaction *m_rocksdb_reuse_tx = nullptr;
3346 
3347  public:
set_lock_timeout(int timeout_sec_arg)3348   void set_lock_timeout(int timeout_sec_arg) override {
3349     if (m_rocksdb_tx) {
3350       m_rocksdb_tx->SetLockTimeout(rdb_convert_sec_to_ms(m_timeout_sec));
3351     }
3352   }
3353 
set_sync(bool sync)3354   void set_sync(bool sync) override {
3355     m_rocksdb_tx->GetWriteOptions()->sync = sync;
3356   }
3357 
release_lock(const Rdb_key_def & key_descr,const std::string & rowkey)3358   void release_lock(const Rdb_key_def &key_descr,
3359                     const std::string &rowkey) override {
3360     if (!THDVAR(m_thd, lock_scanned_rows)) {
3361       m_rocksdb_tx->UndoGetForUpdate(key_descr.get_cf(),
3362                                      rocksdb::Slice(rowkey));
3363       // row_lock_count track row(pk)
3364       assert(!key_descr.is_primary_key() ||
3365                   (key_descr.is_primary_key() && m_row_lock_count > 0));
3366       // m_row_lock_count tracks per row data instead of per key data
3367       if (key_descr.is_primary_key() && m_row_lock_count > 0) {
3368         m_row_lock_count--;
3369       }
3370     }
3371   }
3372 
is_writebatch_trx() const3373   virtual bool is_writebatch_trx() const override { return false; }
3374 
3375  private:
release_tx(void)3376   void release_tx(void) {
3377     // We are done with the current active transaction object.  Preserve it
3378     // for later reuse.
3379     assert(m_rocksdb_reuse_tx == nullptr);
3380     m_rocksdb_reuse_tx = m_rocksdb_tx;
3381     m_rocksdb_tx = nullptr;
3382   }
3383 
prepare()3384   bool prepare() override {
3385     rocksdb::Status s;
3386 
3387     s = merge_auto_incr_map(m_rocksdb_tx->GetWriteBatch()->GetWriteBatch());
3388 #ifndef NDEBUG
3389     DBUG_EXECUTE_IF("myrocks_prepare_io_error",
3390                     dbug_change_status_to_io_error(&s););
3391     DBUG_EXECUTE_IF("myrocks_prepare_incomplete",
3392                     dbug_change_status_to_incomplete(&s););
3393 #endif
3394     if (!s.ok()) {
3395       std::string msg =
3396           "RocksDB error on COMMIT (Prepare/merge): " + s.ToString();
3397       my_error(ER_INTERNAL_ERROR, MYF(0), msg.c_str());
3398       return false;
3399     }
3400 
3401     s = m_rocksdb_tx->Prepare();
3402     if (!s.ok()) {
3403       std::string msg = "RocksDB error on COMMIT (Prepare): " + s.ToString();
3404       my_error(ER_INTERNAL_ERROR, MYF(0), msg.c_str());
3405       return false;
3406     }
3407     return true;
3408   }
3409 
commit_no_binlog()3410   bool commit_no_binlog() override {
3411     bool res = false;
3412     rocksdb::Status s;
3413 
3414     s = merge_auto_incr_map(m_rocksdb_tx->GetWriteBatch()->GetWriteBatch());
3415 #ifndef NDEBUG
3416     DBUG_EXECUTE_IF("myrocks_commit_merge_io_error",
3417                     dbug_change_status_to_io_error(&s););
3418     DBUG_EXECUTE_IF("myrocks_commit_merge_incomplete",
3419                     dbug_change_status_to_incomplete(&s););
3420 #endif
3421     if (!s.ok()) {
3422       rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3423       res = true;
3424       goto error;
3425     }
3426 
3427     release_snapshot();
3428     s = m_rocksdb_tx->Commit();
3429 #ifndef NDEBUG
3430     DBUG_EXECUTE_IF("myrocks_commit_io_error",
3431                     dbug_change_status_to_io_error(&s););
3432     DBUG_EXECUTE_IF("myrocks_commit_incomplete",
3433                     dbug_change_status_to_incomplete(&s););
3434 #endif
3435     if (!s.ok()) {
3436       rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3437       res = true;
3438       goto error;
3439     }
3440 
3441     on_commit();
3442   error:
3443     on_rollback();
3444     /* Save the transaction object to be reused */
3445     release_tx();
3446 
3447     m_write_count = 0;
3448     m_row_lock_count = 0;
3449     set_tx_read_only(false);
3450     m_rollback_only = false;
3451     return res;
3452   }
3453 
3454  public:
rollback()3455   void rollback() override {
3456     on_rollback();
3457     m_write_count = 0;
3458     m_row_lock_count = 0;
3459     m_auto_incr_map.clear();
3460     m_ddl_transaction = false;
3461     if (m_rocksdb_tx) {
3462       release_snapshot();
3463       /* This will also release all of the locks: */
3464       m_rocksdb_tx->Rollback();
3465 
3466       /* Save the transaction object to be reused */
3467       release_tx();
3468 
3469       set_tx_read_only(false);
3470       m_rollback_only = false;
3471     }
3472   }
3473 
acquire_snapshot(bool acquire_now)3474   void acquire_snapshot(bool acquire_now) override {
3475     if (m_read_opts.snapshot == nullptr) {
3476       if (is_tx_read_only()) {
3477         snapshot_created(rdb->GetSnapshot());
3478       } else if (acquire_now) {
3479         m_rocksdb_tx->SetSnapshot();
3480         snapshot_created(m_rocksdb_tx->GetSnapshot());
3481       } else if (!m_is_delayed_snapshot) {
3482         m_rocksdb_tx->SetSnapshotOnNextOperation(m_notifier);
3483         m_is_delayed_snapshot = true;
3484       }
3485     }
3486   }
3487 
release_snapshot()3488   void release_snapshot() override {
3489     bool need_clear = m_is_delayed_snapshot;
3490 
3491     if (m_read_opts.snapshot != nullptr) {
3492       m_snapshot_timestamp = 0;
3493       if (is_tx_read_only()) {
3494         rdb->ReleaseSnapshot(m_read_opts.snapshot);
3495         need_clear = false;
3496       } else {
3497         need_clear = true;
3498       }
3499       m_read_opts.snapshot = nullptr;
3500     }
3501 
3502     if (need_clear && m_rocksdb_tx != nullptr) m_rocksdb_tx->ClearSnapshot();
3503   }
3504 
has_snapshot()3505   bool has_snapshot() { return m_read_opts.snapshot != nullptr; }
3506 
put(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const rocksdb::Slice & value,const bool assume_tracked)3507   rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family,
3508                       const rocksdb::Slice &key, const rocksdb::Slice &value,
3509                       const bool assume_tracked) override {
3510     ++m_write_count;
3511     return m_rocksdb_tx->Put(column_family, key, value, assume_tracked);
3512   }
3513 
delete_key(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const bool assume_tracked)3514   rocksdb::Status delete_key(rocksdb::ColumnFamilyHandle *const column_family,
3515                              const rocksdb::Slice &key,
3516                              const bool assume_tracked) override {
3517     ++m_write_count;
3518     return m_rocksdb_tx->Delete(column_family, key, assume_tracked);
3519   }
3520 
single_delete(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const bool assume_tracked)3521   rocksdb::Status single_delete(
3522       rocksdb::ColumnFamilyHandle *const column_family,
3523       const rocksdb::Slice &key, const bool assume_tracked) override {
3524     ++m_write_count;
3525     return m_rocksdb_tx->SingleDelete(column_family, key, assume_tracked);
3526   }
3527 
has_modifications() const3528   bool has_modifications() const override {
3529     return m_rocksdb_tx->GetWriteBatch() &&
3530            m_rocksdb_tx->GetWriteBatch()->GetWriteBatch() &&
3531            m_rocksdb_tx->GetWriteBatch()->GetWriteBatch()->Count() > 0;
3532   }
3533 
get_write_batch()3534   rocksdb::WriteBatchBase *get_write_batch() override {
3535     return m_rocksdb_tx->GetCommitTimeWriteBatch();
3536   }
3537 
3538   /*
3539     Return a WriteBatch that one can write to. The writes will skip any
3540     transaction locking. The writes WILL be visible to the transaction.
3541   */
get_indexed_write_batch()3542   rocksdb::WriteBatchBase *get_indexed_write_batch() override {
3543     ++m_write_count;
3544     return m_rocksdb_tx->GetWriteBatch();
3545   }
3546 
get(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,rocksdb::PinnableSlice * const value) const3547   rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family,
3548                       const rocksdb::Slice &key,
3549                       rocksdb::PinnableSlice *const value) const override {
3550     // clean PinnableSlice right before Get() for multiple gets per statement
3551     // the resources after the last Get in a statement are cleared in
3552     // handler::reset call
3553     value->Reset();
3554     global_stats.queries[QUERIES_POINT].inc();
3555     return m_rocksdb_tx->Get(m_read_opts, column_family, key, value);
3556   }
3557 
get_for_update(const Rdb_key_def & key_descr,const rocksdb::Slice & key,rocksdb::PinnableSlice * const value,bool exclusive,const bool do_validate)3558   rocksdb::Status get_for_update(const Rdb_key_def &key_descr,
3559                                  const rocksdb::Slice &key,
3560                                  rocksdb::PinnableSlice *const value,
3561                                  bool exclusive,
3562                                  const bool do_validate) override {
3563     rocksdb::ColumnFamilyHandle *const column_family = key_descr.get_cf();
3564     /* check row lock limit in a trx */
3565     if (get_row_lock_count() >= get_max_row_lock_count()) {
3566       return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
3567     }
3568 
3569     if (value != nullptr) {
3570       value->Reset();
3571     }
3572     rocksdb::Status s;
3573     // If snapshot is null, pass it to GetForUpdate and snapshot is
3574     // initialized there. Snapshot validation is skipped in that case.
3575     if (m_read_opts.snapshot == nullptr || do_validate) {
3576       s = m_rocksdb_tx->GetForUpdate(
3577           m_read_opts, column_family, key, value, exclusive,
3578           m_read_opts.snapshot ? do_validate : false);
3579     } else {
3580       // If snapshot is set, and if skipping validation,
3581       // call GetForUpdate without validation and set back old snapshot
3582       auto saved_snapshot = m_read_opts.snapshot;
3583       m_read_opts.snapshot = nullptr;
3584       s = m_rocksdb_tx->GetForUpdate(m_read_opts, column_family, key, value,
3585                                      exclusive, false);
3586       m_read_opts.snapshot = saved_snapshot;
3587     }
3588     // row_lock_count is to track per row instead of per key
3589     if (key_descr.is_primary_key()) incr_row_lock_count();
3590     return s;
3591   }
3592 
get_iterator(const rocksdb::ReadOptions & options,rocksdb::ColumnFamilyHandle * const column_family)3593   rocksdb::Iterator *get_iterator(
3594       const rocksdb::ReadOptions &options,
3595       rocksdb::ColumnFamilyHandle *const column_family) override {
3596     global_stats.queries[QUERIES_RANGE].inc();
3597     return m_rocksdb_tx->GetIterator(options, column_family);
3598   }
3599 
get_rdb_trx() const3600   const rocksdb::Transaction *get_rdb_trx() const { return m_rocksdb_tx; }
3601 
is_tx_started() const3602   bool is_tx_started() const override { return (m_rocksdb_tx != nullptr); }
3603 
start_tx()3604   void start_tx() override {
3605     rocksdb::TransactionOptions tx_opts;
3606     rocksdb::WriteOptions write_opts;
3607     tx_opts.set_snapshot = false;
3608     tx_opts.lock_timeout = rdb_convert_sec_to_ms(m_timeout_sec);
3609     tx_opts.deadlock_detect = THDVAR(m_thd, deadlock_detect);
3610     tx_opts.deadlock_detect_depth = THDVAR(m_thd, deadlock_detect_depth);
3611     // If this variable is set, this will write commit time write batch
3612     // information on recovery or memtable flush.
3613     tx_opts.use_only_the_last_commit_time_batch_for_recovery =
3614         THDVAR(m_thd, commit_time_batch_for_recovery);
3615     tx_opts.max_write_batch_size = THDVAR(m_thd, write_batch_max_bytes);
3616     tx_opts.write_batch_flush_threshold =
3617         THDVAR(m_thd, write_batch_flush_threshold);
3618 
3619     write_opts.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
3620     write_opts.disableWAL = THDVAR(m_thd, write_disable_wal);
3621     write_opts.ignore_missing_column_families =
3622         THDVAR(m_thd, write_ignore_missing_column_families);
3623 
3624     /*
3625       If m_rocksdb_reuse_tx is null this will create a new transaction object.
3626       Otherwise it will reuse the existing one.
3627     */
3628     m_rocksdb_tx =
3629         rdb->BeginTransaction(write_opts, tx_opts, m_rocksdb_reuse_tx);
3630     m_rocksdb_reuse_tx = nullptr;
3631 
3632     m_read_opts = rocksdb::ReadOptions();
3633 
3634     set_initial_savepoint();
3635 
3636     m_ddl_transaction = false;
3637   }
3638 
set_name()3639   void set_name() override {
3640     XID xid;
3641     thd_get_xid(m_thd, reinterpret_cast<MYSQL_XID *>(&xid));
3642     auto name = m_rocksdb_tx->GetName();
3643     if (!name.empty()) {
3644       assert(name == rdb_xid_to_string(xid));
3645       return;
3646     }
3647     rocksdb::Status s = m_rocksdb_tx->SetName(rdb_xid_to_string(xid));
3648     assert(s.ok());
3649     if (!s.ok()) {
3650       rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3651     }
3652   }
3653 
3654   /* Implementations of do_*savepoint based on rocksdB::Transaction savepoints
3655    */
do_set_savepoint()3656   void do_set_savepoint() override { m_rocksdb_tx->SetSavePoint(); }
do_pop_savepoint()3657   rocksdb::Status do_pop_savepoint() override {
3658     return m_rocksdb_tx->PopSavePoint();
3659   }
3660 
do_rollback_to_savepoint()3661   void do_rollback_to_savepoint() override {
3662     m_rocksdb_tx->RollbackToSavePoint();
3663   }
3664 
3665   /*
3666     Start a statement inside a multi-statement transaction.
3667 
3668     @todo: are we sure this is called once (and not several times) per
3669     statement start?
3670 
3671     For hooking to start of statement that is its own transaction, see
3672     ha_rocksdb::external_lock().
3673   */
start_stmt()3674   void start_stmt() override {
3675     // Set the snapshot to delayed acquisition (SetSnapshotOnNextOperation)
3676     acquire_snapshot(false);
3677   }
3678 
3679   /*
3680     This must be called when last statement is rolled back, but the transaction
3681     continues
3682   */
rollback_stmt()3683   void rollback_stmt() override {
3684     /* TODO: here we must release the locks taken since the start_stmt() call */
3685     if (m_rocksdb_tx) {
3686       const rocksdb::Snapshot *const org_snapshot = m_rocksdb_tx->GetSnapshot();
3687       rollback_to_stmt_savepoint();
3688 
3689       const rocksdb::Snapshot *const cur_snapshot = m_rocksdb_tx->GetSnapshot();
3690       if (org_snapshot != cur_snapshot) {
3691         if (org_snapshot != nullptr) m_snapshot_timestamp = 0;
3692 
3693         m_read_opts.snapshot = cur_snapshot;
3694         if (cur_snapshot != nullptr) {
3695           rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp);
3696         } else {
3697           m_is_delayed_snapshot = true;
3698         }
3699       }
3700     }
3701   }
3702 
Rdb_transaction_impl(THD * const thd)3703   explicit Rdb_transaction_impl(THD *const thd)
3704       : Rdb_transaction(thd), m_rocksdb_tx(nullptr) {
3705     // Create a notifier that can be called when a snapshot gets generated.
3706     m_notifier = std::make_shared<Rdb_snapshot_notifier>(this);
3707   }
3708 
~Rdb_transaction_impl()3709   virtual ~Rdb_transaction_impl() override {
3710     rollback();
3711 
3712     // Theoretically the notifier could outlive the Rdb_transaction_impl
3713     // (because of the shared_ptr), so let it know it can't reference
3714     // the transaction anymore.
3715     m_notifier->detach();
3716 
3717     // Free any transaction memory that is still hanging around.
3718     delete m_rocksdb_reuse_tx;
3719     assert(m_rocksdb_tx == nullptr);
3720   }
3721 };
3722 
3723 /* This is a rocksdb write batch. This class doesn't hold or wait on any
3724    transaction locks (skips rocksdb transaction API) thus giving better
3725    performance.
3726 
3727    Currently this is only used for replication threads which are guaranteed
3728    to be non-conflicting. Any further usage of this class should completely
3729    be thought thoroughly.
3730 */
3731 class Rdb_writebatch_impl : public Rdb_transaction {
3732   rocksdb::WriteBatchWithIndex *m_batch;
3733   rocksdb::WriteOptions write_opts;
3734   // Called after commit/rollback.
reset()3735   void reset() {
3736     m_batch->Clear();
3737     m_read_opts = rocksdb::ReadOptions();
3738     m_ddl_transaction = false;
3739   }
3740 
3741  private:
prepare()3742   bool prepare() override { return true; }
3743 
commit_no_binlog()3744   bool commit_no_binlog() override {
3745     bool res = false;
3746     rocksdb::Status s;
3747     rocksdb::TransactionDBWriteOptimizations optimize;
3748     optimize.skip_concurrency_control = true;
3749 
3750     s = merge_auto_incr_map(m_batch->GetWriteBatch());
3751     if (!s.ok()) {
3752       rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3753       res = true;
3754       goto error;
3755     }
3756 
3757     release_snapshot();
3758 
3759     s = rdb->Write(write_opts, optimize, m_batch->GetWriteBatch());
3760     if (!s.ok()) {
3761       rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3762       res = true;
3763       goto error;
3764     }
3765     on_commit();
3766   error:
3767     on_rollback();
3768     reset();
3769 
3770     m_write_count = 0;
3771     set_tx_read_only(false);
3772     m_rollback_only = false;
3773     return res;
3774   }
3775 
3776   /* Implementations of do_*savepoint based on rocksdB::WriteBatch savepoints */
do_set_savepoint()3777   void do_set_savepoint() override { m_batch->SetSavePoint(); }
do_pop_savepoint()3778   rocksdb::Status do_pop_savepoint() override {
3779     return m_batch->PopSavePoint();
3780   }
3781 
do_rollback_to_savepoint()3782   void do_rollback_to_savepoint() override { m_batch->RollbackToSavePoint(); }
3783 
3784  public:
is_writebatch_trx() const3785   bool is_writebatch_trx() const override { return true; }
3786 
set_lock_timeout(int timeout_sec_arg)3787   void set_lock_timeout(int timeout_sec_arg) override {
3788     // Nothing to do here.
3789   }
3790 
set_sync(bool sync)3791   void set_sync(bool sync) override { write_opts.sync = sync; }
3792 
release_lock(const Rdb_key_def & key_descr,const std::string & rowkey)3793   void release_lock(const Rdb_key_def &key_descr,
3794                     const std::string &rowkey) override {
3795     // Nothing to do here since we don't hold any row locks.
3796   }
3797 
rollback()3798   void rollback() override {
3799     on_rollback();
3800     m_write_count = 0;
3801     m_row_lock_count = 0;
3802     release_snapshot();
3803 
3804     reset();
3805     set_tx_read_only(false);
3806     m_rollback_only = false;
3807   }
3808 
acquire_snapshot(bool acquire_now)3809   void acquire_snapshot(bool acquire_now) override {
3810     if (m_read_opts.snapshot == nullptr) snapshot_created(rdb->GetSnapshot());
3811   }
3812 
release_snapshot()3813   void release_snapshot() override {
3814     if (m_read_opts.snapshot != nullptr) {
3815       rdb->ReleaseSnapshot(m_read_opts.snapshot);
3816       m_read_opts.snapshot = nullptr;
3817     }
3818   }
3819 
put(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const rocksdb::Slice & value,const bool assume_tracked)3820   rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family,
3821                       const rocksdb::Slice &key, const rocksdb::Slice &value,
3822                       const bool assume_tracked) override {
3823     ++m_write_count;
3824     m_batch->Put(column_family, key, value);
3825     // Note Put/Delete in write batch doesn't return any error code. We simply
3826     // return OK here.
3827     return rocksdb::Status::OK();
3828   }
3829 
delete_key(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const bool assume_tracked)3830   rocksdb::Status delete_key(rocksdb::ColumnFamilyHandle *const column_family,
3831                              const rocksdb::Slice &key,
3832                              const bool assume_tracked) override {
3833     ++m_write_count;
3834     m_batch->Delete(column_family, key);
3835     return rocksdb::Status::OK();
3836   }
3837 
single_delete(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const bool assume_tracked)3838   rocksdb::Status single_delete(
3839       rocksdb::ColumnFamilyHandle *const column_family,
3840       const rocksdb::Slice &key, const bool assume_tracked) override {
3841     ++m_write_count;
3842     m_batch->SingleDelete(column_family, key);
3843     return rocksdb::Status::OK();
3844   }
3845 
has_modifications() const3846   bool has_modifications() const override {
3847     return m_batch->GetWriteBatch()->Count() > 0;
3848   }
3849 
get_write_batch()3850   rocksdb::WriteBatchBase *get_write_batch() override { return m_batch; }
3851 
get_indexed_write_batch()3852   rocksdb::WriteBatchBase *get_indexed_write_batch() override {
3853     ++m_write_count;
3854     return m_batch;
3855   }
3856 
get(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,rocksdb::PinnableSlice * const value) const3857   rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family,
3858                       const rocksdb::Slice &key,
3859                       rocksdb::PinnableSlice *const value) const override {
3860     value->Reset();
3861     return m_batch->GetFromBatchAndDB(rdb, m_read_opts, column_family, key,
3862                                       value);
3863   }
3864 
get_for_update(const Rdb_key_def & key_descr,const rocksdb::Slice & key,rocksdb::PinnableSlice * const value,bool,const bool)3865   rocksdb::Status get_for_update(const Rdb_key_def &key_descr,
3866                                  const rocksdb::Slice &key,
3867                                  rocksdb::PinnableSlice *const value,
3868                                  bool /* exclusive */,
3869                                  const bool /* do_validate */) override {
3870     rocksdb::ColumnFamilyHandle *const column_family = key_descr.get_cf();
3871     if (value == nullptr) {
3872       rocksdb::PinnableSlice pin_val;
3873       rocksdb::Status s = get(column_family, key, &pin_val);
3874       pin_val.Reset();
3875       return s;
3876     }
3877 
3878     return get(column_family, key, value);
3879   }
3880 
get_iterator(const rocksdb::ReadOptions & options,rocksdb::ColumnFamilyHandle * const column_family)3881   rocksdb::Iterator *get_iterator(
3882       const rocksdb::ReadOptions &options,
3883       rocksdb::ColumnFamilyHandle *const column_family) override {
3884     const auto it = rdb->NewIterator(options);
3885     return m_batch->NewIteratorWithBase(it);
3886   }
3887 
is_tx_started() const3888   bool is_tx_started() const override { return (m_batch != nullptr); }
3889 
start_tx()3890   void start_tx() override {
3891     reset();
3892     write_opts.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
3893     write_opts.disableWAL = THDVAR(m_thd, write_disable_wal);
3894     write_opts.ignore_missing_column_families =
3895         THDVAR(m_thd, write_ignore_missing_column_families);
3896 
3897     set_initial_savepoint();
3898   }
3899 
set_name()3900   void set_name() override {}
3901 
start_stmt()3902   void start_stmt() override {}
3903 
rollback_stmt()3904   void rollback_stmt() override {
3905     if (m_batch) rollback_to_stmt_savepoint();
3906   }
3907 
Rdb_writebatch_impl(THD * const thd)3908   explicit Rdb_writebatch_impl(THD *const thd)
3909       : Rdb_transaction(thd), m_batch(nullptr) {
3910     m_batch = new rocksdb::WriteBatchWithIndex(rocksdb::BytewiseComparator(), 0,
3911                                                true);
3912   }
3913 
~Rdb_writebatch_impl()3914   virtual ~Rdb_writebatch_impl() override {
3915     rollback();
3916     delete m_batch;
3917   }
3918 };
3919 
SnapshotCreated(const rocksdb::Snapshot * const snapshot)3920 void Rdb_snapshot_notifier::SnapshotCreated(
3921     const rocksdb::Snapshot *const snapshot) {
3922   if (m_owning_tx != nullptr) {
3923     m_owning_tx->snapshot_created(snapshot);
3924   }
3925 }
3926 
3927 std::multiset<Rdb_transaction *> Rdb_transaction::s_tx_list;
3928 mysql_mutex_t Rdb_transaction::s_tx_list_mutex;
3929 
get_tx_from_thd(THD * const thd)3930 static Rdb_transaction *&get_tx_from_thd(THD *const thd) {
3931   return *reinterpret_cast<Rdb_transaction **>(
3932       my_core::thd_ha_data(thd, rocksdb_hton));
3933 }
3934 
3935 class Rdb_perf_context_guard {
3936   Rdb_io_perf m_io_perf;
3937   Rdb_io_perf *m_io_perf_ptr;
3938   Rdb_transaction *m_tx;
3939   uint m_level;
3940 
3941  public:
3942   Rdb_perf_context_guard(const Rdb_perf_context_guard &) = delete;
3943   Rdb_perf_context_guard &operator=(const Rdb_perf_context_guard &) = delete;
3944 
Rdb_perf_context_guard(Rdb_io_perf * io_perf,uint level)3945   explicit Rdb_perf_context_guard(Rdb_io_perf *io_perf, uint level)
3946       : m_io_perf_ptr(io_perf), m_tx(nullptr), m_level(level) {
3947     m_io_perf_ptr->start(m_level);
3948   }
3949 
Rdb_perf_context_guard(Rdb_transaction * tx,uint level)3950   explicit Rdb_perf_context_guard(Rdb_transaction *tx, uint level)
3951       : m_io_perf_ptr(nullptr), m_tx(tx), m_level(level) {
3952     /*
3953       if perf_context information is already being recorded, this becomes a
3954       no-op
3955     */
3956     if (tx != nullptr) {
3957       tx->io_perf_start(&m_io_perf);
3958     }
3959   }
3960 
~Rdb_perf_context_guard()3961   ~Rdb_perf_context_guard() {
3962     if (m_tx != nullptr) {
3963       m_tx->io_perf_end_and_record();
3964     } else if (m_io_perf_ptr != nullptr) {
3965       m_io_perf_ptr->end_and_record(m_level);
3966     }
3967   }
3968 };
3969 
3970 /*
3971   TODO: maybe, call this in external_lock() and store in ha_rocksdb..
3972 */
3973 
get_or_create_tx(THD * const thd)3974 static Rdb_transaction *get_or_create_tx(THD *const thd) {
3975   Rdb_transaction *&tx = get_tx_from_thd(thd);
3976   // TODO: this is called too many times.. O(#rows)
3977   if (tx == nullptr) {
3978     if ((rpl_skip_tx_api_var && thd->rli_slave) ||
3979         (THDVAR(thd, master_skip_tx_api) && !thd->rli_slave)) {
3980       tx = new Rdb_writebatch_impl(thd);
3981     } else {
3982       tx = new Rdb_transaction_impl(thd);
3983     }
3984     tx->set_params(THDVAR(thd, lock_wait_timeout), rocksdb_max_row_locks);
3985     tx->start_tx();
3986   } else {
3987     tx->set_params(THDVAR(thd, lock_wait_timeout), rocksdb_max_row_locks);
3988     if (!tx->is_tx_started()) {
3989       tx->start_tx();
3990     }
3991   }
3992 
3993   return tx;
3994 }
3995 
rocksdb_close_connection(handlerton * const hton,THD * const thd)3996 static int rocksdb_close_connection(handlerton *const hton, THD *const thd) {
3997   Rdb_transaction *&tx = get_tx_from_thd(thd);
3998   if (tx != nullptr) {
3999     bool is_critical_error;
4000     int rc = tx->finish_bulk_load(&is_critical_error, false);
4001     if (rc != 0 && is_critical_error) {
4002       // NO_LINT_DEBUG
4003       sql_print_error(
4004           "RocksDB: Error %d finalizing last SST file while "
4005           "disconnecting",
4006           rc);
4007     }
4008 
4009     delete tx;
4010     tx = nullptr;
4011   }
4012   return HA_EXIT_SUCCESS;
4013 }
4014 
4015 /**
4016   Called by hton->flush_logs after MySQL group commit prepares a set of
4017   transactions.
4018 */
rocksdb_flush_wal(handlerton * const hton MY_ATTRIBUTE ((__unused__)),bool binlog_group_flush)4019 static bool rocksdb_flush_wal(handlerton *const hton MY_ATTRIBUTE((__unused__)),
4020                               bool binlog_group_flush) {
4021   DBUG_ENTER("rocksdb_flush_wal");
4022   assert(rdb != nullptr);
4023 
4024   /**
4025     If !binlog_group_flush, we got invoked by FLUSH LOGS or similar.
4026     Else, we got invoked by binlog group commit during flush stage.
4027   */
4028 
4029   if (binlog_group_flush &&
4030       rocksdb_flush_log_at_trx_commit == FLUSH_LOG_NEVER) {
4031     /**
4032       rocksdb_flush_log_at_trx_commit=0
4033       (write and sync based on timer in Rdb_background_thread).
4034       Do not flush the redo log during binlog group commit.
4035     */
4036     DBUG_RETURN(false);
4037   }
4038 
4039   if (!binlog_group_flush || !rocksdb_db_options->allow_mmap_writes ||
4040       rocksdb_flush_log_at_trx_commit != FLUSH_LOG_NEVER) {
4041     /**
4042       Sync the WAL if we are in FLUSH LOGS, or if
4043       rocksdb_flush_log_at_trx_commit=1
4044       (write and sync at each commit).
4045     */
4046     rocksdb_wal_group_syncs++;
4047     const rocksdb::Status s =
4048         rdb->FlushWAL(rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
4049 
4050     if (!s.ok()) {
4051       rdb_log_status_error(s);
4052       DBUG_RETURN(true);
4053     }
4054   }
4055 
4056   DBUG_RETURN(false);
4057 }
4058 
4059 /**
4060   For a slave, prepare() updates the slave_gtid_info table which tracks the
4061   replication progress.
4062 */
rocksdb_prepare(handlerton * const hton,THD * const thd,bool prepare_tx)4063 static int rocksdb_prepare(handlerton *const hton, THD *const thd,
4064                            bool prepare_tx) {
4065   Rdb_transaction *&tx = get_tx_from_thd(thd);
4066   if (!tx->is_tx_started()) {
4067     // nothing to prepare
4068     return HA_EXIT_SUCCESS;
4069   }
4070   if (!tx->can_prepare()) {
4071     return HA_EXIT_FAILURE;
4072   }
4073   if (prepare_tx ||
4074       (!my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
4075     if (thd->durability_property == HA_IGNORE_DURABILITY) {
4076       tx->set_sync(false);
4077     }
4078     if (rocksdb_write_policy != rocksdb::TxnDBWritePolicy::WRITE_UNPREPARED) {
4079       tx->set_name();
4080     }
4081     if (!tx->prepare()) {
4082       return HA_EXIT_FAILURE;
4083     }
4084 
4085     DEBUG_SYNC(thd, "rocksdb.prepared");
4086   } else
4087     tx->make_stmt_savepoint_permanent();
4088 
4089   return HA_EXIT_SUCCESS;
4090 }
4091 
4092 /**
4093  do nothing for prepare/commit by xid
4094  this is needed to avoid crashes in XA scenarios
4095 */
rocksdb_commit_by_xid(handlerton * const hton,XID * const xid)4096 static int rocksdb_commit_by_xid(handlerton *const hton, XID *const xid) {
4097   DBUG_ENTER_FUNC();
4098 
4099   assert(hton != nullptr);
4100   assert(xid != nullptr);
4101   assert(commit_latency_stats != nullptr);
4102 
4103   auto clock = rocksdb::Env::Default()->GetSystemClock().get();
4104   rocksdb::StopWatchNano timer(clock, true);
4105 
4106   const auto name = rdb_xid_to_string(*xid);
4107   assert(!name.empty());
4108 
4109   rocksdb::Transaction *const trx = rdb->GetTransactionByName(name);
4110 
4111   if (trx == nullptr) {
4112     DBUG_RETURN(HA_EXIT_FAILURE);
4113   }
4114 
4115   const rocksdb::Status s = trx->Commit();
4116 
4117   if (!s.ok()) {
4118     rdb_log_status_error(s);
4119     DBUG_RETURN(HA_EXIT_FAILURE);
4120   }
4121 
4122   delete trx;
4123 
4124   // `Add()` is implemented in a thread-safe manner.
4125   commit_latency_stats->Add(timer.ElapsedNanos() / 1000);
4126 
4127   DBUG_RETURN(HA_EXIT_SUCCESS);
4128 }
4129 
rocksdb_rollback_by_xid(handlerton * const hton MY_ATTRIBUTE ((__unused__)),XID * const xid)4130 static int rocksdb_rollback_by_xid(
4131     handlerton *const hton MY_ATTRIBUTE((__unused__)), XID *const xid) {
4132   DBUG_ENTER_FUNC();
4133 
4134   assert(hton != nullptr);
4135   assert(xid != nullptr);
4136   assert(rdb != nullptr);
4137 
4138   const auto name = rdb_xid_to_string(*xid);
4139 
4140   rocksdb::Transaction *const trx = rdb->GetTransactionByName(name);
4141 
4142   if (trx == nullptr) {
4143     DBUG_RETURN(HA_EXIT_FAILURE);
4144   }
4145 
4146   const rocksdb::Status s = trx->Rollback();
4147 
4148   if (!s.ok()) {
4149     rdb_log_status_error(s);
4150     DBUG_RETURN(HA_EXIT_FAILURE);
4151   }
4152 
4153   delete trx;
4154 
4155   DBUG_RETURN(HA_EXIT_SUCCESS);
4156 }
4157 
4158 /**
4159   Rebuilds an XID from a serialized version stored in a string.
4160 */
rdb_xid_from_string(const std::string & src,XID * const dst)4161 static void rdb_xid_from_string(const std::string &src, XID *const dst) {
4162   assert(dst != nullptr);
4163   uint offset = 0;
4164   uint64 raw_fid8 =
4165       rdb_netbuf_to_uint64(reinterpret_cast<const uchar *>(src.data()));
4166   const int64 signed_fid8 = *reinterpret_cast<int64 *>(&raw_fid8);
4167   dst->set_format_id(signed_fid8);
4168   offset += RDB_FORMATID_SZ;
4169   dst->set_gtrid_length(src.at(offset));
4170   offset += RDB_GTRID_SZ;
4171   dst->set_bqual_length(src.at(offset));
4172   offset += RDB_BQUAL_SZ;
4173 
4174   assert(dst->get_gtrid_length() >= 0);
4175   assert(dst->get_gtrid_length() <= MAXGTRIDSIZE);
4176   assert(dst->get_bqual_length() >= 0);
4177   assert(dst->get_bqual_length() <= MAXBQUALSIZE);
4178 
4179   const std::string &tmp_data = src.substr(
4180       RDB_XIDHDR_LEN, (dst->get_gtrid_length()) + (dst->get_bqual_length()));
4181   dst->set_data(tmp_data.data(), tmp_data.length());
4182 }
4183 
4184 /**
4185   Reading last committed binary log info from RocksDB system row.
4186   The info is needed for crash safe slave/master to work.
4187 */
rocksdb_recover(handlerton * const hton,XID * const xid_list,uint len)4188 static int rocksdb_recover(handlerton *const hton, XID *const xid_list,
4189                            uint len) {
4190   if (len == 0 || xid_list == nullptr) {
4191     return HA_EXIT_SUCCESS;
4192   }
4193 
4194   std::vector<rocksdb::Transaction *> trans_list;
4195   rdb->GetAllPreparedTransactions(&trans_list);
4196 
4197   uint count = 0;
4198   for (auto &trans : trans_list) {
4199     if (count >= len) {
4200       break;
4201     }
4202     auto name = trans->GetName();
4203     rdb_xid_from_string(name, &xid_list[count]);
4204     count++;
4205   }
4206   return count;
4207 }
4208 
rocksdb_commit(handlerton * const hton,THD * const thd,bool commit_tx)4209 static int rocksdb_commit(handlerton *const hton, THD *const thd,
4210                           bool commit_tx) {
4211   DBUG_ENTER_FUNC();
4212 
4213   assert(hton != nullptr);
4214   assert(thd != nullptr);
4215   assert(commit_latency_stats != nullptr);
4216 
4217   auto clock = rocksdb::Env::Default()->GetSystemClock().get();
4218   rocksdb::StopWatchNano timer(clock, true);
4219 
4220   /* note: h->external_lock(F_UNLCK) is called after this function is called) */
4221   Rdb_transaction *&tx = get_tx_from_thd(thd);
4222 
4223   /* this will trigger saving of perf_context information */
4224   Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd));
4225 
4226   if (tx != nullptr) {
4227     if (commit_tx || (!my_core::thd_test_options(
4228                          thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
4229       /*
4230         We get here
4231          - For a COMMIT statement that finishes a multi-statement transaction
4232          - For a statement that has its own transaction
4233       */
4234       if (tx->commit()) {
4235         DBUG_RETURN(HA_ERR_ROCKSDB_COMMIT_FAILED);
4236       }
4237     } else {
4238       /*
4239         We get here when committing a statement within a transaction.
4240       */
4241       tx->set_tx_failed(false);
4242       tx->make_stmt_savepoint_permanent();
4243     }
4244 
4245     if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED) {
4246       // For READ_COMMITTED, we release any existing snapshot so that we will
4247       // see any changes that occurred since the last statement.
4248       tx->release_snapshot();
4249     }
4250   }
4251 
4252   // `Add()` is implemented in a thread-safe manner.
4253   commit_latency_stats->Add(timer.ElapsedNanos() / 1000);
4254 
4255   DBUG_RETURN(HA_EXIT_SUCCESS);
4256 }
4257 
rocksdb_rollback(handlerton * const hton,THD * const thd,bool rollback_tx)4258 static int rocksdb_rollback(handlerton *const hton, THD *const thd,
4259                             bool rollback_tx) {
4260   Rdb_transaction *&tx = get_tx_from_thd(thd);
4261   Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd));
4262 
4263   if (tx != nullptr) {
4264     if (rollback_tx) {
4265       /*
4266         We get here, when
4267         - ROLLBACK statement is issued.
4268 
4269         Discard the changes made by the transaction
4270       */
4271       tx->rollback();
4272     } else {
4273       /*
4274         We get here when
4275         - a statement with AUTOCOMMIT=1 is being rolled back (because of some
4276           error)
4277         - a statement inside a transaction is rolled back
4278       */
4279 
4280       tx->rollback_stmt();
4281       tx->set_tx_failed(true);
4282     }
4283 
4284     if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED) {
4285       // For READ_COMMITTED, we release any existing snapshot so that we will
4286       // see any changes that occurred since the last statement.
4287       tx->release_snapshot();
4288     }
4289   }
4290   return HA_EXIT_SUCCESS;
4291 }
4292 
print_stats(THD * const thd,std::string const & type,std::string const & name,std::string const & status,stat_print_fn * stat_print)4293 static bool print_stats(THD *const thd, std::string const &type,
4294                         std::string const &name, std::string const &status,
4295                         stat_print_fn *stat_print) {
4296   return stat_print(thd, type.c_str(), type.size(), name.c_str(), name.size(),
4297                     status.c_str(), status.size());
4298 }
4299 
format_string(const char * const format,...)4300 static std::string format_string(const char *const format, ...) {
4301   std::string res;
4302   va_list args;
4303   va_list args_copy;
4304   char static_buff[256];
4305 
4306   assert(format != nullptr);
4307 
4308   va_start(args, format);
4309   va_copy(args_copy, args);
4310 
4311   // Calculate how much space we will need
4312   int len = vsnprintf(nullptr, 0, format, args);
4313   va_end(args);
4314 
4315   if (len < 0) {
4316     res = std::string("<format error>");
4317   } else if (len == 0) {
4318     // Shortcut for an empty string
4319     res = std::string("");
4320   } else {
4321     // For short enough output use a static buffer
4322     char *buff = static_buff;
4323     std::unique_ptr<char[]> dynamic_buff = nullptr;
4324 
4325     len++;  // Add one for null terminator
4326 
4327     // for longer output use an allocated buffer
4328     if (static_cast<uint>(len) > sizeof(static_buff)) {
4329       dynamic_buff.reset(new char[len]);
4330       buff = dynamic_buff.get();
4331     }
4332 
4333     // Now re-do the vsnprintf with the buffer which is now large enough
4334     (void)vsnprintf(buff, len, format, args_copy);
4335 
4336     // Convert to a std::string.  Note we could have created a std::string
4337     // large enough and then converted the buffer to a 'char*' and created
4338     // the output in place.  This would probably work but feels like a hack.
4339     // Since this isn't code that needs to be super-performant we are going
4340     // with this 'safer' method.
4341     res = std::string(buff);
4342   }
4343 
4344   va_end(args_copy);
4345 
4346   return res;
4347 }
4348 
4349 class Rdb_snapshot_status : public Rdb_tx_list_walker {
4350  private:
4351   std::string m_data;
4352 
current_timestamp(void)4353   static std::string current_timestamp(void) {
4354     static const char *const format = "%d-%02d-%02d %02d:%02d:%02d";
4355     time_t currtime;
4356     struct tm currtm;
4357 
4358     time(&currtime);
4359 
4360     localtime_r(&currtime, &currtm);
4361 
4362     return format_string(format, currtm.tm_year + 1900, currtm.tm_mon + 1,
4363                          currtm.tm_mday, currtm.tm_hour, currtm.tm_min,
4364                          currtm.tm_sec);
4365   }
4366 
get_header(void)4367   static std::string get_header(void) {
4368     return "\n============================================================\n" +
4369            current_timestamp() +
4370            " ROCKSDB TRANSACTION MONITOR OUTPUT\n"
4371            "============================================================\n"
4372            "---------\n"
4373            "SNAPSHOTS\n"
4374            "---------\n"
4375            "LIST OF SNAPSHOTS FOR EACH SESSION:\n";
4376   }
4377 
get_footer(void)4378   static std::string get_footer(void) {
4379     return "-----------------------------------------\n"
4380            "END OF ROCKSDB TRANSACTION MONITOR OUTPUT\n"
4381            "=========================================\n";
4382   }
4383 
get_dl_txn_info(const rocksdb::DeadlockInfo & txn,const GL_INDEX_ID & gl_index_id)4384   static Rdb_deadlock_info::Rdb_dl_trx_info get_dl_txn_info(
4385       const rocksdb::DeadlockInfo &txn, const GL_INDEX_ID &gl_index_id) {
4386     Rdb_deadlock_info::Rdb_dl_trx_info txn_data;
4387 
4388     txn_data.trx_id = txn.m_txn_id;
4389 
4390     txn_data.table_name = ddl_manager.safe_get_table_name(gl_index_id);
4391     if (txn_data.table_name.empty()) {
4392       txn_data.table_name =
4393           "NOT FOUND; INDEX_ID: " + std::to_string(gl_index_id.index_id);
4394     }
4395 
4396     const auto &kd = ddl_manager.safe_find(gl_index_id);
4397     txn_data.index_name =
4398         (kd) ? kd->get_name()
4399              : "NOT FOUND; INDEX_ID: " + std::to_string(gl_index_id.index_id);
4400 
4401     std::shared_ptr<rocksdb::ColumnFamilyHandle> cfh =
4402         cf_manager.get_cf(txn.m_cf_id);
4403 
4404     // Retrieve CF name from CF handle object, and it is safe if the CF is
4405     // removed from cf_manager at this point.
4406     txn_data.cf_name = (cfh)
4407                            ? cfh->GetName()
4408                            : "NOT FOUND; CF_ID: " + std::to_string(txn.m_cf_id);
4409 
4410     txn_data.waiting_key =
4411         rdb_hexdump(txn.m_waiting_key.c_str(), txn.m_waiting_key.length());
4412 
4413     txn_data.exclusive_lock = txn.m_exclusive;
4414 
4415     return txn_data;
4416   }
4417 
get_dl_path_trx_info(const rocksdb::DeadlockPath & path_entry)4418   static Rdb_deadlock_info get_dl_path_trx_info(
4419       const rocksdb::DeadlockPath &path_entry) {
4420     Rdb_deadlock_info deadlock_info;
4421     deadlock_info.path.reserve(path_entry.path.size());
4422 
4423     for (const auto &txn : path_entry.path) {
4424       const GL_INDEX_ID gl_index_id = {
4425           txn.m_cf_id, rdb_netbuf_to_uint32(reinterpret_cast<const uchar *>(
4426                            txn.m_waiting_key.c_str()))};
4427       deadlock_info.path.push_back(get_dl_txn_info(txn, gl_index_id));
4428     }
4429     assert_IFF(path_entry.limit_exceeded, path_entry.path.empty());
4430     /* print the first txn in the path to display the full deadlock cycle */
4431     if (!path_entry.path.empty() && !path_entry.limit_exceeded) {
4432       const auto &deadlocking_txn = *(path_entry.path.end() - 1);
4433       deadlock_info.victim_trx_id = deadlocking_txn.m_txn_id;
4434       deadlock_info.deadlock_time = path_entry.deadlock_time;
4435     }
4436     return deadlock_info;
4437   }
4438 
4439  public:
Rdb_snapshot_status()4440   Rdb_snapshot_status() : m_data(get_header()) {}
4441 
getResult()4442   std::string getResult() { return m_data + get_footer(); }
4443 
4444   /* Implement Rdb_transaction interface */
4445   /* Create one row in the snapshot status table */
process_tran(const Rdb_transaction * const tx)4446   void process_tran(const Rdb_transaction *const tx) override {
4447     assert(tx != nullptr);
4448 
4449     /* Calculate the duration the snapshot has existed */
4450     int64_t snapshot_timestamp = tx->m_snapshot_timestamp;
4451     if (snapshot_timestamp != 0) {
4452       int64_t curr_time;
4453       rdb->GetEnv()->GetCurrentTime(&curr_time);
4454 
4455       THD *thd = tx->get_thd();
4456       char buffer[1024];
4457       thd_security_context(thd, buffer, sizeof buffer, 0);
4458       m_data += format_string(
4459           "---SNAPSHOT, ACTIVE %lld sec\n"
4460           "%s\n"
4461           "lock count %llu, write count %llu\n",
4462           curr_time - snapshot_timestamp, buffer, tx->get_row_lock_count(),
4463           tx->get_write_count());
4464     }
4465   }
4466 
get_deadlock_info()4467   std::vector<Rdb_deadlock_info> get_deadlock_info() {
4468     std::vector<Rdb_deadlock_info> deadlock_info;
4469     const auto &dlock_buffer = rdb->GetDeadlockInfoBuffer();
4470     for (const auto &path_entry : dlock_buffer) {
4471       if (!path_entry.limit_exceeded) {
4472         deadlock_info.push_back(get_dl_path_trx_info(path_entry));
4473       }
4474     }
4475     return deadlock_info;
4476   }
4477 };
4478 
4479 /**
4480  * @brief
4481  * walks through all non-replication transactions and copies
4482  * out relevant information for information_schema.rocksdb_trx
4483  */
4484 class Rdb_trx_info_aggregator : public Rdb_tx_list_walker {
4485  private:
4486   std::vector<Rdb_trx_info> *m_trx_info;
4487 
4488  public:
Rdb_trx_info_aggregator(std::vector<Rdb_trx_info> * const trx_info)4489   explicit Rdb_trx_info_aggregator(std::vector<Rdb_trx_info> *const trx_info)
4490       : m_trx_info(trx_info) {}
4491 
process_tran(const Rdb_transaction * const tx)4492   void process_tran(const Rdb_transaction *const tx) override {
4493     static const std::map<int, std::string> state_map = {
4494         {rocksdb::Transaction::STARTED, "STARTED"},
4495         {rocksdb::Transaction::AWAITING_PREPARE, "AWAITING_PREPARE"},
4496         {rocksdb::Transaction::PREPARED, "PREPARED"},
4497         {rocksdb::Transaction::AWAITING_COMMIT, "AWAITING_COMMIT"},
4498         {rocksdb::Transaction::COMMITED, "COMMITED"},
4499         {rocksdb::Transaction::AWAITING_ROLLBACK, "AWAITING_ROLLBACK"},
4500         {rocksdb::Transaction::ROLLEDBACK, "ROLLEDBACK"},
4501     };
4502     static const size_t trx_query_max_len = 1024;  // length stolen from InnoDB
4503 
4504     assert(tx != nullptr);
4505 
4506     THD *const thd = tx->get_thd();
4507     const my_thread_id thread_id = thd->thread_id();
4508 
4509     if (tx->is_writebatch_trx()) {
4510       const auto wb_impl = static_cast<const Rdb_writebatch_impl *>(tx);
4511       assert(wb_impl);
4512       m_trx_info->push_back(
4513           {"",                            /* name */
4514            0,                             /* trx_id */
4515            wb_impl->get_write_count(), 0, /* lock_count */
4516            0,                             /* timeout_sec */
4517            "",                            /* state */
4518            "",                            /* waiting_key */
4519            0,                             /* waiting_cf_id */
4520            1,                             /*is_replication */
4521            1,                             /* skip_trx_api */
4522            wb_impl->is_tx_read_only(), 0, /* deadlock detection */
4523            wb_impl->num_ongoing_bulk_load(), thread_id, "" /* query string */});
4524     } else {
4525       const auto tx_impl = static_cast<const Rdb_transaction_impl *>(tx);
4526       assert(tx_impl);
4527       const rocksdb::Transaction *rdb_trx = tx_impl->get_rdb_trx();
4528 
4529       if (rdb_trx == nullptr) {
4530         return;
4531       }
4532 
4533       std::string query_str;
4534       query_str.reserve(trx_query_max_len + 1);
4535       size_t query_len = thd_query_safe(thd, &query_str[0], trx_query_max_len);
4536       query_str.resize(query_len);
4537 
4538       const auto state_it = state_map.find(rdb_trx->GetState());
4539       assert(state_it != state_map.end());
4540       const int is_replication = (thd->rli_slave != nullptr);
4541       uint32_t waiting_cf_id;
4542       std::string waiting_key;
4543       rdb_trx->GetWaitingTxns(&waiting_cf_id, &waiting_key),
4544 
4545           m_trx_info->push_back(
4546               {rdb_trx->GetName(), rdb_trx->GetID(), tx_impl->get_write_count(),
4547                tx_impl->get_row_lock_count(), tx_impl->get_timeout_sec(),
4548                state_it->second, waiting_key, waiting_cf_id, is_replication,
4549                0, /* skip_trx_api */
4550                tx_impl->is_tx_read_only(), rdb_trx->IsDeadlockDetect(),
4551                tx_impl->num_ongoing_bulk_load(), thread_id, query_str});
4552     }
4553   }
4554 };
4555 
4556 /*
4557   returns a vector of info for all non-replication threads
4558   for use by information_schema.rocksdb_trx
4559 */
rdb_get_all_trx_info()4560 std::vector<Rdb_trx_info> rdb_get_all_trx_info() {
4561   std::vector<Rdb_trx_info> trx_info;
4562   Rdb_trx_info_aggregator trx_info_agg(&trx_info);
4563   Rdb_transaction::walk_tx_list(&trx_info_agg);
4564   return trx_info;
4565 }
4566 
4567 /*
4568   returns a vector of info of recent deadlocks
4569   for use by information_schema.rocksdb_deadlock
4570 */
rdb_get_deadlock_info()4571 std::vector<Rdb_deadlock_info> rdb_get_deadlock_info() {
4572   Rdb_snapshot_status showStatus;
4573   Rdb_transaction::walk_tx_list(&showStatus);
4574   return showStatus.get_deadlock_info();
4575 }
4576 
4577 /*
4578   This is called for SHOW ENGINE ROCKSDB STATUS | LOGS | etc.
4579 
4580   For now, produce info about live files (which gives an imprecise idea about
4581   what column families are there).
4582 */
rocksdb_show_status(handlerton * const hton,THD * const thd,stat_print_fn * const stat_print,enum ha_stat_type stat_type)4583 static bool rocksdb_show_status(handlerton *const hton, THD *const thd,
4584                                 stat_print_fn *const stat_print,
4585                                 enum ha_stat_type stat_type) {
4586   assert(hton != nullptr);
4587   assert(thd != nullptr);
4588   assert(stat_print != nullptr);
4589 
4590   bool res = false;
4591   char buf[100] = {'\0'};
4592 
4593   if (stat_type == HA_ENGINE_STATUS) {
4594     assert(rdb != nullptr);
4595 
4596     std::string str;
4597 
4598     /* Global DB Statistics */
4599     if (rocksdb_stats) {
4600       str = rocksdb_stats->ToString();
4601 
4602       // Use the same format as internal RocksDB statistics entries to make
4603       // sure that output will look unified.
4604       assert(commit_latency_stats != nullptr);
4605 
4606       snprintf(buf, sizeof(buf),
4607                "rocksdb.commit_latency statistics "
4608                "Percentiles :=> 50 : %.2f 95 : %.2f "
4609                "99 : %.2f 100 : %.2f\n",
4610                commit_latency_stats->Percentile(50),
4611                commit_latency_stats->Percentile(95),
4612                commit_latency_stats->Percentile(99),
4613                commit_latency_stats->Percentile(100));
4614       str.append(buf);
4615 
4616       uint64_t v = 0;
4617 
4618       // Retrieve additional stalling related numbers from RocksDB and append
4619       // them to the buffer meant for displaying detailed statistics. The intent
4620       // here is to avoid adding another row to the query output because of
4621       // just two numbers.
4622       //
4623       // NB! We're replacing hyphens with underscores in output to better match
4624       // the existing naming convention.
4625       if (rdb->GetIntProperty("rocksdb.is-write-stopped", &v)) {
4626         snprintf(buf, sizeof(buf), "rocksdb.is_write_stopped COUNT : %lu\n", v);
4627         str.append(buf);
4628       }
4629 
4630       if (rdb->GetIntProperty("rocksdb.actual-delayed-write-rate", &v)) {
4631         snprintf(buf, sizeof(buf),
4632                  "rocksdb.actual_delayed_write_rate "
4633                  "COUNT : %lu\n",
4634                  v);
4635         str.append(buf);
4636       }
4637 
4638       res |= print_stats(thd, "STATISTICS", "rocksdb", str, stat_print);
4639     }
4640 
4641     /* Per DB stats */
4642     if (rdb->GetProperty("rocksdb.dbstats", &str)) {
4643       res |= print_stats(thd, "DBSTATS", "rocksdb", str, stat_print);
4644     }
4645 
4646     /* Per column family stats */
4647     for (const auto &cf_name : cf_manager.get_cf_names()) {
4648       std::shared_ptr<rocksdb::ColumnFamilyHandle> cfh =
4649           cf_manager.get_cf(cf_name);
4650       if (!cfh) {
4651         continue;
4652       }
4653 
4654       // Retrieve information from CF handle object.
4655       // Even if the CF is removed from CF_manager, the handle object
4656       // is valid.
4657       if (!rdb->GetProperty(cfh.get(), "rocksdb.cfstats", &str)) {
4658         continue;
4659       }
4660 
4661       res |= print_stats(thd, "CF_COMPACTION", cf_name, str, stat_print);
4662     }
4663 
4664     /* Memory Statistics */
4665     std::vector<rocksdb::DB *> dbs;
4666     std::unordered_set<const rocksdb::Cache *> cache_set;
4667     size_t internal_cache_count = 0;
4668     size_t kDefaultInternalCacheSize = 8 * 1024 * 1024;
4669 
4670     dbs.push_back(rdb);
4671     cache_set.insert(rocksdb_tbl_options->block_cache.get());
4672 
4673     for (const auto &cf_handle : cf_manager.get_all_cf()) {
4674       // It is safe if the CF handle is removed from cf_manager
4675       // at this point.
4676       rocksdb::ColumnFamilyDescriptor cf_desc;
4677       cf_handle->GetDescriptor(&cf_desc);
4678       auto *const table_factory = cf_desc.options.table_factory.get();
4679 
4680       if (table_factory != nullptr) {
4681         std::string tf_name = table_factory->Name();
4682 
4683         if (tf_name.find("BlockBasedTable") != std::string::npos) {
4684           const auto bbt_opt =
4685               table_factory->GetOptions<rocksdb::BlockBasedTableOptions>();
4686 
4687           if (bbt_opt != nullptr) {
4688             if (bbt_opt->block_cache.get() != nullptr) {
4689               cache_set.insert(bbt_opt->block_cache.get());
4690             } else {
4691               internal_cache_count++;
4692             }
4693             cache_set.insert(bbt_opt->block_cache_compressed.get());
4694           }
4695         }
4696       }
4697     }
4698 
4699     std::map<rocksdb::MemoryUtil::UsageType, uint64_t> temp_usage_by_type;
4700     str.clear();
4701     rocksdb::MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set,
4702                                                          &temp_usage_by_type);
4703 
4704     snprintf(buf, sizeof(buf), "\nMemTable Total: %lu",
4705              temp_usage_by_type[rocksdb::MemoryUtil::kMemTableTotal]);
4706     str.append(buf);
4707     snprintf(buf, sizeof(buf), "\nMemTable Unflushed: %lu",
4708              temp_usage_by_type[rocksdb::MemoryUtil::kMemTableUnFlushed]);
4709     str.append(buf);
4710     snprintf(buf, sizeof(buf), "\nTable Readers Total: %lu",
4711              temp_usage_by_type[rocksdb::MemoryUtil::kTableReadersTotal]);
4712     str.append(buf);
4713     snprintf(buf, sizeof(buf), "\nCache Total: %lu",
4714              temp_usage_by_type[rocksdb::MemoryUtil::kCacheTotal]);
4715     str.append(buf);
4716     snprintf(buf, sizeof(buf), "\nDefault Cache Capacity: %lu",
4717              internal_cache_count * kDefaultInternalCacheSize);
4718     str.append(buf);
4719     snprintf(buf, sizeof(buf), "\nCache Capacity: %lu",
4720              (uint64_t)rocksdb_block_cache_size);
4721     str.append(buf);
4722     res |= print_stats(thd, "MEMORY_STATS", "rocksdb", str, stat_print);
4723 
4724     /* Show the background thread status */
4725     std::vector<rocksdb::ThreadStatus> thread_list;
4726     rocksdb::Status s = rdb->GetEnv()->GetThreadList(&thread_list);
4727 
4728     if (!s.ok()) {
4729       // NO_LINT_DEBUG
4730       sql_print_error("RocksDB: Returned error (%s) from GetThreadList.\n",
4731                       s.ToString().c_str());
4732       res |= true;
4733     } else {
4734       /* For each background thread retrieved, print out its information */
4735       for (auto &it : thread_list) {
4736         /* Only look at background threads. Ignore user threads, if any. */
4737         if (it.thread_type > rocksdb::ThreadStatus::LOW_PRIORITY) {
4738           continue;
4739         }
4740 
4741         str = "\nthread_type: " + it.GetThreadTypeName(it.thread_type) +
4742               "\ncf_name: " + it.cf_name +
4743               "\noperation_type: " + it.GetOperationName(it.operation_type) +
4744               "\noperation_stage: " +
4745               it.GetOperationStageName(it.operation_stage) +
4746               "\nelapsed_time_ms: " + it.MicrosToString(it.op_elapsed_micros);
4747 
4748         for (auto &it_props : it.InterpretOperationProperties(
4749                  it.operation_type, it.op_properties)) {
4750           str += "\n" + it_props.first + ": " + std::to_string(it_props.second);
4751         }
4752 
4753         str += "\nstate_type: " + it.GetStateName(it.state_type);
4754 
4755         res |= print_stats(thd, "BG_THREADS", std::to_string(it.thread_id), str,
4756                            stat_print);
4757       }
4758     }
4759   }
4760 
4761   return res;
4762 }
4763 
rocksdb_register_tx(handlerton * const hton,THD * const thd,Rdb_transaction * const tx)4764 static inline void rocksdb_register_tx(handlerton *const hton, THD *const thd,
4765                                        Rdb_transaction *const tx) {
4766   assert(tx != nullptr);
4767 
4768   trans_register_ha(thd, false, rocksdb_hton, nullptr);
4769   if (rocksdb_write_policy == rocksdb::TxnDBWritePolicy::WRITE_UNPREPARED) {
4770     // Some internal operations will call trans_register_ha, but they do not
4771     // go through 2pc. In this case, the xid is set with query_id == 0, which
4772     // means that rocksdb will receive transactions with duplicate names.
4773     //
4774     // Skip setting name in these cases.
4775     if (thd->query_id != 0) {
4776       tx->set_name();
4777     }
4778   }
4779   if (my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
4780     tx->start_stmt();
4781     trans_register_ha(thd, true, rocksdb_hton, nullptr);
4782   }
4783 }
4784 
4785 /*
4786     Supporting START TRANSACTION WITH CONSISTENT SNAPSHOT
4787 
4788     - START TRANSACTION WITH CONSISTENT SNAPSHOT
4789     takes both InnoDB and RocksDB snapshots, and both InnoDB and RocksDB
4790     participate in transaction. When executing COMMIT, both InnoDB and
4791     RocksDB modifications are committed. Remember that XA is not supported yet,
4792     so mixing engines is not recommended anyway.
4793 */
rocksdb_start_tx_and_assign_read_view(handlerton * const hton,THD * const thd)4794 static int rocksdb_start_tx_and_assign_read_view(
4795     handlerton *const hton, /*!< in: RocksDB handlerton */
4796     THD *const thd)         /*!< in: MySQL thread handle of the
4797                             user for whom the transaction should
4798                             be committed */
4799 {
4800   ulong const tx_isolation = my_core::thd_tx_isolation(thd);
4801 
4802   Rdb_transaction *tx = get_or_create_tx(thd);
4803   Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd));
4804 
4805   assert(!tx->has_snapshot());
4806   tx->set_tx_read_only(true);
4807   rocksdb_register_tx(hton, thd, tx);
4808 
4809   if (tx_isolation == ISO_REPEATABLE_READ) {
4810     tx->acquire_snapshot(true);
4811   } else {
4812     push_warning_printf(thd, Sql_condition::SL_WARNING, HA_ERR_UNSUPPORTED,
4813                         "RocksDB: Only REPEATABLE READ isolation level is "
4814                         "supported for START TRANSACTION WITH CONSISTENT "
4815                         "SNAPSHOT in RocksDB Storage Engine. Snapshot has not "
4816                         "been taken.");
4817   }
4818   return HA_EXIT_SUCCESS;
4819 }
4820 
4821 /* Dummy SAVEPOINT support. This is needed for long running transactions
4822  * like mysqldump (https://bugs.mysql.com/bug.php?id=71017).
4823  * Current SAVEPOINT does not correctly handle ROLLBACK and does not return
4824  * errors. This needs to be addressed in future versions (Issue#96).
4825  */
rocksdb_savepoint(handlerton * const hton,THD * const thd,void * const savepoint)4826 static int rocksdb_savepoint(handlerton *const hton, THD *const thd,
4827                              void *const savepoint) {
4828   return HA_EXIT_SUCCESS;
4829 }
4830 
rocksdb_rollback_to_savepoint(handlerton * const hton,THD * const thd,void * const savepoint)4831 static int rocksdb_rollback_to_savepoint(handlerton *const hton, THD *const thd,
4832                                          void *const savepoint) {
4833   Rdb_transaction *&tx = get_tx_from_thd(thd);
4834   return tx->rollback_to_savepoint(savepoint);
4835 }
4836 
rocksdb_rollback_to_savepoint_can_release_mdl(handlerton * const hton,THD * const thd)4837 static bool rocksdb_rollback_to_savepoint_can_release_mdl(
4838     handlerton *const hton, THD *const thd) {
4839   return true;
4840 }
4841 
check_rocksdb_options_compatibility(const char * const dbpath,const rocksdb::Options & main_opts,const std::vector<rocksdb::ColumnFamilyDescriptor> & cf_descr)4842 static rocksdb::Status check_rocksdb_options_compatibility(
4843     const char *const dbpath, const rocksdb::Options &main_opts,
4844     const std::vector<rocksdb::ColumnFamilyDescriptor> &cf_descr) {
4845   assert(rocksdb_datadir != nullptr);
4846 
4847   rocksdb::DBOptions loaded_db_opt;
4848   std::vector<rocksdb::ColumnFamilyDescriptor> loaded_cf_descs;
4849   rocksdb::Status status =
4850       LoadLatestOptions(dbpath, rocksdb::Env::Default(), &loaded_db_opt,
4851                         &loaded_cf_descs, rocksdb_ignore_unknown_options);
4852 
4853   // If we're starting from scratch and there are no options saved yet then this
4854   // is a valid case. Therefore we can't compare the current set of options to
4855   // anything.
4856   if (status.IsNotFound()) {
4857     return rocksdb::Status::OK();
4858   }
4859 
4860   if (!status.ok()) {
4861     return status;
4862   }
4863 
4864   if (loaded_cf_descs.size() != cf_descr.size()) {
4865     return rocksdb::Status::NotSupported(
4866         "Mismatched size of column family "
4867         "descriptors.");
4868   }
4869 
4870   // Please see RocksDB documentation for more context about why we need to set
4871   // user-defined functions and pointer-typed options manually.
4872   for (size_t i = 0; i < loaded_cf_descs.size(); i++) {
4873     loaded_cf_descs[i].options.compaction_filter =
4874         cf_descr[i].options.compaction_filter;
4875     loaded_cf_descs[i].options.compaction_filter_factory =
4876         cf_descr[i].options.compaction_filter_factory;
4877     loaded_cf_descs[i].options.comparator = cf_descr[i].options.comparator;
4878     loaded_cf_descs[i].options.memtable_factory =
4879         cf_descr[i].options.memtable_factory;
4880     loaded_cf_descs[i].options.merge_operator =
4881         cf_descr[i].options.merge_operator;
4882     loaded_cf_descs[i].options.prefix_extractor =
4883         cf_descr[i].options.prefix_extractor;
4884     loaded_cf_descs[i].options.table_factory =
4885         cf_descr[i].options.table_factory;
4886   }
4887 
4888   // This is the essence of the function - determine if it's safe to open the
4889   // database or not.
4890   status = CheckOptionsCompatibility(dbpath, rocksdb::Env::Default(), main_opts,
4891                                      loaded_cf_descs,
4892                                      rocksdb_ignore_unknown_options);
4893 
4894   return status;
4895 }
4896 
rocksdb_partition_flags()4897 static uint rocksdb_partition_flags() { return (HA_CANNOT_PARTITION_FK); }
4898 
4899 /* Clean up tables leftover from truncation */
rocksdb_truncation_table_cleanup(void)4900 void rocksdb_truncation_table_cleanup(void) {
4901   /* Scan for tables that have the truncation prefix */
4902   struct Rdb_truncate_tbls : public Rdb_tables_scanner {
4903    public:
4904     std::vector<Rdb_tbl_def *> m_tbl_list;
4905     int add_table(Rdb_tbl_def *tdef) override {
4906       assert(tdef != nullptr);
4907       if (tdef->base_tablename().find(TRUNCATE_TABLE_PREFIX) !=
4908           std::string::npos) {
4909         m_tbl_list.push_back(tdef);
4910       }
4911       return HA_EXIT_SUCCESS;
4912     }
4913   } collector;
4914   ddl_manager.scan_for_tables(&collector);
4915 
4916   /*
4917     For now, delete any table found. It's possible to rename them back,
4918     but there's a risk the rename can potentially lead to other inconsistencies.
4919     Removing the old table (which is being truncated anyway) seems to be the
4920     safest solution.
4921   */
4922   ha_rocksdb table(rocksdb_hton, nullptr);
4923   for (Rdb_tbl_def *tbl_def : collector.m_tbl_list) {
4924     // NO_LINT_DEBUG
4925     sql_print_warning("MyRocks: Removing truncated leftover table %s",
4926                       tbl_def->full_tablename().c_str());
4927     table.delete_table(tbl_def);
4928   }
4929 }
4930 
4931 /*
4932   Storage Engine initialization function, invoked when plugin is loaded.
4933 */
4934 
rocksdb_init_func(void * const p)4935 static int rocksdb_init_func(void *const p) {
4936   DBUG_ENTER_FUNC();
4937 
4938   if (rdb_check_rocksdb_corruption()) {
4939     sql_print_error(
4940         "RocksDB: There was corruption detected in the RockDB data"
4941         "files. Check error log emitted earlier for more details.");
4942     if (rocksdb_allow_to_start_after_corruption) {
4943       sql_print_information(
4944           "RocksDB: Set rocksdb_allow_to_start_after_corruption=0 to prevent "
4945           "server from starting when RocksDB data corruption is detected.");
4946     } else {
4947       sql_print_error(
4948           "RocksDB: The server will exit normally and stop restart "
4949           "attempts. Remove %s file from data directory and "
4950           "start mysqld manually.",
4951           rdb_corruption_marker_file_name().c_str());
4952       exit(0);
4953     }
4954   }
4955 
4956   // Validate the assumption about the size of ROCKSDB_SIZEOF_HIDDEN_PK_COLUMN.
4957   static_assert(sizeof(longlong) == 8, "Assuming that longlong is 8 bytes.");
4958 
4959   // Lock the handlertons initialized status flag for writing
4960   Rdb_hton_init_state::Scoped_lock state_lock(*rdb_get_hton_init_state(), true);
4961   SHIP_ASSERT(!rdb_get_hton_init_state()->initialized());
4962 
4963   init_rocksdb_psi_keys();
4964 
4965   rocksdb_hton = (handlerton *)p;
4966 
4967   rdb_open_tables.init();
4968   Ensure_cleanup rdb_open_tables_cleanup([]() { rdb_open_tables.free(); });
4969 
4970 #ifdef HAVE_PSI_INTERFACE
4971   rdb_bg_thread.init(rdb_signal_bg_psi_mutex_key, rdb_signal_bg_psi_cond_key);
4972   rdb_drop_idx_thread.init(rdb_signal_drop_idx_psi_mutex_key,
4973                            rdb_signal_drop_idx_psi_cond_key);
4974   rdb_is_thread.init(rdb_signal_is_psi_mutex_key, rdb_signal_is_psi_cond_key);
4975   rdb_mc_thread.init(rdb_signal_mc_psi_mutex_key, rdb_signal_mc_psi_cond_key);
4976 #else
4977   rdb_bg_thread.init();
4978   rdb_drop_idx_thread.init();
4979   rdb_is_thread.init();
4980   rdb_mc_thread.init();
4981 #endif
4982   mysql_mutex_init(rdb_collation_data_mutex_key, &rdb_collation_data_mutex,
4983                    MY_MUTEX_INIT_FAST);
4984   mysql_mutex_init(rdb_mem_cmp_space_mutex_key, &rdb_mem_cmp_space_mutex,
4985                    MY_MUTEX_INIT_FAST);
4986 
4987 #if defined(HAVE_PSI_INTERFACE)
4988   rdb_collation_exceptions = new Regex(key_rwlock_collation_exception_list);
4989 #else
4990   rdb_collation_exceptions = new Regex();
4991 #endif
4992 
4993   mysql_mutex_init(rdb_sysvars_psi_mutex_key, &rdb_sysvars_mutex,
4994                    MY_MUTEX_INIT_FAST);
4995   mysql_mutex_init(rdb_block_cache_resize_mutex_key,
4996                    &rdb_block_cache_resize_mutex, MY_MUTEX_INIT_FAST);
4997   mysql_mutex_init(rdb_bottom_pri_background_compactions_resize_mutex_key,
4998                    &rdb_bottom_pri_background_compactions_resize_mutex,
4999                    MY_MUTEX_INIT_FAST);
5000   Rdb_transaction::init_mutex();
5001 
5002   rocksdb_hton->state = SHOW_OPTION_YES;
5003   rocksdb_hton->create = rocksdb_create_handler;
5004   rocksdb_hton->close_connection = rocksdb_close_connection;
5005   rocksdb_hton->prepare = rocksdb_prepare;
5006   rocksdb_hton->commit_by_xid = rocksdb_commit_by_xid;
5007   rocksdb_hton->rollback_by_xid = rocksdb_rollback_by_xid;
5008   rocksdb_hton->recover = rocksdb_recover;
5009   rocksdb_hton->commit = rocksdb_commit;
5010   rocksdb_hton->rollback = rocksdb_rollback;
5011   rocksdb_hton->db_type = DB_TYPE_ROCKSDB;
5012   rocksdb_hton->show_status = rocksdb_show_status;
5013   rocksdb_hton->start_consistent_snapshot =
5014       rocksdb_start_tx_and_assign_read_view;
5015   rocksdb_hton->savepoint_set = rocksdb_savepoint;
5016   rocksdb_hton->savepoint_rollback = rocksdb_rollback_to_savepoint;
5017   rocksdb_hton->savepoint_rollback_can_release_mdl =
5018       rocksdb_rollback_to_savepoint_can_release_mdl;
5019   rocksdb_hton->flush_logs = rocksdb_flush_wal;
5020 
5021   rocksdb_hton->flags = HTON_TEMPORARY_NOT_SUPPORTED |
5022                         HTON_SUPPORTS_EXTENDED_KEYS | HTON_CAN_RECREATE;
5023 
5024   if (rocksdb_enable_native_partition)
5025     rocksdb_hton->partition_flags = rocksdb_partition_flags;
5026 
5027   assert(!mysqld_embedded);
5028 
5029   if (rocksdb_db_options->max_open_files > (long)open_files_limit) {
5030     sql_print_information(
5031         "RocksDB: rocksdb_max_open_files should not be "
5032         "greater than the open_files_limit, effective value "
5033         "of rocksdb_max_open_files is being set to "
5034         "open_files_limit / 2.");
5035     rocksdb_db_options->max_open_files = open_files_limit / 2;
5036   } else if (rocksdb_db_options->max_open_files == -2) {
5037     rocksdb_db_options->max_open_files = open_files_limit / 2;
5038   }
5039 
5040   rdb_read_free_regex_handler.compile(DEFAULT_READ_FREE_RPL_TABLES,
5041                                       get_regex_flags(), table_alias_charset);
5042 
5043   rocksdb_stats = rocksdb::CreateDBStatistics();
5044   rocksdb_stats->set_stats_level(
5045       static_cast<rocksdb::StatsLevel>(rocksdb_stats_level));
5046   rocksdb_stats_level = rocksdb_stats->get_stats_level();
5047   rocksdb_db_options->statistics = rocksdb_stats;
5048 
5049   if (rocksdb_rate_limiter_bytes_per_sec != 0) {
5050     rocksdb_rate_limiter.reset(
5051         rocksdb::NewGenericRateLimiter(rocksdb_rate_limiter_bytes_per_sec));
5052     rocksdb_db_options->rate_limiter = rocksdb_rate_limiter;
5053   }
5054 
5055   rocksdb_db_options->delayed_write_rate = rocksdb_delayed_write_rate;
5056 
5057   std::shared_ptr<Rdb_logger> myrocks_logger = std::make_shared<Rdb_logger>();
5058   rocksdb::Status s = rocksdb::CreateLoggerFromOptions(
5059       rocksdb_datadir, *rocksdb_db_options, &rocksdb_db_options->info_log);
5060   if (s.ok()) {
5061     myrocks_logger->SetRocksDBLogger(rocksdb_db_options->info_log);
5062   }
5063 
5064   rocksdb_db_options->info_log = myrocks_logger;
5065   myrocks_logger->SetInfoLogLevel(
5066       static_cast<rocksdb::InfoLogLevel>(rocksdb_info_log_level));
5067   rocksdb_db_options->wal_dir = rocksdb_wal_dir;
5068 
5069   rocksdb_db_options->wal_recovery_mode =
5070       static_cast<rocksdb::WALRecoveryMode>(rocksdb_wal_recovery_mode);
5071 
5072   rocksdb_db_options->track_and_verify_wals_in_manifest =
5073       rocksdb_track_and_verify_wals_in_manifest;
5074 
5075   rocksdb_db_options->access_hint_on_compaction_start =
5076       static_cast<rocksdb::Options::AccessHint>(
5077           rocksdb_access_hint_on_compaction_start);
5078 
5079   if (rocksdb_db_options->allow_mmap_reads &&
5080       rocksdb_db_options->use_direct_reads) {
5081     // allow_mmap_reads implies !use_direct_reads and RocksDB will not open if
5082     // mmap_reads and direct_reads are both on.   (NO_LINT_DEBUG)
5083     sql_print_error(
5084         "RocksDB: Can't enable both use_direct_reads "
5085         "and allow_mmap_reads\n");
5086     DBUG_RETURN(HA_EXIT_FAILURE);
5087   }
5088 
5089   // Check whether the filesystem backing rocksdb_datadir allows O_DIRECT
5090   if (rocksdb_db_options->use_direct_reads ||
5091       rocksdb_db_options->use_direct_io_for_flush_and_compaction) {
5092     rocksdb::EnvOptions soptions;
5093     rocksdb::Status check_status;
5094     rocksdb::Env *const env = rocksdb_db_options->env;
5095 
5096     std::string fname = format_string("%s/DIRECT_CHECK", rocksdb_datadir);
5097     if (env->FileExists(fname).ok()) {
5098       std::unique_ptr<rocksdb::SequentialFile> file;
5099       soptions.use_direct_reads = true;
5100       check_status = env->NewSequentialFile(fname, &file, soptions);
5101     } else {
5102       std::unique_ptr<rocksdb::WritableFile> file;
5103       soptions.use_direct_writes = true;
5104       check_status = env->ReopenWritableFile(fname, &file, soptions);
5105       if (file != nullptr) {
5106         file->Close();
5107       }
5108       env->DeleteFile(fname);
5109     }
5110 
5111     if (!check_status.ok()) {
5112       sql_print_error(
5113           "RocksDB: Unable to use direct io in rocksdb-datadir:"
5114           "(%s)",
5115           check_status.getState());
5116       DBUG_RETURN(HA_EXIT_FAILURE);
5117     }
5118   }
5119 
5120   if (rocksdb_db_options->allow_mmap_writes &&
5121       rocksdb_db_options->use_direct_io_for_flush_and_compaction) {
5122     // See above comment for allow_mmap_reads. (NO_LINT_DEBUG)
5123     sql_print_error(
5124         "RocksDB: Can't enable both "
5125         "use_direct_io_for_flush_and_compaction and "
5126         "allow_mmap_writes\n");
5127     DBUG_RETURN(HA_EXIT_FAILURE);
5128   }
5129 
5130   if (rocksdb_db_options->allow_mmap_writes &&
5131       rocksdb_flush_log_at_trx_commit != FLUSH_LOG_NEVER) {
5132     // NO_LINT_DEBUG
5133     sql_print_error(
5134         "RocksDB: rocksdb_flush_log_at_trx_commit needs to be 0 "
5135         "to use allow_mmap_writes");
5136     DBUG_RETURN(HA_EXIT_FAILURE);
5137   }
5138 
5139   // sst_file_manager will move deleted rocksdb sst files to trash_dir
5140   // to be deleted in a background thread.
5141   std::string trash_dir = std::string(rocksdb_datadir) + "/trash";
5142   rocksdb_db_options->sst_file_manager.reset(NewSstFileManager(
5143       rocksdb_db_options->env, myrocks_logger, trash_dir,
5144       rocksdb_sst_mgr_rate_bytes_per_sec, true /* delete_existing_trash */));
5145 
5146   std::vector<std::string> cf_names;
5147   rocksdb::Status status;
5148   status = rocksdb::DB::ListColumnFamilies(*rocksdb_db_options, rocksdb_datadir,
5149                                            &cf_names);
5150   if (!status.ok()) {
5151     /*
5152       When we start on an empty datadir, ListColumnFamilies returns IOError,
5153       and RocksDB doesn't provide any way to check what kind of error it was.
5154       Checking system errno happens to work right now.
5155     */
5156     if (status.IsIOError() && errno == ENOENT) {
5157       // NO_LINT_DEBUG
5158       sql_print_information("RocksDB: Got ENOENT when listing column families");
5159 
5160       // NO_LINT_DEBUG
5161       sql_print_information(
5162           "RocksDB:   assuming that we're creating a new database");
5163     } else {
5164       rdb_log_status_error(status, "Error listing column families");
5165       DBUG_RETURN(HA_EXIT_FAILURE);
5166     }
5167   } else {
5168     // NO_LINT_DEBUG
5169     sql_print_information("RocksDB: %ld column families found",
5170                           cf_names.size());
5171   }
5172 
5173   std::vector<rocksdb::ColumnFamilyDescriptor> cf_descr;
5174   std::vector<rocksdb::ColumnFamilyHandle *> cf_handles;
5175 
5176   rocksdb_tbl_options->index_type =
5177       (rocksdb::BlockBasedTableOptions::IndexType)rocksdb_index_type;
5178 
5179   if (!rocksdb_tbl_options->no_block_cache) {
5180     std::shared_ptr<rocksdb::MemoryAllocator> memory_allocator;
5181     if (!rocksdb_cache_dump) {
5182 #ifdef HAVE_JEMALLOC
5183       size_t block_size = rocksdb_tbl_options->block_size;
5184       rocksdb::JemallocAllocatorOptions alloc_opt;
5185       // Limit jemalloc tcache memory usage. The range
5186       // [block_size/4, block_size] should be enough to cover most of
5187       // block cache allocation sizes.
5188       alloc_opt.limit_tcache_size = true;
5189       alloc_opt.tcache_size_lower_bound = block_size / 4;
5190       alloc_opt.tcache_size_upper_bound = block_size;
5191       rocksdb::Status new_alloc_status =
5192           rocksdb::NewJemallocNodumpAllocator(alloc_opt, &memory_allocator);
5193       if (!new_alloc_status.ok()) {
5194         // Fallback to use default malloc/free.
5195         rdb_log_status_error(new_alloc_status,
5196                              "Error excluding block cache from core dump");
5197         memory_allocator = nullptr;
5198         DBUG_RETURN(HA_EXIT_FAILURE);
5199       }
5200 #else
5201       // NO_LINT_DEBUG
5202       sql_print_warning(
5203           "Ignoring rocksdb_cache_dump because jemalloc is missing.");
5204 #endif  // HAVE_JEMALLOC
5205     }
5206     std::shared_ptr<rocksdb::Cache> block_cache = rocksdb::NewLRUCache(
5207         rocksdb_block_cache_size, -1 /*num_shard_bits*/,
5208         false /*strict_capcity_limit*/, rocksdb_cache_high_pri_pool_ratio,
5209         memory_allocator);
5210     if (rocksdb_sim_cache_size > 0) {
5211       // Simulated cache enabled
5212       // Wrap block cache inside a simulated cache and pass it to RocksDB
5213       rocksdb_tbl_options->block_cache =
5214           rocksdb::NewSimCache(block_cache, rocksdb_sim_cache_size, 6);
5215     } else {
5216       // Pass block cache to RocksDB
5217       rocksdb_tbl_options->block_cache = block_cache;
5218     }
5219   }
5220 
5221   if (rocksdb_collect_sst_properties) {
5222     properties_collector_factory =
5223         std::make_shared<Rdb_tbl_prop_coll_factory>(&ddl_manager);
5224 
5225     rocksdb_set_compaction_options(nullptr, nullptr, nullptr, nullptr);
5226 
5227     RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
5228 
5229     assert(rocksdb_table_stats_sampling_pct <=
5230                 RDB_TBL_STATS_SAMPLE_PCT_MAX);
5231     properties_collector_factory->SetTableStatsSamplingPct(
5232         rocksdb_table_stats_sampling_pct);
5233 
5234     RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
5235   }
5236 
5237   if (rocksdb_persistent_cache_size_mb > 0) {
5238     std::shared_ptr<rocksdb::PersistentCache> pcache;
5239     uint64_t cache_size_bytes = rocksdb_persistent_cache_size_mb * 1024 * 1024;
5240     status = rocksdb::NewPersistentCache(
5241         rocksdb::Env::Default(), std::string(rocksdb_persistent_cache_path),
5242         cache_size_bytes, myrocks_logger, true, &pcache);
5243     if (!status.ok()) {
5244       // NO_LINT_DEBUG
5245       sql_print_error("RocksDB: Persistent cache returned error: (%s)",
5246                       status.getState());
5247       DBUG_RETURN(HA_EXIT_FAILURE);
5248     }
5249     rocksdb_tbl_options->persistent_cache = pcache;
5250   } else if (strlen(rocksdb_persistent_cache_path)) {
5251     // NO_LINT_DEBUG
5252     sql_print_error("RocksDB: Must specify rocksdb_persistent_cache_size_mb");
5253     DBUG_RETURN(HA_EXIT_FAILURE);
5254   }
5255 
5256   std::unique_ptr<Rdb_cf_options> cf_options_map(new Rdb_cf_options());
5257   if (!cf_options_map->init(*rocksdb_tbl_options, properties_collector_factory,
5258                             rocksdb_default_cf_options,
5259                             rocksdb_override_cf_options)) {
5260     // NO_LINT_DEBUG
5261     sql_print_error("RocksDB: Failed to initialize CF options map.");
5262     DBUG_RETURN(HA_EXIT_FAILURE);
5263   }
5264 
5265   /*
5266     If there are no column families, we're creating the new database.
5267     Create one column family named "default".
5268   */
5269   if (cf_names.size() == 0) cf_names.push_back(DEFAULT_CF_NAME);
5270 
5271   std::vector<int> compaction_enabled_cf_indices;
5272 
5273   // NO_LINT_DEBUG
5274   sql_print_information("RocksDB: Column Families at start:");
5275   for (size_t i = 0; i < cf_names.size(); ++i) {
5276     rocksdb::ColumnFamilyOptions opts;
5277     cf_options_map->get_cf_options(cf_names[i], &opts);
5278 
5279     // NO_LINT_DEBUG
5280     sql_print_information("  cf=%s", cf_names[i].c_str());
5281 
5282     // NO_LINT_DEBUG
5283     sql_print_information("    write_buffer_size=%ld", opts.write_buffer_size);
5284 
5285     // NO_LINT_DEBUG
5286     sql_print_information("    target_file_size_base=%" PRIu64,
5287                           opts.target_file_size_base);
5288 
5289     /*
5290       Temporarily disable compactions to prevent a race condition where
5291       compaction starts before compaction filter is ready.
5292     */
5293     if (!opts.disable_auto_compactions) {
5294       compaction_enabled_cf_indices.push_back(i);
5295       opts.disable_auto_compactions = true;
5296     }
5297     cf_descr.push_back(rocksdb::ColumnFamilyDescriptor(cf_names[i], opts));
5298   }
5299 
5300   rocksdb::Options main_opts(*rocksdb_db_options,
5301                              cf_options_map->get_defaults());
5302 
5303   rocksdb::TransactionDBOptions tx_db_options;
5304   tx_db_options.transaction_lock_timeout = 2000;  // 2 seconds
5305   tx_db_options.custom_mutex_factory = std::make_shared<Rdb_mutex_factory>();
5306   tx_db_options.write_policy =
5307       static_cast<rocksdb::TxnDBWritePolicy>(rocksdb_write_policy);
5308 
5309   status =
5310       check_rocksdb_options_compatibility(rocksdb_datadir, main_opts, cf_descr);
5311 
5312   // We won't start if we'll determine that there's a chance of data corruption
5313   // because of incompatible options.
5314   if (!status.ok()) {
5315     rdb_log_status_error(
5316         status, "Compatibility check against existing database options failed");
5317     DBUG_RETURN(HA_EXIT_FAILURE);
5318   }
5319 
5320   // NO_LINT_DEBUG
5321   sql_print_information("RocksDB: Opening TransactionDB...");
5322 
5323   status = rocksdb::TransactionDB::Open(
5324       main_opts, tx_db_options, rocksdb_datadir, cf_descr, &cf_handles, &rdb);
5325 
5326   if (!status.ok()) {
5327     rdb_log_status_error(status, "Error opening instance");
5328     DBUG_RETURN(HA_EXIT_FAILURE);
5329   }
5330   cf_manager.init(std::move(cf_options_map), &cf_handles);
5331 
5332   // NO_LINT_DEBUG
5333   sql_print_information("RocksDB: Initializing data dictionary...");
5334 
5335   if (st_rdb_exec_time.exec("Rdb_dict_manager::init", [&]() {
5336         return dict_manager.init(rdb, &cf_manager,
5337                                  rocksdb_enable_remove_orphaned_dropped_cfs);
5338       })) {
5339     // NO_LINT_DEBUG
5340     sql_print_error("RocksDB: Failed to initialize data dictionary.");
5341     DBUG_RETURN(HA_EXIT_FAILURE);
5342   }
5343 
5344   sql_print_information("RocksDB: Initializing DDL Manager...");
5345 
5346   if (st_rdb_exec_time.exec("Rdb_ddl_manager::init", [&]() {
5347 #if defined(ROCKSDB_INCLUDE_VALIDATE_TABLES) && ROCKSDB_INCLUDE_VALIDATE_TABLES
5348         return ddl_manager.init(&dict_manager, &cf_manager,
5349                                 rocksdb_validate_tables);
5350 #else
5351         return ddl_manager.init(&dict_manager, &cf_manager);
5352 #endif // defined(ROCKSDB_INCLUDE_VALIDATE_TABLES) && ROCKSDB_INCLUDE_VALIDATE_TABLES
5353       })) {
5354     // NO_LINT_DEBUG
5355     sql_print_error("RocksDB: Failed to initialize DDL manager.");
5356     DBUG_RETURN(HA_EXIT_FAILURE);
5357   }
5358 
5359   for (const auto &cf_handle : cf_manager.get_all_cf()) {
5360     uint flags;
5361     if (!dict_manager.get_cf_flags(cf_handle->GetID(), &flags)) {
5362       const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
5363       rocksdb::WriteBatch *const batch = wb.get();
5364       dict_manager.add_cf_flags(batch, cf_handle->GetID(), 0);
5365       dict_manager.commit(batch);
5366     }
5367   }
5368 
5369   Rdb_sst_info::init(rdb);
5370 
5371   /*
5372     Enable auto compaction, things needed for compaction filter are finished
5373     initializing
5374   */
5375   std::vector<rocksdb::ColumnFamilyHandle *> compaction_enabled_cf_handles;
5376   compaction_enabled_cf_handles.reserve(compaction_enabled_cf_indices.size());
5377   for (const auto &index : compaction_enabled_cf_indices) {
5378     compaction_enabled_cf_handles.push_back(cf_handles[index]);
5379   }
5380 
5381   status = rdb->EnableAutoCompaction(compaction_enabled_cf_handles);
5382 
5383   if (!status.ok()) {
5384     rdb_log_status_error(status, "Error enabling compaction");
5385     DBUG_RETURN(HA_EXIT_FAILURE);
5386   }
5387 
5388   auto err = rdb_bg_thread.create_thread(BG_THREAD_NAME
5389 #ifdef HAVE_PSI_INTERFACE
5390                                          ,
5391                                          rdb_background_psi_thread_key
5392 #endif
5393   );
5394   if (err != 0) {
5395     // NO_LINT_DEBUG
5396     sql_print_error("RocksDB: Couldn't start the background thread: (errno=%d)",
5397                     err);
5398     DBUG_RETURN(HA_EXIT_FAILURE);
5399   }
5400 
5401   err = rdb_drop_idx_thread.create_thread(INDEX_THREAD_NAME
5402 #ifdef HAVE_PSI_INTERFACE
5403                                           ,
5404                                           rdb_drop_idx_psi_thread_key
5405 #endif
5406   );
5407   if (err != 0) {
5408     sql_print_error("RocksDB: Couldn't start the drop index thread: (errno=%d)",
5409                     err);
5410     DBUG_RETURN(HA_EXIT_FAILURE);
5411   }
5412 
5413 #ifndef HAVE_PSI_INTERFACE
5414   err = rdb_is_thread.create_thread(INDEX_STATS_THREAD_NAME);
5415 #else
5416   err = rdb_is_thread.create_thread(INDEX_STATS_THREAD_NAME,
5417                                     rdb_is_psi_thread_key);
5418 #endif
5419   if (err != 0) {
5420     // NO_LINT_DEBUG
5421     sql_print_error(
5422         "RocksDB: Couldn't start the index stats calculation thread: "
5423         "(errno=%d)",
5424         err);
5425     DBUG_RETURN(HA_EXIT_FAILURE);
5426   }
5427 
5428   err = rdb_mc_thread.create_thread(MANUAL_COMPACTION_THREAD_NAME
5429 #ifdef HAVE_PSI_INTERFACE
5430                                     ,
5431                                     rdb_mc_psi_thread_key
5432 #endif
5433   );
5434   if (err != 0) {
5435     // NO_LINT_DEBUG
5436     sql_print_error(
5437         "RocksDB: Couldn't start the manual compaction thread: (errno=%d)",
5438         err);
5439     DBUG_RETURN(HA_EXIT_FAILURE);
5440   }
5441 
5442   rdb_set_collation_exception_list(rocksdb_strict_collation_exceptions);
5443 
5444   if (rocksdb_pause_background_work) {
5445     rdb->PauseBackgroundWork();
5446   }
5447 
5448   err = my_error_register(rdb_get_error_messages, HA_ERR_ROCKSDB_FIRST,
5449                           HA_ERR_ROCKSDB_LAST);
5450   if (err != 0) {
5451     // NO_LINT_DEBUG
5452     sql_print_error("RocksDB: Couldn't initialize error messages");
5453     DBUG_RETURN(HA_EXIT_FAILURE);
5454   }
5455 
5456   // Creating an instance of HistogramImpl should only happen after RocksDB
5457   // has been successfully initialized.
5458   commit_latency_stats = new rocksdb::HistogramImpl();
5459 
5460   // succeeded, set the init status flag
5461   rdb_get_hton_init_state()->set_initialized(true);
5462 
5463   // Remove tables that may have been leftover during truncation
5464   rocksdb_truncation_table_cleanup();
5465 
5466   // NO_LINT_DEBUG
5467   sql_print_information(
5468       "MyRocks storage engine plugin has been successfully "
5469       "initialized.");
5470 
5471   st_rdb_exec_time.report();
5472 
5473   // Skip cleaning up rdb_open_tables as we've succeeded
5474   rdb_open_tables_cleanup.skip();
5475 
5476   rocksdb_set_max_bottom_pri_background_compactions_internal(
5477       rocksdb_max_bottom_pri_background_compactions);
5478 
5479   DBUG_RETURN(HA_EXIT_SUCCESS);
5480 }
5481 
5482 /*
5483   Storage Engine deinitialization function, invoked when plugin is unloaded.
5484 */
5485 
rocksdb_done_func(void * const p)5486 static int rocksdb_done_func(void *const p) {
5487   DBUG_ENTER_FUNC();
5488 
5489   int error = 0;
5490 
5491   // If we finalize the storage engine plugin, it is no longer initialized.
5492   // Grab a writer lock for the duration of the call, so we can clear the flag
5493   // and destroy the handlerton and global state in isolation.
5494   Rdb_hton_init_state::Scoped_lock state_lock(*rdb_get_hton_init_state(), true);
5495   SHIP_ASSERT(rdb_get_hton_init_state()->initialized());
5496 
5497   // signal the drop index thread to stop
5498   rdb_drop_idx_thread.signal(true);
5499 
5500   // Flush all memtables for not losing data, even if WAL is disabled.
5501   rocksdb_flush_all_memtables();
5502 
5503   // Stop all rocksdb background work
5504   CancelAllBackgroundWork(rdb->GetBaseDB(), true);
5505 
5506   // Signal the background thread to stop and to persist all stats collected
5507   // from background flushes and compactions. This will add more keys to a new
5508   // memtable, but since the memtables were just flushed, it should not trigger
5509   // a flush that can stall due to background threads being stopped. As long
5510   // as these keys are stored in a WAL file, they can be retrieved on restart.
5511   rdb_bg_thread.signal(true);
5512 
5513   // signal the index stats calculation thread to stop
5514   rdb_is_thread.signal(true);
5515 
5516   // signal the manual compaction thread to stop
5517   rdb_mc_thread.signal(true);
5518 
5519   // Wait for the background thread to finish.
5520   auto err = rdb_bg_thread.join();
5521   if (err != 0) {
5522     // We'll log the message and continue because we're shutting down and
5523     // continuation is the optimal strategy.
5524     // NO_LINT_DEBUG
5525     sql_print_error("RocksDB: Couldn't stop the background thread: (errno=%d)",
5526                     err);
5527   }
5528 
5529   // Wait for the drop index thread to finish.
5530   err = rdb_drop_idx_thread.join();
5531   if (err != 0) {
5532     // NO_LINT_DEBUG
5533     sql_print_error("RocksDB: Couldn't stop the index thread: (errno=%d)", err);
5534   }
5535 
5536   // Wait for the index stats calculation thread to finish.
5537   err = rdb_is_thread.join();
5538   if (err != 0) {
5539     // NO_LINT_DEBUG
5540     sql_print_error(
5541         "RocksDB: Couldn't stop the index stats calculation thread: (errno=%d)",
5542         err);
5543   }
5544 
5545   // Wait for the manual compaction thread to finish.
5546   err = rdb_mc_thread.join();
5547   if (err != 0) {
5548     // NO_LINT_DEBUG
5549     sql_print_error(
5550         "RocksDB: Couldn't stop the manual compaction thread: (errno=%d)", err);
5551   }
5552 
5553   if (rdb_open_tables.count()) {
5554     // Looks like we are getting unloaded and yet we have some open tables
5555     // left behind.
5556     error = 1;
5557   }
5558 
5559   rdb_open_tables.free();
5560   mysql_mutex_destroy(&rdb_sysvars_mutex);
5561   mysql_mutex_destroy(&rdb_block_cache_resize_mutex);
5562   mysql_mutex_destroy(&rdb_bottom_pri_background_compactions_resize_mutex);
5563 
5564   delete rdb_collation_exceptions;
5565   mysql_mutex_destroy(&rdb_collation_data_mutex);
5566   mysql_mutex_destroy(&rdb_mem_cmp_space_mutex);
5567 
5568   Rdb_transaction::term_mutex();
5569 
5570   for (auto &it : rdb_collation_data) {
5571     delete it;
5572     it = nullptr;
5573   }
5574 
5575   ddl_manager.cleanup();
5576   dict_manager.cleanup();
5577   cf_manager.cleanup();
5578 
5579   delete rdb;
5580   rdb = nullptr;
5581 
5582   delete commit_latency_stats;
5583   commit_latency_stats = nullptr;
5584 
5585 // Disown the cache data since we're shutting down.
5586 // This results in memory leaks but it improved the shutdown time.
5587 // Don't disown when running under valgrind or ASAN
5588 #if !defined(HAVE_VALGRIND) && !defined(HAVE_ASAN)
5589   if (rocksdb_tbl_options->block_cache) {
5590     rocksdb_tbl_options->block_cache->DisownData();
5591   }
5592 #endif  // HAVE_VALGRIND
5593 
5594   rocksdb_db_options = nullptr;
5595   rocksdb_tbl_options = nullptr;
5596   rocksdb_stats = nullptr;
5597 
5598   my_error_unregister(HA_ERR_ROCKSDB_FIRST, HA_ERR_ROCKSDB_LAST);
5599 
5600   // clear the initialized flag and unlock
5601   rdb_get_hton_init_state()->set_initialized(false);
5602 
5603   DBUG_RETURN(error);
5604 }
5605 
rocksdb_smart_seek(bool seek_backward,rocksdb::Iterator * const iter,const rocksdb::Slice & key_slice)5606 static inline void rocksdb_smart_seek(bool seek_backward,
5607                                       rocksdb::Iterator *const iter,
5608                                       const rocksdb::Slice &key_slice) {
5609   if (seek_backward) {
5610     iter->SeekForPrev(key_slice);
5611   } else {
5612     iter->Seek(key_slice);
5613   }
5614 }
5615 
rocksdb_smart_next(bool seek_backward,rocksdb::Iterator * const iter)5616 static inline void rocksdb_smart_next(bool seek_backward,
5617                                       rocksdb::Iterator *const iter) {
5618   if (seek_backward) {
5619     iter->Prev();
5620   } else {
5621     iter->Next();
5622   }
5623 }
5624 
5625 // If the iterator is not valid it might be because of EOF but might be due
5626 // to IOError or corruption. The good practice is always check it.
5627 // https://github.com/facebook/rocksdb/wiki/Iterator#error-handling
is_valid(rocksdb::Iterator * scan_it)5628 static inline bool is_valid(rocksdb::Iterator *scan_it) {
5629   if (scan_it->Valid()) {
5630     return true;
5631   } else {
5632     rocksdb::Status s = scan_it->status();
5633     DBUG_EXECUTE_IF("rocksdb_return_status_corrupted",
5634                     dbug_change_status_to_corrupted(&s););
5635     if (s.IsIOError() || s.IsCorruption()) {
5636       if (s.IsCorruption()) {
5637         rdb_persist_corruption_marker();
5638       }
5639       rdb_handle_io_error(s, RDB_IO_ERROR_GENERAL);
5640     }
5641     return false;
5642   }
5643 }
5644 
5645 /**
5646   @brief
5647   Example of simple lock controls. The "table_handler" it creates is a
5648   structure we will pass to each ha_rocksdb handler. Do you have to have
5649   one of these? Well, you have pieces that are used for locking, and
5650   they are needed to function.
5651 */
5652 
get_table_handler(const char * const table_name)5653 Rdb_table_handler *Rdb_open_tables_map::get_table_handler(
5654     const char *const table_name) {
5655   assert(table_name != nullptr);
5656 
5657   Rdb_table_handler *table_handler;
5658 
5659   const std::string table_name_str(table_name);
5660 
5661   // First, look up the table in the hash map.
5662   RDB_MUTEX_LOCK_CHECK(m_mutex);
5663   const auto &it = m_table_map.find(table_name_str);
5664   if (it != m_table_map.end()) {
5665     // Found it
5666     table_handler = it->second;
5667   } else {
5668     char *tmp_name;
5669 
5670     // Since we did not find it in the hash map, attempt to create and add it
5671     // to the hash map.
5672 #ifdef HAVE_PSI_INTERFACE
5673     if (!(table_handler = reinterpret_cast<Rdb_table_handler *>(
5674               my_multi_malloc(rdb_handler_memory_key, MYF(MY_WME | MY_ZEROFILL),
5675                               &table_handler, sizeof(*table_handler), &tmp_name,
5676                               table_name_str.length() + 1, NullS)))) {
5677 #else
5678     if (!(table_handler = reinterpret_cast<Rdb_table_handler *>(
5679               my_multi_malloc(PSI_NOT_INSTRUMENTED, MYF(MY_WME | MY_ZEROFILL),
5680                               &table_handler, sizeof(*table_handler), &tmp_name,
5681                               table_name_str.length() + 1, NullS)))) {
5682 #endif
5683       // Allocating a new Rdb_table_handler and a new table name failed.
5684       RDB_MUTEX_UNLOCK_CHECK(m_mutex);
5685       return nullptr;
5686     }
5687 
5688     table_handler->m_ref_count = 0;
5689     table_handler->m_table_name_length = table_name_str.length();
5690     table_handler->m_table_name = tmp_name;
5691     my_stpmov(table_handler->m_table_name, table_name_str.c_str());
5692 
5693     m_table_map.emplace(table_name_str, table_handler);
5694 
5695     thr_lock_init(&table_handler->m_thr_lock);
5696     table_handler->m_io_perf_read.init();
5697   }
5698   assert(table_handler->m_ref_count >= 0);
5699   table_handler->m_ref_count++;
5700 
5701   RDB_MUTEX_UNLOCK_CHECK(m_mutex);
5702 
5703   return table_handler;
5704 }
5705 
5706 std::vector<std::string> rdb_get_open_table_names(void) {
5707   return rdb_open_tables.get_table_names();
5708 }
5709 
5710 std::vector<std::string> Rdb_open_tables_map::get_table_names(void) const {
5711   const Rdb_table_handler *table_handler;
5712   std::vector<std::string> names;
5713 
5714   RDB_MUTEX_LOCK_CHECK(m_mutex);
5715   for (const auto &kv : m_table_map) {
5716     table_handler = kv.second;
5717     assert(table_handler != nullptr);
5718     names.push_back(table_handler->m_table_name);
5719   }
5720   RDB_MUTEX_UNLOCK_CHECK(m_mutex);
5721 
5722   return names;
5723 }
5724 
5725 /*
5726   Inspired by innobase_get_int_col_max_value from InnoDB. This returns the
5727   maximum value a type can take on.
5728 */
5729 static ulonglong rdb_get_int_col_max_value(const Field *field) {
5730   ulonglong max_value = 0;
5731   switch (field->key_type()) {
5732     case HA_KEYTYPE_BINARY:
5733       max_value = 0xFFULL;
5734       break;
5735     case HA_KEYTYPE_INT8:
5736       max_value = 0x7FULL;
5737       break;
5738     case HA_KEYTYPE_USHORT_INT:
5739       max_value = 0xFFFFULL;
5740       break;
5741     case HA_KEYTYPE_SHORT_INT:
5742       max_value = 0x7FFFULL;
5743       break;
5744     case HA_KEYTYPE_UINT24:
5745       max_value = 0xFFFFFFULL;
5746       break;
5747     case HA_KEYTYPE_INT24:
5748       max_value = 0x7FFFFFULL;
5749       break;
5750     case HA_KEYTYPE_ULONG_INT:
5751       max_value = 0xFFFFFFFFULL;
5752       break;
5753     case HA_KEYTYPE_LONG_INT:
5754       max_value = 0x7FFFFFFFULL;
5755       break;
5756     case HA_KEYTYPE_ULONGLONG:
5757       max_value = 0xFFFFFFFFFFFFFFFFULL;
5758       break;
5759     case HA_KEYTYPE_LONGLONG:
5760       max_value = 0x7FFFFFFFFFFFFFFFULL;
5761       break;
5762     case HA_KEYTYPE_FLOAT:
5763       max_value = 0x1000000ULL;
5764       break;
5765     case HA_KEYTYPE_DOUBLE:
5766       max_value = 0x20000000000000ULL;
5767       break;
5768     default:
5769       abort();
5770   }
5771 
5772   return max_value;
5773 }
5774 
5775 void ha_rocksdb::load_auto_incr_value() {
5776   ulonglong auto_incr = 0;
5777   bool validate_last = false, use_datadic = true;
5778 #if !defined(NDEBUG)
5779   DBUG_EXECUTE_IF("myrocks_autoinc_upgrade", use_datadic = false;);
5780   validate_last = true;
5781 #endif  // !defined(NDEBUG)
5782 
5783   if (use_datadic && dict_manager.get_auto_incr_val(
5784                          m_tbl_def->get_autoincr_gl_index_id(), &auto_incr)) {
5785     update_auto_incr_val(auto_incr);
5786   }
5787 
5788   // If we find nothing in the data dictionary, or if we are in debug mode,
5789   // then call index_last to get the last value.
5790   //
5791   // This is needed when upgrading from a server that did not support
5792   // persistent auto_increment, of if the table is empty.
5793   //
5794   // For debug mode, we are just verifying that the data dictionary value is
5795   // greater than or equal to the maximum value in the table.
5796   if (auto_incr == 0 || validate_last) {
5797     auto_incr = load_auto_incr_value_from_index();
5798     update_auto_incr_val(auto_incr);
5799   }
5800 
5801   // If we failed to find anything from the data dictionary and index, then
5802   // initialize auto_increment to 1.
5803   if (m_tbl_def->m_auto_incr_val == 0) {
5804     update_auto_incr_val(1);
5805   }
5806 }
5807 
5808 ulonglong ha_rocksdb::load_auto_incr_value_from_index() {
5809   const int save_active_index = active_index;
5810   active_index = table->s->next_number_index;
5811   const uint8 save_table_status = table->status;
5812   ulonglong last_val = 0;
5813 
5814   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
5815   const bool is_new_snapshot = !tx->has_snapshot();
5816   if (is_new_snapshot) {
5817     tx->acquire_snapshot(true);
5818   }
5819 
5820   // Do a lookup. We only need index column, so it should be index-only.
5821   // (another reason to make it index-only is that table->read_set is not set
5822   // appropriately and non-index-only lookup will not read the value)
5823   const bool save_keyread_only = m_keyread_only;
5824   m_keyread_only = true;
5825   m_converter->set_is_key_requested(true);
5826 
5827   if (!index_last(table->record[0])) {
5828     Field *field =
5829         table->key_info[table->s->next_number_index].key_part[0].field;
5830     ulonglong max_val = rdb_get_int_col_max_value(field);
5831     my_bitmap_map *const old_map =
5832         dbug_tmp_use_all_columns(table, table->read_set);
5833     last_val = field->val_int();
5834     if (last_val != max_val) {
5835       last_val++;
5836     }
5837 #ifndef NDEBUG
5838     ulonglong dd_val;
5839     if (last_val <= max_val) {
5840       const auto &gl_index_id = m_tbl_def->get_autoincr_gl_index_id();
5841       if (dict_manager.get_auto_incr_val(gl_index_id, &dd_val) &&
5842           tx->get_auto_incr(gl_index_id) == 0) {
5843         assert(dd_val >= last_val);
5844       }
5845     }
5846 #endif
5847     dbug_tmp_restore_column_map(table->read_set, old_map);
5848   }
5849 
5850   m_keyread_only = save_keyread_only;
5851   if (is_new_snapshot) {
5852     tx->release_snapshot();
5853   }
5854 
5855   table->status = save_table_status;
5856   active_index = save_active_index;
5857 
5858   /*
5859     Do what ha_rocksdb::index_end() does.
5860     (Why don't we use index_init/index_end? class handler defines index_init
5861     as private, for some reason).
5862     */
5863   release_scan_iterator();
5864 
5865   return last_val;
5866 }
5867 
5868 void ha_rocksdb::update_auto_incr_val(ulonglong val) {
5869   ulonglong auto_incr_val = m_tbl_def->m_auto_incr_val;
5870   while (
5871       auto_incr_val < val &&
5872       !m_tbl_def->m_auto_incr_val.compare_exchange_weak(auto_incr_val, val)) {
5873     // Do nothing - just loop until auto_incr_val is >= val or we successfully
5874     // set it
5875   }
5876 }
5877 
5878 void ha_rocksdb::update_auto_incr_val_from_field() {
5879   Field *field;
5880   ulonglong new_val, max_val;
5881   field = table->key_info[table->s->next_number_index].key_part[0].field;
5882   max_val = rdb_get_int_col_max_value(field);
5883 
5884   my_bitmap_map *const old_map =
5885       dbug_tmp_use_all_columns(table, table->read_set);
5886   new_val = field->val_int();
5887   // don't increment if we would wrap around
5888   if (new_val != max_val) {
5889     new_val++;
5890   }
5891 
5892   dbug_tmp_restore_column_map(table->read_set, old_map);
5893 
5894   // Only update if positive value was set for auto_incr column.
5895   if (new_val <= max_val) {
5896     Rdb_transaction *const tx = get_or_create_tx(table->in_use);
5897     tx->set_auto_incr(m_tbl_def->get_autoincr_gl_index_id(), new_val);
5898 
5899     // Update the in memory auto_incr value in m_tbl_def.
5900     update_auto_incr_val(new_val);
5901   }
5902 }
5903 
5904 int ha_rocksdb::load_hidden_pk_value() {
5905   const int save_active_index = active_index;
5906   active_index = m_tbl_def->m_key_count - 1;
5907   const uint8 save_table_status = table->status;
5908 
5909   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
5910   const bool is_new_snapshot = !tx->has_snapshot();
5911 
5912   longlong hidden_pk_id = 1;
5913   // Do a lookup.
5914   if (!index_last(table->record[0])) {
5915     /*
5916       Decode PK field from the key
5917     */
5918     auto err = read_hidden_pk_id_from_rowkey(&hidden_pk_id);
5919     if (err) {
5920       if (is_new_snapshot) {
5921         tx->release_snapshot();
5922       }
5923       return err;
5924     }
5925 
5926     hidden_pk_id++;
5927   }
5928 
5929   longlong old = m_tbl_def->m_hidden_pk_val;
5930   while (old < hidden_pk_id &&
5931          !m_tbl_def->m_hidden_pk_val.compare_exchange_weak(old, hidden_pk_id)) {
5932   }
5933 
5934   if (is_new_snapshot) {
5935     tx->release_snapshot();
5936   }
5937 
5938   table->status = save_table_status;
5939   active_index = save_active_index;
5940 
5941   release_scan_iterator();
5942 
5943   return HA_EXIT_SUCCESS;
5944 }
5945 
5946 /* Get PK value from m_tbl_def->m_hidden_pk_info. */
5947 longlong ha_rocksdb::update_hidden_pk_val() {
5948   assert(has_hidden_pk(table));
5949   const longlong new_val = m_tbl_def->m_hidden_pk_val++;
5950   return new_val;
5951 }
5952 
5953 /* Get the id of the hidden pk id from m_last_rowkey */
5954 int ha_rocksdb::read_hidden_pk_id_from_rowkey(longlong *const hidden_pk_id) {
5955   assert(hidden_pk_id != nullptr);
5956   assert(table != nullptr);
5957   assert(has_hidden_pk(table));
5958 
5959   rocksdb::Slice rowkey_slice(m_last_rowkey.ptr(), m_last_rowkey.length());
5960 
5961   // Get hidden primary key from old key slice
5962   Rdb_string_reader reader(&rowkey_slice);
5963   if ((!reader.read(Rdb_key_def::INDEX_NUMBER_SIZE))) {
5964     return HA_ERR_ROCKSDB_CORRUPT_DATA;
5965   }
5966 
5967   const int length = Field_longlong::PACK_LENGTH;
5968   const uchar *from = reinterpret_cast<const uchar *>(reader.read(length));
5969   if (from == nullptr) {
5970     /* Mem-comparable image doesn't have enough bytes */
5971     return HA_ERR_ROCKSDB_CORRUPT_DATA;
5972   }
5973 
5974   *hidden_pk_id = rdb_netbuf_read_uint64(&from);
5975   return HA_EXIT_SUCCESS;
5976 }
5977 
5978 /**
5979   @brief
5980   Free lock controls. We call this whenever we close a table. If the table had
5981   the last reference to the table_handler, then we free the memory associated
5982   with it.
5983 */
5984 
5985 void Rdb_open_tables_map::release_table_handler(
5986     Rdb_table_handler *const table_handler) {
5987   RDB_MUTEX_LOCK_CHECK(m_mutex);
5988 
5989   assert(table_handler != nullptr);
5990   assert(table_handler->m_ref_count > 0);
5991   if (!--table_handler->m_ref_count) {
5992     const auto ret MY_ATTRIBUTE((__unused__)) =
5993         m_table_map.erase(std::string(table_handler->m_table_name));
5994     assert(ret == 1);  // the hash entry must actually be found and deleted
5995     my_core::thr_lock_delete(&table_handler->m_thr_lock);
5996     my_free(table_handler);
5997   }
5998 
5999   RDB_MUTEX_UNLOCK_CHECK(m_mutex);
6000 }
6001 
6002 static handler *rocksdb_create_handler(my_core::handlerton *const hton,
6003                                        my_core::TABLE_SHARE *const table_arg,
6004                                        my_core::MEM_ROOT *const mem_root) {
6005   if (rocksdb_enable_native_partition && table_arg &&
6006       table_arg->db_type() == rocksdb_hton && table_arg->partition_info_str &&
6007       table_arg->partition_info_str_len) {
6008     ha_rockspart *file = new (mem_root) ha_rockspart(hton, table_arg);
6009     if (file && file->init_partitioning(mem_root)) {
6010       delete file;
6011       return nullptr;
6012     }
6013     return (file);
6014   }
6015 
6016   return new (mem_root) ha_rocksdb(hton, table_arg);
6017 }
6018 
6019 ha_rocksdb::ha_rocksdb(my_core::handlerton *const hton,
6020                        my_core::TABLE_SHARE *const table_arg)
6021     : handler(hton, table_arg),
6022       m_table_handler(nullptr),
6023       m_scan_it(nullptr),
6024       m_scan_it_skips_bloom(false),
6025       m_scan_it_snapshot(nullptr),
6026       m_scan_it_lower_bound(nullptr),
6027       m_scan_it_upper_bound(nullptr),
6028       m_tbl_def(nullptr),
6029       m_pk_descr(nullptr),
6030       m_key_descr_arr(nullptr),
6031       m_pk_can_be_decoded(true),
6032       m_pk_tuple(nullptr),
6033       m_pk_packed_tuple(nullptr),
6034       m_sk_packed_tuple(nullptr),
6035       m_end_key_packed_tuple(nullptr),
6036       m_sk_match_prefix(nullptr),
6037       m_sk_match_prefix_buf(nullptr),
6038       m_sk_packed_tuple_old(nullptr),
6039       m_dup_sk_packed_tuple(nullptr),
6040       m_dup_sk_packed_tuple_old(nullptr),
6041       m_pack_buffer(nullptr),
6042       m_lock_rows(RDB_LOCK_NONE),
6043       m_keyread_only(false),
6044       m_insert_with_update(false),
6045       m_dup_key_found(false),
6046 #if defined(ROCKSDB_INCLUDE_RFR) && ROCKSDB_INCLUDE_RFR
6047       m_in_rpl_delete_rows(false),
6048       m_in_rpl_update_rows(false),
6049 #endif  // defined(ROCKSDB_INCLUDE_RFR) && ROCKSDB_INCLUDE_RFR
6050       m_need_build_decoder(false) {
6051 }
6052 
6053 ha_rocksdb::~ha_rocksdb() {
6054   int err MY_ATTRIBUTE((__unused__));
6055   err = finalize_bulk_load(false);
6056   if (err != 0) {
6057     // NO_LINT_DEBUG
6058     sql_print_error(
6059         "RocksDB: Error %d finalizing bulk load while closing "
6060         "handler.",
6061         err);
6062   }
6063 }
6064 
6065 static const char *ha_rocksdb_exts[] = {NullS};
6066 
6067 const char **ha_rocksdb::bas_ext() const {
6068   DBUG_ENTER_FUNC();
6069 
6070   DBUG_RETURN(ha_rocksdb_exts);
6071 }
6072 
6073 const std::string &ha_rocksdb::get_table_basename() const {
6074   return m_tbl_def->base_tablename();
6075 }
6076 
6077 /**
6078   @return
6079     false  OK
6080     other  Error inpacking the data
6081 */
6082 bool ha_rocksdb::init_with_fields() {
6083   DBUG_ENTER_FUNC();
6084 
6085   const uint pk = table_share->primary_key;
6086   if (pk != MAX_KEY) {
6087     const uint key_parts = table_share->key_info[pk].user_defined_key_parts;
6088     check_keyread_allowed(m_pk_can_be_decoded, table_share, pk /*PK*/,
6089                           key_parts - 1, true);
6090   } else {
6091     m_pk_can_be_decoded = false;
6092   }
6093   cached_table_flags = table_flags();
6094 
6095   DBUG_RETURN(false); /* Ok */
6096 }
6097 
6098 bool ha_rocksdb::rpl_can_handle_stm_event() const {
6099   return !(rpl_skip_tx_api_var && !super_read_only);
6100 }
6101 
6102 /*
6103   If the key is a TTL key, we may need to filter it out.
6104 
6105   The purpose of read filtering for tables with TTL is to ensure that
6106   during a transaction a key which has expired already but not removed by
6107   compaction yet is not returned to the user.
6108 
6109   Without this the user might be hit with problems such as disappearing
6110   rows within a transaction, etc, because the compaction filter ignores
6111   snapshots when filtering keys.
6112 */
6113 bool ha_rocksdb::should_hide_ttl_rec(const Rdb_key_def &kd,
6114                                      const rocksdb::Slice &ttl_rec_val,
6115                                      const int64_t curr_ts) {
6116   assert(kd.has_ttl());
6117   assert(kd.m_ttl_rec_offset != UINT_MAX);
6118 
6119   /*
6120     Curr_ts can only be 0 if there are no snapshots open.
6121     should_hide_ttl_rec can only be called when there is >=1 snapshots, unless
6122     we are filtering on the write path (single INSERT/UPDATE) in which case
6123     we are passed in the current time as curr_ts.
6124 
6125     In the event curr_ts is 0, we always decide not to filter the record. We
6126     also log a warning and increment a diagnostic counter.
6127   */
6128   if (curr_ts == 0) {
6129     update_row_stats(ROWS_HIDDEN_NO_SNAPSHOT);
6130     return false;
6131   }
6132 
6133   if (!rdb_is_ttl_read_filtering_enabled() || !rdb_is_ttl_enabled()) {
6134     return false;
6135   }
6136 
6137   Rdb_string_reader reader(&ttl_rec_val);
6138 
6139   /*
6140     Find where the 8-byte ttl is for each record in this index.
6141   */
6142   uint64 ts;
6143   if (!reader.read(kd.m_ttl_rec_offset) || reader.read_uint64(&ts)) {
6144     /*
6145       This condition should never be reached since all TTL records have an
6146       8 byte ttl field in front. Don't filter the record out, and log an error.
6147     */
6148     std::string buf;
6149     buf = rdb_hexdump(ttl_rec_val.data(), ttl_rec_val.size(),
6150                       RDB_MAX_HEXDUMP_LEN);
6151     const GL_INDEX_ID gl_index_id = kd.get_gl_index_id();
6152     // NO_LINT_DEBUG
6153     sql_print_error(
6154         "Decoding ttl from PK value failed, "
6155         "for index (%u,%u), val: %s",
6156         gl_index_id.cf_id, gl_index_id.index_id, buf.c_str());
6157     assert(0);
6158     return false;
6159   }
6160 
6161   /* Hide record if it has expired before the current snapshot time. */
6162   uint64 read_filter_ts = 0;
6163 #if !defined(NDEBUG)
6164   read_filter_ts += rdb_dbug_set_ttl_read_filter_ts();
6165 #endif  // !defined(NDEBUG)
6166   bool is_hide_ttl =
6167       ts + kd.m_ttl_duration + read_filter_ts <= static_cast<uint64>(curr_ts);
6168   if (is_hide_ttl) {
6169     update_row_stats(ROWS_FILTERED);
6170 
6171     /* increment examined row count when rows are skipped */
6172     THD *thd = ha_thd();
6173     thd->inc_examined_row_count(1);
6174     DEBUG_SYNC(thd, "rocksdb.ttl_rows_examined");
6175   }
6176   return is_hide_ttl;
6177 }
6178 
6179 int ha_rocksdb::rocksdb_skip_expired_records(const Rdb_key_def &kd,
6180                                              rocksdb::Iterator *const iter,
6181                                              bool seek_backward) {
6182   if (kd.has_ttl()) {
6183     THD *thd = ha_thd();
6184     while (iter->Valid() &&
6185            should_hide_ttl_rec(
6186                kd, iter->value(),
6187                get_or_create_tx(table->in_use)->m_snapshot_timestamp)) {
6188       DEBUG_SYNC(thd, "rocksdb.check_flags_ser");
6189       if (thd && thd->killed) {
6190         return HA_ERR_QUERY_INTERRUPTED;
6191       }
6192       rocksdb_smart_next(seek_backward, iter);
6193     }
6194   }
6195   return HA_EXIT_SUCCESS;
6196 }
6197 
6198 #ifndef NDEBUG
6199 void dbug_append_garbage_at_end(rocksdb::PinnableSlice *on_disk_rec) {
6200   std::string str(on_disk_rec->data(), on_disk_rec->size());
6201   on_disk_rec->Reset();
6202   str.append("abc");
6203   on_disk_rec->PinSelf(rocksdb::Slice(str));
6204 }
6205 
6206 void dbug_truncate_record(rocksdb::PinnableSlice *on_disk_rec) {
6207   on_disk_rec->remove_suffix(on_disk_rec->size());
6208 }
6209 
6210 void dbug_modify_rec_varchar12(rocksdb::PinnableSlice *on_disk_rec) {
6211   std::string res;
6212   // The record is NULL-byte followed by VARCHAR(10).
6213   // Put the NULL-byte
6214   res.append("\0", 1);
6215   // Then, add a valid VARCHAR(12) value.
6216   res.append("\xC", 1);
6217   res.append("123456789ab", 12);
6218 
6219   on_disk_rec->Reset();
6220   on_disk_rec->PinSelf(rocksdb::Slice(res));
6221 }
6222 
6223 void dbug_create_err_inplace_alter() {
6224   my_printf_error(ER_UNKNOWN_ERROR,
6225                   "Intentional failure in inplace alter occurred.", MYF(0));
6226 }
6227 #endif  // !defined(NDEBUG)
6228 
6229 int ha_rocksdb::convert_record_from_storage_format(
6230     const rocksdb::Slice *const key, uchar *const buf) {
6231   DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read1",
6232                   dbug_append_garbage_at_end(&m_retrieved_record););
6233   DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read2",
6234                   dbug_truncate_record(&m_retrieved_record););
6235   DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read3",
6236                   dbug_modify_rec_varchar12(&m_retrieved_record););
6237 
6238   return convert_record_from_storage_format(key, &m_retrieved_record, buf);
6239 }
6240 
6241 /*
6242   @brief
6243   Unpack the record in this->m_retrieved_record and this->m_last_rowkey from
6244   storage format into buf (which can be table->record[0] or table->record[1]).
6245 
6246   @param  key   Table record's key in mem-comparable form.
6247   @param  buf   Store record in table->record[0] format here
6248 
6249   @detail
6250     If the table has blobs, the unpacked data in buf may keep pointers to the
6251     data in this->m_retrieved_record.
6252 
6253     The key is only needed to check its checksum value (the checksum is in
6254     m_retrieved_record).
6255 
6256   @seealso
6257     rdb_converter::setup_read_decoders()  Sets up data structures which tell
6258   which columns to decode.
6259 
6260   @return
6261     0      OK
6262     other  Error inpacking the data
6263 */
6264 
6265 int ha_rocksdb::convert_record_from_storage_format(
6266     const rocksdb::Slice *const key, const rocksdb::Slice *const value,
6267     uchar *const buf) {
6268   assert(key != nullptr);
6269   assert(buf != nullptr);
6270 
6271   return m_converter->decode(m_pk_descr, buf, key, value);
6272 }
6273 
6274 int ha_rocksdb::alloc_key_buffers(const TABLE *const table_arg,
6275                                   const Rdb_tbl_def *const tbl_def_arg,
6276                                   bool alloc_alter_buffers) {
6277   DBUG_ENTER_FUNC();
6278 
6279   assert(m_pk_tuple == nullptr);
6280   assert(tbl_def_arg != nullptr);
6281 
6282   std::shared_ptr<Rdb_key_def> *const kd_arr = tbl_def_arg->m_key_descr_arr;
6283 
6284   uint key_len = 0;
6285   uint max_packed_sk_len = 0;
6286   uint pack_key_len = 0;
6287 
6288   m_pk_descr = kd_arr[pk_index(table_arg, tbl_def_arg)];
6289   if (has_hidden_pk(table_arg)) {
6290     m_pk_key_parts = 1;
6291   } else {
6292     m_pk_key_parts =
6293         table->key_info[table->s->primary_key].user_defined_key_parts;
6294     key_len = table->key_info[table->s->primary_key].key_length;
6295   }
6296 
6297   // move this into get_table_handler() ??
6298   m_pk_descr->setup(table_arg, tbl_def_arg);
6299 
6300 #ifdef HAVE_PSI_INTERFACE
6301   m_pk_tuple =
6302       static_cast<uchar *>(my_malloc(rdb_handler_memory_key, key_len, MYF(0)));
6303 #else
6304   m_pk_tuple =
6305       static_cast<uchar *>(my_malloc(PSI_NOT_INSTRUMENTED, key_len, MYF(0)));
6306 #endif
6307   if (m_pk_tuple == nullptr) {
6308     goto error;
6309   }
6310 
6311   pack_key_len = m_pk_descr->max_storage_fmt_length();
6312 #ifdef HAVE_PSI_INTERFACE
6313   m_pk_packed_tuple = static_cast<uchar *>(
6314       my_malloc(rdb_handler_memory_key, pack_key_len, MYF(0)));
6315 #else
6316   m_pk_packed_tuple = static_cast<uchar *>(
6317       my_malloc(PSI_NOT_INSTRUMENTED, pack_key_len, MYF(0)));
6318 #endif
6319   if (m_pk_packed_tuple == nullptr) {
6320     goto error;
6321   }
6322 
6323   /* Sometimes, we may use m_sk_packed_tuple for storing packed PK */
6324   max_packed_sk_len = pack_key_len;
6325   for (uint i = 0; i < table_arg->s->keys; i++) {
6326     /* Primary key was processed above */
6327     if (i == table_arg->s->primary_key) continue;
6328 
6329     // TODO: move this into get_table_handler() ??
6330     kd_arr[i]->setup(table_arg, tbl_def_arg);
6331 
6332     const uint packed_len = kd_arr[i]->max_storage_fmt_length();
6333     if (packed_len > max_packed_sk_len) {
6334       max_packed_sk_len = packed_len;
6335     }
6336   }
6337 
6338 #ifdef HAVE_PSI_INTERFACE
6339   if (!(m_sk_packed_tuple = static_cast<uchar *>(
6340             my_malloc(rdb_handler_memory_key, max_packed_sk_len, MYF(0)))) ||
6341       !(m_sk_match_prefix_buf = static_cast<uchar *>(
6342             my_malloc(rdb_handler_memory_key, max_packed_sk_len, MYF(0)))) ||
6343       !(m_sk_packed_tuple_old = static_cast<uchar *>(
6344             my_malloc(rdb_handler_memory_key, max_packed_sk_len, MYF(0)))) ||
6345       !(m_end_key_packed_tuple = static_cast<uchar *>(
6346             my_malloc(rdb_handler_memory_key, max_packed_sk_len, MYF(0)))) ||
6347       !(m_pack_buffer = static_cast<uchar *>(
6348             my_malloc(rdb_handler_memory_key, max_packed_sk_len, MYF(0)))) ||
6349       !(m_scan_it_lower_bound = static_cast<uchar *>(
6350             my_malloc(rdb_handler_memory_key, max_packed_sk_len, MYF(0)))) ||
6351       !(m_scan_it_upper_bound = static_cast<uchar *>(
6352             my_malloc(rdb_handler_memory_key, max_packed_sk_len, MYF(0))))) {
6353 #else
6354   if (!(m_sk_packed_tuple = static_cast<uchar *>(
6355             my_malloc(PSI_NOT_INSTRUMENTED, max_packed_sk_len, MYF(0)))) ||
6356       !(m_sk_match_prefix_buf = static_cast<uchar *>(
6357             my_malloc(PSI_NOT_INSTRUMENTED, max_packed_sk_len, MYF(0)))) ||
6358       !(m_sk_packed_tuple_old = static_cast<uchar *>(
6359             my_malloc(PSI_NOT_INSTRUMENTED, max_packed_sk_len, MYF(0)))) ||
6360       !(m_end_key_packed_tuple = static_cast<uchar *>(
6361             my_malloc(PSI_NOT_INSTRUMENTED, max_packed_sk_len, MYF(0)))) ||
6362       !(m_pack_buffer = static_cast<uchar *>(
6363             my_malloc(PSI_NOT_INSTRUMENTED, max_packed_sk_len, MYF(0)))) ||
6364       !(m_scan_it_lower_bound = static_cast<uchar *>(
6365             my_malloc(PSI_NOT_INSTRUMENTED, max_packed_sk_len, MYF(0)))) ||
6366       !(m_scan_it_upper_bound = static_cast<uchar *>(
6367             my_malloc(PSI_NOT_INSTRUMENTED, max_packed_sk_len, MYF(0))))) {
6368 #endif
6369     goto error;
6370   }
6371 
6372     /*
6373       If inplace alter is happening, allocate special buffers for unique
6374       secondary index duplicate checking.
6375     */
6376 #ifdef HAVE_PSI_INTERFACE
6377   if (alloc_alter_buffers &&
6378       (!(m_dup_sk_packed_tuple = static_cast<uchar *>(
6379              my_malloc(rdb_handler_memory_key, max_packed_sk_len, MYF(0)))) ||
6380        !(m_dup_sk_packed_tuple_old = static_cast<uchar *>(
6381              my_malloc(rdb_handler_memory_key, max_packed_sk_len, MYF(0)))))) {
6382 #else
6383   if (alloc_alter_buffers &&
6384       (!(m_dup_sk_packed_tuple = static_cast<uchar *>(
6385              my_malloc(PSI_NOT_INSTRUMENTED, max_packed_sk_len, MYF(0)))) ||
6386        !(m_dup_sk_packed_tuple_old = static_cast<uchar *>(
6387              my_malloc(PSI_NOT_INSTRUMENTED, max_packed_sk_len, MYF(0)))))) {
6388 #endif
6389     goto error;
6390   }
6391 
6392   DBUG_RETURN(HA_EXIT_SUCCESS);
6393 
6394 error:
6395   // If we're here then this means that at some point above an allocation may
6396   // have failed. To avoid any resource leaks and maintain a clear contract
6397   // we'll clean up before returning the error code.
6398   free_key_buffers();
6399 
6400   DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
6401 }
6402 
6403 void ha_rocksdb::free_key_buffers() {
6404   my_free(m_pk_tuple);
6405   m_pk_tuple = nullptr;
6406 
6407   my_free(m_pk_packed_tuple);
6408   m_pk_packed_tuple = nullptr;
6409 
6410   my_free(m_sk_packed_tuple);
6411   m_sk_packed_tuple = nullptr;
6412 
6413   my_free(m_sk_match_prefix_buf);
6414   m_sk_match_prefix_buf = nullptr;
6415 
6416   my_free(m_sk_packed_tuple_old);
6417   m_sk_packed_tuple_old = nullptr;
6418 
6419   my_free(m_end_key_packed_tuple);
6420   m_end_key_packed_tuple = nullptr;
6421 
6422   my_free(m_pack_buffer);
6423   m_pack_buffer = nullptr;
6424 
6425   my_free(m_dup_sk_packed_tuple);
6426   m_dup_sk_packed_tuple = nullptr;
6427 
6428   my_free(m_dup_sk_packed_tuple_old);
6429   m_dup_sk_packed_tuple_old = nullptr;
6430 
6431   my_free(m_scan_it_lower_bound);
6432   m_scan_it_lower_bound = nullptr;
6433 
6434   my_free(m_scan_it_upper_bound);
6435   m_scan_it_upper_bound = nullptr;
6436 }
6437 
6438 /**
6439   @return
6440     HA_EXIT_SUCCESS  OK
6441     other            HA_ERR error code (can be SE-specific)
6442 */
6443 int ha_rocksdb::open(const char *const name, int mode, uint test_if_locked) {
6444   DBUG_ENTER_FUNC();
6445 
6446   int err = close();
6447   if (err) {
6448     DBUG_RETURN(err);
6449   }
6450 
6451   m_table_handler = rdb_open_tables.get_table_handler(name);
6452 
6453   if (m_table_handler == nullptr) {
6454     DBUG_RETURN(HA_ERR_OUT_OF_MEM);
6455   }
6456 
6457   my_core::thr_lock_data_init(&m_table_handler->m_thr_lock, &m_db_lock,
6458                               nullptr);
6459   m_io_perf.init(&m_table_handler->m_table_perf_context,
6460                  &m_table_handler->m_io_perf_read, &stats);
6461 
6462   Rdb_perf_context_guard guard(&m_io_perf,
6463                                rocksdb_perf_context_level(ha_thd()));
6464 
6465   std::string fullname;
6466   err = rdb_normalize_tablename(name, &fullname);
6467   if (err != HA_EXIT_SUCCESS) {
6468     DBUG_RETURN(err);
6469   }
6470 
6471   m_tbl_def = ddl_manager.find(fullname);
6472   if (m_tbl_def == nullptr) {
6473     my_error(ER_INTERNAL_ERROR, MYF(0),
6474              "Attempt to open a table that is not present in RocksDB-SE data "
6475              "dictionary");
6476     DBUG_RETURN(HA_ERR_ROCKSDB_INVALID_TABLE);
6477   }
6478 
6479   m_lock_rows = RDB_LOCK_NONE;
6480   m_key_descr_arr = m_tbl_def->m_key_descr_arr;
6481 
6482   /*
6483     Full table scan actually uses primary key
6484     (UPDATE needs to know this, otherwise it will go into infinite loop on
6485     queries like "UPDATE tbl SET pk=pk+100")
6486   */
6487   key_used_on_scan = table->s->primary_key;
6488 
6489   // close() above has already called free_key_buffers(). No need to do it here.
6490   err = alloc_key_buffers(table, m_tbl_def);
6491 
6492   if (err) {
6493     DBUG_RETURN(err);
6494   }
6495 
6496   /*
6497     init_with_fields() is used to initialize table flags based on the field
6498     definitions in table->field[].
6499     It is called by open_binary_frm(), but that function calls the method for
6500     a temporary ha_rocksdb object which is later destroyed.
6501 
6502     If we are here in ::open(), then init_with_fields() has not been called
6503     for this object. Call it ourselves, we want all member variables to be
6504     properly initialized.
6505   */
6506   init_with_fields();
6507 
6508   /* Initialize decoder */
6509   m_converter.reset(new Rdb_converter(ha_thd(), m_tbl_def, table));
6510 
6511   /*
6512      Update m_ttl_bytes address to same as Rdb_converter's m_ttl_bytes.
6513      Remove this code after moving convert_record_to_storage_format() into
6514      Rdb_converter class.
6515   */
6516   m_ttl_bytes = m_converter->get_ttl_bytes_buffer();
6517 
6518   info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
6519 
6520   /*
6521     The following load_XXX code calls row decode functions, and they do
6522     that without having done ::external_lock() or index_init()/rnd_init().
6523     (Note: this also means we're doing a read when there was no
6524     rdb_converter::setup_field_encoders() call)
6525 
6526     Initialize the necessary variables for them:
6527   */
6528 
6529   /* Load auto_increment value only once on first use. */
6530   if (table->found_next_number_field && m_tbl_def->m_auto_incr_val == 0) {
6531     load_auto_incr_value();
6532   }
6533 
6534   /* Load hidden pk only once on first use. */
6535   if (has_hidden_pk(table) && m_tbl_def->m_hidden_pk_val == 0 &&
6536       (err = load_hidden_pk_value()) != HA_EXIT_SUCCESS) {
6537     free_key_buffers();
6538     DBUG_RETURN(err);
6539   }
6540 
6541   /* Index block size in MyRocks: used by MySQL in query optimization */
6542   stats.block_size = rocksdb_tbl_options->block_size;
6543 
6544   DBUG_RETURN(HA_EXIT_SUCCESS);
6545 }
6546 
6547 int ha_rocksdb::close(void) {
6548   DBUG_ENTER_FUNC();
6549 
6550   m_pk_descr = nullptr;
6551   m_key_descr_arr = nullptr;
6552   m_converter = nullptr;
6553   free_key_buffers();
6554 
6555   if (m_table_handler != nullptr) {
6556     rdb_open_tables.release_table_handler(m_table_handler);
6557     m_table_handler = nullptr;
6558   }
6559 
6560   // These are needed to suppress valgrind errors in rocksdb.partition
6561   m_last_rowkey.mem_free();
6562   m_sk_tails.free();
6563   m_sk_tails_old.free();
6564   m_pk_unpack_info.free();
6565 
6566   DBUG_RETURN(HA_EXIT_SUCCESS);
6567 }
6568 
6569 static const char *rdb_error_messages[] = {
6570     "Table must have a PRIMARY KEY.",
6571     "Specifying DATA DIRECTORY for an individual table is not supported.",
6572     "Specifying INDEX DIRECTORY for an individual table is not supported.",
6573     "RocksDB commit failed.",
6574     "Failure during bulk load operation.",
6575     "Found data corruption.",
6576     "CRC checksum mismatch.",
6577     "Invalid table.",
6578     "Could not access RocksDB properties.",
6579     "File I/O error during merge/sort operation.",
6580     "RocksDB status: not found.",
6581     "RocksDB status: corruption.",
6582     "RocksDB status: not supported.",
6583     "RocksDB status: invalid argument.",
6584     "RocksDB status: io error.",
6585     "RocksDB status: no space.",
6586     "RocksDB status: merge in progress.",
6587     "RocksDB status: incomplete.",
6588     "RocksDB status: shutdown in progress.",
6589     "RocksDB status: timed out.",
6590     "RocksDB status: aborted.",
6591     "RocksDB status: lock limit reached.",
6592     "RocksDB status: busy.",
6593     "RocksDB status: deadlock.",
6594     "RocksDB status: expired.",
6595     "RocksDB status: try again.",
6596 };
6597 
6598 static_assert((sizeof(rdb_error_messages) / sizeof(rdb_error_messages[0])) ==
6599                   ((HA_ERR_ROCKSDB_LAST - HA_ERR_ROCKSDB_FIRST) + 1),
6600               "Number of error messages doesn't match number of error codes");
6601 
6602 static const char *rdb_get_error_messages(int error) {
6603   if (error >= HA_ERR_ROCKSDB_FIRST && error <= HA_ERR_ROCKSDB_LAST) {
6604     return rdb_error_messages[error - HA_ERR_ROCKSDB_FIRST];
6605   }
6606   return "";
6607 }
6608 
6609 bool ha_rocksdb::get_error_message(const int error, String *const buf) {
6610   DBUG_ENTER_FUNC();
6611 
6612   static_assert(HA_ERR_ROCKSDB_LAST > HA_ERR_FIRST,
6613                 "HA_ERR_ROCKSDB_LAST > HA_ERR_FIRST");
6614   static_assert(HA_ERR_ROCKSDB_LAST > HA_ERR_LAST,
6615                 "HA_ERR_ROCKSDB_LAST > HA_ERR_LAST");
6616 
6617   assert(buf != nullptr);
6618 
6619   buf->append(rdb_get_error_messages(error));
6620 
6621   // We can be called with the values which are < HA_ERR_FIRST because most
6622   // MySQL internal functions will just return HA_EXIT_FAILURE in case of
6623   // an error.
6624 
6625   DBUG_RETURN(false);
6626 }
6627 
6628 /*
6629   Generalized way to convert RocksDB status errors into MySQL error code, and
6630   print error message.
6631 
6632   Each error code below maps to a RocksDB status code found in:
6633   rocksdb/include/rocksdb/status.h
6634 */
6635 int ha_rocksdb::rdb_error_to_mysql(const rocksdb::Status &s,
6636                                    const char *opt_msg) {
6637   assert(!s.ok());
6638 
6639   int err;
6640   switch (s.code()) {
6641     case rocksdb::Status::Code::kOk:
6642       err = HA_EXIT_SUCCESS;
6643       break;
6644     case rocksdb::Status::Code::kNotFound:
6645       err = HA_ERR_ROCKSDB_STATUS_NOT_FOUND;
6646       break;
6647     case rocksdb::Status::Code::kCorruption:
6648       err = HA_ERR_ROCKSDB_STATUS_CORRUPTION;
6649       break;
6650     case rocksdb::Status::Code::kNotSupported:
6651       err = HA_ERR_ROCKSDB_STATUS_NOT_SUPPORTED;
6652       break;
6653     case rocksdb::Status::Code::kInvalidArgument:
6654       err = HA_ERR_ROCKSDB_STATUS_INVALID_ARGUMENT;
6655       break;
6656     case rocksdb::Status::Code::kIOError:
6657       err = (s.IsNoSpace()) ? HA_ERR_ROCKSDB_STATUS_NO_SPACE
6658                             : HA_ERR_ROCKSDB_STATUS_IO_ERROR;
6659       break;
6660     case rocksdb::Status::Code::kMergeInProgress:
6661       err = HA_ERR_ROCKSDB_STATUS_MERGE_IN_PROGRESS;
6662       break;
6663     case rocksdb::Status::Code::kIncomplete:
6664       err = HA_ERR_ROCKSDB_STATUS_INCOMPLETE;
6665       break;
6666     case rocksdb::Status::Code::kShutdownInProgress:
6667       err = HA_ERR_ROCKSDB_STATUS_SHUTDOWN_IN_PROGRESS;
6668       break;
6669     case rocksdb::Status::Code::kTimedOut:
6670       err = HA_ERR_ROCKSDB_STATUS_TIMED_OUT;
6671       break;
6672     case rocksdb::Status::Code::kAborted:
6673       err = (s.IsLockLimit()) ? HA_ERR_ROCKSDB_STATUS_LOCK_LIMIT
6674                               : HA_ERR_ROCKSDB_STATUS_ABORTED;
6675       break;
6676     case rocksdb::Status::Code::kBusy:
6677       err = (s.IsDeadlock()) ? HA_ERR_ROCKSDB_STATUS_DEADLOCK
6678                              : HA_ERR_ROCKSDB_STATUS_BUSY;
6679       break;
6680     case rocksdb::Status::Code::kExpired:
6681       err = HA_ERR_ROCKSDB_STATUS_EXPIRED;
6682       break;
6683     case rocksdb::Status::Code::kTryAgain:
6684       err = HA_ERR_ROCKSDB_STATUS_TRY_AGAIN;
6685       break;
6686     default:
6687       assert(0);
6688       return -1;
6689   }
6690 
6691   std::string errMsg;
6692   if (s.IsLockLimit()) {
6693     errMsg =
6694         "Operation aborted: Failed to acquire lock due to "
6695         "rocksdb_max_row_locks limit";
6696   } else {
6697     errMsg = s.ToString();
6698   }
6699 
6700   if (opt_msg) {
6701     std::string concatenated_error = errMsg + " (" + std::string(opt_msg) + ")";
6702     my_error(ER_GET_ERRMSG, MYF(0), s.code(), concatenated_error.c_str(),
6703              rocksdb_hton_name);
6704   } else {
6705     my_error(ER_GET_ERRMSG, MYF(0), s.code(), errMsg.c_str(),
6706              rocksdb_hton_name);
6707   }
6708 
6709   return err;
6710 }
6711 
6712 /* MyRocks supports only the following collations for indexed columns */
6713 static const std::set<const my_core::CHARSET_INFO *> RDB_INDEX_COLLATIONS = {
6714     &my_charset_bin, &my_charset_utf8_bin, &my_charset_latin1_bin};
6715 
6716 static bool rdb_is_index_collation_supported(
6717     const my_core::Field *const field) {
6718   const my_core::enum_field_types type = field->real_type();
6719   /* Handle [VAR](CHAR|BINARY) or TEXT|BLOB */
6720   if (type == MYSQL_TYPE_VARCHAR || type == MYSQL_TYPE_STRING ||
6721       type == MYSQL_TYPE_BLOB || type == MYSQL_TYPE_JSON) {
6722     return (RDB_INDEX_COLLATIONS.find(field->charset()) !=
6723             RDB_INDEX_COLLATIONS.end()) ||
6724            rdb_is_collation_supported(field->charset());
6725   }
6726   return true;
6727 }
6728 
6729 /*
6730   Create structures needed for storing data in rocksdb. This is called when the
6731   table is created. The structures will be shared by all TABLE* objects.
6732 
6733   @param
6734     table_arg        Table with definition
6735     db_table         "dbname.tablename"
6736     len              strlen of the above
6737     tbl_def_arg      tbl_def whose key_descr is being created/populated
6738     old_tbl_def_arg  tbl_def from which keys are being copied over from
6739                      (for use during inplace alter)
6740 
6741   @return
6742     0      - Ok
6743     other  - error, either given table ddl is not supported by rocksdb or OOM.
6744 */
6745 int ha_rocksdb::create_key_defs(
6746     const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
6747     const TABLE *const old_table_arg /* = nullptr */,
6748     const Rdb_tbl_def *const old_tbl_def_arg
6749     /* = nullptr */) const {
6750   DBUG_ENTER_FUNC();
6751 
6752   assert(table_arg != nullptr);
6753   assert(table_arg->s != nullptr);
6754 
6755   DBUG_EXECUTE_IF("rocksdb_truncate_failure", {
6756     my_error(ER_INTERNAL_ERROR, MYF(0), "Simulated truncation failure.");
6757     DBUG_RETURN(HA_EXIT_FAILURE);
6758   });
6759 
6760   DBUG_EXECUTE_IF("rocksdb_truncate_failure_crash", DBUG_SUICIDE(););
6761 
6762   /*
6763     These need to be one greater than MAX_INDEXES since the user can create
6764     MAX_INDEXES secondary keys and no primary key which would cause us
6765     to generate a hidden one.
6766   */
6767   std::array<key_def_cf_info, MAX_INDEXES + 1> cfs;
6768 
6769   /*
6770     NOTE: All new column families must be created before new index numbers are
6771     allocated to each key definition. See below for more details.
6772     http://github.com/MySQLOnRocksDB/mysql-5.6/issues/86#issuecomment-138515501
6773   */
6774   if (create_cfs(table_arg, tbl_def_arg, &cfs)) {
6775     DBUG_RETURN(HA_EXIT_FAILURE);
6776   }
6777 
6778   uint64 ttl_duration = 0;
6779   std::string ttl_column;
6780   uint ttl_field_offset;
6781 
6782   uint err;
6783   if ((err = Rdb_key_def::extract_ttl_duration(table_arg, tbl_def_arg,
6784                                                &ttl_duration))) {
6785     DBUG_RETURN(err);
6786   }
6787 
6788   if ((err = Rdb_key_def::extract_ttl_col(table_arg, tbl_def_arg, &ttl_column,
6789                                           &ttl_field_offset))) {
6790     DBUG_RETURN(err);
6791   }
6792 
6793   /* We don't currently support TTL on tables with hidden primary keys. */
6794   if (ttl_duration > 0 && has_hidden_pk(table_arg)) {
6795     my_error(ER_RDB_TTL_UNSUPPORTED, MYF(0));
6796     DBUG_RETURN(HA_EXIT_FAILURE);
6797   }
6798 
6799   /*
6800     If TTL duration is not specified but TTL column was specified, throw an
6801     error because TTL column requires duration.
6802   */
6803   if (ttl_duration == 0 && !ttl_column.empty()) {
6804     my_error(ER_RDB_TTL_COL_FORMAT, MYF(0), ttl_column.c_str());
6805     DBUG_RETURN(HA_EXIT_FAILURE);
6806   }
6807 
6808   if (!old_tbl_def_arg) {
6809     /*
6810       old_tbl_def doesn't exist. this means we are in the process of creating
6811       a new table.
6812 
6813       Get the index numbers (this will update the next_index_number)
6814       and create Rdb_key_def structures.
6815     */
6816     for (uint i = 0; i < tbl_def_arg->m_key_count; i++) {
6817       if (create_key_def(table_arg, i, tbl_def_arg, &m_key_descr_arr[i], cfs[i],
6818                          ttl_duration, ttl_column)) {
6819         DBUG_RETURN(HA_EXIT_FAILURE);
6820       }
6821     }
6822   } else {
6823     /*
6824       old_tbl_def exists.  This means we are creating a new tbl_def as part of
6825       in-place alter table.  Copy over existing keys from the old_tbl_def and
6826       generate the necessary new key definitions if any.
6827     */
6828     if (create_inplace_key_defs(table_arg, tbl_def_arg, old_table_arg,
6829                                 old_tbl_def_arg, cfs, ttl_duration,
6830                                 ttl_column)) {
6831       DBUG_RETURN(HA_EXIT_FAILURE);
6832     }
6833   }
6834 
6835   DBUG_RETURN(HA_EXIT_SUCCESS);
6836 }
6837 
6838 /*
6839   Checks index parameters and creates column families needed for storing data
6840   in rocksdb if necessary.
6841 
6842   @param in
6843     table_arg     Table with definition
6844     db_table      Table name
6845     tbl_def_arg   Table def structure being populated
6846 
6847   @param out
6848     cfs           CF info for each key definition in 'key_info' order
6849 
6850   @return
6851     0      - Ok
6852     other  - error
6853 */
6854 int ha_rocksdb::create_cfs(
6855     const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
6856     std::array<struct key_def_cf_info, MAX_INDEXES + 1> *const cfs) const {
6857   DBUG_ENTER_FUNC();
6858 
6859   assert(table_arg != nullptr);
6860   assert(table_arg->s != nullptr);
6861   assert(tbl_def_arg != nullptr);
6862 
6863   char tablename_sys[NAME_LEN + 1];
6864 
6865   my_core::filename_to_tablename(tbl_def_arg->base_tablename().c_str(),
6866                                  tablename_sys, sizeof(tablename_sys));
6867 
6868   uint primary_key_index = pk_index(table_arg, tbl_def_arg);
6869   /*
6870     The first loop checks the index parameters and creates
6871     column families if necessary.
6872   */
6873   THD *const thd = my_core::thd_get_current_thd();
6874   for (uint i = 0; i < tbl_def_arg->m_key_count; i++) {
6875     std::shared_ptr<rocksdb::ColumnFamilyHandle> cf_handle;
6876 
6877     /*
6878       Skip collation checks on truncation since we might be recreating the
6879       table that had unsupported collations and we don't want to fail the
6880       truncation.
6881     */
6882     if (rocksdb_strict_collation_check &&
6883         thd->lex->sql_command != SQLCOM_TRUNCATE &&
6884         !is_hidden_pk(i, table_arg, tbl_def_arg) &&
6885         tbl_def_arg->base_tablename().find(tmp_file_prefix) != 0) {
6886       for (uint part = 0; part < table_arg->key_info[i].actual_key_parts;
6887            part++) {
6888         if (!rdb_is_index_collation_supported(
6889                 table_arg->key_info[i].key_part[part].field) &&
6890             !rdb_collation_exceptions->match(tablename_sys)) {
6891           std::string collation_err;
6892           for (const auto &coll : RDB_INDEX_COLLATIONS) {
6893             if (collation_err != "") {
6894               collation_err += ", ";
6895             }
6896             collation_err += coll->name;
6897           }
6898 
6899           if (rocksdb_error_on_suboptimal_collation) {
6900             my_error(ER_UNSUPPORTED_COLLATION, MYF(0),
6901                      tbl_def_arg->full_tablename().c_str(),
6902                      table_arg->key_info[i].key_part[part].field->field_name,
6903                      collation_err.c_str());
6904             DBUG_RETURN(HA_EXIT_FAILURE);
6905           } else {
6906             push_warning_printf(
6907                 ha_thd(), Sql_condition::SL_WARNING, HA_ERR_INTERNAL_ERROR,
6908                 "Indexed column %s.%s uses a collation that does not allow "
6909                 "index-only access in secondary key and has reduced disk space "
6910                 "efficiency in primary key.",
6911                 tbl_def_arg->full_tablename().c_str(),
6912                 table_arg->key_info[i].key_part[part].field->field_name);
6913           }
6914         }
6915       }
6916     }
6917 
6918     // Internal consistency check to make sure that data in TABLE and
6919     // Rdb_tbl_def structures matches. Either both are missing or both are
6920     // specified. Yes, this is critical enough to make it into SHIP_ASSERT.
6921     SHIP_ASSERT(!table_arg->part_info == tbl_def_arg->base_partition().empty());
6922 
6923     // Generate the name for the column family to use.
6924     bool per_part_match_found = false;
6925     std::string cf_name =
6926         generate_cf_name(i, table_arg, tbl_def_arg, &per_part_match_found);
6927 
6928     // Prevent create from using the system column family.
6929     if (cf_name == DEFAULT_SYSTEM_CF_NAME) {
6930       my_error(ER_WRONG_ARGUMENTS, MYF(0),
6931                "column family not valid for storing index data.");
6932       DBUG_RETURN(HA_EXIT_FAILURE);
6933     }
6934 
6935     DBUG_EXECUTE_IF("rocksdb_create_primary_cf", {
6936       if (cf_name == "cf_primary_key") {
6937         THD *const thd = my_core::thd_get_current_thd();
6938         static constexpr char act[] =
6939             "now signal ready_to_mark_cf_dropped_in_create_cfs "
6940             "wait_for mark_cf_dropped_done_in_create_cfs";
6941         assert(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
6942       }
6943     });
6944 
6945     DBUG_EXECUTE_IF("rocksdb_create_secondary_cf", {
6946       if (cf_name == "cf_secondary_key") {
6947         THD *const thd = my_core::thd_get_current_thd();
6948         static constexpr char act[] =
6949             "now signal ready_to_mark_cf_dropped_in_create_cfs "
6950             "wait_for mark_cf_dropped_done_in_create_cfs";
6951         assert(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
6952       }
6953     });
6954 
6955     // if not specified, use default CF name
6956     if (cf_name.empty()) {
6957       if (i != primary_key_index && rocksdb_use_default_sk_cf)
6958         cf_name = DEFAULT_SK_CF_NAME;
6959       else
6960         cf_name = DEFAULT_CF_NAME;
6961     }
6962 
6963     // Here's how `get_or_create_cf` will use the input parameters:
6964     //
6965     // `cf_name` - will be used as a CF name.
6966     {
6967       std::lock_guard<Rdb_dict_manager> dm_lock(dict_manager);
6968       cf_handle = cf_manager.get_or_create_cf(rdb, cf_name, !rocksdb_no_create_column_family);
6969       if (!cf_handle) {
6970         DBUG_RETURN(HA_EXIT_FAILURE);
6971       }
6972 
6973       uint32 cf_id = cf_handle->GetID();
6974 
6975       // If the cf is marked as dropped, we fail it here.
6976       // The cf can be dropped after this point, we will
6977       // check again when committing metadata changes.
6978       if (dict_manager.get_dropped_cf(cf_id)) {
6979         my_error(ER_CF_DROPPED, MYF(0), cf_name.c_str());
6980         DBUG_RETURN(HA_EXIT_FAILURE);
6981       }
6982 
6983       if (cf_manager.create_cf_flags_if_needed(&dict_manager,
6984                                                cf_handle->GetID(), cf_name,
6985                                                per_part_match_found)) {
6986         DBUG_RETURN(HA_EXIT_FAILURE);
6987       }
6988     }
6989 
6990     // The CF can be dropped from cf_manager at this point. This is part of
6991     // create table or alter table. If the drop happens before metadata are
6992     // written, create table or alter table will fail.
6993     auto &cf = (*cfs)[i];
6994 
6995     cf.cf_handle = cf_handle;
6996     cf.is_reverse_cf = Rdb_cf_manager::is_cf_name_reverse(cf_name.c_str());
6997     cf.is_per_partition_cf = per_part_match_found;
6998   }
6999 
7000   DBUG_RETURN(HA_EXIT_SUCCESS);
7001 }
7002 
7003 /*
7004   Create key definition needed for storing data in rocksdb during ADD index
7005   inplace operations.
7006 
7007   @param in
7008     table_arg         Table with definition
7009     tbl_def_arg       New table def structure being populated
7010     old_tbl_def_arg   Old(current) table def structure
7011     cfs               Struct array which contains column family information
7012 
7013   @return
7014     0      - Ok
7015     other  - error, either given table ddl is not supported by rocksdb or OOM.
7016 */
7017 int ha_rocksdb::create_inplace_key_defs(
7018     const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
7019     const TABLE *const old_table_arg, const Rdb_tbl_def *const old_tbl_def_arg,
7020     const std::array<key_def_cf_info, MAX_INDEXES + 1> &cfs,
7021     uint64 ttl_duration, const std::string &ttl_column) const {
7022   DBUG_ENTER_FUNC();
7023 
7024   assert(table_arg != nullptr);
7025   assert(tbl_def_arg != nullptr);
7026   assert(old_tbl_def_arg != nullptr);
7027 
7028   std::shared_ptr<Rdb_key_def> *const old_key_descr =
7029       old_tbl_def_arg->m_key_descr_arr;
7030   std::shared_ptr<Rdb_key_def> *const new_key_descr =
7031       tbl_def_arg->m_key_descr_arr;
7032   const std::unordered_map<std::string, uint> old_key_pos =
7033       get_old_key_positions(table_arg, tbl_def_arg, old_table_arg,
7034                             old_tbl_def_arg);
7035 
7036   uint i;
7037   for (i = 0; i < tbl_def_arg->m_key_count; i++) {
7038     const auto &it = old_key_pos.find(get_key_name(i, table_arg, tbl_def_arg));
7039 
7040     if (it != old_key_pos.end()) {
7041       /*
7042         Found matching index in old table definition, so copy it over to the
7043         new one created.
7044       */
7045       const Rdb_key_def &okd = *old_key_descr[it->second];
7046 
7047       const GL_INDEX_ID gl_index_id = okd.get_gl_index_id();
7048       struct Rdb_index_info index_info;
7049       if (!dict_manager.get_index_info(gl_index_id, &index_info)) {
7050         // NO_LINT_DEBUG
7051         sql_print_error(
7052             "RocksDB: Could not get index information "
7053             "for Index Number (%u,%u), table %s",
7054             gl_index_id.cf_id, gl_index_id.index_id,
7055             old_tbl_def_arg->full_tablename().c_str());
7056         DBUG_RETURN(HA_EXIT_FAILURE);
7057       }
7058 
7059       uint32 ttl_rec_offset =
7060           Rdb_key_def::has_index_flag(index_info.m_index_flags,
7061                                       Rdb_key_def::TTL_FLAG)
7062               ? Rdb_key_def::calculate_index_flag_offset(
7063                     index_info.m_index_flags, Rdb_key_def::TTL_FLAG)
7064               : UINT_MAX;
7065 
7066       /*
7067         We can't use the copy constructor because we need to update the
7068         keynr within the pack_info for each field and the keyno of the keydef
7069         itself.
7070       */
7071       new_key_descr[i] = std::make_shared<Rdb_key_def>(
7072           okd.get_index_number(), i, okd.get_shared_cf(),
7073           index_info.m_index_dict_version, index_info.m_index_type,
7074           index_info.m_kv_version, okd.m_is_reverse_cf,
7075           okd.m_is_per_partition_cf, okd.m_name.c_str(),
7076           dict_manager.get_stats(gl_index_id), index_info.m_index_flags,
7077           ttl_rec_offset, index_info.m_ttl_duration);
7078     } else if (create_key_def(table_arg, i, tbl_def_arg, &new_key_descr[i],
7079                               cfs[i], ttl_duration, ttl_column)) {
7080       DBUG_RETURN(HA_EXIT_FAILURE);
7081     }
7082 
7083     assert(new_key_descr[i] != nullptr);
7084     new_key_descr[i]->setup(table_arg, tbl_def_arg);
7085   }
7086 
7087   tbl_def_arg->m_tbl_stats.set(new_key_descr[0]->m_stats.m_rows, 0, 0);
7088 
7089   DBUG_RETURN(HA_EXIT_SUCCESS);
7090 }
7091 
7092 std::unordered_map<std::string, uint> ha_rocksdb::get_old_key_positions(
7093     const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg,
7094     const TABLE *const old_table_arg,
7095     const Rdb_tbl_def *const old_tbl_def_arg) const {
7096   DBUG_ENTER_FUNC();
7097 
7098   assert(table_arg != nullptr);
7099   assert(old_table_arg != nullptr);
7100   assert(tbl_def_arg != nullptr);
7101   assert(old_tbl_def_arg != nullptr);
7102 
7103   std::shared_ptr<Rdb_key_def> *const old_key_descr =
7104       old_tbl_def_arg->m_key_descr_arr;
7105   std::unordered_map<std::string, uint> old_key_pos;
7106   std::unordered_map<std::string, uint> new_key_pos;
7107   uint i;
7108 
7109   for (i = 0; i < tbl_def_arg->m_key_count; i++) {
7110     new_key_pos[get_key_name(i, table_arg, tbl_def_arg)] = i;
7111   }
7112 
7113   for (i = 0; i < old_tbl_def_arg->m_key_count; i++) {
7114     if (is_hidden_pk(i, old_table_arg, old_tbl_def_arg)) {
7115       old_key_pos[old_key_descr[i]->m_name] = i;
7116       continue;
7117     }
7118 
7119     /*
7120       In case of matching key name, need to check key parts of keys as well,
7121       in case a simultaneous drop + add is performed, where the key name is the
7122       same but the key parts are different.
7123 
7124       Example:
7125       CREATE TABLE t1 (a INT, b INT, KEY ka(a)) ENGINE=RocksDB;
7126       ALTER TABLE t1 DROP INDEX ka, ADD INDEX ka(b), ALGORITHM=INPLACE;
7127     */
7128     const KEY *const old_key = &old_table_arg->key_info[i];
7129     const auto &it = new_key_pos.find(old_key->name);
7130     if (it == new_key_pos.end()) {
7131       continue;
7132     }
7133 
7134     KEY *const new_key = &table_arg->key_info[it->second];
7135 
7136     /*
7137       Check that the key is identical between old and new tables.
7138       If not, we still need to create a new index.
7139 
7140       The exception is if there is an index changed from unique to non-unique,
7141       in these cases we don't need to rebuild as they are stored the same way in
7142       RocksDB.
7143     */
7144     bool unique_to_non_unique =
7145         ((old_key->flags ^ new_key->flags) == HA_NOSAME) &&
7146         (old_key->flags & HA_NOSAME);
7147 
7148     if (compare_keys(old_key, new_key) && !unique_to_non_unique) {
7149       continue;
7150     }
7151 
7152     /* Check to make sure key parts match. */
7153     if (compare_key_parts(old_key, new_key)) {
7154       continue;
7155     }
7156 
7157     old_key_pos[old_key->name] = i;
7158   }
7159 
7160   DBUG_RETURN(old_key_pos);
7161 }
7162 
7163 /* Check to see if two keys are identical. */
7164 int ha_rocksdb::compare_keys(const KEY *const old_key,
7165                              const KEY *const new_key) const {
7166   DBUG_ENTER_FUNC();
7167 
7168   assert(old_key != nullptr);
7169   assert(new_key != nullptr);
7170 
7171   /* Check index name. */
7172   if (strcmp(old_key->name, new_key->name) != 0) {
7173     DBUG_RETURN(HA_EXIT_FAILURE);
7174   }
7175 
7176   /* If index algorithms are different then keys are different. */
7177   if (old_key->algorithm != new_key->algorithm) {
7178     DBUG_RETURN(HA_EXIT_FAILURE);
7179   }
7180 
7181   /* Check that the key is identical between old and new tables.  */
7182   if ((old_key->flags ^ new_key->flags) & HA_KEYFLAG_MASK) {
7183     DBUG_RETURN(HA_EXIT_FAILURE);
7184   }
7185 
7186   /* Check index comment. (for column family changes) */
7187   std::string old_comment(old_key->comment.str, old_key->comment.length);
7188   std::string new_comment(new_key->comment.str, new_key->comment.length);
7189   if (old_comment.compare(new_comment) != 0) {
7190     DBUG_RETURN(HA_EXIT_FAILURE);
7191   }
7192 
7193   DBUG_RETURN(HA_EXIT_SUCCESS);
7194 }
7195 
7196 /* Check two keys to ensure that key parts within keys match */
7197 int ha_rocksdb::compare_key_parts(const KEY *const old_key,
7198                                   const KEY *const new_key) const {
7199   DBUG_ENTER_FUNC();
7200 
7201   assert(old_key != nullptr);
7202   assert(new_key != nullptr);
7203 
7204   /* Skip if key parts do not match, as it is a different key */
7205   if (new_key->user_defined_key_parts != old_key->user_defined_key_parts) {
7206     DBUG_RETURN(HA_EXIT_FAILURE);
7207   }
7208 
7209   /* Check to see that key parts themselves match */
7210   for (uint i = 0; i < old_key->user_defined_key_parts; i++) {
7211     if (strcmp(old_key->key_part[i].field->field_name,
7212                new_key->key_part[i].field->field_name) != 0) {
7213       DBUG_RETURN(HA_EXIT_FAILURE);
7214     }
7215 
7216     /* Check if prefix index key part length has changed */
7217     if (old_key->key_part[i].length != new_key->key_part[i].length) {
7218       DBUG_RETURN(HA_EXIT_FAILURE);
7219     }
7220   }
7221 
7222   DBUG_RETURN(HA_EXIT_SUCCESS);
7223 }
7224 
7225 /*
7226   Create key definition needed for storing data in rocksdb.
7227   This can be called either during CREATE table or doing ADD index operations.
7228 
7229   @param in
7230     table_arg     Table with definition
7231     i             Position of index being created inside table_arg->key_info
7232     tbl_def_arg   Table def structure being populated
7233     cf_info       Struct which contains column family information
7234 
7235   @param out
7236     new_key_def  Newly created index definition.
7237 
7238   @return
7239     0      - Ok
7240     other  - error, either given table ddl is not supported by rocksdb or OOM.
7241 */
7242 int ha_rocksdb::create_key_def(const TABLE *const table_arg, const uint i,
7243                                const Rdb_tbl_def *const tbl_def_arg,
7244                                std::shared_ptr<Rdb_key_def> *const new_key_def,
7245                                const struct key_def_cf_info &cf_info,
7246                                uint64 ttl_duration,
7247                                const std::string &ttl_column) const {
7248   DBUG_ENTER_FUNC();
7249 
7250   assert(new_key_def != nullptr);
7251   assert(*new_key_def == nullptr);
7252 
7253   const uint index_id = ddl_manager.get_and_update_next_number(&dict_manager);
7254   const uint16_t index_dict_version = Rdb_key_def::INDEX_INFO_VERSION_LATEST;
7255   uchar index_type;
7256   uint16_t kv_version;
7257 
7258   if (is_hidden_pk(i, table_arg, tbl_def_arg)) {
7259     index_type = Rdb_key_def::INDEX_TYPE_HIDDEN_PRIMARY;
7260     kv_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_LATEST;
7261   } else if (i == table_arg->s->primary_key) {
7262     index_type = Rdb_key_def::INDEX_TYPE_PRIMARY;
7263     uint16 pk_latest_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_LATEST;
7264     kv_version = pk_latest_version;
7265   } else {
7266     index_type = Rdb_key_def::INDEX_TYPE_SECONDARY;
7267     uint16 sk_latest_version = Rdb_key_def::SECONDARY_FORMAT_VERSION_LATEST;
7268     kv_version = sk_latest_version;
7269   }
7270 
7271   // Use PRIMARY_FORMAT_VERSION_UPDATE1 here since it is the same value as
7272   // SECONDARY_FORMAT_VERSION_UPDATE1 so it doesn't matter if this is a
7273   // primary key or secondary key.
7274   DBUG_EXECUTE_IF("MYROCKS_LEGACY_VARBINARY_FORMAT", {
7275     kv_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_UPDATE1;
7276   });
7277 
7278   DBUG_EXECUTE_IF("MYROCKS_NO_COVERED_BITMAP_FORMAT", {
7279     if (index_type == Rdb_key_def::INDEX_TYPE_SECONDARY) {
7280       kv_version = Rdb_key_def::SECONDARY_FORMAT_VERSION_UPDATE2;
7281     }
7282   });
7283 
7284   uint32 index_flags = (ttl_duration > 0 ? Rdb_key_def::TTL_FLAG : 0);
7285 
7286   uint32 ttl_rec_offset =
7287       Rdb_key_def::has_index_flag(index_flags, Rdb_key_def::TTL_FLAG)
7288           ? Rdb_key_def::calculate_index_flag_offset(index_flags,
7289                                                      Rdb_key_def::TTL_FLAG)
7290           : UINT_MAX;
7291 
7292   const char *const key_name = get_key_name(i, table_arg, m_tbl_def);
7293   *new_key_def = std::make_shared<Rdb_key_def>(
7294       index_id, i, cf_info.cf_handle, index_dict_version, index_type,
7295       kv_version, cf_info.is_reverse_cf, cf_info.is_per_partition_cf, key_name,
7296       Rdb_index_stats(), index_flags, ttl_rec_offset, ttl_duration);
7297 
7298   if (!ttl_column.empty()) {
7299     (*new_key_def)->m_ttl_column = ttl_column;
7300   }
7301   // initialize key_def
7302   (*new_key_def)->setup(table_arg, tbl_def_arg);
7303   DBUG_RETURN(HA_EXIT_SUCCESS);
7304 }
7305 
7306 bool rdb_is_tablename_normalized(const std::string &tablename) {
7307   return tablename.size() < 2 || (tablename[0] != '.' && tablename[1] != '/');
7308 }
7309 
7310 int rdb_normalize_tablename(const std::string &tablename,
7311                             std::string *const strbuf) {
7312   assert(strbuf != nullptr);
7313 
7314   if (tablename.size() < 2 || tablename[0] != '.' || tablename[1] != '/') {
7315     assert(0);  // We were not passed table name?
7316     return HA_ERR_ROCKSDB_INVALID_TABLE;
7317   }
7318 
7319   size_t pos = tablename.find_first_of('/', 2);
7320   if (pos == std::string::npos) {
7321     assert(0);  // We were not passed table name?
7322     return HA_ERR_ROCKSDB_INVALID_TABLE;
7323   }
7324 
7325   *strbuf = tablename.substr(2, pos - 2) + "." + tablename.substr(pos + 1);
7326 
7327   return HA_EXIT_SUCCESS;
7328 }
7329 
7330 int rdb_make_canonical_tablename(const std::string &tablename,
7331     std::string *const strbuf) {
7332   assert(strbuf != nullptr);
7333 
7334   size_t pos = tablename.find_first_of('.');
7335   if (pos == std::string::npos) {
7336     assert(0);
7337     return HA_ERR_ROCKSDB_INVALID_TABLE;
7338   }
7339 
7340   *strbuf = "./" + tablename.substr(0, pos) + "/" + tablename.substr(pos + 1);
7341 
7342   return HA_EXIT_SUCCESS;
7343 }
7344 
7345 /*
7346   Check to see if the user's original statement includes foreign key
7347   references
7348 */
7349 bool ha_rocksdb::contains_foreign_key(THD *const thd) {
7350   bool success;
7351   const char *str = thd->query().str;
7352 
7353   assert(str != nullptr);
7354 
7355   while (*str != '\0') {
7356     // Scan from our current pos looking for 'FOREIGN'
7357     str = rdb_find_in_string(str, "FOREIGN", &success);
7358     if (!success) {
7359       return false;
7360     }
7361 
7362     // Skip past the found "FOREIGN'
7363     str = rdb_check_next_token(&my_charset_bin, str, "FOREIGN", &success);
7364     assert(success);
7365 
7366     if (!my_isspace(&my_charset_bin, *str)) {
7367       return false;
7368     }
7369 
7370     // See if the next token is 'KEY'
7371     str = rdb_check_next_token(&my_charset_bin, str, "KEY", &success);
7372     if (!success) {
7373       continue;
7374     }
7375 
7376     // See if the next token is '('
7377     str = rdb_check_next_token(&my_charset_bin, str, "(", &success);
7378     if (!success) {
7379       // There is an optional index id after 'FOREIGN KEY', skip it
7380       str = rdb_skip_id(&my_charset_bin, str);
7381 
7382       // Now check for '(' again
7383       str = rdb_check_next_token(&my_charset_bin, str, "(", &success);
7384     }
7385 
7386     // If we have found 'FOREIGN KEY [<word>] (' we can be confident we have
7387     // a foreign key clause.
7388     return success;
7389   }
7390 
7391   // We never found a valid foreign key clause
7392   return false;
7393 }
7394 
7395 /**
7396   @brief
7397   splits the normalized table name of <dbname>.<tablename>#P#<part_no> into
7398   the <dbname>, <tablename> and <part_no> components.
7399 
7400   @param dbbuf returns database name/table_schema
7401   @param tablebuf returns tablename
7402   @param partitionbuf returns partition suffix if there is one
7403   @return HA_EXIT_SUCCESS on success, non-zero on failure to split
7404 */
7405 int rdb_split_normalized_tablename(const std::string &fullname,
7406                                    std::string *const db,
7407                                    std::string *const table,
7408                                    std::string *const partition) {
7409   assert(!fullname.empty());
7410 
7411 #define RDB_PARTITION_STR "#P#"
7412 
7413   /* Normalize returns dbname.tablename. */
7414   size_t dotpos = fullname.find('.');
7415 
7416   /* Invalid table name? */
7417   if (dotpos == std::string::npos) {
7418     return HA_ERR_ROCKSDB_INVALID_TABLE;
7419   }
7420 
7421   // Table must have a database name associated with it.
7422   assert(dotpos > 0);
7423 
7424   if (db != nullptr) {
7425     *db = fullname.substr(0, dotpos);
7426   }
7427 
7428   dotpos++;
7429 
7430   const size_t partpos =
7431       fullname.find(RDB_PARTITION_STR, dotpos, strlen(RDB_PARTITION_STR));
7432 
7433   if (partpos != std::string::npos) {
7434     assert(partpos >= dotpos);
7435 
7436     if (table != nullptr) {
7437       *table = fullname.substr(dotpos, partpos - dotpos);
7438     }
7439 
7440     if (partition != nullptr) {
7441       *partition = fullname.substr(partpos + strlen(RDB_PARTITION_STR));
7442     }
7443   } else if (table != nullptr) {
7444     *table = fullname.substr(dotpos);
7445   }
7446 
7447   return HA_EXIT_SUCCESS;
7448 }
7449 
7450 /*
7451   Generates the normalized tablename using as many of the given arguments as
7452   possible. Any of the three arguments to <db>.<table>#P#<partition> can be
7453   null/empty, but return string will only ever be of the form
7454   <db>
7455   <db>.<table>
7456   <db>.<table>#P#<partition>
7457   <table>
7458   <table>#P#<partition>
7459 */
7460 void rdb_gen_normalized_tablename(const std::string *db,
7461                                   const std::string *table,
7462                                   const std::string *partition,
7463                                   std::string *fullname) {
7464   if (!fullname) return;
7465   fullname->clear();
7466   if (db && db->length() > 0) *fullname += *db;
7467 
7468   /* If table was not passed in, the partition will be ignored too */
7469   if (!table || table->length() == 0) return;
7470 
7471   if (fullname->length() > 0) *fullname += ".";
7472   *fullname += *table;
7473 
7474   if (partition && partition->length() > 0) {
7475     *fullname += std::string(RDB_PARTITION_STR) + *partition;
7476   }
7477 }
7478 
7479 /*
7480  Create a table's Rdb_tbl_def and its Rdb_key_defs and store table information
7481  into MyRocks Data Dictionary
7482  The method is called during create table/partition, truncate table/partition
7483 
7484  @param table_name            IN      table's name formated as
7485  'dbname.tablename'
7486  @param table_arg             IN      sql table
7487  @param auto_increment_value  IN      specified table's auto increment value
7488 
7489   @return
7490     HA_EXIT_SUCCESS  OK
7491     other            HA_ERR error code (can be SE-specific)
7492 */
7493 int ha_rocksdb::create_table(const std::string &table_name,
7494                              const TABLE *table_arg,
7495                              ulonglong auto_increment_value) {
7496   DBUG_ENTER_FUNC();
7497 
7498   int err;
7499 
7500   const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
7501   rocksdb::WriteBatch *const batch = wb.get();
7502 
7503   /* Create table/key descriptions and put them into the data dictionary */
7504   m_tbl_def = new Rdb_tbl_def(table_name);
7505 
7506   uint n_keys = table_arg->s->keys;
7507 
7508   /*
7509     If no primary key found, create a hidden PK and place it inside table
7510     definition
7511   */
7512   if (has_hidden_pk(table_arg)) {
7513     n_keys += 1;
7514     // reset hidden pk id
7515     // the starting valid value for hidden pk is 1
7516     m_tbl_def->m_hidden_pk_val = 1;
7517   }
7518 
7519   m_key_descr_arr = new std::shared_ptr<Rdb_key_def>[n_keys];
7520   m_tbl_def->m_key_count = n_keys;
7521   m_tbl_def->m_key_descr_arr = m_key_descr_arr;
7522 
7523   err = create_key_defs(table_arg, m_tbl_def);
7524   if (err != HA_EXIT_SUCCESS) {
7525     goto error;
7526   }
7527 
7528   m_pk_descr = m_key_descr_arr[pk_index(table_arg, m_tbl_def)];
7529 
7530   if (auto_increment_value) {
7531     bool autoinc_upgrade_test = false;
7532     m_tbl_def->m_auto_incr_val = auto_increment_value;
7533     DBUG_EXECUTE_IF("myrocks_autoinc_upgrade", autoinc_upgrade_test = true;);
7534     if (!autoinc_upgrade_test) {
7535       auto s = dict_manager.put_auto_incr_val(
7536           batch, m_tbl_def->get_autoincr_gl_index_id(),
7537           m_tbl_def->m_auto_incr_val);
7538       if (!s.ok()) {
7539         goto error;
7540       }
7541     }
7542   }
7543 
7544   DBUG_EXECUTE_IF("rocksdb_create_table", {
7545     THD *const thd = my_core::thd_get_current_thd();
7546     static constexpr char act[] =
7547         "now signal ready_to_mark_cf_dropped_in_create_table "
7548         "wait_for mark_cf_dropped_done_in_create_table";
7549     assert(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
7550   });
7551 
7552   {
7553     std::lock_guard<Rdb_dict_manager> dm_lock(dict_manager);
7554     err = ddl_manager.put_and_write(m_tbl_def, batch);
7555     if (err != HA_EXIT_SUCCESS) {
7556       goto error;
7557     }
7558 
7559     err = dict_manager.commit(batch);
7560     if (err != HA_EXIT_SUCCESS) {
7561       goto error;
7562     }
7563   }
7564 
7565   DBUG_RETURN(HA_EXIT_SUCCESS);
7566 
7567 error:
7568   /* Delete what we have allocated so far */
7569   delete m_tbl_def;
7570   m_tbl_def = nullptr;
7571   m_key_descr_arr = nullptr;
7572 
7573   DBUG_RETURN(err);
7574 }
7575 
7576 /**
7577   @brief
7578   create() is called to create a table. The variable name will have the name
7579   of the table.
7580 
7581   @details
7582   When create() is called you do not need to worry about
7583   opening the table. Also, the .frm file will have already been
7584   created so adjusting create_info is not necessary. You can overwrite
7585   the .frm file at this point if you wish to change the table
7586   definition, but there are no methods currently provided for doing
7587   so.
7588 
7589   Called from handle.cc by ha_create_table().
7590 
7591   @return
7592     HA_EXIT_SUCCESS  OK
7593     other            HA_ERR error code (can be SE-specific)
7594 
7595   @see
7596   ha_create_table() in handle.cc
7597 */
7598 
7599 int ha_rocksdb::create(const char *const name, TABLE *const table_arg,
7600                        HA_CREATE_INFO *const create_info) {
7601   DBUG_ENTER_FUNC();
7602 
7603   assert(table_arg != nullptr);
7604   assert(create_info != nullptr);
7605 
7606   if (unlikely(create_info->data_file_name)) {
7607     // DATA DIRECTORY is used to create tables under a specific location
7608     // outside the MySQL data directory. We don't support this for MyRocks.
7609     // The `rocksdb_datadir` setting should be used to configure RocksDB data
7610     // directory.
7611     DBUG_RETURN(HA_ERR_ROCKSDB_TABLE_DATA_DIRECTORY_NOT_SUPPORTED);
7612   }
7613 
7614   if (unlikely(create_info->index_file_name)) {
7615     // Similar check for INDEX DIRECTORY as well.
7616     DBUG_RETURN(HA_ERR_ROCKSDB_TABLE_INDEX_DIRECTORY_NOT_SUPPORTED);
7617   }
7618 
7619   if (unlikely(create_info->encrypt_type.length)) {
7620     my_error(ER_NOT_SUPPORTED_YET, MYF(0),
7621              "ENCRYPTION for the RocksDB storage engine");
7622     DBUG_RETURN(HA_WRONG_CREATE_OPTION);
7623   }
7624 
7625   if (unlikely(create_info->tablespace)) {
7626     my_error(ER_NOT_SUPPORTED_YET, MYF(0),
7627              "TABLESPACEs for the RocksDB storage engine");
7628     DBUG_RETURN(HA_WRONG_CREATE_OPTION);
7629   }
7630 
7631   if (unlikely(create_info->compress.length)) {
7632     my_error(ER_NOT_SUPPORTED_YET, MYF(0),
7633              "InnoDB page COMPRESSION for the RocksDB storage engine");
7634     DBUG_RETURN(HA_WRONG_CREATE_OPTION);
7635   }
7636 
7637   int err;
7638   /*
7639     Construct dbname.tablename ourselves, because parititioning
7640     passes strings like "./test/t14#P#p0" for individual partitions,
7641     while table_arg->s->table_name has none of that.
7642   */
7643   std::string str;
7644   err = rdb_normalize_tablename(name, &str);
7645   if (err != HA_EXIT_SUCCESS) {
7646     DBUG_RETURN(err);
7647   }
7648 
7649   // FOREIGN KEY isn't supported yet
7650   THD *const thd = my_core::thd_get_current_thd();
7651   if (contains_foreign_key(thd)) {
7652     my_error(ER_NOT_SUPPORTED_YET, MYF(0),
7653              "FOREIGN KEY for the RocksDB storage engine");
7654     DBUG_RETURN(HA_ERR_UNSUPPORTED);
7655   }
7656 
7657   // Check whether Data Dictionary contain information
7658   Rdb_tbl_def *old_tbl = ddl_manager.find(str);
7659   if (old_tbl != nullptr) {
7660     if (thd->lex->sql_command == SQLCOM_TRUNCATE) {
7661       DBUG_RETURN(truncate_table(old_tbl, table_arg,
7662                                  create_info->auto_increment_value));
7663     } else {
7664       my_error(ER_METADATA_INCONSISTENCY, MYF(0), str.c_str(), name);
7665       DBUG_RETURN(HA_ERR_ROCKSDB_CORRUPT_DATA);
7666     }
7667   }
7668 
7669   DBUG_RETURN(create_table(str, table_arg, create_info->auto_increment_value));
7670 }
7671 
7672 /*
7673   Fast truncates a table by renaming the old table, creating a new one and
7674   restoring or deleting the old table based on the results from creation.
7675 
7676   @param tbl_def               IN      MyRocks's table structure
7677   @param table_arg             IN      sql table
7678   @param auto_increment_value  IN      specified table's auto increment value
7679 
7680   @return
7681     HA_EXIT_SUCCESS  OK
7682     other            HA_ERR error code (can be SE-specific)
7683 */
7684 int ha_rocksdb::truncate_table(Rdb_tbl_def *tbl_def_arg, TABLE *table_arg,
7685                                ulonglong auto_increment_value) {
7686   DBUG_ENTER_FUNC();
7687 
7688   /*
7689     Fast table truncation involves deleting the table and then recreating
7690     it. However, it is possible recreating the table fails. In this case, a
7691     table inconsistency might result between SQL and MyRocks where MyRocks is
7692     missing a table. Since table creation involves modifying keys with the
7693     original table name, renaming the original table first, and then renaming
7694     it back in case of creation failure can help restore the pre-truncation
7695     state.
7696 
7697     If the server were to crash during truncation, the system will end up with
7698     an inconsistency. Future changes for atomic ddl will resolve this. For now,
7699     if there are any truncation renamed tables found during startup, MyRocks
7700     will automatically remove them.
7701   */
7702   std::string orig_tablename = tbl_def_arg->full_tablename();
7703   std::string dbname, tblname, partition;
7704 
7705   /*
7706     Rename the table in the data dictionary. Since this thread should be
7707     holding the MDL for this tablename, it is safe to perform these renames
7708     should be locked via MDL, no other process thread be able to access this
7709     table.
7710   */
7711   int err = rdb_split_normalized_tablename(orig_tablename, &dbname, &tblname,
7712                                            &partition);
7713   assert(err == 0);
7714   if (err != HA_EXIT_SUCCESS) DBUG_RETURN(err);
7715   tblname = std::string(TRUNCATE_TABLE_PREFIX) + tblname;
7716 
7717   std::string tmp_tablename;
7718   rdb_gen_normalized_tablename(&dbname, &tblname, &partition, &tmp_tablename);
7719 
7720   err = rename_table(orig_tablename.c_str(), tmp_tablename.c_str());
7721   if (err != HA_EXIT_SUCCESS) DBUG_RETURN(err);
7722 
7723   /*
7724     Attempt to create the table. If this succeeds, then drop the old table.
7725     Otherwise, try to restore it.
7726   */
7727   err = create_table(orig_tablename, table_arg, auto_increment_value);
7728   bool should_remove_old_table = true;
7729 
7730   /* Restore the old table being truncated if creating the new table failed */
7731   if (err != HA_EXIT_SUCCESS) {
7732     int rename_err =
7733         rename_table(tmp_tablename.c_str(), orig_tablename.c_str());
7734 
7735     /*
7736       If the rename also fails, we are out of options, but at least try to drop
7737       the old table contents.
7738     */
7739     if (rename_err == HA_EXIT_SUCCESS) {
7740       should_remove_old_table = false;
7741     } else {
7742       // NO_LINT_DEBUG
7743       sql_print_error(
7744           "MyRocks: Failure during truncation of table %s "
7745           "being renamed from %s",
7746           orig_tablename.c_str(), tmp_tablename.c_str());
7747       err = rename_err;
7748     }
7749   }
7750 
7751   /*
7752     Since the table was successfully truncated or the name restore failed, no
7753     error should be returned at this point from trying to delete the old
7754     table. If the delete_table fails, log it instead.
7755   */
7756   Rdb_tbl_def *old_tbl_def = ddl_manager.find(tmp_tablename);
7757   if (should_remove_old_table && old_tbl_def) {
7758     m_tbl_def = old_tbl_def;
7759     if (delete_table(old_tbl_def) != HA_EXIT_SUCCESS) {
7760       // NO_LINT_DEBUG
7761       sql_print_error(
7762           "Failure when trying to drop table %s during "
7763           "truncation of table %s",
7764           tmp_tablename.c_str(), orig_tablename.c_str());
7765     }
7766   }
7767 
7768   /* Update the local m_tbl_def reference */
7769   m_tbl_def = ddl_manager.find(orig_tablename);
7770   m_converter.reset(new Rdb_converter(ha_thd(), m_tbl_def, table_arg));
7771   DBUG_RETURN(err);
7772 }
7773 
7774 /**
7775   @note
7776   This function is used only when the table has not yet been opened, and
7777   keyread_allowed bitmap doesn't have the correct values yet.
7778 
7779   See comment in ha_rocksdb::index_flags() for details.
7780 */
7781 
7782 bool ha_rocksdb::check_keyread_allowed(bool &pk_can_be_decoded,
7783                                        const TABLE_SHARE *table_share, uint inx,
7784                                        uint part, bool all_parts) {
7785   bool res = true;
7786   KEY *const key_info = &table_share->key_info[inx];
7787 
7788   Rdb_field_packing dummy1;
7789   res = dummy1.setup(nullptr, key_info->key_part[part].field, inx, part,
7790                      key_info->key_part[part].length);
7791 
7792   if (res && all_parts) {
7793     for (uint i = 0; i < part; i++) {
7794       Field *field;
7795       if ((field = key_info->key_part[i].field)) {
7796         Rdb_field_packing dummy;
7797         if (!dummy.setup(nullptr, field, inx, i,
7798                          key_info->key_part[i].length)) {
7799           /* Cannot do index-only reads for this column */
7800           res = false;
7801           break;
7802         }
7803       }
7804     }
7805   }
7806 
7807   const uint pk = table_share->primary_key;
7808   if (inx == pk && all_parts &&
7809       part + 1 == table_share->key_info[pk].user_defined_key_parts) {
7810     pk_can_be_decoded = res;
7811   }
7812 
7813   return res;
7814 }
7815 
7816 int ha_rocksdb::read_key_exact(const Rdb_key_def &kd,
7817                                rocksdb::Iterator *const iter,
7818                                const bool /* unused */,
7819                                const rocksdb::Slice &key_slice,
7820                                const int64_t ttl_filter_ts) {
7821   assert(iter != nullptr);
7822 
7823   THD *thd = ha_thd();
7824   /*
7825     We are looking for the first record such that
7826       index_tuple= lookup_tuple.
7827     lookup_tuple may be a prefix of the index.
7828   */
7829   rocksdb_smart_seek(kd.m_is_reverse_cf, iter, key_slice);
7830 
7831   while (iter->Valid() && kd.value_matches_prefix(iter->key(), key_slice)) {
7832     if (thd && thd->killed) {
7833       return HA_ERR_QUERY_INTERRUPTED;
7834     }
7835     /*
7836       If TTL is enabled we need to check if the given key has already expired
7837       from the POV of the current transaction.  If it has, try going to the next
7838       key.
7839     */
7840     if (kd.has_ttl() && should_hide_ttl_rec(kd, iter->value(), ttl_filter_ts)) {
7841       rocksdb_smart_next(kd.m_is_reverse_cf, iter);
7842       continue;
7843     }
7844 
7845     return HA_EXIT_SUCCESS;
7846   }
7847 
7848   /*
7849     Got a record that is not equal to the lookup value, or even a record
7850     from another table.index.
7851   */
7852   return HA_ERR_KEY_NOT_FOUND;
7853 }
7854 
7855 int ha_rocksdb::read_before_key(const Rdb_key_def &kd,
7856                                 const bool full_key_match,
7857                                 const rocksdb::Slice &key_slice,
7858                                 const int64_t ttl_filter_ts) {
7859   THD *thd = ha_thd();
7860   /*
7861     We are looking for record with the biggest t.key such that
7862     t.key < lookup_tuple.
7863   */
7864   rocksdb_smart_seek(!kd.m_is_reverse_cf, m_scan_it, key_slice);
7865 
7866   while (is_valid(m_scan_it)) {
7867     if (thd && thd->killed) {
7868       return HA_ERR_QUERY_INTERRUPTED;
7869     }
7870     /*
7871       We are using full key and we've hit an exact match, or...
7872 
7873       If TTL is enabled we need to check if the given key has already expired
7874       from the POV of the current transaction.  If it has, try going to the next
7875       key.
7876     */
7877     if ((full_key_match &&
7878          kd.value_matches_prefix(m_scan_it->key(), key_slice)) ||
7879         (kd.has_ttl() &&
7880          should_hide_ttl_rec(kd, m_scan_it->value(), ttl_filter_ts))) {
7881       rocksdb_smart_next(!kd.m_is_reverse_cf, m_scan_it);
7882       continue;
7883     }
7884 
7885     return HA_EXIT_SUCCESS;
7886   }
7887 
7888   return HA_ERR_KEY_NOT_FOUND;
7889 }
7890 
7891 int ha_rocksdb::read_after_key(const Rdb_key_def &kd,
7892                                const rocksdb::Slice &key_slice,
7893                                const int64_t ttl_filter_ts) {
7894   THD *thd = ha_thd();
7895   /*
7896     We are looking for the first record such that
7897 
7898       index_tuple $GT lookup_tuple
7899 
7900     with HA_READ_AFTER_KEY, $GT = '>',
7901     with HA_READ_KEY_OR_NEXT, $GT = '>='
7902   */
7903   rocksdb_smart_seek(kd.m_is_reverse_cf, m_scan_it, key_slice);
7904 
7905   /*
7906     If TTL is enabled we need to check if the given key has already expired
7907     from the POV of the current transaction.  If it has, try going to the next
7908     key.
7909   */
7910   while (is_valid(m_scan_it) && kd.has_ttl() &&
7911          should_hide_ttl_rec(kd, m_scan_it->value(), ttl_filter_ts)) {
7912     if (thd && thd->killed) {
7913       return HA_ERR_QUERY_INTERRUPTED;
7914     }
7915     rocksdb_smart_next(kd.m_is_reverse_cf, m_scan_it);
7916   }
7917 
7918   return is_valid(m_scan_it) ? HA_EXIT_SUCCESS : HA_ERR_KEY_NOT_FOUND;
7919 }
7920 
7921 int ha_rocksdb::position_to_correct_key(
7922     const Rdb_key_def &kd, const enum ha_rkey_function &find_flag,
7923     const bool full_key_match, const uchar *const key,
7924     const key_part_map &keypart_map, const rocksdb::Slice &key_slice,
7925     bool *const move_forward, const int64_t ttl_filter_ts) {
7926   int rc = 0;
7927 
7928   *move_forward = true;
7929 
7930   switch (find_flag) {
7931     case HA_READ_KEY_EXACT:
7932       rc = read_key_exact(kd, m_scan_it, full_key_match, key_slice,
7933                           ttl_filter_ts);
7934       break;
7935     case HA_READ_BEFORE_KEY:
7936       *move_forward = false;
7937       rc = read_before_key(kd, full_key_match, key_slice, ttl_filter_ts);
7938       if (rc == 0 && !kd.covers_key(m_scan_it->key())) {
7939         /* The record we've got is not from this index */
7940         rc = HA_ERR_KEY_NOT_FOUND;
7941       }
7942       break;
7943     case HA_READ_AFTER_KEY:
7944     case HA_READ_KEY_OR_NEXT:
7945       rc = read_after_key(kd, key_slice, ttl_filter_ts);
7946       if (rc == 0 && !kd.covers_key(m_scan_it->key())) {
7947         /* The record we've got is not from this index */
7948         rc = HA_ERR_KEY_NOT_FOUND;
7949       }
7950       break;
7951     case HA_READ_KEY_OR_PREV:
7952     case HA_READ_PREFIX:
7953       /* This flag is not used by the SQL layer, so we don't support it yet. */
7954       rc = HA_ERR_UNSUPPORTED;
7955       break;
7956     case HA_READ_PREFIX_LAST:
7957     case HA_READ_PREFIX_LAST_OR_PREV:
7958       *move_forward = false;
7959       /*
7960         Find the last record with the specified index prefix lookup.
7961         - HA_READ_PREFIX_LAST requires that the record has the
7962           prefix=lookup (if there are no such records,
7963           HA_ERR_KEY_NOT_FOUND should be returned).
7964         - HA_READ_PREFIX_LAST_OR_PREV has no such requirement. If there are no
7965           records with prefix=lookup, we should return the last record
7966           before that.
7967       */
7968       rc = read_before_key(kd, full_key_match, key_slice, ttl_filter_ts);
7969       if (rc == 0) {
7970         const rocksdb::Slice &rkey = m_scan_it->key();
7971         if (!kd.covers_key(rkey)) {
7972           /* The record we've got is not from this index */
7973           rc = HA_ERR_KEY_NOT_FOUND;
7974         } else if (find_flag == HA_READ_PREFIX_LAST) {
7975           uint size = kd.pack_index_tuple(table, m_pack_buffer,
7976                                           m_sk_packed_tuple, key, keypart_map);
7977           rocksdb::Slice lookup_tuple(
7978               reinterpret_cast<char *>(m_sk_packed_tuple), size);
7979 
7980           // We need to compare the key we've got with the original search
7981           // prefix.
7982           if (!kd.value_matches_prefix(rkey, lookup_tuple)) {
7983             rc = HA_ERR_KEY_NOT_FOUND;
7984           }
7985         }
7986       }
7987       break;
7988     default:
7989       assert(0);
7990       break;
7991   }
7992 
7993   return rc;
7994 }
7995 
7996 int ha_rocksdb::calc_eq_cond_len(const Rdb_key_def &kd,
7997                                  const enum ha_rkey_function &find_flag,
7998                                  const rocksdb::Slice &slice,
7999                                  const int bytes_changed_by_succ,
8000                                  const key_range *const end_key,
8001                                  uint *const end_key_packed_size) {
8002   if (find_flag == HA_READ_KEY_EXACT) return slice.size();
8003 
8004   if (find_flag == HA_READ_PREFIX_LAST) {
8005     /*
8006       We have made the kd.successor(m_sk_packed_tuple) call above.
8007 
8008       The slice is at least Rdb_key_def::INDEX_NUMBER_SIZE bytes long.
8009     */
8010     return slice.size() - bytes_changed_by_succ;
8011   }
8012 
8013   if (end_key) {
8014     *end_key_packed_size =
8015         kd.pack_index_tuple(table, m_pack_buffer, m_end_key_packed_tuple,
8016                             end_key->key, end_key->keypart_map);
8017 
8018     /*
8019       Calculating length of the equal conditions here. 4 byte index id is
8020       included.
8021       Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3)
8022        WHERE id1=1 AND id2=1 AND id3>=2 => eq_cond_len= 4+8+4= 16
8023        WHERE id1=1 AND id2>=1 AND id3>=2 => eq_cond_len= 4+8= 12
8024       Example2: id1 VARCHAR(30), id2 INT, PRIMARY KEY (id1, id2)
8025        WHERE id1 = 'AAA' and id2 < 3; => eq_cond_len=13 (varchar used 9 bytes)
8026     */
8027     rocksdb::Slice end_slice(reinterpret_cast<char *>(m_end_key_packed_tuple),
8028                              *end_key_packed_size);
8029     return slice.difference_offset(end_slice);
8030   }
8031 
8032   /*
8033     On range scan without any end key condition, there is no
8034     eq cond, and eq cond length is the same as index_id size (4 bytes).
8035     Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3)
8036      WHERE id1>=1 AND id2 >= 2 and id2 <= 5 => eq_cond_len= 4
8037   */
8038   return Rdb_key_def::INDEX_NUMBER_SIZE;
8039 }
8040 
8041 int ha_rocksdb::read_row_from_primary_key(uchar *const buf) {
8042   assert(buf != nullptr);
8043 
8044   int rc;
8045   const rocksdb::Slice &rkey = m_scan_it->key();
8046   const uint pk_size = rkey.size();
8047   const char *pk_data = rkey.data();
8048 
8049   memcpy(m_pk_packed_tuple, pk_data, pk_size);
8050   m_last_rowkey.copy(pk_data, pk_size, &my_charset_bin);
8051 
8052   if (m_lock_rows != RDB_LOCK_NONE) {
8053     /* We need to put a lock and re-read */
8054     rc = get_row_by_rowid(buf, m_pk_packed_tuple, pk_size);
8055   } else {
8056     /* Unpack from the row we've read */
8057     const rocksdb::Slice &value = m_scan_it->value();
8058     rc = convert_record_from_storage_format(&rkey, &value, buf);
8059   }
8060 
8061   return rc;
8062 }
8063 
8064 int ha_rocksdb::read_row_from_secondary_key(uchar *const buf,
8065                                             const Rdb_key_def &kd,
8066                                             bool move_forward) {
8067   assert(buf != nullptr);
8068 
8069   int rc = 0;
8070   uint pk_size;
8071 
8072   /* Get the key columns and primary key value */
8073   const rocksdb::Slice &rkey = m_scan_it->key();
8074   const rocksdb::Slice &value = m_scan_it->value();
8075 
8076 #if !defined(NDEBUG)
8077   bool save_keyread_only = m_keyread_only;
8078 #endif  // !defined(NDEBUG)
8079   DBUG_EXECUTE_IF("dbug.rocksdb.HA_EXTRA_KEYREAD", { m_keyread_only = true; });
8080 
8081   bool covered_lookup =
8082       (m_keyread_only && kd.can_cover_lookup()) ||
8083       kd.covers_lookup(&value, m_converter->get_lookup_bitmap());
8084 
8085 #if !defined(NDEBUG)
8086   m_keyread_only = save_keyread_only;
8087 #endif  // !defined(NDEBUG)
8088 
8089   if (covered_lookup && m_lock_rows == RDB_LOCK_NONE) {
8090     pk_size =
8091         kd.get_primary_key_tuple(table, *m_pk_descr, &rkey, m_pk_packed_tuple);
8092     if (pk_size == RDB_INVALID_KEY_LEN) {
8093       rc = HA_ERR_ROCKSDB_CORRUPT_DATA;
8094     } else {
8095       rc = kd.unpack_record(table, buf, &rkey, &value,
8096                             m_converter->get_verify_row_debug_checksums());
8097     }
8098   } else {
8099     if (kd.m_is_reverse_cf) move_forward = !move_forward;
8100 
8101     rc = find_icp_matching_index_rec(move_forward, buf);
8102     if (!rc) {
8103       const rocksdb::Slice &rkey = m_scan_it->key();
8104       pk_size = kd.get_primary_key_tuple(table, *m_pk_descr, &rkey,
8105                                          m_pk_packed_tuple);
8106       if (pk_size == RDB_INVALID_KEY_LEN) {
8107         rc = HA_ERR_ROCKSDB_CORRUPT_DATA;
8108       } else {
8109         rc = get_row_by_rowid(buf, m_pk_packed_tuple, pk_size);
8110       }
8111     }
8112   }
8113 
8114   if (!rc) {
8115     m_last_rowkey.copy((const char *)m_pk_packed_tuple, pk_size,
8116                        &my_charset_bin);
8117   }
8118 
8119   return rc;
8120 }
8121 
8122 /**
8123   @note
8124     The problem with this function is that SQL layer calls it, when
8125      - the table has not been yet opened (no ::open() call done)
8126      - this->table_share already exists, but it is in the process of being
8127        filled, so some of fields are still NULL.
8128      - In particular, table_share->key_info[inx].key_part[] is filled only up
8129        to part #part. Subsequent key parts are not yet filled.
8130 
8131     To complicate things further, SQL layer will call index_flags() with
8132     all_parts=true. Essentially, we're asked to provide flags for reading
8133     keyparts whose datatype is not yet known.
8134 
8135     We walk around this problem by using check_keyread_allowed(), which uses
8136     table_share object and is careful not to step on unitialized data.
8137 
8138     When we get a call with all_parts=true, we try to analyze all parts but
8139     ignore those that have key_part->field==nullptr (these are not initialized
8140     yet).
8141 */
8142 
8143 ulong ha_rocksdb::index_flags(bool &pk_can_be_decoded,
8144                               const TABLE_SHARE *table_share, uint inx,
8145                               uint part, bool all_parts) {
8146   DBUG_ENTER_FUNC();
8147 
8148   ulong base_flags = HA_READ_NEXT |  // doesn't seem to be used
8149                      HA_READ_ORDER | HA_READ_RANGE | HA_READ_PREV;
8150 
8151   bool res = check_keyread_allowed(pk_can_be_decoded, table_share, inx, part,
8152                                    all_parts);
8153   if (res)
8154     base_flags |= HA_KEYREAD_ONLY;
8155 
8156   if (inx == table_share->primary_key) {
8157     /*
8158       Index-only reads on primary key are the same as table scan for us. Still,
8159       we need to explicitly "allow" them, otherwise SQL layer will miss some
8160       plans.
8161     */
8162     base_flags |= HA_KEYREAD_ONLY;
8163   } else if (res) {
8164     /* We can do ICP only if we are able to decode the key (res == true) */
8165     /*
8166       We can Index Condition Pushdown any key except the primary. With primary
8167       key, we get (pk, record) pair immediately, there is no place to put the
8168       ICP check.
8169     */
8170     base_flags |= HA_DO_INDEX_COND_PUSHDOWN;
8171   }
8172 
8173   DBUG_RETURN(base_flags);
8174 }
8175 
8176 ulong ha_rocksdb::index_flags(uint inx, uint part, bool all_parts) const {
8177   return index_flags(m_pk_can_be_decoded, table_share, inx, part, all_parts);
8178 }
8179 
8180 ha_rocksdb::Table_flags ha_rocksdb::table_flags() const {
8181   return table_flags(m_pk_can_be_decoded);
8182 }
8183 
8184 /**
8185   @brief
8186   Read next index tuple through the secondary index.
8187 
8188   @details
8189   m_scan_it points at the index key-value pair that we should read the (pk,row)
8190   pair for.
8191 */
8192 int ha_rocksdb::secondary_index_read(const int keyno, uchar *const buf) {
8193   assert(buf != nullptr);
8194   assert(table != nullptr);
8195 
8196   /* Use STATUS_NOT_FOUND when record not found or some error occurred */
8197   table->status = STATUS_NOT_FOUND;
8198 
8199   if (is_valid(m_scan_it)) {
8200     rocksdb::Slice key = m_scan_it->key();
8201 
8202     /* Check if we've ran out of records of this index */
8203     if (m_key_descr_arr[keyno]->covers_key(key)) {
8204       int rc = 0;
8205 
8206       // TODO: We could here check if we have ran out of range we're scanning
8207       const uint size = m_key_descr_arr[keyno]->get_primary_key_tuple(
8208           table, *m_pk_descr, &key, m_pk_packed_tuple);
8209       if (size == RDB_INVALID_KEY_LEN) {
8210         return HA_ERR_ROCKSDB_CORRUPT_DATA;
8211       }
8212 
8213       m_last_rowkey.copy((const char *)m_pk_packed_tuple, size,
8214                          &my_charset_bin);
8215 
8216       rocksdb::Slice value = m_scan_it->value();
8217       bool covered_lookup =
8218           (m_keyread_only && m_key_descr_arr[keyno]->can_cover_lookup()) ||
8219           m_key_descr_arr[keyno]->covers_lookup(
8220               &value, m_converter->get_lookup_bitmap());
8221       if (covered_lookup && m_lock_rows == RDB_LOCK_NONE) {
8222         rc = m_key_descr_arr[keyno]->unpack_record(
8223             table, buf, &key, &value,
8224             m_converter->get_verify_row_debug_checksums());
8225         inc_covered_sk_lookup();
8226       } else {
8227         DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete_sk");
8228         rc = get_row_by_rowid(buf, m_pk_packed_tuple, size);
8229       }
8230 
8231       if (!rc) {
8232         table->status = 0;
8233         update_row_stats(ROWS_READ);
8234       }
8235       return rc;
8236     }
8237   }
8238   return HA_ERR_END_OF_FILE;
8239 }
8240 
8241 /*
8242   ha_rocksdb::read_range_first overrides handler::read_range_first.
8243   The only difference from handler::read_range_first is that
8244   ha_rocksdb::read_range_first passes end_key to
8245   ha_rocksdb::index_read_map_impl function.
8246 
8247   @return
8248     HA_EXIT_SUCCESS  OK
8249     other            HA_ERR error code (can be SE-specific)
8250 */
8251 int ha_rocksdb::read_range_first(const key_range *const start_key,
8252                                  const key_range *const end_key,
8253                                  bool eq_range_arg, bool sorted) {
8254   DBUG_ENTER_FUNC();
8255 
8256   check_build_decoder();
8257 
8258   int result;
8259 
8260   eq_range = eq_range_arg;
8261   set_end_range(end_key, RANGE_SCAN_ASC);
8262 
8263   range_key_part = table->key_info[active_index].key_part;
8264 
8265   if (!start_key)  // Read first record
8266     result = ha_index_first(table->record[0]);
8267   else {
8268     if (is_using_prohibited_gap_locks(
8269             table,
8270             is_using_full_unique_key(active_index, start_key->keypart_map,
8271                                      start_key->flag))) {
8272       DBUG_RETURN(HA_ERR_LOCK_DEADLOCK);
8273     }
8274 
8275     result =
8276         index_read_map_impl(table->record[0], start_key->key,
8277                             start_key->keypart_map, start_key->flag, end_key);
8278   }
8279   if (result) {
8280     DBUG_RETURN((result == HA_ERR_KEY_NOT_FOUND) ? HA_ERR_END_OF_FILE : result);
8281   }
8282 
8283   if (compare_key(end_range) <= 0) {
8284     DBUG_RETURN(HA_EXIT_SUCCESS);
8285   } else {
8286     /*
8287       The last read row does not fall in the range. So request
8288       storage engine to release row lock if possible.
8289     */
8290     unlock_row();
8291     DBUG_RETURN(HA_ERR_END_OF_FILE);
8292   }
8293 }
8294 
8295 /**
8296    @return
8297     HA_EXIT_SUCCESS  OK
8298     other            HA_ERR error code (can be SE-specific)
8299 */
8300 int ha_rocksdb::index_read_map(uchar *const buf, const uchar *const key,
8301                                key_part_map keypart_map,
8302                                enum ha_rkey_function find_flag) {
8303   DBUG_ENTER_FUNC();
8304 
8305   check_build_decoder();
8306 
8307   DBUG_RETURN(index_read_map_impl(buf, key, keypart_map, find_flag, nullptr));
8308 }
8309 
8310 /*
8311    See storage/rocksdb/rocksdb-range-access.txt for description of how MySQL
8312    index navigation commands are converted into RocksDB lookup commands.
8313 
8314    This function takes end_key as an argument, and it is set on range scan.
8315    MyRocks needs to decide whether prefix bloom filter can be used or not.
8316    To decide to use prefix bloom filter or not, calculating equal condition
8317    length
8318    is needed. On equal lookups (find_flag == HA_READ_KEY_EXACT), equal
8319    condition length is the same as rocksdb::Slice.size() of the start key.
8320    On range scan, equal condition length is MIN(start_key, end_key) of the
8321    rocksdb::Slice expression.
8322 
8323    @return
8324     HA_EXIT_SUCCESS  OK
8325     other            HA_ERR error code (can be SE-specific)
8326 */
8327 int ha_rocksdb::index_read_map_impl(uchar *const buf, const uchar *const key,
8328                                     key_part_map keypart_map,
8329                                     enum ha_rkey_function find_flag,
8330                                     const key_range *end_key) {
8331   DBUG_ENTER_FUNC();
8332 
8333   int rc = 0;
8334 
8335   THD *thd = ha_thd();
8336   DEBUG_SYNC(thd, "rocksdb.check_flags_rmi");
8337   if (thd && thd->killed) {
8338     rc = HA_ERR_QUERY_INTERRUPTED;
8339     DBUG_RETURN(rc);
8340   }
8341 
8342   ha_statistic_increment(&SSV::ha_read_key_count);
8343   const Rdb_key_def &kd = *m_key_descr_arr[active_index];
8344   const uint actual_key_parts = kd.get_key_parts();
8345   bool using_full_key = is_using_full_key(keypart_map, actual_key_parts);
8346 
8347   if (!end_key) end_key = end_range;
8348 
8349   /* By default, we don't need the retrieved records to match the prefix */
8350   m_sk_match_prefix = nullptr;
8351 
8352   if (active_index == table->s->primary_key && find_flag == HA_READ_KEY_EXACT &&
8353       using_full_key) {
8354     /*
8355       Equality lookup over primary key, using full tuple.
8356       This is a special case, use DB::Get.
8357     */
8358     const uint size = kd.pack_index_tuple(table, m_pack_buffer,
8359                                           m_pk_packed_tuple, key, keypart_map);
8360     bool skip_lookup = is_blind_delete_enabled();
8361     rc = get_row_by_rowid(buf, m_pk_packed_tuple, size, false, skip_lookup);
8362     if (!rc && !skip_lookup) {
8363       update_row_stats(ROWS_READ);
8364     }
8365     DBUG_RETURN(rc);
8366   }
8367 
8368   /*
8369     Unique secondary index performs lookups without the extended key fields
8370   */
8371   uint packed_size;
8372   if (active_index != table->s->primary_key &&
8373       table->key_info[active_index].flags & HA_NOSAME &&
8374       find_flag == HA_READ_KEY_EXACT && using_full_key) {
8375     key_part_map tmp_map = (key_part_map(1) << table->key_info[active_index]
8376                                                    .user_defined_key_parts) -
8377                            1;
8378     packed_size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
8379                                       key, tmp_map);
8380     if (table->key_info[active_index].user_defined_key_parts !=
8381         kd.get_key_parts()) {
8382       using_full_key = false;
8383     }
8384 
8385     if (m_insert_with_update && m_dup_key_found &&
8386         active_index == m_dupp_errkey) {
8387       /*
8388         We are in INSERT ... ON DUPLICATE KEY UPDATE, and this is a read
8389         that SQL layer does to read the duplicate key.
8390         Its rowid is saved in m_last_rowkey. Get the full record and return it.
8391       */
8392 
8393       assert(m_dup_key_retrieved_record.length() >= packed_size);
8394       assert(memcmp(m_dup_key_retrieved_record.ptr(), m_sk_packed_tuple,
8395                          packed_size) == 0);
8396 
8397       rc = get_row_by_rowid(buf, m_last_rowkey.ptr(), m_last_rowkey.length());
8398       DBUG_RETURN(rc);
8399     }
8400 
8401   } else {
8402     packed_size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
8403                                       key, keypart_map);
8404   }
8405 
8406   if ((pushed_idx_cond && pushed_idx_cond_keyno == active_index) &&
8407       (find_flag == HA_READ_KEY_EXACT || find_flag == HA_READ_PREFIX_LAST)) {
8408     /*
8409       We are doing a point index lookup, and ICP is enabled. It is possible
8410       that this call will be followed by ha_rocksdb->index_next_same() call.
8411 
8412       Do what InnoDB does: save the lookup tuple now. We will need it in
8413       index_next_same/find_icp_matching_index_rec in order to stop scanning
8414       as soon as index record doesn't match the lookup tuple.
8415 
8416       When not using ICP, handler::index_next_same() will make sure that rows
8417       that don't match the lookup prefix are not returned.
8418       row matches the lookup prefix.
8419     */
8420     m_sk_match_prefix = m_sk_match_prefix_buf;
8421     m_sk_match_length = packed_size;
8422     memcpy(m_sk_match_prefix, m_sk_packed_tuple, packed_size);
8423   }
8424 
8425   int bytes_changed_by_succ = 0;
8426   if (find_flag == HA_READ_PREFIX_LAST_OR_PREV ||
8427       find_flag == HA_READ_PREFIX_LAST || find_flag == HA_READ_AFTER_KEY) {
8428     /* See below */
8429     bytes_changed_by_succ = kd.successor(m_sk_packed_tuple, packed_size);
8430   }
8431 
8432   rocksdb::Slice slice(reinterpret_cast<const char *>(m_sk_packed_tuple),
8433                        packed_size);
8434 
8435   uint end_key_packed_size = 0;
8436   const uint eq_cond_len =
8437       calc_eq_cond_len(kd, find_flag, slice, bytes_changed_by_succ, end_key,
8438                        &end_key_packed_size);
8439 
8440   bool use_all_keys = false;
8441   if (find_flag == HA_READ_KEY_EXACT &&
8442       my_count_bits(keypart_map) == kd.get_key_parts()) {
8443     use_all_keys = true;
8444   }
8445 
8446   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
8447   const bool is_new_snapshot = !tx->has_snapshot();
8448   // Loop as long as we get a deadlock error AND we end up creating the
8449   // snapshot here (i.e. it did not exist prior to this)
8450   for (;;) {
8451     DEBUG_SYNC(thd, "rocksdb.check_flags_rmi_scan");
8452     if (thd && thd->killed) {
8453       rc = HA_ERR_QUERY_INTERRUPTED;
8454       break;
8455     }
8456     /*
8457       This will open the iterator and position it at a record that's equal or
8458       greater than the lookup tuple.
8459     */
8460     setup_scan_iterator(kd, &slice, use_all_keys, eq_cond_len);
8461 
8462     /*
8463       Once we are positioned on from above, move to the position we really
8464       want: See storage/rocksdb/rocksdb-range-access.txt
8465     */
8466     bool move_forward;
8467     rc =
8468         position_to_correct_key(kd, find_flag, using_full_key, key, keypart_map,
8469                                 slice, &move_forward, tx->m_snapshot_timestamp);
8470 
8471     if (rc) {
8472       break;
8473     }
8474 
8475     m_skip_scan_it_next_call = false;
8476 
8477     /*
8478       Now get the data for the row into 'buf'.  If we were using a primary key
8479       then we have all the rows we need.  For a secondary key we now need to
8480       lookup the primary key.
8481     */
8482     if (active_index == table->s->primary_key) {
8483       rc = read_row_from_primary_key(buf);
8484     } else {
8485       rc = read_row_from_secondary_key(buf, kd, move_forward);
8486     }
8487 
8488     if (!should_recreate_snapshot(rc, is_new_snapshot)) {
8489       break; /* Exit the loop */
8490     }
8491 
8492     // release the snapshot and iterator so they will be regenerated
8493     tx->release_snapshot();
8494     release_scan_iterator();
8495   }
8496 
8497   if (rc) {
8498     /*
8499       This status is returned on any error
8500       the only possible error condition is record-not-found
8501     */
8502     table->status = STATUS_NOT_FOUND;
8503   } else {
8504     table->status = 0;
8505     update_row_stats(ROWS_READ);
8506   }
8507 
8508   DBUG_RETURN(rc);
8509 }
8510 
8511 /*
8512   @brief
8513   Scan the secondary index until we find an index record that satisfies ICP
8514 
8515   @param move_forward   true  <=> move m_scan_it forward
8516                         false <=> move m_scan_it backward
8517   @param buf            Record buffer (must be the same buffer that
8518                         pushed index condition points to, in practice
8519                         it is table->record[0])
8520 
8521   @detail
8522   Move the current iterator m_scan_it until we get an index tuple that
8523   satisfies the pushed Index Condition.
8524   (if there is no pushed index condition, return right away)
8525 
8526   @return
8527     0     - Index tuple satisfies ICP, can do index read.
8528     other - error code
8529 */
8530 
8531 int ha_rocksdb::find_icp_matching_index_rec(const bool move_forward,
8532                                             uchar *const buf) {
8533   assert(buf != nullptr);
8534 
8535   if (pushed_idx_cond && pushed_idx_cond_keyno == active_index) {
8536     const Rdb_key_def &kd = *m_key_descr_arr[active_index];
8537     THD *thd = ha_thd();
8538 
8539     while (1) {
8540       int rc = rocksdb_skip_expired_records(kd, m_scan_it, !move_forward);
8541       if (rc != HA_EXIT_SUCCESS) {
8542         return rc;
8543       }
8544 
8545       if (thd && thd->killed) {
8546         return HA_ERR_QUERY_INTERRUPTED;
8547       }
8548 
8549       if (!is_valid(m_scan_it)) {
8550         table->status = STATUS_NOT_FOUND;
8551         return HA_ERR_END_OF_FILE;
8552       }
8553       const rocksdb::Slice rkey = m_scan_it->key();
8554 
8555       if (!kd.covers_key(rkey)) {
8556         table->status = STATUS_NOT_FOUND;
8557         return HA_ERR_END_OF_FILE;
8558       }
8559 
8560       if (m_sk_match_prefix) {
8561         const rocksdb::Slice prefix((const char *)m_sk_match_prefix,
8562                                     m_sk_match_length);
8563         if (!kd.value_matches_prefix(rkey, prefix)) {
8564           table->status = STATUS_NOT_FOUND;
8565           return HA_ERR_END_OF_FILE;
8566         }
8567       }
8568 
8569       const rocksdb::Slice value = m_scan_it->value();
8570       int err = kd.unpack_record(table, buf, &rkey, &value,
8571                                  m_converter->get_verify_row_debug_checksums());
8572       if (err != HA_EXIT_SUCCESS) {
8573         return err;
8574       }
8575 
8576       const enum icp_result icp_status = check_index_cond();
8577       if (icp_status == ICP_NO_MATCH) {
8578         rocksdb_smart_next(!move_forward, m_scan_it);
8579         continue; /* Get the next (or prev) index tuple */
8580       } else if (icp_status == ICP_OUT_OF_RANGE) {
8581         /* We have walked out of range we are scanning */
8582         table->status = STATUS_NOT_FOUND;
8583         return HA_ERR_END_OF_FILE;
8584       } else /* icp_status == ICP_MATCH */
8585       {
8586         /* Index Condition is satisfied. We have rc==0, proceed to fetch the
8587          * row. */
8588         break;
8589       }
8590     }
8591   }
8592   return HA_EXIT_SUCCESS;
8593 }
8594 
8595 /**
8596    @return
8597     HA_EXIT_SUCCESS  OK
8598     other            HA_ERR error code (can be SE-specific)
8599 */
8600 int ha_rocksdb::index_read_last_map(uchar *const buf, const uchar *const key,
8601                                     key_part_map keypart_map) {
8602   DBUG_ENTER_FUNC();
8603 
8604   check_build_decoder();
8605 
8606   DBUG_RETURN(index_read_map(buf, key, keypart_map, HA_READ_PREFIX_LAST));
8607 }
8608 
8609 /**
8610    @return
8611     HA_ADMIN_OK      OK
8612     other            HA_ADMIN error code
8613 */
8614 int ha_rocksdb::check(THD *const thd, HA_CHECK_OPT *const check_opt) {
8615   DBUG_ENTER_FUNC();
8616 
8617   assert(thd != nullptr);
8618   assert(check_opt != nullptr);
8619 
8620   const uint pk = pk_index(table, m_tbl_def);
8621   String rowkey_copy;
8622   String sec_key_copy;
8623   const char *const table_name = table->s->table_name.str;
8624 
8625   // Only when debugging: don't use snapshot when reading
8626   // Rdb_transaction *tx= get_or_create_tx(table->in_use);
8627   // tx->snapshot= nullptr;
8628 
8629   bool save_verify_row_debug_checksums =
8630       m_converter->get_verify_row_debug_checksums();
8631   m_converter->set_verify_row_debug_checksums(true);
8632   /* For each secondary index, check that we can get a PK value from it */
8633   // NO_LINT_DEBUG
8634   sql_print_information("CHECKTABLE %s: Checking table %s", table_name,
8635                         table_name);
8636   ha_rows row_checksums_at_start = 0;  // set/used iff first_index==true
8637   ha_rows row_checksums = ha_rows(-1);
8638   bool first_index = true;
8639 
8640   for (uint keyno = 0; keyno < table->s->keys; keyno++) {
8641     if (keyno != pk) {
8642       extra(HA_EXTRA_KEYREAD);
8643       ha_index_init(keyno, true);
8644       ha_rows rows = 0;
8645       ha_rows checksums = 0;
8646       if (first_index) {
8647         row_checksums_at_start = m_converter->get_row_checksums_checked();
8648       }
8649       int res;
8650       // NO_LINT_DEBUG
8651       sql_print_information("CHECKTABLE %s:   Checking index %s", table_name,
8652                             table->key_info[keyno].name);
8653       while (1) {
8654         if (!rows) {
8655           res = index_first(table->record[0]);
8656         } else {
8657           res = index_next(table->record[0]);
8658         }
8659 
8660         if (res == HA_ERR_END_OF_FILE) break;
8661         if (res) {
8662           // error
8663           // NO_LINT_DEBUG
8664           sql_print_error("CHECKTABLE %s:   .. row %lld: index scan error %d",
8665                           table_name, rows, res);
8666           goto error;
8667         }
8668         rocksdb::Slice key = m_scan_it->key();
8669         sec_key_copy.copy(key.data(), key.size(), &my_charset_bin);
8670         rowkey_copy.copy(m_last_rowkey.ptr(), m_last_rowkey.length(),
8671                          &my_charset_bin);
8672 
8673         if (m_key_descr_arr[keyno]->unpack_info_has_checksum(
8674                 m_scan_it->value())) {
8675           checksums++;
8676         }
8677 
8678         if ((res = get_row_by_rowid(table->record[0], rowkey_copy.ptr(),
8679                                     rowkey_copy.length()))) {
8680           // NO_LINT_DEBUG
8681           sql_print_error(
8682               "CHECKTABLE %s:   .. row %lld: "
8683               "failed to fetch row by rowid",
8684               table_name, rows);
8685           goto error;
8686         }
8687 
8688         longlong hidden_pk_id = 0;
8689         if (has_hidden_pk(table) &&
8690             read_hidden_pk_id_from_rowkey(&hidden_pk_id)) {
8691           goto error;
8692         }
8693 
8694         /* Check if we get the same PK value */
8695         uint packed_size = m_pk_descr->pack_record(
8696             table, m_pack_buffer, table->record[0], m_pk_packed_tuple, nullptr,
8697             false, hidden_pk_id);
8698         if (packed_size != rowkey_copy.length() ||
8699             memcmp(m_pk_packed_tuple, rowkey_copy.ptr(), packed_size)) {
8700           // NO_LINT_DEBUG
8701           sql_print_error("CHECKTABLE %s:   .. row %lld: PK value mismatch",
8702                           table_name, rows);
8703           goto print_and_error;
8704         }
8705 
8706         /* Check if we get the same secondary key value */
8707         packed_size = m_key_descr_arr[keyno]->pack_record(
8708             table, m_pack_buffer, table->record[0], m_sk_packed_tuple,
8709             &m_sk_tails, false, hidden_pk_id);
8710         if (packed_size != sec_key_copy.length() ||
8711             memcmp(m_sk_packed_tuple, sec_key_copy.ptr(), packed_size)) {
8712           // NO_LINT_DEBUG
8713           sql_print_error(
8714               "CHECKTABLE %s:   .. row %lld: "
8715               "secondary index value mismatch",
8716               table_name, rows);
8717           goto print_and_error;
8718         }
8719         rows++;
8720         continue;
8721 
8722       print_and_error : {
8723         std::string buf;
8724         buf = rdb_hexdump(rowkey_copy.ptr(), rowkey_copy.length(),
8725                           RDB_MAX_HEXDUMP_LEN);
8726         // NO_LINT_DEBUG
8727         sql_print_error("CHECKTABLE %s:   rowkey: %s", table_name, buf.c_str());
8728 
8729         buf = rdb_hexdump(m_retrieved_record.data(), m_retrieved_record.size(),
8730                           RDB_MAX_HEXDUMP_LEN);
8731         // NO_LINT_DEBUG
8732         sql_print_error("CHECKTABLE %s:   record: %s", table_name, buf.c_str());
8733 
8734         buf = rdb_hexdump(sec_key_copy.ptr(), sec_key_copy.length(),
8735                           RDB_MAX_HEXDUMP_LEN);
8736         // NO_LINT_DEBUG
8737         sql_print_error("CHECKTABLE %s:   index: %s", table_name, buf.c_str());
8738 
8739         goto error;
8740       }
8741       }
8742       // NO_LINT_DEBUG
8743       sql_print_information(
8744           "CHECKTABLE %s:   ... %lld index entries checked "
8745           "(%lld had checksums)",
8746           table_name, rows, checksums);
8747 
8748       if (first_index) {
8749         row_checksums =
8750             m_converter->get_row_checksums_checked() - row_checksums_at_start;
8751         first_index = false;
8752       }
8753       ha_index_end();
8754     }
8755   }
8756   if (row_checksums != ha_rows(-1)) {
8757     // NO_LINT_DEBUG
8758     sql_print_information("CHECKTABLE %s:   %lld table records had checksums",
8759                           table_name, row_checksums);
8760   }
8761   extra(HA_EXTRA_NO_KEYREAD);
8762 
8763   m_converter->set_verify_row_debug_checksums(save_verify_row_debug_checksums);
8764   /*
8765     TODO: we should check also for PK records that are missing in the secondary
8766     indexes.
8767     For that, need to walk through the PK and check that every PK record has a
8768     proper counterpart in each secondary index.
8769   */
8770   DBUG_RETURN(HA_ADMIN_OK);
8771 error:
8772   m_converter->set_verify_row_debug_checksums(save_verify_row_debug_checksums);
8773   ha_index_or_rnd_end();
8774   extra(HA_EXTRA_NO_KEYREAD);
8775 
8776   DBUG_RETURN(HA_ADMIN_CORRUPT);
8777 }
8778 
8779 static void dbug_dump_str(FILE *const out, const char *const str, int len) {
8780   fprintf(out, "\"");
8781   for (int i = 0; i < len; i++) {
8782     if (str[i] > 32) {
8783       fprintf(out, "%c", str[i]);
8784     } else {
8785       fprintf(out, "\\%d", str[i]);
8786     }
8787   }
8788   fprintf(out, "\"");
8789 }
8790 
8791 /*
8792   Debugging help: dump the whole database into a human-readable file.
8793   Usage:
8794     dbug_dump_database(rdb);
8795 */
8796 
8797 void dbug_dump_database(rocksdb::DB *const db) {
8798   FILE *const out = fopen("/tmp/rocksdb.dump", "wt");
8799   if (!out) return;
8800 
8801   rocksdb::Iterator *it = db->NewIterator(rocksdb::ReadOptions());
8802   for (it->SeekToFirst(); it->Valid(); it->Next()) {
8803     rocksdb::Slice key = it->key();
8804     rocksdb::Slice val = it->value();
8805     dbug_dump_str(out, key.data(), key.size());
8806     fprintf(out, " -> ");
8807     dbug_dump_str(out, val.data(), val.size());
8808     fprintf(out, "\n");
8809   }
8810 
8811   delete it;
8812   fclose(out);
8813 }
8814 
8815 rocksdb::Status ha_rocksdb::get_for_update(
8816     Rdb_transaction *const tx, const Rdb_key_def &key_descr,
8817     const rocksdb::Slice &key, rocksdb::PinnableSlice *const value) const {
8818   assert(m_lock_rows != RDB_LOCK_NONE);
8819 
8820   bool exclusive = m_lock_rows != RDB_LOCK_READ;
8821   bool do_validate = my_core::thd_tx_isolation(ha_thd()) > ISO_READ_COMMITTED;
8822   rocksdb::Status s =
8823       tx->get_for_update(key_descr, key, value, exclusive, do_validate);
8824 
8825 #ifndef NDEBUG
8826   ++rocksdb_num_get_for_update_calls;
8827 #endif
8828   return s;
8829 }
8830 
8831 bool ha_rocksdb::is_blind_delete_enabled() {
8832   THD *thd = ha_thd();
8833   return (THDVAR(thd, blind_delete_primary_key) &&
8834           thd->lex->sql_command == SQLCOM_DELETE &&
8835           thd->lex->table_count == 1 && table->s->keys == 1 &&
8836           !has_hidden_pk(table) && !thd->rli_slave);
8837 }
8838 
8839 /*
8840   Given a rowid (i.e. packed PK) as a parameter, get the record.
8841 
8842   @return
8843     HA_EXIT_SUCCESS  OK
8844     other            HA_ERR error code (can be SE-specific)
8845 */
8846 
8847 int ha_rocksdb::get_row_by_rowid(uchar *const buf, const char *const rowid,
8848                                  const uint rowid_size,
8849                                  const bool skip_ttl_check,
8850                                  const bool skip_lookup) {
8851   DBUG_ENTER_FUNC();
8852 
8853   assert(buf != nullptr);
8854   assert(rowid != nullptr);
8855   assert(table != nullptr);
8856 
8857   int rc;
8858 
8859   rocksdb::Slice key_slice(rowid, rowid_size);
8860 
8861   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
8862   assert(tx != nullptr);
8863 
8864   DEBUG_SYNC(ha_thd(), "rocksdb.get_row_by_rowid");
8865   DBUG_EXECUTE_IF("dbug.rocksdb.get_row_by_rowid", {
8866     THD *thd = ha_thd();
8867     static constexpr char act[] =
8868         "now signal Reached "
8869         "wait_for signal.rocksdb.get_row_by_rowid_let_running";
8870     assert(opt_debug_sync_timeout > 0);
8871     assert(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
8872   };);
8873 
8874   bool found;
8875   rocksdb::Status s;
8876 
8877   /* Pretend row found without looking up */
8878   if (skip_lookup) {
8879     update_row_stats(ROWS_DELETED_BLIND);
8880     m_last_rowkey.copy((const char *)rowid, rowid_size, &my_charset_bin);
8881     table->status = 0;
8882     DBUG_RETURN(0);
8883   }
8884 
8885   if (m_lock_rows == RDB_LOCK_NONE) {
8886     tx->acquire_snapshot(true);
8887     s = tx->get(m_pk_descr->get_cf(), key_slice, &m_retrieved_record);
8888   } else if (m_insert_with_update && m_dup_key_found &&
8889              m_pk_descr->get_keyno() == m_dupp_errkey) {
8890     assert(m_dup_key_retrieved_record.length() ==
8891                 m_retrieved_record.size());
8892     assert(memcmp(m_dup_key_retrieved_record.ptr(),
8893                        m_retrieved_record.data(),
8894                        m_retrieved_record.size()) == 0);
8895 
8896     // do nothing - we already have the result in m_retrieved_record and
8897     // already taken the lock
8898     s = rocksdb::Status::OK();
8899   } else {
8900     s = get_for_update(tx, *m_pk_descr, key_slice, &m_retrieved_record);
8901   }
8902 
8903   DBUG_EXECUTE_IF("rocksdb_return_status_corrupted",
8904                   dbug_change_status_to_corrupted(&s););
8905 
8906   if (!s.IsNotFound() && !s.ok()) {
8907     DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def));
8908   }
8909   found = !s.IsNotFound();
8910 
8911   table->status = STATUS_NOT_FOUND;
8912   if (found) {
8913     /* If we found the record, but it's expired, pretend we didn't find it.  */
8914     if (!skip_ttl_check && m_pk_descr->has_ttl() &&
8915         should_hide_ttl_rec(*m_pk_descr, m_retrieved_record,
8916                             tx->m_snapshot_timestamp)) {
8917       DBUG_RETURN(HA_ERR_KEY_NOT_FOUND);
8918     }
8919 
8920     m_last_rowkey.copy((const char *)rowid, rowid_size, &my_charset_bin);
8921     rc = convert_record_from_storage_format(&key_slice, buf);
8922 
8923     if (!rc) {
8924       table->status = 0;
8925     }
8926   } else {
8927     /*
8928       Note: we don't need to unlock the row. It is intentional that we keep
8929       locks on rows that don't exist.
8930     */
8931     rc = HA_ERR_KEY_NOT_FOUND;
8932   }
8933 
8934   DBUG_RETURN(rc);
8935 }
8936 
8937 /**
8938   @return
8939     HA_EXIT_SUCCESS  OK
8940     other            HA_ERR error code (can be SE-specific)
8941 */
8942 int ha_rocksdb::index_next(uchar *const buf) {
8943   DBUG_ENTER_FUNC();
8944 
8945   check_build_decoder();
8946 
8947   bool moves_forward = true;
8948   ha_statistic_increment(&SSV::ha_read_next_count);
8949   if (m_key_descr_arr[active_index]->m_is_reverse_cf) {
8950     moves_forward = false;
8951   }
8952 
8953   int rc = index_next_with_direction(buf, moves_forward);
8954   if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
8955 
8956   DBUG_RETURN(rc);
8957 }
8958 
8959 /**
8960   @return
8961     HA_EXIT_SUCCESS  OK
8962     other            HA_ERR error code (can be SE-specific)
8963 */
8964 int ha_rocksdb::index_prev(uchar *const buf) {
8965   DBUG_ENTER_FUNC();
8966 
8967   check_build_decoder();
8968 
8969   bool moves_forward = false;
8970   ha_statistic_increment(&SSV::ha_read_prev_count);
8971   if (m_key_descr_arr[active_index]->m_is_reverse_cf) {
8972     moves_forward = true;
8973   }
8974 
8975   int rc = index_next_with_direction(buf, moves_forward);
8976   if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
8977 
8978   DBUG_RETURN(rc);
8979 }
8980 
8981 int ha_rocksdb::index_next_with_direction(uchar *const buf, bool move_forward) {
8982   DBUG_ENTER_FUNC();
8983 
8984   int rc;
8985 
8986   if (active_index == pk_index(table, m_tbl_def)) {
8987     rc = rnd_next_with_direction(buf, move_forward);
8988   } else {
8989     THD *thd = ha_thd();
8990     for (;;) {
8991       DEBUG_SYNC(thd, "rocksdb.check_flags_inwd");
8992       if (thd && thd->killed) {
8993         rc = HA_ERR_QUERY_INTERRUPTED;
8994         break;
8995       }
8996       if (m_skip_scan_it_next_call) {
8997         m_skip_scan_it_next_call = false;
8998       } else if (!m_scan_it->Valid()) {
8999         DBUG_RETURN(HA_ERR_KEY_NOT_FOUND);
9000       } else {
9001         if (move_forward) {
9002           m_scan_it->Next(); /* this call cannot fail */
9003         } else {
9004           m_scan_it->Prev();
9005         }
9006       }
9007       rc = rocksdb_skip_expired_records(*m_key_descr_arr[active_index],
9008                                         m_scan_it, !move_forward);
9009       if (rc != HA_EXIT_SUCCESS) {
9010         break;
9011       }
9012       rc = find_icp_matching_index_rec(move_forward, buf);
9013       if (!rc) rc = secondary_index_read(active_index, buf);
9014       if (!should_skip_invalidated_record(rc)) {
9015         break;
9016       }
9017     }
9018   }
9019 
9020   DBUG_RETURN(rc);
9021 }
9022 
9023 /**
9024   @return
9025     HA_EXIT_SUCCESS  OK
9026     other            HA_ERR error code (can be SE-specific)
9027 */
9028 int ha_rocksdb::index_first(uchar *const buf) {
9029   DBUG_ENTER_FUNC();
9030 
9031   check_build_decoder();
9032 
9033   m_sk_match_prefix = nullptr;
9034   ha_statistic_increment(&SSV::ha_read_first_count);
9035   int rc = m_key_descr_arr[active_index]->m_is_reverse_cf
9036                ? index_last_intern(buf)
9037                : index_first_intern(buf);
9038   if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
9039 
9040   DBUG_RETURN(rc);
9041 }
9042 
9043 /**
9044   @return
9045     HA_EXIT_SUCCESS  OK
9046     other            HA_ERR error code (can be SE-specific)
9047 */
9048 int ha_rocksdb::index_last(uchar *const buf) {
9049   DBUG_ENTER_FUNC();
9050 
9051   check_build_decoder();
9052 
9053   m_sk_match_prefix = nullptr;
9054   ha_statistic_increment(&SSV::ha_read_last_count);
9055   int rc = m_key_descr_arr[active_index]->m_is_reverse_cf
9056                ? index_first_intern(buf)
9057                : index_last_intern(buf);
9058   if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
9059 
9060   DBUG_RETURN(rc);
9061 }
9062 
9063 /*
9064   Start scanning from the "first" value.
9065 
9066   The 'first' here means "the first from start of the key space".
9067   For reverse-ordered key spaces, the first value will be the biggest, 'zzzz'.
9068 
9069   An picture of a forward-ordered keyspace (remember, the keys have form
9070   'indexnr-keyval'. Suppose the index we are at has number n)
9071 
9072       (n-1) - ...
9073       ( n )          <--- 1. (n) doesn't exist in the db but it would be here.
9074       ( n ) - aaa       <--- 2. Seek("n") will put us here on the first index
9075       ( n ) - bbb               record.
9076       ( n ) - cc
9077 
9078   So, need to do: Seek(n);
9079 
9080   A backward-ordered keyspace:
9081 
9082       (n+1) - bbb
9083       (n+1) - aaa
9084       (n+1)        <--- (n+1) doesn't exist in the db but would be here.
9085       ( n ) - ccc       <--- 1. We need to be here.
9086       ( n ) - bbb
9087       ( n ) - aaa
9088       ( n )
9089 
9090   So, need to: Seek(n+1);
9091 
9092 */
9093 
9094 int ha_rocksdb::index_first_intern(uchar *const buf) {
9095   DBUG_ENTER_FUNC();
9096 
9097   assert(buf != nullptr);
9098 
9099   uchar *key;
9100   uint key_size;
9101   int rc;
9102 
9103   if (is_pk(active_index, table, m_tbl_def)) {
9104     key = m_pk_packed_tuple;
9105   } else {
9106     key = m_sk_packed_tuple;
9107   }
9108 
9109   assert(key != nullptr);
9110 
9111   const Rdb_key_def &kd = *m_key_descr_arr[active_index];
9112   int key_start_matching_bytes = kd.get_first_key(key, &key_size);
9113 
9114   rocksdb::Slice index_key((const char *)key, key_size);
9115 
9116   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
9117   assert(tx != nullptr);
9118 
9119   const bool is_new_snapshot = !tx->has_snapshot();
9120   // Loop as long as we get a deadlock error AND we end up creating the
9121   // snapshot here (i.e. it did not exist prior to this)
9122   for (;;) {
9123     setup_scan_iterator(kd, &index_key, false, key_start_matching_bytes);
9124     m_scan_it->Seek(index_key);
9125     m_skip_scan_it_next_call = true;
9126 
9127     rc = index_next_with_direction(buf, true);
9128     if (!should_recreate_snapshot(rc, is_new_snapshot)) {
9129       break; /* exit the loop */
9130     }
9131 
9132     // release the snapshot and iterator so they will be regenerated
9133     tx->release_snapshot();
9134     release_scan_iterator();
9135   }
9136 
9137   DBUG_RETURN(rc);
9138 }
9139 
9140 /**
9141   @details
9142   Start scanning from the "last" value
9143 
9144   The 'last' here means "the last from start of the key space".
9145   For reverse-ordered key spaces, we will actually read the smallest value.
9146 
9147   An picture of a forward-ordered keyspace (remember, the keys have form
9148   'indexnr-keyval'. Suppose the we are at a key that has number n)
9149 
9150      (n-1)-something
9151      ( n )-aaa
9152      ( n )-bbb
9153      ( n )-ccc            <----------- Need to seek to here.
9154      (n+1)      <---- Doesn't exist, but would be here.
9155      (n+1)-smth, or no value at all
9156 
9157    RocksDB's Iterator::SeekForPrev($val) seeks to "at $val or last value that's
9158    smaller". We can't seek to "(n)-ccc" directly, because we don't know what
9159    is the value of 'ccc' (the biggest record with prefix (n)). Instead, we seek
9160    to "(n+1)", which is the least possible value that's greater than any value
9161    in index #n.
9162 
9163    So, need to:  it->SeekForPrev(n+1)
9164 
9165    A backward-ordered keyspace:
9166 
9167       (n+1)-something
9168       ( n ) - ccc
9169       ( n ) - bbb
9170       ( n ) - aaa       <---------------- (*) Need to seek here.
9171       ( n ) <--- Doesn't exist, but would be here.
9172       (n-1)-smth, or no value at all
9173 
9174    So, need to:  it->SeekForPrev(n)
9175 */
9176 
9177 int ha_rocksdb::index_last_intern(uchar *const buf) {
9178   DBUG_ENTER_FUNC();
9179 
9180   assert(buf != nullptr);
9181 
9182   uchar *key;
9183   uint key_size;
9184   int rc;
9185 
9186   if (is_pk(active_index, table, m_tbl_def)) {
9187     key = m_pk_packed_tuple;
9188   } else {
9189     key = m_sk_packed_tuple;
9190   }
9191 
9192   assert(key != nullptr);
9193 
9194   const Rdb_key_def &kd = *m_key_descr_arr[active_index];
9195   int key_end_matching_bytes = kd.get_last_key(key, &key_size);
9196 
9197   rocksdb::Slice index_key((const char *)key, key_size);
9198 
9199   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
9200   assert(tx != nullptr);
9201 
9202   const bool is_new_snapshot = !tx->has_snapshot();
9203   // Loop as long as we get a deadlock error AND we end up creating the
9204   // snapshot here (i.e. it did not exist prior to this)
9205   for (;;) {
9206     setup_scan_iterator(kd, &index_key, false, key_end_matching_bytes);
9207     m_scan_it->SeekForPrev(index_key);
9208     m_skip_scan_it_next_call = false;
9209 
9210     if (is_pk(active_index, table, m_tbl_def)) {
9211       m_skip_scan_it_next_call = true;
9212       rc = rnd_next_with_direction(buf, false);
9213     } else {
9214       rc = find_icp_matching_index_rec(false /*move_forward*/, buf);
9215       if (!rc) rc = secondary_index_read(active_index, buf);
9216     }
9217 
9218     if (!should_recreate_snapshot(rc, is_new_snapshot)) {
9219       break; /* exit the loop */
9220     }
9221 
9222     // release the snapshot and iterator so they will be regenerated
9223     tx->release_snapshot();
9224     release_scan_iterator();
9225   }
9226 
9227   DBUG_RETURN(rc);
9228 }
9229 
9230 void ha_rocksdb::unlock_row() {
9231   DBUG_ENTER_FUNC();
9232 
9233   if (m_lock_rows != RDB_LOCK_NONE) {
9234     Rdb_transaction *const tx = get_or_create_tx(table->in_use);
9235     tx->release_lock(*m_pk_descr,
9236                      std::string(m_last_rowkey.ptr(), m_last_rowkey.length()));
9237   }
9238 
9239   DBUG_VOID_RETURN;
9240 }
9241 
9242 /*
9243   Returning true if SingleDelete can be used.
9244   - Secondary Indexes can always use SingleDelete.
9245   - If the index is PRIMARY KEY, and if all of the columns of the table
9246     are covered by the PRIMARY KEY, SingleDelete can be used.
9247 */
9248 bool ha_rocksdb::can_use_single_delete(const uint index) const {
9249   return (index != pk_index(table, m_tbl_def) ||
9250           (!has_hidden_pk(table) &&
9251            table->key_info[index].actual_key_parts == table->s->fields));
9252 }
9253 
9254 bool ha_rocksdb::skip_unique_check() const {
9255   /*
9256     We want to skip unique checks if:
9257       1) bulk_load is on
9258       2) the user set unique_checks option to 0, and the table does not have
9259          any indexes. If the table has secondary keys, then those might becomes
9260          inconsisted/corrupted
9261       3) We're using read-free replication
9262   */
9263   return THDVAR(table->in_use, bulk_load) ||
9264          (my_core::thd_test_options(table->in_use,
9265                                     OPTION_RELAXED_UNIQUE_CHECKS) &&
9266           m_tbl_def->m_key_count == 1) ||
9267          use_read_free_rpl();
9268 }
9269 
9270 bool ha_rocksdb::commit_in_the_middle() {
9271   return THDVAR(table->in_use, bulk_load) ||
9272          THDVAR(table->in_use, commit_in_the_middle);
9273 }
9274 
9275 /*
9276   Executing bulk commit if it should.
9277   @retval true if bulk commit failed
9278   @retval false if bulk commit was skipped or succeeded
9279 */
9280 bool ha_rocksdb::do_bulk_commit(Rdb_transaction *const tx) {
9281   assert(tx != nullptr);
9282   return commit_in_the_middle() &&
9283          tx->get_write_count() >= THDVAR(table->in_use, bulk_load_size) &&
9284          tx->flush_batch();
9285 }
9286 
9287 /*
9288   If table was created without primary key, SQL layer represents the primary
9289   key number as MAX_INDEXES.  Hence, this function returns true if the table
9290   does not contain a primary key. (In which case we generate a hidden
9291   'auto-incremented' pk.)
9292 */
9293 bool ha_rocksdb::has_hidden_pk(const TABLE *const table) const {
9294   assert(table != nullptr);
9295   return Rdb_key_def::table_has_hidden_pk(table);
9296 }
9297 
9298 /*
9299   Returns true if given index number is a hidden_pk.
9300   - This is used when a table is created with no primary key.
9301 */
9302 bool ha_rocksdb::is_hidden_pk(const uint index, const TABLE *const table_arg,
9303                               const Rdb_tbl_def *const tbl_def_arg) {
9304   assert(table_arg != nullptr);
9305   assert(table_arg->s != nullptr);
9306   assert(tbl_def_arg != nullptr);
9307 
9308   return (table_arg->s->primary_key == MAX_INDEXES &&
9309           index == tbl_def_arg->m_key_count - 1);
9310 }
9311 
9312 /* Returns index of primary key */
9313 uint ha_rocksdb::pk_index(const TABLE *const table_arg,
9314                           const Rdb_tbl_def *const tbl_def_arg) {
9315   assert(table_arg != nullptr);
9316   assert(table_arg->s != nullptr);
9317   assert(tbl_def_arg != nullptr);
9318 
9319   return table_arg->s->primary_key == MAX_INDEXES ? tbl_def_arg->m_key_count - 1
9320                                                   : table_arg->s->primary_key;
9321 }
9322 
9323 /* Returns true if given index number is a primary key */
9324 bool ha_rocksdb::is_pk(const uint index, const TABLE *const table_arg,
9325                        const Rdb_tbl_def *const tbl_def_arg) {
9326   assert(table_arg != nullptr);
9327   assert(table_arg->s != nullptr);
9328   assert(tbl_def_arg != nullptr);
9329 
9330   return index == table_arg->s->primary_key ||
9331          is_hidden_pk(index, table_arg, tbl_def_arg);
9332 }
9333 
9334 uint ha_rocksdb::max_supported_key_part_length(HA_CREATE_INFO *) const {
9335   DBUG_ENTER_FUNC();
9336   DBUG_RETURN(rocksdb_large_prefix ? MAX_INDEX_COL_LEN_LARGE
9337                                    : MAX_INDEX_COL_LEN_SMALL);
9338 }
9339 
9340 const char *ha_rocksdb::get_key_name(const uint index,
9341                                      const TABLE *const table_arg,
9342                                      const Rdb_tbl_def *const tbl_def_arg) {
9343   assert(table_arg != nullptr);
9344   assert(tbl_def_arg != nullptr);
9345 
9346   if (is_hidden_pk(index, table_arg, tbl_def_arg)) {
9347     return HIDDEN_PK_NAME;
9348   }
9349 
9350   assert(table_arg->key_info != nullptr);
9351   assert(table_arg->key_info[index].name != nullptr);
9352 
9353   return table_arg->key_info[index].name;
9354 }
9355 
9356 const char *ha_rocksdb::get_key_comment(const uint index,
9357                                         const TABLE *const table_arg,
9358                                         const Rdb_tbl_def *const tbl_def_arg) {
9359   assert(table_arg != nullptr);
9360   assert(tbl_def_arg != nullptr);
9361 
9362   if (is_hidden_pk(index, table_arg, tbl_def_arg)) {
9363     return nullptr;
9364   }
9365 
9366   assert(table_arg->key_info != nullptr);
9367 
9368   return table_arg->key_info[index].comment.str;
9369 }
9370 
9371 const std::string ha_rocksdb::generate_cf_name(
9372     const uint index, const TABLE *const table_arg,
9373     const Rdb_tbl_def *const tbl_def_arg, bool *per_part_match_found) {
9374   assert(table_arg != nullptr);
9375   assert(tbl_def_arg != nullptr);
9376   assert(per_part_match_found != nullptr);
9377 
9378   // When creating CF-s the caller needs to know if there was a custom CF name
9379   // specified for a given paritition.
9380   *per_part_match_found = false;
9381 
9382   // Index comment is used to define the column family name specification(s).
9383   // If there was no comment, we get an emptry string, and it means "use the
9384   // default column family".
9385   const char *const comment = get_key_comment(index, table_arg, tbl_def_arg);
9386 
9387   // `get_key_comment` can return `nullptr`, that's why this.
9388   std::string key_comment = comment ? comment : "";
9389 
9390   std::string cf_name = Rdb_key_def::parse_comment_for_qualifier(
9391       key_comment, table_arg, tbl_def_arg, per_part_match_found,
9392       RDB_CF_NAME_QUALIFIER);
9393 
9394   if (table_arg->part_info != nullptr && !*per_part_match_found) {
9395     // At this point we tried to search for a custom CF name for a partition,
9396     // but none was specified. Therefore default one will be used.
9397     return "";
9398   }
9399 
9400   // If we didn't find any partitioned/non-partitioned qualifiers, return the
9401   // comment itself.  NOTE: this currently handles returning the cf name
9402   // specified in the index comment in the case of no partitions, which doesn't
9403   // use any qualifiers at the moment. (aka its a special case)
9404   if (cf_name.empty() && !key_comment.empty()) {
9405     return key_comment;
9406   }
9407 
9408   return cf_name;
9409 }
9410 
9411 const std::string ha_rocksdb::get_table_comment(const TABLE *const table_arg) {
9412   assert(table_arg != nullptr);
9413   assert(table_arg->s != nullptr);
9414 
9415   return table_arg->s->comment.str;
9416 }
9417 
9418 /**
9419   Write a new row
9420 
9421   @param[in] buf                new row data to write
9422   @return
9423     HA_EXIT_SUCCESS  OK
9424     other            HA_ERR error code (can be SE-specific)
9425 */
9426 int ha_rocksdb::write_row(uchar *const buf) {
9427   DBUG_ENTER_FUNC();
9428 
9429   assert(buf != nullptr);
9430   assert(buf == table->record[0]);
9431   assert(m_lock_rows == RDB_LOCK_WRITE);
9432 
9433   ha_statistic_increment(&SSV::ha_write_count);
9434   /*
9435     Note: "buf == table->record[0]" is copied from innodb. I am not aware of
9436     any use cases where this condition is not true.
9437   */
9438   if (table->next_number_field && buf == table->record[0]) {
9439     int err;
9440     if ((err = update_auto_increment())) {
9441       DBUG_RETURN(err);
9442     }
9443   }
9444 
9445   // clear cache at beginning of write for INSERT ON DUPLICATE
9446   // we may get multiple write->fail->read->update if there are multiple
9447   // values from INSERT
9448   m_dup_key_found = false;
9449 
9450   const int rv = update_write_row(nullptr, buf, skip_unique_check());
9451 
9452   if (rv == 0) {
9453 
9454     // Not protected by ddl_manger lock for performance
9455     // reasons. This is an estimate value anyway.
9456     inc_table_n_rows();
9457     update_table_stats_if_needed();
9458 
9459     update_row_stats(ROWS_INSERTED);
9460   }
9461 
9462   DBUG_RETURN(rv);
9463 }
9464 
9465 // Increment the number of rows in the table by one.
9466 // This operation is not protected by ddl manager lock.
9467 // The number is estimated.
9468 void ha_rocksdb::inc_table_n_rows() {
9469   if (!rocksdb_table_stats_use_table_scan) {
9470     return;
9471   }
9472 
9473   uint64 n_rows = m_tbl_def->m_tbl_stats.m_stat_n_rows;
9474   if (n_rows < std::numeric_limits<ulonglong>::max()) {
9475     m_tbl_def->m_tbl_stats.m_stat_n_rows = n_rows + 1;
9476   }
9477 }
9478 
9479 // Decrement the number of rows in the table by one.
9480 // This operation is not protected by ddl manager lock.
9481 // The number is estimated.
9482 void ha_rocksdb::dec_table_n_rows() {
9483   if (!rocksdb_table_stats_use_table_scan) {
9484     return;
9485   }
9486 
9487   uint64 n_rows = m_tbl_def->m_tbl_stats.m_stat_n_rows;
9488   if (n_rows > 0) {
9489     m_tbl_def->m_tbl_stats.m_stat_n_rows = n_rows - 1;
9490   }
9491 }
9492 
9493 /**
9494   Constructing m_last_rowkey (MyRocks key expression) from
9495   before_update|delete image (MySQL row expression).
9496   m_last_rowkey is normally set during lookup phase, such as
9497   rnd_next_with_direction() and rnd_pos(). With Read Free Replication,
9498   these read functions are skipped and update_rows(), delete_rows() are
9499   called without setting m_last_rowkey. This function sets m_last_rowkey
9500   for Read Free Replication.
9501 */
9502 void ha_rocksdb::set_last_rowkey(const uchar *const old_data) {
9503 #if defined(ROCKSDB_INCLUDE_RFR) && ROCKSDB_INCLUDE_RFR
9504   if (old_data && use_read_free_rpl()) {
9505     const int old_pk_size = m_pk_descr->pack_record(
9506         table, m_pack_buffer, old_data, m_pk_packed_tuple, nullptr, false);
9507     m_last_rowkey.copy((const char *)m_pk_packed_tuple, old_pk_size,
9508                        &my_charset_bin);
9509   }
9510 #endif  // defined(ROCKSDB_INCLUDE_RFR) && ROCKSDB_INCLUDE_RFR
9511 }
9512 
9513 /**
9514   Collect update data for primary key
9515 
9516   @param[in, out] row_info            hold all data for update row, such as
9517                                       new row data/old row data
9518   @return
9519     HA_EXIT_SUCCESS  OK
9520     other            HA_ERR error code (can be SE-specific)
9521 */
9522 int ha_rocksdb::get_pk_for_update(struct update_row_info *const row_info) {
9523   int size;
9524 
9525   /*
9526     Get new row key for any insert, and any update where the pk is not hidden.
9527     Row key for updates with hidden pk is handled below.
9528   */
9529   if (!has_hidden_pk(table)) {
9530     row_info->hidden_pk_id = 0;
9531 
9532     row_info->new_pk_unpack_info = &m_pk_unpack_info;
9533 
9534     size = m_pk_descr->pack_record(
9535         table, m_pack_buffer, row_info->new_data, m_pk_packed_tuple,
9536         row_info->new_pk_unpack_info, false, 0, 0, nullptr);
9537   } else if (row_info->old_data == nullptr) {
9538     row_info->hidden_pk_id = update_hidden_pk_val();
9539     size =
9540         m_pk_descr->pack_hidden_pk(row_info->hidden_pk_id, m_pk_packed_tuple);
9541   } else {
9542     /*
9543       If hidden primary key, rowkey for new record will always be the same as
9544       before
9545     */
9546     size = row_info->old_pk_slice.size();
9547     memcpy(m_pk_packed_tuple, row_info->old_pk_slice.data(), size);
9548     int err = read_hidden_pk_id_from_rowkey(&row_info->hidden_pk_id);
9549     if (err) {
9550       return err;
9551     }
9552   }
9553 
9554   row_info->new_pk_slice =
9555       rocksdb::Slice((const char *)m_pk_packed_tuple, size);
9556 
9557   return HA_EXIT_SUCCESS;
9558 }
9559 
9560 /**
9561    Check the specified primary key value is unique and also lock the row
9562 
9563   @param[in] key_id           key index
9564   @param[in] row_info         hold all data for update row, such as old row
9565                               data and new row data
9566   @param[out] found           whether the primary key exists before.
9567   @param[out] pk_changed      whether primary key is changed
9568   @return
9569     HA_EXIT_SUCCESS  OK
9570     other            HA_ERR error code (can be SE-specific)
9571 */
9572 int ha_rocksdb::check_and_lock_unique_pk(const uint key_id,
9573                                          const struct update_row_info &row_info,
9574                                          bool *const found,
9575                                          const bool skip_unique_check) {
9576   assert(found != nullptr);
9577 
9578   assert(row_info.old_pk_slice.size() == 0 ||
9579               row_info.new_pk_slice.compare(row_info.old_pk_slice) != 0);
9580 
9581   const bool ignore_pk_unique_check = skip_unique_check;
9582 
9583   /*
9584     Perform a read to determine if a duplicate entry exists. For primary
9585     keys, a point lookup will be sufficient.
9586 
9587     note: we intentionally don't set options.snapshot here. We want to read
9588     the latest committed data.
9589   */
9590 
9591   /*
9592     To prevent race conditions like below, it is necessary to
9593     take a lock for a target row. get_for_update() holds a gap lock if
9594     target key does not exist, so below conditions should never
9595     happen.
9596 
9597     1) T1 Get(empty) -> T2 Get(empty) -> T1 Put(insert) -> T1 commit
9598        -> T2 Put(overwrite) -> T2 commit
9599     2) T1 Get(empty) -> T1 Put(insert, not committed yet) -> T2 Get(empty)
9600        -> T2 Put(insert, blocked) -> T1 commit -> T2 commit(overwrite)
9601   */
9602   const rocksdb::Status s =
9603       get_for_update(row_info.tx, *m_pk_descr, row_info.new_pk_slice,
9604                      ignore_pk_unique_check ? nullptr : &m_retrieved_record);
9605   if (!s.ok() && !s.IsNotFound()) {
9606     return row_info.tx->set_status_error(table->in_use, s,
9607                                          *m_key_descr_arr[key_id], m_tbl_def);
9608   }
9609 
9610   bool key_found = ignore_pk_unique_check ? false : !s.IsNotFound();
9611 
9612   /*
9613     If the pk key has ttl, we may need to pretend the row wasn't
9614     found if it is already expired.
9615   */
9616   if (key_found && m_pk_descr->has_ttl() &&
9617       should_hide_ttl_rec(*m_pk_descr, m_retrieved_record,
9618                           (row_info.tx->m_snapshot_timestamp
9619                                ? row_info.tx->m_snapshot_timestamp
9620                                : static_cast<int64_t>(std::time(nullptr))))) {
9621     key_found = false;
9622   }
9623 
9624   if (key_found && row_info.old_data == nullptr && m_insert_with_update) {
9625     // In INSERT ON DUPLICATE KEY UPDATE ... case, if the insert failed
9626     // due to a duplicate key, remember the last key and skip the check
9627     // next time
9628     m_dup_key_found = true;
9629 
9630 #ifndef NDEBUG
9631     // save it for sanity checking later
9632     m_dup_key_retrieved_record.copy(m_retrieved_record.data(),
9633                                     m_retrieved_record.size(), &my_charset_bin);
9634 #endif
9635   }
9636 
9637   *found = key_found;
9638 
9639   return HA_EXIT_SUCCESS;
9640 }
9641 
9642 /**
9643    Check the specified secondary key value is unique and also lock the row
9644 
9645   @param[in] key_id           key index
9646   @param[in] row_info         hold all data for update row, such as old row
9647                               data and new row data
9648   @param[out] found           whether specified key value exists before.
9649   @return
9650     HA_EXIT_SUCCESS  OK
9651     other            HA_ERR error code (can be SE-specific)
9652 */
9653 int ha_rocksdb::check_and_lock_sk(const uint key_id,
9654                                   const struct update_row_info &row_info,
9655                                   bool *const found,
9656                                   const bool skip_unique_check) {
9657   assert(found != nullptr);
9658   *found = false;
9659 
9660   /*
9661     Can skip checking this key if none of the key fields have changed.
9662   */
9663   if (row_info.old_data != nullptr && !m_update_scope.is_set(key_id)) {
9664     return HA_EXIT_SUCCESS;
9665   }
9666 
9667   KEY *key_info = nullptr;
9668   uint n_null_fields = 0;
9669   uint user_defined_key_parts = 1;
9670 
9671   key_info = &table->key_info[key_id];
9672   user_defined_key_parts = key_info->user_defined_key_parts;
9673   /*
9674     If there are no uniqueness requirements, there's no need to obtain a
9675     lock for this key.
9676   */
9677   if (!(key_info->flags & HA_NOSAME)) {
9678     return HA_EXIT_SUCCESS;
9679   }
9680 
9681   const Rdb_key_def &kd = *m_key_descr_arr[key_id];
9682 
9683   /*
9684     Calculate the new key for obtaining the lock
9685 
9686     For unique secondary indexes, the key used for locking does not
9687     include the extended fields.
9688   */
9689   int size =
9690       kd.pack_record(table, m_pack_buffer, row_info.new_data, m_sk_packed_tuple,
9691                      nullptr, false, 0, user_defined_key_parts, &n_null_fields);
9692   if (n_null_fields > 0) {
9693     /*
9694       If any fields are marked as NULL this will never match another row as
9695       to NULL never matches anything else including another NULL.
9696      */
9697     return HA_EXIT_SUCCESS;
9698   }
9699 
9700   const rocksdb::Slice new_slice =
9701       rocksdb::Slice((const char *)m_sk_packed_tuple, size);
9702 
9703   /*
9704      Acquire lock on the old key in case of UPDATE
9705   */
9706   if (row_info.old_data != nullptr) {
9707     size = kd.pack_record(table, m_pack_buffer, row_info.old_data,
9708                           m_sk_packed_tuple_old, nullptr, false, 0,
9709                           user_defined_key_parts);
9710     const rocksdb::Slice old_slice =
9711         rocksdb::Slice((const char *)m_sk_packed_tuple_old, size);
9712 
9713     const rocksdb::Status s =
9714         get_for_update(row_info.tx, kd, old_slice, nullptr);
9715     if (!s.ok()) {
9716       return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def);
9717     }
9718 
9719     /*
9720       If the old and new keys are the same we're done since we've already taken
9721       the lock on the old key
9722     */
9723     if (!new_slice.compare(old_slice)) {
9724       return HA_EXIT_SUCCESS;
9725     }
9726   }
9727 
9728   /*
9729     Perform a read to determine if a duplicate entry exists - since this is
9730     a secondary indexes a range scan is needed.
9731 
9732     note: we intentionally don't set options.snapshot here. We want to read
9733     the latest committed data.
9734   */
9735 
9736   const bool all_parts_used = (user_defined_key_parts == kd.get_key_parts());
9737 
9738   /*
9739     This iterator seems expensive since we need to allocate and free
9740     memory for each unique index.
9741 
9742     If this needs to be optimized, for keys without NULL fields, the
9743     extended primary key fields can be migrated to the value portion of the
9744     key. This enables using Get() instead of Seek() as in the primary key
9745     case.
9746 
9747     The bloom filter may need to be disabled for this lookup.
9748   */
9749   uchar lower_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE];
9750   uchar upper_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE];
9751   rocksdb::Slice lower_bound_slice;
9752   rocksdb::Slice upper_bound_slice;
9753 
9754   const bool total_order_seek = !check_bloom_and_set_bounds(
9755       ha_thd(), kd, new_slice, all_parts_used, Rdb_key_def::INDEX_NUMBER_SIZE,
9756       lower_bound_buf, upper_bound_buf, &lower_bound_slice, &upper_bound_slice);
9757   const bool fill_cache = !THDVAR(ha_thd(), skip_fill_cache);
9758 
9759   const rocksdb::Status s = get_for_update(row_info.tx, kd, new_slice, nullptr);
9760   if (!s.ok() && !s.IsNotFound()) {
9761     return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def);
9762   }
9763 
9764   rocksdb::Iterator *const iter = row_info.tx->get_iterator(
9765       kd.get_cf(), total_order_seek, fill_cache, lower_bound_slice,
9766       upper_bound_slice, true /* read current data */,
9767       false /* acquire snapshot */);
9768   /*
9769     Need to scan the transaction to see if there is a duplicate key.
9770     Also need to scan RocksDB and verify the key has not been deleted
9771     in the transaction.
9772   */
9773   *found = !read_key_exact(kd, iter, all_parts_used, new_slice,
9774                            row_info.tx->m_snapshot_timestamp);
9775 
9776   int rc = HA_EXIT_SUCCESS;
9777 
9778   if (*found && m_insert_with_update) {
9779     const rocksdb::Slice &rkey = iter->key();
9780     uint pk_size =
9781         kd.get_primary_key_tuple(table, *m_pk_descr, &rkey, m_pk_packed_tuple);
9782     if (pk_size == RDB_INVALID_KEY_LEN) {
9783       rc = HA_ERR_ROCKSDB_CORRUPT_DATA;
9784     } else {
9785       m_dup_key_found = true;
9786       m_last_rowkey.copy((const char *)m_pk_packed_tuple, pk_size,
9787                          &my_charset_bin);
9788 #ifndef NDEBUG
9789       // save it for sanity checking later
9790       m_dup_key_retrieved_record.copy(rkey.data(), rkey.size(),
9791                                       &my_charset_bin);
9792 #endif
9793     }
9794   }
9795 
9796   delete iter;
9797   return rc;
9798 }
9799 
9800 /**
9801    Enumerate all keys to check their uniquess and also lock it
9802 
9803   @param[in] row_info         hold all data for update row, such as old row
9804                               data and new row data
9805   @param[out] pk_changed      whether primary key is changed
9806   @return
9807     HA_EXIT_SUCCESS  OK
9808     other            HA_ERR error code (can be SE-specific)
9809 */
9810 int ha_rocksdb::check_uniqueness_and_lock(
9811     const struct update_row_info &row_info, bool pk_changed,
9812     bool skip_unique_check) {
9813   /*
9814     Go through each index and determine if the index has uniqueness
9815     requirements. If it does, then try to obtain a row lock on the new values.
9816     Once all locks have been obtained, then perform the changes needed to
9817     update/insert the row.
9818   */
9819   for (uint key_id = 0; key_id < m_tbl_def->m_key_count; key_id++) {
9820     bool found;
9821     int rc;
9822 
9823     if (is_pk(key_id, table, m_tbl_def)) {
9824       if (row_info.old_pk_slice.size() > 0 && !pk_changed) {
9825         found = false;
9826         rc = HA_EXIT_SUCCESS;
9827       } else {
9828         rc = check_and_lock_unique_pk(key_id, row_info, &found,
9829                                       skip_unique_check);
9830         DEBUG_SYNC(ha_thd(), "rocksdb.after_unique_pk_check");
9831       }
9832     } else {
9833       rc = check_and_lock_sk(key_id, row_info, &found, skip_unique_check);
9834       DEBUG_SYNC(ha_thd(), "rocksdb.after_unique_sk_check");
9835     }
9836 
9837     if (rc != HA_EXIT_SUCCESS) {
9838       return rc;
9839     }
9840 
9841     if (found) {
9842       /* There is a row with this key already, so error out. */
9843       errkey = key_id;
9844       m_dupp_errkey = errkey;
9845 
9846       return HA_ERR_FOUND_DUPP_KEY;
9847     }
9848   }
9849 
9850   return HA_EXIT_SUCCESS;
9851 }
9852 
9853 /**
9854   Check whether secondary key value is duplicate or not
9855 
9856   @param[in] table_arg         the table currently working on
9857   @param[in  key_def           the key_def is being checked
9858   @param[in] key               secondary key storage data
9859   @param[out] sk_info          hold secondary key memcmp datas(new/old)
9860   @return
9861     HA_EXIT_SUCCESS  OK
9862     other            HA_ERR error code (can be SE-specific)
9863 */
9864 
9865 int ha_rocksdb::check_duplicate_sk(const TABLE *table_arg,
9866                                    const Rdb_key_def &key_def,
9867                                    const rocksdb::Slice *key,
9868                                    struct unique_sk_buf_info *sk_info) {
9869   uint n_null_fields = 0;
9870   const rocksdb::Comparator *index_comp = key_def.get_cf()->GetComparator();
9871 
9872   /* Get proper SK buffer. */
9873   uchar *sk_buf = sk_info->swap_and_get_sk_buf();
9874 
9875   /* Get memcmp form of sk without extended pk tail */
9876   uint sk_memcmp_size =
9877       key_def.get_memcmp_sk_parts(table_arg, *key, sk_buf, &n_null_fields);
9878 
9879   sk_info->sk_memcmp_key =
9880       rocksdb::Slice(reinterpret_cast<char *>(sk_buf), sk_memcmp_size);
9881 
9882   if (sk_info->sk_memcmp_key_old.size() > 0 && n_null_fields == 0 &&
9883       index_comp->Compare(sk_info->sk_memcmp_key, sk_info->sk_memcmp_key_old) ==
9884           0) {
9885     return 1;
9886   }
9887 
9888   sk_info->sk_memcmp_key_old = sk_info->sk_memcmp_key;
9889   return 0;
9890 }
9891 
9892 int ha_rocksdb::bulk_load_key(Rdb_transaction *const tx, const Rdb_key_def &kd,
9893                               const rocksdb::Slice &key,
9894                               const rocksdb::Slice &value, bool sort) {
9895   DBUG_ENTER_FUNC();
9896   int res;
9897   THD *thd = ha_thd();
9898   if (thd && thd->killed) {
9899     DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED);
9900   }
9901 
9902   rocksdb::ColumnFamilyHandle *cf = kd.get_cf();
9903 
9904   // In the case of unsorted inserts, m_sst_info allocated here is not
9905   // used to store the keys. It is still used to indicate when tables
9906   // are switched.
9907   if (m_sst_info == nullptr || m_sst_info->is_done()) {
9908     m_sst_info.reset(new Rdb_sst_info(rdb, m_table_handler->m_table_name,
9909                                       kd.get_name(), cf, *rocksdb_db_options,
9910                                       THDVAR(ha_thd(), trace_sst_api)));
9911     res = tx->start_bulk_load(this, m_sst_info);
9912     if (res != HA_EXIT_SUCCESS) {
9913       DBUG_RETURN(res);
9914     }
9915   }
9916   assert(m_sst_info);
9917 
9918   if (sort) {
9919     Rdb_index_merge *key_merge;
9920     assert(cf != nullptr);
9921 
9922     res = tx->get_key_merge(kd.get_gl_index_id(), cf, &key_merge);
9923     if (res == HA_EXIT_SUCCESS) {
9924       res = key_merge->add(key, value);
9925     }
9926   } else {
9927     res = m_sst_info->put(key, value);
9928   }
9929 
9930   DBUG_RETURN(res);
9931 }
9932 
9933 int ha_rocksdb::finalize_bulk_load(bool print_client_error) {
9934   DBUG_ENTER_FUNC();
9935 
9936   int res = HA_EXIT_SUCCESS;
9937 
9938   /* Skip if there are no possible ongoing bulk loads */
9939   if (m_sst_info) {
9940     if (m_sst_info->is_done()) {
9941       m_sst_info.reset();
9942       DBUG_RETURN(res);
9943     }
9944 
9945     Rdb_sst_info::Rdb_sst_commit_info commit_info;
9946 
9947     // Wrap up the current work in m_sst_info and get ready to commit
9948     // This transfer the responsibility of commit over to commit_info
9949     res = m_sst_info->finish(&commit_info, print_client_error);
9950     if (res == 0) {
9951       // Make sure we have work to do - under race condition we could lose
9952       // to another thread and end up with no work
9953       if (commit_info.has_work()) {
9954         rocksdb::IngestExternalFileOptions opts;
9955         opts.move_files = true;
9956         opts.snapshot_consistency = false;
9957         opts.allow_global_seqno = false;
9958         opts.allow_blocking_flush = false;
9959 
9960         const rocksdb::Status s = rdb->IngestExternalFile(
9961             commit_info.get_cf(), commit_info.get_committed_files(), opts);
9962         if (!s.ok()) {
9963           if (print_client_error) {
9964             Rdb_sst_info::report_error_msg(s, nullptr);
9965           }
9966           res = HA_ERR_ROCKSDB_BULK_LOAD;
9967         } else {
9968           // Mark the list of SST files as committed, otherwise they'll get
9969           // cleaned up when commit_info destructs
9970           commit_info.commit();
9971         }
9972       }
9973     }
9974     m_sst_info.reset();
9975   }
9976   DBUG_RETURN(res);
9977 }
9978 
9979 /**
9980   Update an existing primary key record or write a new primary key record
9981 
9982   @param[in] kd                the primary key is being update/write
9983   @param[in] update_row_info   hold all row data, such as old row data and
9984                                new row data
9985   @param[in] pk_changed        whether primary key is changed
9986   @return
9987     HA_EXIT_SUCCESS OK
9988     Other           HA_ERR error code (can be SE-specific)
9989  */
9990 int ha_rocksdb::update_write_pk(const Rdb_key_def &kd,
9991                                 const struct update_row_info &row_info,
9992                                 const bool pk_changed) {
9993   const uint key_id = kd.get_keyno();
9994   const bool hidden_pk = is_hidden_pk(key_id, table, m_tbl_def);
9995 
9996   /*
9997     If the PK has changed, or if this PK uses single deletes and this is an
9998     update, the old key needs to be deleted. In the single delete case, it
9999     might be possible to have this sequence of keys: PUT(X), PUT(X), SD(X),
10000     resulting in the first PUT(X) showing up.
10001   */
10002   if (!hidden_pk && (pk_changed || ((row_info.old_pk_slice.size() > 0) &&
10003                                     can_use_single_delete(key_id)))) {
10004     const rocksdb::Status s = delete_or_singledelete(
10005         key_id, row_info.tx, kd.get_cf(), row_info.old_pk_slice);
10006     if (!s.ok()) {
10007       return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def);
10008     }
10009   }
10010 
10011   if (table->found_next_number_field) {
10012     update_auto_incr_val_from_field();
10013   }
10014 
10015   int rc = HA_EXIT_SUCCESS;
10016   rocksdb::Slice value_slice;
10017   /* Prepare the new record to be written into RocksDB */
10018   if ((rc = m_converter->encode_value_slice(
10019            m_pk_descr, row_info.new_pk_slice, row_info.new_pk_unpack_info,
10020            !row_info.old_pk_slice.empty(), should_store_row_debug_checksums(),
10021            m_ttl_bytes, &m_ttl_bytes_updated, &value_slice))) {
10022     return rc;
10023   }
10024 
10025   const auto cf = m_pk_descr->get_cf();
10026   if (rocksdb_enable_bulk_load_api && THDVAR(table->in_use, bulk_load) &&
10027       !hidden_pk) {
10028     /*
10029       Write the primary key directly to an SST file using an SstFileWriter
10030      */
10031     rc = bulk_load_key(row_info.tx, kd, row_info.new_pk_slice, value_slice,
10032                        THDVAR(table->in_use, bulk_load_allow_unsorted));
10033   } else if (row_info.skip_unique_check || row_info.tx->m_ddl_transaction) {
10034     /*
10035       It is responsibility of the user to make sure that the data being
10036       inserted doesn't violate any unique keys.
10037     */
10038     row_info.tx->get_indexed_write_batch()->Put(cf, row_info.new_pk_slice,
10039                                                 value_slice);
10040   } else {
10041     const bool assume_tracked = can_assume_tracked(ha_thd());
10042     const auto s = row_info.tx->put(cf, row_info.new_pk_slice, value_slice,
10043                                     assume_tracked);
10044     if (!s.ok()) {
10045       if (s.IsBusy()) {
10046         errkey = table->s->primary_key;
10047         m_dupp_errkey = errkey;
10048         rc = HA_ERR_FOUND_DUPP_KEY;
10049       } else {
10050         rc = row_info.tx->set_status_error(table->in_use, s, *m_pk_descr,
10051                                            m_tbl_def);
10052       }
10053     }
10054   }
10055 
10056   return rc;
10057 }
10058 
10059 /**
10060   update an existing secondary key record or write a new secondary key record
10061 
10062   @param[in] table_arg    Table we're working on
10063   @param[in] kd           The secondary key being update/write
10064   @param[in] row_info     data structure contains old row data and new row data
10065   @param[in] bulk_load_sk whether support bulk load. Currently it is only
10066                           support for write
10067   @return
10068     HA_EXIT_SUCCESS OK
10069     Other           HA_ERR error code (can be SE-specific)
10070  */
10071 int ha_rocksdb::update_write_sk(const TABLE *const table_arg,
10072                                 const Rdb_key_def &kd,
10073                                 const struct update_row_info &row_info,
10074                                 const bool bulk_load_sk) {
10075   int new_packed_size;
10076   int old_packed_size;
10077   int rc = HA_EXIT_SUCCESS;
10078 
10079   rocksdb::Slice new_key_slice;
10080   rocksdb::Slice new_value_slice;
10081   rocksdb::Slice old_key_slice;
10082 
10083   const uint key_id = kd.get_keyno();
10084   /*
10085     Can skip updating this key if none of the key fields have changed and, if
10086     this table has TTL, the TTL timestamp has not changed.
10087   */
10088   if (row_info.old_data != nullptr && !m_update_scope.is_set(key_id) &&
10089       (!kd.has_ttl() || !m_ttl_bytes_updated)) {
10090     return HA_EXIT_SUCCESS;
10091   }
10092 
10093   bool store_row_debug_checksums = should_store_row_debug_checksums();
10094   new_packed_size =
10095       kd.pack_record(table_arg, m_pack_buffer, row_info.new_data,
10096                      m_sk_packed_tuple, &m_sk_tails, store_row_debug_checksums,
10097                      row_info.hidden_pk_id, 0, nullptr, m_ttl_bytes);
10098 
10099   if (row_info.old_data != nullptr) {
10100     // The old value
10101     old_packed_size = kd.pack_record(
10102         table_arg, m_pack_buffer, row_info.old_data, m_sk_packed_tuple_old,
10103         &m_sk_tails_old, store_row_debug_checksums, row_info.hidden_pk_id, 0,
10104         nullptr, m_ttl_bytes);
10105 
10106     /*
10107       Check if we are going to write the same value. This can happen when
10108       one does
10109         UPDATE tbl SET col='foo'
10110       and we are looking at the row that already has col='foo'.
10111 
10112       We also need to compare the unpack info. Suppose, the collation is
10113       case-insensitive, and unpack info contains information about whether
10114       the letters were uppercase and lowercase.  Then, both 'foo' and 'FOO'
10115       will have the same key value, but different data in unpack_info.
10116 
10117       (note: anyone changing bytewise_compare should take this code into
10118       account)
10119     */
10120     if (old_packed_size == new_packed_size &&
10121         m_sk_tails_old.get_current_pos() == m_sk_tails.get_current_pos() &&
10122         !(kd.has_ttl() && m_ttl_bytes_updated) &&
10123         memcmp(m_sk_packed_tuple_old, m_sk_packed_tuple, old_packed_size) ==
10124             0 &&
10125         memcmp(m_sk_tails_old.ptr(), m_sk_tails.ptr(),
10126                m_sk_tails.get_current_pos()) == 0) {
10127       return HA_EXIT_SUCCESS;
10128     }
10129 
10130     /*
10131       Deleting entries from secondary index should skip locking, but
10132       be visible to the transaction.
10133       (also note that DDL statements do not delete rows, so this is not a DDL
10134        statement)
10135     */
10136     old_key_slice = rocksdb::Slice(
10137         reinterpret_cast<const char *>(m_sk_packed_tuple_old), old_packed_size);
10138 
10139     row_info.tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(),
10140                                                          old_key_slice);
10141   }
10142 
10143   new_key_slice = rocksdb::Slice(
10144       reinterpret_cast<const char *>(m_sk_packed_tuple), new_packed_size);
10145   new_value_slice =
10146       rocksdb::Slice(reinterpret_cast<const char *>(m_sk_tails.ptr()),
10147                      m_sk_tails.get_current_pos());
10148 
10149   if (bulk_load_sk && row_info.old_data == nullptr) {
10150     rc = bulk_load_key(row_info.tx, kd, new_key_slice, new_value_slice, true);
10151   } else {
10152     row_info.tx->get_indexed_write_batch()->Put(kd.get_cf(), new_key_slice,
10153                                                 new_value_slice);
10154   }
10155 
10156   return rc;
10157 }
10158 
10159 /**
10160    Update existing indexes(PK/SKs) or write new indexes(PK/SKs)
10161 
10162    @param[in] row_info    hold all row data, such as old key/new key
10163    @param[in] pk_changed  whether primary key is changed
10164    @return
10165      HA_EXIT_SUCCESS OK
10166      Other           HA_ERR error code (can be SE-specific)
10167  */
10168 int ha_rocksdb::update_write_indexes(const struct update_row_info &row_info,
10169                                      const bool pk_changed) {
10170   int rc;
10171   bool bulk_load_sk;
10172 
10173   // The PK must be updated first to pull out the TTL value.
10174   rc = update_write_pk(*m_pk_descr, row_info, pk_changed);
10175   if (rc != HA_EXIT_SUCCESS) {
10176     return rc;
10177   }
10178 
10179   // Update the remaining indexes. Allow bulk loading only if
10180   // allow_sk is enabled
10181   bulk_load_sk = rocksdb_enable_bulk_load_api &&
10182                  THDVAR(table->in_use, bulk_load) &&
10183                  THDVAR(table->in_use, bulk_load_allow_sk);
10184   for (uint key_id = 0; key_id < m_tbl_def->m_key_count; key_id++) {
10185     if (is_pk(key_id, table, m_tbl_def)) {
10186       continue;
10187     }
10188 
10189     rc = update_write_sk(table, *m_key_descr_arr[key_id], row_info,
10190                          bulk_load_sk);
10191     if (rc != HA_EXIT_SUCCESS) {
10192       return rc;
10193     }
10194   }
10195 
10196   return HA_EXIT_SUCCESS;
10197 }
10198 
10199 /**
10200   Update an existing row or write a new row
10201 
10202   @param[in] old_data           nullptr for write, non-null for update
10203   @param[in] new_data           non-null for write/update
10204   @param[in] skip_unique_check  whether to check uniqueness
10205   @return
10206     HA_EXIT_SUCCESS OK
10207     Other           HA_ERR error code (can be SE-specific)
10208  */
10209 int ha_rocksdb::update_write_row(const uchar *const old_data,
10210                                  const uchar *const new_data,
10211                                  const bool skip_unique_check) {
10212   DBUG_ENTER_FUNC();
10213 
10214   THD *thd = ha_thd();
10215   if (thd && thd->killed) {
10216     DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED);
10217   }
10218 
10219   bool pk_changed = false;
10220   struct update_row_info row_info;
10221 
10222   row_info.old_data = old_data;
10223   row_info.new_data = new_data;
10224   row_info.skip_unique_check = skip_unique_check;
10225   row_info.new_pk_unpack_info = nullptr;
10226   set_last_rowkey(old_data);
10227 
10228   row_info.tx = get_or_create_tx(table->in_use);
10229 
10230   if (old_data != nullptr) {
10231     row_info.old_pk_slice =
10232         rocksdb::Slice(m_last_rowkey.ptr(), m_last_rowkey.length());
10233 
10234     /* Determine which indexes need updating. */
10235     calc_updated_indexes();
10236   }
10237 
10238   /*
10239     Get the new row key into row_info.new_pk_slice
10240    */
10241   int rc = get_pk_for_update(&row_info);
10242   if (rc != HA_EXIT_SUCCESS) {
10243     DBUG_RETURN(rc);
10244   }
10245 
10246   /*
10247     For UPDATEs, if the key has changed, we need to obtain a lock. INSERTs
10248     always require locking.
10249   */
10250   if (row_info.old_pk_slice.size() > 0) {
10251     pk_changed = row_info.new_pk_slice.compare(row_info.old_pk_slice) != 0;
10252   }
10253 
10254   // Case: We skip both unique checks and rows locks only when bulk load is
10255   // enabled or if rocksdb_skip_locks_if_skip_unique_check is ON
10256   if (!THDVAR(table->in_use, bulk_load) &&
10257       (!rocksdb_skip_locks_if_skip_unique_check || !skip_unique_check)) {
10258     /*
10259       Check to see if we are going to have failures because of unique
10260       keys.  Also lock the appropriate key values.
10261     */
10262     rc = check_uniqueness_and_lock(row_info, pk_changed, skip_unique_check);
10263     if (rc != HA_EXIT_SUCCESS) {
10264       DBUG_RETURN(rc);
10265     }
10266   }
10267 
10268   DEBUG_SYNC(ha_thd(), "rocksdb.update_write_row_after_unique_check");
10269 
10270   /*
10271     At this point, all locks have been obtained, and all checks for duplicate
10272     keys have been performed. No further errors can be allowed to occur from
10273     here because updates to the transaction will be made and those updates
10274     cannot be easily removed without rolling back the entire transaction.
10275   */
10276   rc = update_write_indexes(row_info, pk_changed);
10277   if (rc != HA_EXIT_SUCCESS) {
10278     DBUG_RETURN(rc);
10279   }
10280 
10281   row_info.tx->log_table_write_op(m_tbl_def);
10282 
10283   if (do_bulk_commit(row_info.tx)) {
10284     DBUG_RETURN(HA_ERR_ROCKSDB_BULK_LOAD);
10285   }
10286 
10287   DBUG_RETURN(HA_EXIT_SUCCESS);
10288 }
10289 
10290 /*
10291  Setting iterator upper/lower bounds for Seek/SeekForPrev.
10292  This makes RocksDB to avoid scanning tombstones outside of
10293  the given key ranges, when prefix_same_as_start=true was not passed
10294  (when prefix bloom filter can not be used).
10295  Inversing upper/lower bound is necessary on reverse order CF.
10296  This covers HA_READ_PREFIX_LAST* case as well. For example,
10297  if given query eq condition was 12 bytes and condition was
10298  0x0000b3eb003f65c5e78858b8, and if doing HA_READ_PREFIX_LAST,
10299  eq_cond_len was 11 (see calc_eq_cond_len() for details).
10300  If the index was reverse order, upper bound would be
10301  0x0000b3eb003f65c5e78857, and lower bound would be
10302  0x0000b3eb003f65c5e78859. These cover given eq condition range.
10303 */
10304 void ha_rocksdb::setup_iterator_bounds(
10305     const Rdb_key_def &kd, const rocksdb::Slice &eq_cond, size_t bound_len,
10306     uchar *const lower_bound, uchar *const upper_bound,
10307     rocksdb::Slice *lower_bound_slice, rocksdb::Slice *upper_bound_slice) {
10308   // If eq_cond is shorter than Rdb_key_def::INDEX_NUMBER_SIZE, we should be
10309   // able to get better bounds just by using index id directly.
10310   if (eq_cond.size() <= Rdb_key_def::INDEX_NUMBER_SIZE) {
10311     assert(bound_len == Rdb_key_def::INDEX_NUMBER_SIZE);
10312     uint size;
10313     kd.get_infimum_key(lower_bound, &size);
10314     assert(size == Rdb_key_def::INDEX_NUMBER_SIZE);
10315     kd.get_supremum_key(upper_bound, &size);
10316     assert(size == Rdb_key_def::INDEX_NUMBER_SIZE);
10317   } else {
10318     assert(bound_len <= eq_cond.size());
10319     memcpy(upper_bound, eq_cond.data(), bound_len);
10320     kd.successor(upper_bound, bound_len);
10321     memcpy(lower_bound, eq_cond.data(), bound_len);
10322     kd.predecessor(lower_bound, bound_len);
10323   }
10324 
10325   if (kd.m_is_reverse_cf) {
10326     *upper_bound_slice = rocksdb::Slice((const char *)lower_bound, bound_len);
10327     *lower_bound_slice = rocksdb::Slice((const char *)upper_bound, bound_len);
10328   } else {
10329     *upper_bound_slice = rocksdb::Slice((const char *)upper_bound, bound_len);
10330     *lower_bound_slice = rocksdb::Slice((const char *)lower_bound, bound_len);
10331   }
10332 }
10333 
10334 /*
10335   Open a cursor
10336 */
10337 
10338 void ha_rocksdb::setup_scan_iterator(const Rdb_key_def &kd,
10339                                      rocksdb::Slice *const slice,
10340                                      const bool use_all_keys,
10341                                      const uint eq_cond_len) {
10342   assert(slice != nullptr);
10343   assert(slice->size() >= eq_cond_len);
10344 
10345   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10346 
10347   bool skip_bloom = true;
10348 
10349   const rocksdb::Slice eq_cond(slice->data(), eq_cond_len);
10350   // The size of m_scan_it_lower_bound (and upper) is technically
10351   // max_packed_sk_len as calculated in ha_rocksdb::alloc_key_buffers.  Rather
10352   // than recalculating that number, we pass in the max of eq_cond_len and
10353   // Rdb_key_def::INDEX_NUMBER_SIZE which is guaranteed to be smaller than
10354   // max_packed_sk_len, hence ensuring no buffer overrun.
10355   //
10356   // See ha_rocksdb::setup_iterator_bounds on how the bound_len parameter is
10357   // used.
10358   if (check_bloom_and_set_bounds(
10359           ha_thd(), kd, eq_cond, use_all_keys,
10360           std::max(eq_cond_len, (uint)Rdb_key_def::INDEX_NUMBER_SIZE),
10361           m_scan_it_lower_bound, m_scan_it_upper_bound,
10362           &m_scan_it_lower_bound_slice, &m_scan_it_upper_bound_slice)) {
10363     skip_bloom = false;
10364   }
10365 
10366   /*
10367     In some cases, setup_scan_iterator() is called multiple times from
10368     the same query but bloom filter can not always be used.
10369     Suppose the following query example. id2 is VARCHAR(30) and PRIMARY KEY
10370     (id1, id2).
10371      select count(*) from t2 WHERE id1=100 and id2 IN ('00000000000000000000',
10372     '100');
10373     In this case, setup_scan_iterator() is called twice, the first time is for
10374     (id1, id2)=(100, '00000000000000000000') and the second time is for (100,
10375     '100').
10376     If prefix bloom filter length is 24 bytes, prefix bloom filter can be used
10377     for the
10378     first condition but not for the second condition.
10379     If bloom filter condition is changed, currently it is necessary to destroy
10380     and
10381     re-create Iterator.
10382   */
10383   if (m_scan_it_skips_bloom != skip_bloom) {
10384     release_scan_iterator();
10385   }
10386 
10387   /*
10388     SQL layer can call rnd_init() multiple times in a row.
10389     In that case, re-use the iterator, but re-position it at the table start.
10390   */
10391   if (!m_scan_it) {
10392     const bool fill_cache = !THDVAR(ha_thd(), skip_fill_cache);
10393     if (commit_in_the_middle()) {
10394       assert(m_scan_it_snapshot == nullptr);
10395       m_scan_it_snapshot = rdb->GetSnapshot();
10396 
10397       auto read_opts = rocksdb::ReadOptions();
10398       read_opts.total_order_seek = true;  // TODO: set based on WHERE conditions
10399       read_opts.snapshot = m_scan_it_snapshot;
10400       m_scan_it = rdb->NewIterator(read_opts, kd.get_cf());
10401     } else {
10402       m_scan_it = tx->get_iterator(kd.get_cf(), skip_bloom, fill_cache,
10403                                    m_scan_it_lower_bound_slice,
10404                                    m_scan_it_upper_bound_slice);
10405     }
10406     m_scan_it_skips_bloom = skip_bloom;
10407   }
10408 }
10409 
10410 void ha_rocksdb::release_scan_iterator() {
10411   delete m_scan_it;
10412   m_scan_it = nullptr;
10413 
10414   if (m_scan_it_snapshot) {
10415     rdb->ReleaseSnapshot(m_scan_it_snapshot);
10416     m_scan_it_snapshot = nullptr;
10417   }
10418 }
10419 
10420 void ha_rocksdb::setup_iterator_for_rnd_scan() {
10421   uint key_size;
10422 
10423   int key_start_matching_bytes =
10424       m_pk_descr->get_first_key(m_pk_packed_tuple, &key_size);
10425 
10426   rocksdb::Slice table_key((const char *)m_pk_packed_tuple, key_size);
10427 
10428   setup_scan_iterator(*m_pk_descr, &table_key, false, key_start_matching_bytes);
10429   m_scan_it->Seek(table_key);
10430   m_skip_scan_it_next_call = true;
10431 }
10432 
10433 /**
10434   @return
10435     HA_EXIT_SUCCESS  OK
10436     other            HA_ERR error code (can be SE-specific)
10437 */
10438 int ha_rocksdb::rnd_init(bool scan) {
10439   DBUG_ENTER_FUNC();
10440 
10441   m_need_build_decoder = true;
10442   active_index = table->s->primary_key;
10443 
10444   THD *thd = ha_thd();
10445   if (thd && thd->killed) {
10446     DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED);
10447   }
10448 
10449   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10450 
10451   if (scan) {
10452     m_rnd_scan_is_new_snapshot = !tx->has_snapshot();
10453     setup_iterator_for_rnd_scan();
10454   } else {
10455     /* We don't need any preparations for rnd_pos() calls. */
10456   }
10457 
10458   // If m_lock_rows is on then we will be doing a get_for_update when accessing
10459   // the index, so don't acquire the snapshot right away.  Otherwise acquire
10460   // the snapshot immediately.
10461   tx->acquire_snapshot(m_lock_rows == RDB_LOCK_NONE);
10462 
10463   DBUG_RETURN(HA_EXIT_SUCCESS);
10464 }
10465 
10466 /**
10467   @return
10468     HA_EXIT_SUCCESS  OK
10469     other            HA_ERR error code (can be SE-specific)
10470 */
10471 int ha_rocksdb::rnd_next(uchar *const buf) {
10472   DBUG_ENTER_FUNC();
10473 
10474   check_build_decoder();
10475 
10476   int rc;
10477   ha_statistic_increment(&SSV::ha_read_rnd_next_count);
10478   for (;;) {
10479     rc = rnd_next_with_direction(buf, true);
10480     if (!should_recreate_snapshot(rc, m_rnd_scan_is_new_snapshot)) {
10481       break; /* exit the loop */
10482     }
10483     // release the snapshot and iterator and then regenerate them
10484     Rdb_transaction *tx = get_or_create_tx(table->in_use);
10485     tx->release_snapshot();
10486     release_scan_iterator();
10487     setup_iterator_for_rnd_scan();
10488   }
10489 
10490   m_rnd_scan_is_new_snapshot = false;
10491 
10492   if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
10493 
10494   DBUG_RETURN(rc);
10495 }
10496 
10497 /*
10498   See also secondary_index_read().
10499 */
10500 int ha_rocksdb::rnd_next_with_direction(uchar *const buf, bool move_forward) {
10501   DBUG_ENTER_FUNC();
10502 
10503   int rc;
10504   THD *thd = ha_thd();
10505 
10506   table->status = STATUS_NOT_FOUND;
10507 
10508   if (!m_scan_it || !is_valid(m_scan_it)) {
10509     /*
10510       We can get here when SQL layer has called
10511 
10512         h->index_init(PRIMARY);
10513         h->index_read_map(full index tuple, HA_READ_KEY_EXACT);
10514 
10515       In this case, we should return EOF.
10516     */
10517     DBUG_RETURN(HA_ERR_END_OF_FILE);
10518   }
10519 
10520   for (;;) {
10521     DEBUG_SYNC(thd, "rocksdb.check_flags_rnwd");
10522     if (thd && thd->killed) {
10523       rc = HA_ERR_QUERY_INTERRUPTED;
10524       break;
10525     }
10526 
10527     if (m_skip_scan_it_next_call) {
10528       m_skip_scan_it_next_call = false;
10529     } else {
10530       if (move_forward) {
10531         m_scan_it->Next(); /* this call cannot fail */
10532       } else {
10533         m_scan_it->Prev(); /* this call cannot fail */
10534       }
10535     }
10536 
10537     if (!is_valid(m_scan_it)) {
10538       rc = HA_ERR_END_OF_FILE;
10539       break;
10540     }
10541 
10542     /* check if we're out of this table */
10543     const rocksdb::Slice key = m_scan_it->key();
10544     if (!m_pk_descr->covers_key(key)) {
10545       rc = HA_ERR_END_OF_FILE;
10546       break;
10547     }
10548 
10549     if (m_lock_rows != RDB_LOCK_NONE) {
10550       /*
10551         Lock the row we've just read.
10552 
10553         Now we call get_for_update which will 1) Take a lock and 2) Will fail
10554         if the row was deleted since the snapshot was taken.
10555       */
10556       Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10557       DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete");
10558 
10559       if (m_pk_descr->has_ttl() &&
10560           should_hide_ttl_rec(*m_pk_descr, m_scan_it->value(),
10561                               tx->m_snapshot_timestamp)) {
10562         continue;
10563       }
10564 
10565       const rocksdb::Status s =
10566           get_for_update(tx, *m_pk_descr, key, &m_retrieved_record);
10567       if (s.IsNotFound() &&
10568           should_skip_invalidated_record(HA_ERR_KEY_NOT_FOUND)) {
10569         continue;
10570       }
10571 
10572       if (!s.ok()) {
10573         DBUG_RETURN(
10574             tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def));
10575       }
10576 
10577       // If we called get_for_update() use the value from that call not from
10578       // the iterator as it may be stale since we don't have a snapshot
10579       // when m_lock_rows is not RDB_LOCK_NONE.
10580       m_last_rowkey.copy(key.data(), key.size(), &my_charset_bin);
10581       rc = convert_record_from_storage_format(&key, buf);
10582     } else {
10583       // Use the value from the iterator
10584       rocksdb::Slice value = m_scan_it->value();
10585 
10586       if (m_pk_descr->has_ttl() &&
10587           should_hide_ttl_rec(
10588               *m_pk_descr, value,
10589               get_or_create_tx(table->in_use)->m_snapshot_timestamp)) {
10590         continue;
10591       }
10592 
10593       m_last_rowkey.copy(key.data(), key.size(), &my_charset_bin);
10594       rc = convert_record_from_storage_format(&key, &value, buf);
10595     }
10596 
10597     table->status = 0;
10598     break;
10599   }
10600 
10601   if (!rc) {
10602     update_row_stats(ROWS_READ);
10603   }
10604 
10605   DBUG_RETURN(rc);
10606 }
10607 
10608 int ha_rocksdb::rnd_end() {
10609   DBUG_ENTER_FUNC();
10610 
10611   m_need_build_decoder = false;
10612 
10613   release_scan_iterator();
10614 
10615   DBUG_RETURN(HA_EXIT_SUCCESS);
10616 }
10617 
10618 void ha_rocksdb::build_decoder() {
10619   m_converter->setup_field_decoders(table->read_set, active_index,
10620                                     m_keyread_only,
10621                                     m_lock_rows == RDB_LOCK_WRITE);
10622 }
10623 
10624 void ha_rocksdb::check_build_decoder() {
10625   if (m_need_build_decoder) {
10626     build_decoder();
10627     m_need_build_decoder = false;
10628   }
10629 }
10630 
10631 /**
10632   @return
10633     HA_EXIT_SUCCESS  OK
10634     other            HA_ERR error code (can be SE-specific)
10635 */
10636 int ha_rocksdb::index_init(uint idx, bool sorted) {
10637   DBUG_ENTER_FUNC();
10638 
10639   m_need_build_decoder = true;
10640   active_index = idx;
10641 
10642   THD *thd = ha_thd();
10643   if (thd && thd->killed) {
10644     DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED);
10645   }
10646 
10647   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10648   assert(tx != nullptr);
10649 
10650   // If m_lock_rows is not RDB_LOCK_NONE then we will be doing a get_for_update
10651   // when accessing the index, so don't acquire the snapshot right away.
10652   // Otherwise acquire the snapshot immediately.
10653   tx->acquire_snapshot(m_lock_rows == RDB_LOCK_NONE);
10654 
10655   DBUG_RETURN(HA_EXIT_SUCCESS);
10656 }
10657 
10658 /**
10659   @return
10660     HA_EXIT_SUCCESS      OK
10661 */
10662 int ha_rocksdb::index_end() {
10663   DBUG_ENTER_FUNC();
10664 
10665   m_need_build_decoder = false;
10666 
10667   release_scan_iterator();
10668 
10669   active_index = MAX_KEY;
10670   in_range_check_pushed_down = false;
10671 
10672   DBUG_RETURN(HA_EXIT_SUCCESS);
10673 }
10674 
10675 /**
10676   Called by the partition manager for truncating tables.
10677 
10678   @return
10679     HA_EXIT_SUCCESS  OK
10680     other            HA_ERR error code (can be SE-specific)
10681 */
10682 int ha_rocksdb::truncate() {
10683   DBUG_ENTER_FUNC();
10684 
10685   assert(m_tbl_def != nullptr);
10686 
10687   // Reset auto_increment_value to 1 if auto-increment feature is enabled
10688   // By default, the starting valid value for auto_increment_value is 1
10689   DBUG_RETURN(truncate_table(
10690       m_tbl_def, table,
10691       table->found_next_number_field ? 1 : 0 /* auto_increment_value */));
10692 }
10693 
10694 /*
10695   Delete the row we've last read. The row is also passed as parameter.
10696 
10697   @detail
10698     The caller guarantees table buf points to the row that was just read.
10699     The row is either table->record[0] or table->record[1].
10700     (Check out InnoDB: row_update_for_mysql() has "UT_NOT_USED(mysql_rec)"
10701 
10702   @return
10703     HA_EXIT_SUCCESS  OK
10704     other            HA_ERR error code (can be SE-specific)
10705 */
10706 int ha_rocksdb::delete_row(const uchar *const buf) {
10707   DBUG_ENTER_FUNC();
10708 
10709   assert(buf != nullptr);
10710 
10711   ha_statistic_increment(&SSV::ha_delete_count);
10712   set_last_rowkey(buf);
10713 
10714   rocksdb::Slice key_slice(m_last_rowkey.ptr(), m_last_rowkey.length());
10715   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10716 
10717   const uint index = pk_index(table, m_tbl_def);
10718   rocksdb::Status s =
10719       delete_or_singledelete(index, tx, m_pk_descr->get_cf(), key_slice);
10720   if (!s.ok()) {
10721     DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def));
10722   }
10723 
10724   longlong hidden_pk_id = 0;
10725   if (m_tbl_def->m_key_count > 1 && has_hidden_pk(table)) {
10726     int err = read_hidden_pk_id_from_rowkey(&hidden_pk_id);
10727     if (err) {
10728       DBUG_RETURN(err);
10729     }
10730   }
10731 
10732   // Delete the record for every secondary index
10733   for (uint i = 0; i < m_tbl_def->m_key_count; i++) {
10734     if (!is_pk(i, table, m_tbl_def)) {
10735       int packed_size;
10736       const Rdb_key_def &kd = *m_key_descr_arr[i];
10737 
10738       // The unique key should be locked so that behavior is
10739       // similar to InnoDB and reduce conflicts. The key
10740       // used for locking does not include the extended fields.
10741       const KEY *key_info = &table->key_info[i];
10742       if (key_info->flags & HA_NOSAME) {
10743         uint user_defined_key_parts = key_info->user_defined_key_parts;
10744         uint n_null_fields = 0;
10745 
10746         packed_size = kd.pack_record(table, m_pack_buffer, buf,
10747                                      m_sk_packed_tuple, nullptr, false, 0,
10748                                      user_defined_key_parts, &n_null_fields);
10749 
10750         // NULL fields are considered unique, so no lock is needed
10751         if (n_null_fields == 0) {
10752           rocksdb::Slice sk_slice(
10753               reinterpret_cast<const char *>(m_sk_packed_tuple), packed_size);
10754           const rocksdb::Status s = get_for_update(tx, kd, sk_slice, nullptr);
10755           if (!s.ok()) {
10756             DBUG_RETURN(tx->set_status_error(table->in_use, s, kd, m_tbl_def));
10757           }
10758         }
10759       }
10760 
10761       packed_size = kd.pack_record(table, m_pack_buffer, buf, m_sk_packed_tuple,
10762                                    nullptr, false, hidden_pk_id);
10763       rocksdb::Slice secondary_key_slice(
10764           reinterpret_cast<const char *>(m_sk_packed_tuple), packed_size);
10765       tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(),
10766                                                   secondary_key_slice);
10767     }
10768   }
10769 
10770   tx->log_table_write_op(m_tbl_def);
10771 
10772   if (do_bulk_commit(tx)) {
10773     DBUG_RETURN(HA_ERR_ROCKSDB_BULK_LOAD);
10774   }
10775 
10776   // Not protected by ddl_manger lock for performance
10777   // reasons. This is an estimate value anyway.
10778   dec_table_n_rows();
10779   update_table_stats_if_needed();
10780   update_row_stats(ROWS_DELETED);
10781 
10782   DBUG_RETURN(HA_EXIT_SUCCESS);
10783 }
10784 
10785 rocksdb::Status ha_rocksdb::delete_or_singledelete(
10786     uint index, Rdb_transaction *const tx,
10787     rocksdb::ColumnFamilyHandle *const column_family,
10788     const rocksdb::Slice &key) {
10789   const bool assume_tracked = can_assume_tracked(ha_thd());
10790   if (can_use_single_delete(index)) {
10791     return tx->single_delete(column_family, key, assume_tracked);
10792   }
10793   return tx->delete_key(column_family, key, assume_tracked);
10794 }
10795 
10796 void ha_rocksdb::update_stats(void) {
10797   DBUG_ENTER_FUNC();
10798 
10799   stats.records = 0;
10800   stats.index_file_length = 0ul;
10801   stats.data_file_length = 0ul;
10802   stats.mean_rec_length = 0;
10803 
10804   for (uint i = 0; i < m_tbl_def->m_key_count; i++) {
10805     if (is_pk(i, table, m_tbl_def)) {
10806       stats.data_file_length = m_pk_descr->m_stats.m_actual_disk_size;
10807       stats.records = m_pk_descr->m_stats.m_rows;
10808     } else {
10809       stats.index_file_length += m_key_descr_arr[i]->m_stats.m_actual_disk_size;
10810     }
10811   }
10812 
10813   DBUG_VOID_RETURN;
10814 }
10815 
10816 int ha_rocksdb::adjust_handler_stats_table_scan() {
10817   DBUG_ENTER_FUNC();
10818 
10819   bool should_recalc_stats = false;
10820   if (static_cast<longlong>(stats.data_file_length) < 0) {
10821     stats.data_file_length = 0;
10822     should_recalc_stats = true;
10823   }
10824 
10825   if (static_cast<longlong>(stats.index_file_length) < 0) {
10826     stats.index_file_length = 0;
10827     should_recalc_stats = true;
10828   }
10829 
10830   if (static_cast<longlong>(stats.records) < 0) {
10831     stats.records = 1;
10832     should_recalc_stats = true;
10833   }
10834 
10835   if (should_recalc_stats) {
10836     // If any of the stats is corrupt, add the table to the index stats
10837     // recalc queue.
10838     rdb_is_thread.add_index_stats_request(m_tbl_def->full_tablename());
10839   }
10840   DBUG_RETURN(HA_EXIT_SUCCESS);
10841 }
10842 
10843 /**
10844   @return
10845     HA_EXIT_SUCCESS  OK
10846     HA_EXIT_FAILURE  Error
10847 */
10848 int ha_rocksdb::info(uint flag) {
10849   DBUG_ENTER_FUNC();
10850 
10851   if (!table) DBUG_RETURN(HA_EXIT_FAILURE);
10852 
10853   if (flag & HA_STATUS_VARIABLE) {
10854     /*
10855       Test only to simulate corrupted stats
10856     */
10857     DBUG_EXECUTE_IF("myrocks_simulate_negative_stats",
10858                     m_pk_descr->m_stats.m_actual_disk_size =
10859                         -m_pk_descr->m_stats.m_actual_disk_size;);
10860 
10861     update_stats();
10862     if (rocksdb_table_stats_use_table_scan) {
10863       int ret = adjust_handler_stats_table_scan();
10864       if (ret != HA_EXIT_SUCCESS) {
10865         return ret;
10866       }
10867     } else {
10868       int ret = adjust_handler_stats_sst_and_memtable();
10869       if (ret != HA_EXIT_SUCCESS) {
10870         return ret;
10871       }
10872     }
10873 
10874     if (rocksdb_debug_optimizer_n_rows > 0) {
10875       stats.records = rocksdb_debug_optimizer_n_rows;
10876     }
10877 
10878     if (stats.records != 0) {
10879       stats.mean_rec_length = stats.data_file_length / stats.records;
10880     }
10881   }
10882 
10883   if (flag & (HA_STATUS_VARIABLE | HA_STATUS_CONST)) {
10884     ref_length = m_pk_descr->max_storage_fmt_length();
10885 
10886     for (uint i = 0; i < m_tbl_def->m_key_count; i++) {
10887       if (is_hidden_pk(i, table, m_tbl_def)) {
10888         continue;
10889       }
10890       KEY *const k = &table->key_info[i];
10891       for (uint j = 0; j < k->actual_key_parts; j++) {
10892         const Rdb_index_stats &k_stats = m_key_descr_arr[i]->m_stats;
10893         uint x;
10894 
10895         if (k_stats.m_distinct_keys_per_prefix.size() > j &&
10896             k_stats.m_distinct_keys_per_prefix[j] > 0) {
10897           x = k_stats.m_rows / k_stats.m_distinct_keys_per_prefix[j];
10898           /*
10899             If the number of rows is less than the number of prefixes (due to
10900             sampling), the average number of rows with the same prefix is 1.
10901            */
10902           if (x == 0) {
10903             x = 1;
10904           }
10905         } else {
10906           x = 0;
10907         }
10908         if (x > stats.records) x = stats.records;
10909         if ((x == 0 && rocksdb_debug_optimizer_no_zero_cardinality) ||
10910             rocksdb_debug_optimizer_n_rows > 0) {
10911           // Fake cardinality implementation. For example, (idx1, idx2, idx3)
10912           // index
10913           // will have rec_per_key for (idx1)=4, (idx1,2)=2, and (idx1,2,3)=1.
10914           // rec_per_key for the whole index is 1, and multiplied by 2^n if
10915           // n suffix columns of the index are not used.
10916           x = 1 << (k->actual_key_parts - j - 1);
10917         }
10918         k->rec_per_key[j] = x;
10919       }
10920     }
10921 
10922     stats.create_time = m_tbl_def->get_create_time();
10923   }
10924 
10925   if (flag & HA_STATUS_TIME) {
10926     stats.update_time = m_tbl_def->m_update_time;
10927   }
10928 
10929   if (flag & HA_STATUS_ERRKEY) {
10930     /*
10931       Currently we support only primary keys so we know which key had a
10932       uniqueness violation.
10933     */
10934     errkey = m_dupp_errkey;
10935     dup_ref = m_pk_tuple;  // TODO(?): this should store packed PK.
10936   }
10937 
10938   if (flag & HA_STATUS_AUTO) {
10939     stats.auto_increment_value = m_tbl_def->m_auto_incr_val;
10940   }
10941 
10942   DBUG_RETURN(HA_EXIT_SUCCESS);
10943 }
10944 
10945 void ha_rocksdb::position(const uchar *const record) {
10946   DBUG_ENTER_FUNC();
10947 
10948   longlong hidden_pk_id = 0;
10949   if (has_hidden_pk(table) && read_hidden_pk_id_from_rowkey(&hidden_pk_id)) {
10950     assert(false);  // should never reach here
10951   }
10952 
10953   /*
10954     Get packed primary key value from the record.
10955 
10956     (Note: m_last_rowkey has the packed PK of last-read-row, which allows to
10957     handle most cases, but there is an exception to this: when slave applies
10958     RBR events, it fills the record and then calls position(); rnd_pos())
10959 
10960     Also note that we *can* rely on PK column values being available. This is
10961     because table_flags() includes HA_PRIMARY_KEY_REQUIRED_FOR_POSITION bit.
10962     When that is true, table->prepare_for_position() adds PK columns into the
10963     read set (this may potentially disable index-only access when PK column
10964     cannot be restored from its mem-comparable form in the secondary indexes).
10965   */
10966   const uint packed_size = m_pk_descr->pack_record(
10967       table, m_pack_buffer, record, ref, nullptr, false, hidden_pk_id);
10968 
10969   /*
10970     It could be that mem-comparable form of PK occupies less than ref_length
10971     bytes. Fill the remainder with zeros.
10972   */
10973   if (ref_length > packed_size) {
10974     memset(ref + packed_size, 0, ref_length - packed_size);
10975   }
10976 
10977   DBUG_VOID_RETURN;
10978 }
10979 
10980 /**
10981   @return
10982     HA_EXIT_SUCCESS  OK
10983     other            HA_ERR error code (can be SE-specific)
10984 */
10985 int ha_rocksdb::rnd_pos(uchar *const buf, uchar *const pos) {
10986   DBUG_ENTER_FUNC();
10987 
10988   check_build_decoder();
10989 
10990   int rc;
10991   size_t len;
10992 
10993   ha_statistic_increment(&SSV::ha_read_rnd_count);
10994   len = m_pk_descr->key_length(table,
10995                                rocksdb::Slice((const char *)pos, ref_length));
10996   if (len == size_t(-1)) {
10997     DBUG_RETURN(HA_ERR_ROCKSDB_CORRUPT_DATA); /* Data corruption? */
10998   }
10999 
11000   rc = get_row_by_rowid(buf, pos, len);
11001 
11002   if (!rc) {
11003     update_row_stats(ROWS_READ);
11004   }
11005 
11006   DBUG_RETURN(rc);
11007 }
11008 
11009 /*
11010   @brief
11011     Calculate (if needed) the bitmap of indexes that are modified by the
11012     current query
11013 
11014   @detail
11015     The calculation done by checking index definitions against the
11016   table->write_set
11017     column bitmap.
11018 */
11019 
11020 void ha_rocksdb::calc_updated_indexes() {
11021   m_update_scope.clear_all();
11022 
11023   for (uint keynr = 0; keynr < table->s->keys; keynr++) {
11024     const Rdb_key_def &kd = *m_key_descr_arr[keynr];
11025     /* Walk over all key parts, including the "extended key" suffix */
11026     const uint key_parts = kd.get_key_parts();
11027     for (uint kp = 0; kp < key_parts; kp++) {
11028       if (has_hidden_pk(table) && kp + 1 == key_parts) break;
11029 
11030       Field *const field = kd.get_table_field_for_part_no(table, kp);
11031       if (bitmap_is_set(table->write_set, field->field_index)) {
11032         m_update_scope.set_bit(keynr);
11033         break;
11034       }
11035     }
11036   }
11037 }
11038 
11039 /**
11040   Update an existing row
11041   @param[in] old_data           nullptr for write, non-null for update
11042   @param[in] new_data           non-null for write/update
11043   @return
11044     HA_EXIT_SUCCESS  OK
11045     other            HA_ERR error code (can be SE-specific)
11046 */
11047 int ha_rocksdb::update_row(const uchar *const old_data, uchar *const new_data) {
11048   DBUG_ENTER_FUNC();
11049 
11050   assert(old_data != nullptr);
11051   assert(new_data != nullptr);
11052   assert(m_lock_rows == RDB_LOCK_WRITE);
11053   /*
11054     old_data points to record we're updating. It is the same as the record
11055     we've just read (for multi-table UPDATE, too, because SQL layer will make
11056     an rnd_pos() call to re-read the record before calling update_row())
11057   */
11058   assert(new_data == table->record[0]);
11059 
11060   ha_statistic_increment(&SSV::ha_update_count);
11061   const int rv = update_write_row(old_data, new_data, skip_unique_check());
11062 
11063   if (rv == 0) {
11064     update_table_stats_if_needed();
11065     update_row_stats(ROWS_UPDATED);
11066   }
11067 
11068   DBUG_RETURN(rv);
11069 }
11070 
11071 void ha_rocksdb::update_table_stats_if_needed() {
11072   DBUG_ENTER_FUNC();
11073 
11074   if (!rocksdb_table_stats_use_table_scan) {
11075     DBUG_VOID_RETURN;
11076   }
11077 
11078   /*
11079     InnoDB performs a similar operation to update counters during query
11080     processing. Because the changes in MyRocks are made to a write batch,
11081     it is possible for the table scan cardinality calculation to trigger
11082     before the transaction performing the update commits. Hence the
11083     cardinality scan might miss the keys for these pending transactions.
11084   */
11085   uint64 counter = m_tbl_def->m_tbl_stats.m_stat_modified_counter++;
11086   uint64 n_rows = m_tbl_def->m_tbl_stats.m_stat_n_rows;
11087 
11088   if (counter > std::max(rocksdb_table_stats_recalc_threshold_count,
11089                          static_cast<uint64>(
11090                              n_rows * rocksdb_table_stats_recalc_threshold_pct /
11091                              100.0))) {
11092     // Add the table to the recalc queue
11093     rdb_is_thread.add_index_stats_request(m_tbl_def->full_tablename());
11094     m_tbl_def->m_tbl_stats.m_stat_modified_counter = 0;
11095   }
11096 
11097   DBUG_VOID_RETURN;
11098 }
11099 
11100 /* The following function was copied from ha_blackhole::store_lock: */
11101 THR_LOCK_DATA **ha_rocksdb::store_lock(THD *const thd, THR_LOCK_DATA **to,
11102                                        enum thr_lock_type lock_type) {
11103   DBUG_ENTER_FUNC();
11104 
11105   assert(thd != nullptr);
11106   assert(to != nullptr);
11107 
11108   bool in_lock_tables = my_core::thd_in_lock_tables(thd);
11109 
11110   /* First, make a decision about MyRocks's internal locking */
11111   if (lock_type >= TL_WRITE_ALLOW_WRITE) {
11112     m_lock_rows = RDB_LOCK_WRITE;
11113   } else if (lock_type == TL_READ_WITH_SHARED_LOCKS) {
11114     m_lock_rows = RDB_LOCK_READ;
11115   } else if (lock_type != TL_IGNORE) {
11116     m_lock_rows = RDB_LOCK_NONE;
11117     if (THDVAR(thd, lock_scanned_rows)) {
11118       /*
11119         The following logic was copied directly from
11120         ha_innobase::store_lock_with_x_type() in
11121         storage/innobase/handler/ha_innodb.cc and causes MyRocks to leave
11122         locks in place on rows that are in a table that is not being updated.
11123       */
11124       const uint sql_command = my_core::thd_sql_command(thd);
11125       if ((lock_type == TL_READ && in_lock_tables) ||
11126           (lock_type == TL_READ_HIGH_PRIORITY && in_lock_tables) ||
11127           lock_type == TL_READ_WITH_SHARED_LOCKS ||
11128           lock_type == TL_READ_NO_INSERT ||
11129           (lock_type != TL_IGNORE && sql_command != SQLCOM_SELECT)) {
11130         ulong tx_isolation = my_core::thd_tx_isolation(thd);
11131         if (sql_command != SQLCOM_CHECKSUM &&
11132             ((my_core::thd_test_options(thd, OPTION_BIN_LOG) &&
11133               tx_isolation > ISO_READ_COMMITTED) ||
11134              tx_isolation == ISO_SERIALIZABLE ||
11135              (lock_type != TL_READ && lock_type != TL_READ_NO_INSERT) ||
11136              (sql_command != SQLCOM_INSERT_SELECT &&
11137               sql_command != SQLCOM_REPLACE_SELECT &&
11138               sql_command != SQLCOM_UPDATE && sql_command != SQLCOM_DELETE &&
11139               sql_command != SQLCOM_CREATE_TABLE))) {
11140           m_lock_rows = RDB_LOCK_READ;
11141         }
11142       }
11143     }
11144   }
11145 
11146   /* Then, tell the SQL layer what kind of locking it should use: */
11147   if (lock_type != TL_IGNORE && m_db_lock.type == TL_UNLOCK) {
11148     /*
11149       Here is where we get into the guts of a row level lock.
11150       If TL_UNLOCK is set
11151       If we are not doing a LOCK TABLE or DISCARD/IMPORT
11152       TABLESPACE, then allow multiple writers
11153     */
11154 
11155     if ((lock_type >= TL_WRITE_CONCURRENT_INSERT && lock_type <= TL_WRITE) &&
11156         !in_lock_tables && !my_core::thd_tablespace_op(thd)) {
11157       lock_type = TL_WRITE_ALLOW_WRITE;
11158     }
11159 
11160     /*
11161       In queries of type INSERT INTO t1 SELECT ... FROM t2 ...
11162       MySQL would use the lock TL_READ_NO_INSERT on t2, and that
11163       would conflict with TL_WRITE_ALLOW_WRITE, blocking all inserts
11164       to t2. Convert the lock to a normal read lock to allow
11165       concurrent inserts to t2.
11166     */
11167 
11168     if (lock_type == TL_READ_NO_INSERT && !in_lock_tables) {
11169       lock_type = TL_READ;
11170     }
11171 
11172     m_db_lock.type = lock_type;
11173   }
11174 
11175   *to++ = &m_db_lock;
11176 
11177   DBUG_RETURN(to);
11178 }
11179 
11180 void ha_rocksdb::read_thd_vars(THD *const thd) {
11181   m_store_row_debug_checksums = THDVAR(thd, store_row_debug_checksums);
11182   m_converter->set_verify_row_debug_checksums(
11183       THDVAR(thd, verify_row_debug_checksums));
11184   m_checksums_pct = THDVAR(thd, checksums_pct);
11185 }
11186 
11187 /**
11188   @return
11189     HA_EXIT_SUCCESS  OK
11190     other            HA_ERR error code (cannot be SE-specific)
11191 */
11192 int ha_rocksdb::external_lock(THD *const thd, int lock_type) {
11193   DBUG_ENTER_FUNC();
11194 
11195   assert(thd != nullptr);
11196 
11197   int res = HA_EXIT_SUCCESS;
11198   int binlog_format = my_core::thd_binlog_format(thd);
11199   bool unsafe_for_binlog = THDVAR(ha_thd(), unsafe_for_binlog);
11200   if (lock_type == F_WRLCK && !thd->rli_slave && !unsafe_for_binlog &&
11201       binlog_format != BINLOG_FORMAT_ROW &&
11202       binlog_format != BINLOG_FORMAT_UNSPEC &&
11203       my_core::thd_binlog_filter_ok(thd)) {
11204     my_error(ER_REQUIRE_ROW_BINLOG_FORMAT, MYF(0));
11205     DBUG_RETURN(HA_ERR_UNSUPPORTED);
11206   }
11207 
11208   if (lock_type == F_UNLCK) {
11209     Rdb_transaction *const tx = get_tx_from_thd(thd);
11210 
11211     if (tx) {
11212       tx->io_perf_end_and_record(&m_io_perf);
11213       tx->m_n_mysql_tables_in_use--;
11214       if (tx->m_n_mysql_tables_in_use == 0 &&
11215           !my_core::thd_test_options(thd,
11216                                      OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
11217         /*
11218           Do like InnoDB: when we get here, it's time to commit a
11219           single-statement transaction.
11220 
11221           If the statement involved multiple tables, this code will be executed
11222           for each of them, but that's ok because non-first tx->commit() calls
11223           will be no-ops.
11224         */
11225         if (tx->commit_or_rollback()) {
11226           res = HA_ERR_INTERNAL_ERROR;
11227         }
11228       }
11229     }
11230   } else {
11231     if (my_core::thd_tx_isolation(thd) < ISO_READ_COMMITTED ||
11232         my_core::thd_tx_isolation(thd) > ISO_REPEATABLE_READ) {
11233       my_error(ER_ISOLATION_MODE_NOT_SUPPORTED, MYF(0),
11234                tx_isolation_names[my_core::thd_tx_isolation(thd)]);
11235       DBUG_RETURN(HA_ERR_UNSUPPORTED);
11236     }
11237     /*
11238       It's nice to do the following on start of every statement. The problem
11239       is, handler->start_stmt() is not called for INSERTs.
11240       So, we put this code here.
11241     */
11242     Rdb_transaction *const tx = get_or_create_tx(thd);
11243     read_thd_vars(thd);
11244 
11245     if (skip_unique_check()) {
11246       if ((thd->lex->sql_command == SQLCOM_INSERT ||
11247            thd->lex->sql_command == SQLCOM_LOAD ||
11248            thd->lex->sql_command == SQLCOM_REPLACE) &&
11249           (thd->lex->duplicates == DUP_REPLACE ||
11250            thd->lex->duplicates == DUP_UPDATE)) {
11251         my_error(ER_ON_DUPLICATE_DISABLED, MYF(0), thd->query().str);
11252         DBUG_RETURN(HA_ERR_UNSUPPORTED);
11253       }
11254     }
11255 
11256     if (lock_type == F_WRLCK) {
11257       if (tx->is_tx_read_only()) {
11258         my_error(ER_UPDATES_WITH_CONSISTENT_SNAPSHOT, MYF(0));
11259         DBUG_RETURN(HA_ERR_UNSUPPORTED);
11260       }
11261 
11262       /*
11263         SQL layer signals us to take a write lock. It does so when starting DML
11264         statement. We should put locks on the rows we're reading.
11265 
11266         Note: sometimes, external_lock() can be called without a prior
11267         ::store_lock call.  That's why we need to set lock_* members here, too.
11268       */
11269       m_lock_rows = RDB_LOCK_WRITE;
11270 
11271       if (thd->lex->sql_command == SQLCOM_CREATE_INDEX ||
11272           thd->lex->sql_command == SQLCOM_DROP_INDEX ||
11273           thd->lex->sql_command == SQLCOM_ALTER_TABLE) {
11274         tx->m_ddl_transaction = true;
11275       }
11276     }
11277     tx->m_n_mysql_tables_in_use++;
11278     rocksdb_register_tx(rocksdb_hton, thd, tx);
11279     tx->io_perf_start(&m_io_perf);
11280   }
11281 
11282   DBUG_RETURN(res);
11283 }
11284 
11285 /**
11286   @note
11287   A quote from ha_innobase::start_stmt():
11288   <quote>
11289   MySQL calls this function at the start of each SQL statement inside LOCK
11290   TABLES. Inside LOCK TABLES the ::external_lock method does not work to
11291   mark SQL statement borders.
11292   </quote>
11293 
11294   @return
11295     HA_EXIT_SUCCESS  OK
11296 */
11297 
11298 int ha_rocksdb::start_stmt(THD *const thd, thr_lock_type lock_type) {
11299   DBUG_ENTER_FUNC();
11300 
11301   assert(thd != nullptr);
11302 
11303   Rdb_transaction *const tx = get_or_create_tx(thd);
11304   read_thd_vars(thd);
11305   rocksdb_register_tx(ht, thd, tx);
11306   tx->io_perf_start(&m_io_perf);
11307 
11308   DBUG_RETURN(HA_EXIT_SUCCESS);
11309 }
11310 
11311 rocksdb::Range get_range(uint32_t i,
11312                          uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2],
11313                          int offset1, int offset2) {
11314   uchar *buf_begin = buf;
11315   uchar *buf_end = buf + Rdb_key_def::INDEX_NUMBER_SIZE;
11316   rdb_netbuf_store_index(buf_begin, i + offset1);
11317   rdb_netbuf_store_index(buf_end, i + offset2);
11318 
11319   return rocksdb::Range(
11320       rocksdb::Slice((const char *)buf_begin, Rdb_key_def::INDEX_NUMBER_SIZE),
11321       rocksdb::Slice((const char *)buf_end, Rdb_key_def::INDEX_NUMBER_SIZE));
11322 }
11323 
11324 static rocksdb::Range get_range(const Rdb_key_def &kd,
11325                                 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2],
11326                                 int offset1, int offset2) {
11327   return get_range(kd.get_index_number(), buf, offset1, offset2);
11328 }
11329 
11330 rocksdb::Range get_range(const Rdb_key_def &kd,
11331                          uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]) {
11332   if (kd.m_is_reverse_cf) {
11333     return myrocks::get_range(kd, buf, 1, 0);
11334   } else {
11335     return myrocks::get_range(kd, buf, 0, 1);
11336   }
11337 }
11338 
11339 rocksdb::Range ha_rocksdb::get_range(
11340     const int i, uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]) const {
11341   return myrocks::get_range(*m_key_descr_arr[i], buf);
11342 }
11343 
11344 /*
11345  This function is called with total_order_seek=true, but
11346  upper/lower bound setting is not necessary.
11347  Boundary set is useful when there is no matching key,
11348  but in drop_index_thread's case, it means index is marked as removed,
11349  so no further seek will happen for the index id.
11350 */
11351 static bool is_myrocks_index_empty(rocksdb::ColumnFamilyHandle *cfh,
11352                                    const bool is_reverse_cf,
11353                                    const rocksdb::ReadOptions &read_opts,
11354                                    const uint index_id) {
11355   bool index_removed = false;
11356   uchar key_buf[Rdb_key_def::INDEX_NUMBER_SIZE] = {0};
11357   rdb_netbuf_store_uint32(key_buf, index_id);
11358   const rocksdb::Slice key =
11359       rocksdb::Slice(reinterpret_cast<char *>(key_buf), sizeof(key_buf));
11360   std::unique_ptr<rocksdb::Iterator> it(rdb->NewIterator(read_opts, cfh));
11361   rocksdb_smart_seek(is_reverse_cf, it.get(), key);
11362   if (!it->Valid()) {
11363     index_removed = true;
11364   } else {
11365     if (memcmp(it->key().data(), key_buf, Rdb_key_def::INDEX_NUMBER_SIZE)) {
11366       // Key does not have same prefix
11367       index_removed = true;
11368     }
11369   }
11370   return index_removed;
11371 }
11372 
11373 /*
11374   Drop index thread's main logic
11375 */
11376 
11377 void Rdb_drop_index_thread::run() {
11378   RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
11379 
11380   for (;;) {
11381     // The stop flag might be set by shutdown command
11382     // after drop_index_thread releases signal_mutex
11383     // (i.e. while executing expensive Seek()). To prevent drop_index_thread
11384     // from entering long cond_timedwait, checking if stop flag
11385     // is true or not is needed, with drop_index_interrupt_mutex held.
11386     if (m_killed) {
11387       break;
11388     }
11389 
11390     timespec ts;
11391     clock_gettime(CLOCK_REALTIME, &ts);
11392     ts.tv_sec += dict_manager.is_drop_index_empty()
11393                      ? 24 * 60 * 60  // no filtering
11394                      : 60;           // filtering
11395 
11396     const auto ret MY_ATTRIBUTE((__unused__)) =
11397         mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts);
11398     if (m_killed) {
11399       break;
11400     }
11401     // make sure, no program error is returned
11402     assert(ret == 0 || ret == ETIMEDOUT);
11403     RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
11404 
11405     std::unordered_set<GL_INDEX_ID> indices;
11406     dict_manager.get_ongoing_drop_indexes(&indices);
11407     if (!indices.empty()) {
11408       std::unordered_set<GL_INDEX_ID> finished;
11409       rocksdb::ReadOptions read_opts;
11410       read_opts.total_order_seek = true;  // disable bloom filter
11411 
11412       for (const auto d : indices) {
11413         uint32 cf_flags = 0;
11414         if (!dict_manager.get_cf_flags(d.cf_id, &cf_flags)) {
11415           sql_print_error(
11416               "RocksDB: Failed to get column family flags "
11417               "from cf id %u. MyRocks data dictionary may "
11418               "get corrupted.",
11419               d.cf_id);
11420           abort();
11421         }
11422 
11423         std::shared_ptr<rocksdb::ColumnFamilyHandle> cfh =
11424             cf_manager.get_cf(d.cf_id);
11425         assert(cfh);
11426 
11427         if (dict_manager.get_dropped_cf(d.cf_id)) {
11428           finished.insert(d);
11429           continue;
11430         }
11431 
11432         const bool is_reverse_cf = cf_flags & Rdb_key_def::REVERSE_CF_FLAG;
11433 
11434         uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
11435         rocksdb::Range range = get_range(d.index_id, buf, is_reverse_cf ? 1 : 0,
11436                                          is_reverse_cf ? 0 : 1);
11437 
11438         rocksdb::Status status = DeleteFilesInRange(rdb->GetBaseDB(), cfh.get(),
11439                                                     &range.start, &range.limit);
11440         if (!status.ok()) {
11441           if (status.IsShutdownInProgress()) {
11442             break;
11443           }
11444           rdb_handle_io_error(status, RDB_IO_ERROR_BG_THREAD);
11445         }
11446 
11447         status = rdb->CompactRange(getCompactRangeOptions(), cfh.get(),
11448                                    &range.start, &range.limit);
11449         if (!status.ok()) {
11450           if (status.IsShutdownInProgress()) {
11451             break;
11452           }
11453           rdb_handle_io_error(status, RDB_IO_ERROR_BG_THREAD);
11454         }
11455         if (is_myrocks_index_empty(cfh.get(), is_reverse_cf, read_opts,
11456                                    d.index_id)) {
11457           finished.insert(d);
11458         }
11459       }
11460 
11461       if (!finished.empty()) {
11462         dict_manager.finish_drop_indexes(finished);
11463       }
11464     }
11465 
11466     DBUG_EXECUTE_IF("rocksdb_drop_cf", {
11467       THD *thd = new THD();
11468       thd->thread_stack = reinterpret_cast<char *>(&(thd));
11469       thd->store_globals();
11470 
11471       static constexpr char act[] = "now wait_for ready_to_drop_cf";
11472       assert(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
11473 
11474       thd->restore_globals();
11475       delete thd;
11476     });
11477 
11478     // Remove dropped column family
11479     // 1. Get all cf ids from ongoing_index_drop.
11480     // 2. Get all cf ids for cfs marked as dropped.
11481     // 3. If a cf id is in the list of ongoing_index_drop
11482     // , skip removing this cf. It will be removed later.
11483     // 4. If it is not, proceed to remove the cf.
11484     //
11485     // This should be under dict_manager lock
11486 
11487     {
11488       std::lock_guard<Rdb_dict_manager> dm_lock(dict_manager);
11489       std::unordered_set<uint32> dropped_cf_ids;
11490       dict_manager.get_all_dropped_cfs(&dropped_cf_ids);
11491 
11492       if (!dropped_cf_ids.empty()) {
11493         std::unordered_set<GL_INDEX_ID> ongoing_drop_indices;
11494         dict_manager.get_ongoing_drop_indexes(&ongoing_drop_indices);
11495 
11496         std::unordered_set<uint32> ongoing_drop_cf_ids;
11497         for (const auto index : ongoing_drop_indices) {
11498           ongoing_drop_cf_ids.insert(index.cf_id);
11499         }
11500 
11501         for (const auto cf_id : dropped_cf_ids) {
11502           if (ongoing_drop_cf_ids.find(cf_id) == ongoing_drop_cf_ids.end()) {
11503             cf_manager.remove_dropped_cf(&dict_manager, rdb, cf_id);
11504           }
11505         }
11506       }
11507     }
11508 
11509     DBUG_EXECUTE_IF("rocksdb_drop_cf", {
11510       THD *thd = new THD();
11511       thd->thread_stack = reinterpret_cast<char *>(&(thd));
11512       thd->store_globals();
11513 
11514       static constexpr char act[] = "now signal drop_cf_done";
11515       assert(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
11516 
11517       thd->restore_globals();
11518       delete thd;
11519     });
11520     RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
11521   }
11522 
11523   RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
11524 }
11525 
11526 Rdb_tbl_def *ha_rocksdb::get_table_if_exists(const char *const tablename) {
11527   assert(tablename != nullptr);
11528 
11529   std::string str;
11530   if (rdb_normalize_tablename(tablename, &str) != HA_EXIT_SUCCESS) {
11531     // We were not passed table name?
11532     assert(0);
11533     return nullptr;
11534   }
11535 
11536   return ddl_manager.find(str);
11537 }
11538 
11539 /*
11540   Overload func for delete table ---it deletes table meta data in data
11541   dictionary immediately and delete real data in background thread(async)
11542 
11543   @param tbl       IN      MyRocks table definition
11544 
11545   @return
11546     HA_EXIT_SUCCESS  OK
11547     other            HA_ERR error code (can be SE-specific)
11548 */
11549 int ha_rocksdb::delete_table(Rdb_tbl_def *const tbl) {
11550   DBUG_ENTER_FUNC();
11551 
11552   assert(tbl != nullptr);
11553   assert(m_tbl_def == nullptr || m_tbl_def == tbl);
11554 
11555   const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
11556   rocksdb::WriteBatch *const batch = wb.get();
11557 
11558   DBUG_EXECUTE_IF("rocksdb_before_delete_table", {
11559     static constexpr char act[] =
11560         "now signal ready_to_mark_cf_dropped_before_delete_table wait_for "
11561         "mark_cf_dropped_done_before_delete_table";
11562     assert(!debug_sync_set_action(ha_thd(), STRING_WITH_LEN(act)));
11563   });
11564 
11565   {
11566     std::lock_guard<Rdb_dict_manager> dm_lock(dict_manager);
11567     dict_manager.add_drop_table(tbl->m_key_descr_arr, tbl->m_key_count, batch);
11568 
11569     /*
11570       Remove the table entry in data dictionary (this will also remove it from
11571       the persistent data dictionary).
11572     */
11573     ddl_manager.remove(tbl, batch, true);
11574 
11575     int err = dict_manager.commit(batch);
11576     if (err) {
11577       DBUG_RETURN(err);
11578     }
11579   }
11580 
11581   DBUG_EXECUTE_IF("rocksdb_after_delete_table", {
11582     static constexpr char act[] =
11583         "now signal ready_to_mark_cf_dropped_after_delete_table "
11584         "wait_for mark_cf_dropped_done_after_delete_table";
11585     assert(!debug_sync_set_action(ha_thd(), STRING_WITH_LEN(act)));
11586   });
11587 
11588   rdb_drop_idx_thread.signal();
11589   // avoid dangling pointer
11590   m_tbl_def = nullptr;
11591   DBUG_RETURN(HA_EXIT_SUCCESS);
11592 }
11593 
11594 /*
11595   Note: the following function is called when the table is not open. That is,
11596   this->table==nullptr, pk_key_descr==nullptr, etc.
11597 
11598   tablename points to line in form "./dbname/tablename".
11599 
11600   @return
11601     HA_EXIT_SUCCESS  OK
11602     other            HA_ERR error code (can be SE-specific)
11603 */
11604 
11605 int ha_rocksdb::delete_non_partitioned_table(const char *const tablename) {
11606   DBUG_ENTER_FUNC();
11607 
11608   assert(tablename != nullptr);
11609 
11610   /* Find the table in the hash */
11611   Rdb_tbl_def *const tbl = get_table_if_exists(tablename);
11612   if (!tbl) {
11613     DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
11614   }
11615 
11616   DBUG_RETURN(delete_table(tbl));
11617 }
11618 
11619 static int init_partition_handler(THD *thd, const std::string &partition_string,
11620                                   ha_rockspart &file) {
11621   assert(thd);
11622   MEM_ROOT *mem_root = thd->mem_root;
11623 
11624   partition_info *part_info =
11625       native_part::parse_partition_info(thd, partition_string);
11626 
11627   if (file.init_partitioning(mem_root))
11628     return HA_ERR_CANNOT_INITIALIZE_PARTITIONING;
11629 
11630   file.set_part_info(part_info, false);
11631 
11632   if (file.initialize_partition(mem_root))
11633     return HA_ERR_CANNOT_INITIALIZE_PARTITIONING;
11634 
11635   return 0;
11636 }
11637 
11638 int ha_rocksdb::delete_partitioned_table(
11639     const char *const tablename, const std::string &partition_info_str) {
11640   ha_rockspart file(rocksdb_hton, nullptr);
11641   int result = init_partition_handler(ha_thd(), partition_info_str, file);
11642   if (result) return result;
11643   return file.delete_table(tablename);
11644 }
11645 
11646 int ha_rocksdb::delete_table(const char *const tablename) {
11647   assert(tablename);
11648   std::string partition_info_str;
11649   if (!native_part::get_part_str_for_table(tablename, partition_info_str))
11650     return HA_ERR_TABLE_CORRUPT;
11651   if (partition_info_str.empty())
11652     return delete_non_partitioned_table(tablename);
11653   return delete_partitioned_table(tablename, partition_info_str);
11654 }
11655 
11656 int ha_rocksdb::remove_rows(Rdb_tbl_def *const tbl) {
11657   const rocksdb::WriteOptions wo =
11658       rdb_get_rocksdb_write_options(handler::ha_thd());
11659 
11660   rocksdb::ReadOptions opts;
11661   opts.total_order_seek = true;
11662   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
11663 
11664   char key_buf[MAX_KEY_LENGTH];
11665   uint key_len;
11666 
11667   uchar lower_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE];
11668   uchar upper_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE];
11669   rocksdb::Slice lower_bound_slice;
11670   rocksdb::Slice upper_bound_slice;
11671 
11672   /*
11673     Remove all records in each index.
11674     (This is is not crash-safe, but it doesn't matter, because bulk row
11675     deletion will be handled on rocksdb side)
11676   */
11677   for (uint i = 0; i < tbl->m_key_count; i++) {
11678     const Rdb_key_def &kd = *tbl->m_key_descr_arr[i];
11679     kd.get_infimum_key(reinterpret_cast<uchar *>(key_buf), &key_len);
11680     rocksdb::ColumnFamilyHandle *cf = kd.get_cf();
11681     const rocksdb::Slice table_key(key_buf, key_len);
11682     assert(key_len == Rdb_key_def::INDEX_NUMBER_SIZE);
11683     if (THDVAR(ha_thd(), enable_iterate_bounds)) {
11684       setup_iterator_bounds(kd, table_key, Rdb_key_def::INDEX_NUMBER_SIZE,
11685                             lower_bound_buf, upper_bound_buf,
11686                             &lower_bound_slice, &upper_bound_slice);
11687       opts.iterate_lower_bound = &lower_bound_slice;
11688       opts.iterate_upper_bound = &upper_bound_slice;
11689     } else {
11690       opts.iterate_lower_bound = nullptr;
11691       opts.iterate_upper_bound = nullptr;
11692     }
11693     std::unique_ptr<rocksdb::Iterator> it(rdb->NewIterator(opts, cf));
11694 
11695     it->Seek(table_key);
11696     while (it->Valid()) {
11697       const rocksdb::Slice key = it->key();
11698       if (!kd.covers_key(key)) {
11699         break;
11700       }
11701 
11702       rocksdb::Status s;
11703       if (can_use_single_delete(i)) {
11704         s = rdb->SingleDelete(wo, cf, key);
11705       } else {
11706         s = rdb->Delete(wo, cf, key);
11707       }
11708 
11709       if (!s.ok()) {
11710         return tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def);
11711       }
11712 
11713       it->Next();
11714     }
11715   }
11716 
11717   return HA_EXIT_SUCCESS;
11718 }
11719 
11720 int ha_rocksdb::rename_partitioned_table(const char *const from,
11721                                          const char *const to,
11722                                          const std::string &partition_string) {
11723   ha_rockspart file(rocksdb_hton, nullptr);
11724   int result = init_partition_handler(ha_thd(), partition_string, file);
11725   if (result) return result;
11726   return file.rename_table(from, to);
11727 }
11728 
11729 /**
11730   @return
11731     HA_EXIT_SUCCESS  OK
11732     other            HA_ERR error code (cannot be SE-specific)
11733 */
11734 int ha_rocksdb::rename_non_partitioned_table(const char *const from,
11735                                              const char *const to) {
11736   DBUG_ENTER_FUNC();
11737 
11738   assert(from != nullptr);
11739   assert(to != nullptr);
11740 
11741   std::string from_str;
11742   std::string to_str;
11743   std::string from_db;
11744   std::string to_db;
11745   int rc;
11746 
11747   if (rdb_is_tablename_normalized(from)) {
11748     from_str = from;
11749   } else {
11750     rc = rdb_normalize_tablename(from, &from_str);
11751     if (rc != HA_EXIT_SUCCESS) {
11752       DBUG_RETURN(rc);
11753     }
11754   }
11755 
11756   rc = rdb_split_normalized_tablename(from_str, &from_db);
11757   if (rc != HA_EXIT_SUCCESS) {
11758     DBUG_RETURN(rc);
11759   }
11760 
11761   if (rdb_is_tablename_normalized(to)) {
11762     to_str = to;
11763   } else {
11764     rc = rdb_normalize_tablename(to, &to_str);
11765     if (rc != HA_EXIT_SUCCESS) {
11766       DBUG_RETURN(rc);
11767     }
11768   }
11769 
11770   rc = rdb_split_normalized_tablename(to_str, &to_db);
11771   if (rc != HA_EXIT_SUCCESS) {
11772     DBUG_RETURN(rc);
11773   }
11774 
11775   // If the user changed the database part of the name then validate that the
11776   // 'to' database exists.
11777   if (from_db != to_db && !rdb_database_exists(to_db)) {
11778     // If we return a RocksDB specific error code here we get
11779     // "error: 206 - Unknown error 206".  InnoDB gets
11780     // "error -1 - Unknown error -1" so let's match them.
11781     DBUG_RETURN(-1);
11782   }
11783 
11784   DBUG_EXECUTE_IF("gen_sql_table_name", to_str = to_str + "#sql-test";);
11785 
11786   const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
11787   rocksdb::WriteBatch *const batch = wb.get();
11788 
11789   // rename table is under dict_manager lock, and the cfs used
11790   // by indices of this table cannot be dropped during the process.
11791   dict_manager.lock();
11792 
11793   if (ddl_manager.rename(from_str, to_str, batch)) {
11794     rc = HA_ERR_ROCKSDB_INVALID_TABLE;
11795   } else {
11796     rc = dict_manager.commit(batch);
11797   }
11798   dict_manager.unlock();
11799 
11800   DBUG_RETURN(rc);
11801 }
11802 
11803 int ha_rocksdb::rename_table(const char *const from, const char *const to) {
11804   assert(from);
11805   assert(to);
11806 
11807   const char* from2 = from;
11808   const char* to2 = to;
11809   std::string from_canon, to_canon;
11810   int rc;
11811 
11812   if (rdb_is_tablename_normalized(from)) {
11813     rc = rdb_make_canonical_tablename(from, &from_canon);
11814     if (rc != HA_EXIT_SUCCESS) {
11815       return rc;
11816     }
11817 
11818     from2 = from_canon.c_str();
11819   }
11820 
11821   if (rdb_is_tablename_normalized(to)) {
11822     rc = rdb_make_canonical_tablename(to, &to_canon);
11823     if (rc != HA_EXIT_SUCCESS) {
11824       return rc;
11825     }
11826     to2 = to_canon.c_str();
11827   }
11828 
11829   std::string partition_info_str;
11830   if (!native_part::get_part_str_for_table(from2, partition_info_str))
11831     return HA_ERR_TABLE_CORRUPT;
11832 
11833   if (partition_info_str.empty()) return rename_non_partitioned_table(from2, to2);
11834   return rename_partitioned_table(from2, to2, partition_info_str);
11835 }
11836 
11837 /**
11838   check_if_incompatible_data() called if ALTER TABLE can't detect otherwise
11839   if new and old definition are compatible
11840 
11841   @details If there are no other explicit signs like changed number of
11842   fields this function will be called by compare_tables()
11843   (sql/sql_tables.cc) to decide should we rewrite whole table or only .frm
11844   file.
11845 
11846 */
11847 
11848 bool ha_rocksdb::check_if_incompatible_data(HA_CREATE_INFO *const info,
11849                                             uint table_changes) {
11850   DBUG_ENTER_FUNC();
11851 
11852   assert(info != nullptr);
11853 
11854   // this function is needed only for online alter-table
11855   DBUG_RETURN(COMPATIBLE_DATA_NO);
11856 }
11857 
11858 /**
11859   @return
11860     HA_EXIT_SUCCESS  OK
11861 */
11862 int ha_rocksdb::extra(enum ha_extra_function operation) {
11863   DBUG_ENTER_FUNC();
11864 
11865   switch (operation) {
11866     case HA_EXTRA_KEYREAD:
11867       m_keyread_only = true;
11868       break;
11869     case HA_EXTRA_NO_KEYREAD:
11870       m_keyread_only = false;
11871       break;
11872     case HA_EXTRA_FLUSH:
11873       /*
11874         If the table has blobs, then they are part of m_retrieved_record.
11875         This call invalidates them.
11876       */
11877       m_retrieved_record.Reset();
11878       break;
11879     case HA_EXTRA_INSERT_WITH_UPDATE:
11880       // INSERT ON DUPLICATE KEY UPDATE
11881       if (rocksdb_enable_insert_with_update_caching) {
11882         m_insert_with_update = true;
11883       }
11884       break;
11885     case HA_EXTRA_NO_IGNORE_DUP_KEY:
11886       // PAIRED with HA_EXTRA_INSERT_WITH_UPDATE or HA_EXTRA_WRITE_CAN_REPLACE
11887       // that indicates the end of REPLACE / INSERT ON DUPLICATE KEY
11888       m_insert_with_update = false;
11889       break;
11890 
11891     default:
11892       break;
11893   }
11894 
11895   DBUG_RETURN(HA_EXIT_SUCCESS);
11896 }
11897 
11898 /*
11899   Given a starting key and an ending key, estimate the number of rows that
11900   will exist between the two keys.
11901 */
11902 ha_rows ha_rocksdb::records_in_range(uint inx, key_range *const min_key,
11903                                      key_range *const max_key) {
11904   DBUG_ENTER_FUNC();
11905 
11906   ha_rows ret = THDVAR(ha_thd(), records_in_range);
11907   if (ret) {
11908     DBUG_RETURN(ret);
11909   }
11910   if (table->force_index) {
11911     const ha_rows force_rows = THDVAR(ha_thd(), force_index_records_in_range);
11912     if (force_rows) {
11913       DBUG_RETURN(force_rows);
11914     }
11915   }
11916 
11917   const Rdb_key_def &kd = *m_key_descr_arr[inx];
11918 
11919   auto disk_size = kd.m_stats.m_actual_disk_size;
11920   if (disk_size == 0) disk_size = kd.m_stats.m_data_size;
11921   auto rows = kd.m_stats.m_rows;
11922   if (rows == 0 || disk_size == 0) {
11923     rows = 1;
11924     disk_size = ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE;
11925   }
11926   ulonglong total_size = 0;
11927   ulonglong total_row = 0;
11928   records_in_range_internal(inx, min_key, max_key, disk_size, rows, &total_size,
11929                             &total_row);
11930   ret = total_row;
11931   /*
11932     GetApproximateSizes() gives estimates so ret might exceed stats.records.
11933     MySQL then decides to use full index scan rather than range scan, which
11934     is not efficient for most cases.
11935     To prevent this, changing estimated records slightly smaller than
11936     stats.records.
11937   */
11938   if (ret >= stats.records) {
11939     ret = stats.records * 0.99;
11940   }
11941 
11942   if (rocksdb_debug_optimizer_n_rows > 0) {
11943     ret = rocksdb_debug_optimizer_n_rows;
11944   } else if (ret == 0) {
11945     ret = 1;
11946   }
11947 
11948   DBUG_RETURN(ret);
11949 }
11950 
11951 void ha_rocksdb::records_in_range_internal(uint inx, key_range *const min_key,
11952                                            key_range *const max_key,
11953                                            int64 disk_size, int64 rows,
11954                                            ulonglong *total_size,
11955                                            ulonglong *row_count) {
11956   DBUG_ENTER_FUNC();
11957 
11958   const Rdb_key_def &kd = *m_key_descr_arr[inx];
11959 
11960   uint size1 = 0;
11961   if (min_key) {
11962     size1 = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
11963                                 min_key->key, min_key->keypart_map);
11964     if (min_key->flag == HA_READ_PREFIX_LAST_OR_PREV ||
11965         min_key->flag == HA_READ_PREFIX_LAST ||
11966         min_key->flag == HA_READ_AFTER_KEY) {
11967       kd.successor(m_sk_packed_tuple, size1);
11968     }
11969   } else {
11970     kd.get_infimum_key(m_sk_packed_tuple, &size1);
11971   }
11972 
11973   uint size2 = 0;
11974   if (max_key) {
11975     size2 = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple_old,
11976                                 max_key->key, max_key->keypart_map);
11977     if (max_key->flag == HA_READ_PREFIX_LAST_OR_PREV ||
11978         max_key->flag == HA_READ_PREFIX_LAST ||
11979         max_key->flag == HA_READ_AFTER_KEY) {
11980       kd.successor(m_sk_packed_tuple_old, size2);
11981     }
11982   } else {
11983     kd.get_supremum_key(m_sk_packed_tuple_old, &size2);
11984   }
11985 
11986   const rocksdb::Slice slice1((const char *)m_sk_packed_tuple, size1);
11987   const rocksdb::Slice slice2((const char *)m_sk_packed_tuple_old, size2);
11988 
11989   // It's possible to get slice1 == slice2 for a non-inclusive range with the
11990   // right bound being successor() of the left one, e.g. "t.key>10 AND t.key<11"
11991   if (slice1.compare(slice2) >= 0) {
11992     // It's not possible to get slice2 > slice1
11993     assert(slice1.compare(slice2) == 0);
11994     DBUG_VOID_RETURN;
11995   }
11996 
11997   rocksdb::Range r(kd.m_is_reverse_cf ? slice2 : slice1,
11998                    kd.m_is_reverse_cf ? slice1 : slice2);
11999 
12000   uint64_t sz = 0;
12001 
12002 #pragma GCC diagnostic push
12003 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
12004   // Getting statistics, including from Memtables
12005   uint8_t include_flags = rocksdb::DB::INCLUDE_FILES;
12006   rdb->GetApproximateSizes(kd.get_cf(), &r, 1, &sz, include_flags);
12007   *row_count = rows * ((double)sz / (double)disk_size);
12008   *total_size = sz;
12009   uint64_t memTableCount;
12010   rdb->GetApproximateMemTableStats(kd.get_cf(), r, &memTableCount, &sz);
12011   *row_count += memTableCount;
12012   *total_size += sz;
12013   DBUG_VOID_RETURN;
12014 }
12015 
12016 void ha_rocksdb::update_create_info(HA_CREATE_INFO *const create_info) {
12017   DBUG_ENTER_FUNC();
12018 
12019   assert(create_info != nullptr);
12020 
12021   if (!(create_info->used_fields & HA_CREATE_USED_AUTO)) {
12022     create_info->auto_increment_value = m_tbl_def->m_auto_incr_val;
12023   }
12024 
12025   DBUG_VOID_RETURN;
12026 }
12027 
12028 /**
12029   @brief
12030   Doing manual compaction on OPTIMIZE TABLE in RocksDB.
12031   Compaction itself is executed by background thread in RocksDB, but
12032   CompactRange() waits until compaction completes so this function
12033   may take a long time.
12034   Since RocksDB dataset is allocated per index id, OPTIMIZE TABLE
12035   triggers manual compaction for all indexes of the table.
12036   @details
12037   Compaction range is from the beginning of the index id to
12038   the first row of the next index id. When using reverse order
12039   column family, the first row of the next index id should be
12040   the last row of the previous index id.
12041 
12042   @return
12043     HA_ADMIN_OK      OK
12044     other            HA_ADMIN error code
12045 */
12046 int ha_rocksdb::optimize(THD *const thd, HA_CHECK_OPT *const check_opt) {
12047   DBUG_ENTER_FUNC();
12048 
12049   assert(thd != nullptr);
12050   assert(check_opt != nullptr);
12051 
12052   for (uint i = 0; i < table->s->keys; i++) {
12053     uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
12054     auto range = get_range(i, buf);
12055     const rocksdb::Status s = rdb->CompactRange(getCompactRangeOptions(),
12056                                                 m_key_descr_arr[i]->get_cf(),
12057                                                 &range.start, &range.limit);
12058     if (!s.ok()) {
12059       DBUG_RETURN(rdb_error_to_mysql(s));
12060     }
12061   }
12062 
12063   DBUG_RETURN(HA_EXIT_SUCCESS);
12064 }
12065 
12066 static void init_stats(
12067     const std::unordered_map<GL_INDEX_ID, std::shared_ptr<const Rdb_key_def>>
12068         &to_recalc,
12069     std::unordered_map<GL_INDEX_ID, Rdb_index_stats> *stats) {
12070   for (const auto &it : to_recalc) {
12071     const GL_INDEX_ID index_id = it.first;
12072     auto &kd = it.second;
12073 
12074     (*stats).emplace(index_id, Rdb_index_stats(index_id));
12075     assert(kd->get_key_parts() > 0);
12076     (*stats)[index_id].m_distinct_keys_per_prefix.resize(kd->get_key_parts());
12077   }
12078 }
12079 
12080 /**
12081   Calculate the following index stats for all indexes of a table:
12082   number of rows, file size, and cardinality. It adopts an index
12083   scan approach using rocksdb::Iterator. Sampling is used to
12084   accelerate the scan.
12085 **/
12086 static int calculate_cardinality_table_scan(
12087     const std::unordered_map<GL_INDEX_ID, std::shared_ptr<const Rdb_key_def>>
12088         &to_recalc,
12089     std::unordered_map<GL_INDEX_ID, Rdb_index_stats> *stats,
12090     table_cardinality_scan_type scan_type, uint64_t max_num_rows_scanned,
12091     THD::killed_state volatile *killed) {
12092   DBUG_ENTER_FUNC();
12093 
12094   assert(scan_type != SCAN_TYPE_NONE);
12095   init_stats(to_recalc, stats);
12096 
12097   auto read_opts = rocksdb::ReadOptions();
12098   read_opts.fill_cache = false;
12099   if (scan_type == SCAN_TYPE_MEMTABLE_ONLY) {
12100     read_opts.read_tier = rocksdb::ReadTier::kMemtableTier;
12101   } else {
12102     read_opts.total_order_seek = true;
12103   }
12104 
12105   Rdb_tbl_card_coll cardinality_collector(rocksdb_table_stats_sampling_pct);
12106 
12107   for (const auto &it_kd : to_recalc) {
12108     const GL_INDEX_ID index_id = it_kd.first;
12109 
12110     if (!ddl_manager.safe_find(index_id)) {
12111       // If index id is not in ddl manager, then it has been dropped.
12112       // Skip scanning index
12113       continue;
12114     }
12115 
12116     const std::shared_ptr<const Rdb_key_def> &kd = it_kd.second;
12117     assert(index_id == kd->get_gl_index_id());
12118     Rdb_index_stats &stat = (*stats)[kd->get_gl_index_id()];
12119 
12120     uchar r_buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
12121     auto r = myrocks::get_range(*kd, r_buf);
12122     uint64_t memtableCount;
12123     uint64_t memtableSize;
12124     rdb->GetApproximateMemTableStats(kd->get_cf(), r, &memtableCount,
12125                                      &memtableSize);
12126 
12127     if (scan_type == SCAN_TYPE_MEMTABLE_ONLY &&
12128         memtableCount < (uint64_t)stat.m_rows / 10) {
12129       // skip tables that already have enough stats from SST files to reduce
12130       // overhead and avoid degradation of big tables stats by sampling from
12131       // relatively tiny (less than 10% of full data set) memtable dataset
12132       continue;
12133     }
12134 
12135     // Set memtable count to row count
12136     stat.m_rows = memtableCount;
12137 
12138     if (scan_type == SCAN_TYPE_FULL_TABLE) {
12139       // Set memtable size to file size
12140       stat.m_actual_disk_size = memtableSize;
12141     }
12142 
12143     std::unique_ptr<rocksdb::Iterator> it = std::unique_ptr<rocksdb::Iterator>(
12144         rdb->NewIterator(read_opts, kd->get_cf()));
12145     rocksdb::Slice first_index_key((const char *)r_buf,
12146                                    Rdb_key_def::INDEX_NUMBER_SIZE);
12147 
12148     // Reset m_last_key for new index
12149     cardinality_collector.Reset();
12150     uint64_t rows_scanned = 0ul;
12151     for (it->Seek(first_index_key); is_valid(it.get()); it->Next()) {
12152       if (killed && *killed) {
12153         // NO_LINT_DEBUG
12154         sql_print_information(
12155             "Index stats calculation for index %s with id (%u,%u) is "
12156             "terminated",
12157             kd->get_name().c_str(), stat.m_gl_index_id.cf_id,
12158             stat.m_gl_index_id.index_id);
12159         DBUG_RETURN(HA_EXIT_FAILURE);
12160       }
12161 
12162       const rocksdb::Slice key = it->key();
12163 
12164       if ((scan_type == SCAN_TYPE_FULL_TABLE && max_num_rows_scanned > 0 &&
12165            rows_scanned >= max_num_rows_scanned) ||
12166           !kd->covers_key(key)) {
12167         break;  // end of this index
12168       }
12169 
12170       cardinality_collector.ProcessKey(key, kd.get(), &stat);
12171       rows_scanned++;
12172     }
12173 
12174     cardinality_collector.Reset(); /* reset m_last_key for each key definition */
12175     cardinality_collector.SetCardinality(&stat);
12176     cardinality_collector.AdjustStats(&stat);
12177 
12178     DBUG_EXECUTE_IF("rocksdb_calculate_stats", {
12179       if (kd->get_name() == "secondary_key") {
12180         THD *thd = new THD();
12181         thd->thread_stack = reinterpret_cast<char *>(&thd);
12182         thd->store_globals();
12183 
12184         static constexpr char act[] =
12185             "now signal ready_to_drop_index wait_for ready_to_save_index_stats";
12186         assert(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
12187 
12188         thd->restore_globals();
12189         delete thd;
12190       }
12191     });
12192   }
12193 
12194   DBUG_RETURN(HA_EXIT_SUCCESS);
12195 }
12196 
12197 static void reset_cardinality(
12198     std::unordered_map<GL_INDEX_ID, Rdb_index_stats> *stats) {
12199   for (auto &src : *stats) {
12200     Rdb_index_stats &stat = src.second;
12201     stat.reset_cardinality();
12202   }
12203 }
12204 
12205 static void merge_stats(
12206     const std::unordered_map<GL_INDEX_ID, std::shared_ptr<const Rdb_key_def>>
12207         &to_recalc,
12208     std::unordered_map<GL_INDEX_ID, Rdb_index_stats> *stats,
12209     const std::unordered_map<GL_INDEX_ID, Rdb_index_stats> &card_stats) {
12210   assert(stats->size() == card_stats.size());
12211 
12212   for (auto &src : *stats) {
12213     auto index_id = src.first;
12214     Rdb_index_stats &stat = src.second;
12215     auto it = card_stats.find(index_id);
12216     assert(it != card_stats.end());
12217 
12218     auto it_index = to_recalc.find(index_id);
12219     assert(it_index != to_recalc.end());
12220     stat.merge(it->second, true, it_index->second->max_storage_fmt_length());
12221   }
12222 }
12223 
12224 static void adjust_cardinality(
12225     std::unordered_map<GL_INDEX_ID, Rdb_index_stats> *stats,
12226     table_cardinality_scan_type scan_type, uint64_t max_num_rows_scanned) {
12227   assert(scan_type == SCAN_TYPE_FULL_TABLE);
12228   assert(max_num_rows_scanned > 0);
12229 
12230   for (auto &src : *stats) {
12231     Rdb_index_stats &stat = src.second;
12232     if ((uint64_t)stat.m_rows > max_num_rows_scanned) {
12233       stat.adjust_cardinality(stat.m_rows / max_num_rows_scanned);
12234     }
12235 #ifndef NDEBUG
12236     for (size_t i = 0; i < stat.m_distinct_keys_per_prefix.size(); i++) {
12237       assert(stat.m_distinct_keys_per_prefix[i] <= stat.m_rows);
12238     }
12239 #endif
12240   }
12241 }
12242 
12243 static int read_stats_from_ssts(
12244     const std::unordered_map<GL_INDEX_ID, std::shared_ptr<const Rdb_key_def>>
12245         &to_recalc,
12246     std::unordered_map<GL_INDEX_ID, Rdb_index_stats> *stats) {
12247   DBUG_ENTER_FUNC();
12248 
12249   init_stats(to_recalc, stats);
12250 
12251   // find per column family key ranges which need to be queried
12252   std::unordered_map<rocksdb::ColumnFamilyHandle *, std::vector<rocksdb::Range>>
12253       ranges;
12254   std::vector<uchar> buf(to_recalc.size() * 2 * Rdb_key_def::INDEX_NUMBER_SIZE);
12255 
12256   uchar *bufp = buf.data();
12257   for (const auto &it : to_recalc) {
12258     auto &kd = it.second;
12259     ranges[kd->get_cf()].push_back(myrocks::get_range(*kd, bufp));
12260     bufp += 2 * Rdb_key_def::INDEX_NUMBER_SIZE;
12261   }
12262 
12263   // get RocksDB table properties for these ranges
12264   rocksdb::TablePropertiesCollection props;
12265   for (const auto &it : ranges) {
12266     const auto old_size MY_ATTRIBUTE((__unused__)) = props.size();
12267     const auto status = rdb->GetPropertiesOfTablesInRange(
12268         it.first, &it.second[0], it.second.size(), &props);
12269     assert(props.size() >= old_size);
12270     if (!status.ok()) {
12271       DBUG_RETURN(ha_rocksdb::rdb_error_to_mysql(
12272           status, "Could not access RocksDB properties"));
12273     }
12274   }
12275 
12276   int num_sst = 0;
12277   for (const auto &it : props) {
12278     std::vector<Rdb_index_stats> sst_stats;
12279     Rdb_tbl_prop_coll::read_stats_from_tbl_props(it.second, &sst_stats);
12280     /*
12281       sst_stats is a list of index statistics for indexes that have entries
12282       in the current SST file.
12283     */
12284     for (const auto &it1 : sst_stats) {
12285       /*
12286         Only update statistics for indexes that belong to this SQL table.
12287 
12288         The reason is: We are walking through all SST files that have
12289         entries from this table (and so can compute good statistics). For
12290         other SQL tables, it can be that we're only seeing a small fraction
12291         of table's entries (and so we can't update statistics based on that).
12292       */
12293       if (stats->find(it1.m_gl_index_id) == stats->end()) {
12294         continue;
12295       }
12296 
12297       auto it_index = to_recalc.find(it1.m_gl_index_id);
12298       assert(it_index != to_recalc.end());
12299       if (it_index == to_recalc.end()) {
12300         continue;
12301       }
12302 
12303       (*stats)[it1.m_gl_index_id].merge(
12304           it1, true, it_index->second->max_storage_fmt_length());
12305     }
12306     num_sst++;
12307   }
12308 
12309   DBUG_RETURN(HA_EXIT_SUCCESS);
12310 }
12311 
12312 static int calculate_stats(
12313     const std::unordered_map<GL_INDEX_ID, std::shared_ptr<const Rdb_key_def>>
12314         &to_recalc,
12315     table_cardinality_scan_type scan_type, THD::killed_state volatile *killed) {
12316   DBUG_ENTER_FUNC();
12317 
12318   std::unordered_map<GL_INDEX_ID, Rdb_index_stats> stats;
12319   int ret = read_stats_from_ssts(to_recalc, &stats);
12320   if (ret != HA_EXIT_SUCCESS) {
12321     DBUG_RETURN(ret);
12322   }
12323 
12324   if (scan_type != SCAN_TYPE_NONE) {
12325     std::unordered_map<GL_INDEX_ID, Rdb_index_stats> card_stats;
12326     uint64_t max_num_rows_scanned = rocksdb_table_stats_max_num_rows_scanned;
12327     ret = calculate_cardinality_table_scan(to_recalc, &card_stats, scan_type,
12328                                            max_num_rows_scanned, killed);
12329     if (ret != HA_EXIT_SUCCESS) {
12330       DBUG_RETURN(ret);
12331     }
12332 
12333     if (scan_type == SCAN_TYPE_FULL_TABLE) {
12334       reset_cardinality(&stats);
12335     }
12336 
12337     merge_stats(to_recalc, &stats, card_stats);
12338     if (scan_type == SCAN_TYPE_FULL_TABLE && max_num_rows_scanned > 0) {
12339       adjust_cardinality(&stats, scan_type, max_num_rows_scanned);
12340     }
12341   }
12342 
12343   // set and persist new stats
12344   ddl_manager.set_stats(stats);
12345   ddl_manager.persist_stats(true);
12346 
12347   DBUG_RETURN(HA_EXIT_SUCCESS);
12348 }
12349 
12350 static int calculate_stats_for_table(
12351     const std::string &tbl_name, table_cardinality_scan_type scan_type,
12352     THD::killed_state volatile *killed = nullptr) {
12353   DBUG_ENTER_FUNC();
12354   std::unordered_map<GL_INDEX_ID, std::shared_ptr<const Rdb_key_def>> to_recalc;
12355   std::vector<GL_INDEX_ID> indexes;
12356   ddl_manager.find_indexes(tbl_name, &indexes);
12357 
12358   for (const auto &index : indexes) {
12359     std::shared_ptr<const Rdb_key_def> keydef = ddl_manager.safe_find(index);
12360 
12361     if (keydef) {
12362       to_recalc.insert(std::make_pair(keydef->get_gl_index_id(), keydef));
12363     }
12364   }
12365 
12366   if (to_recalc.empty()) {
12367     DBUG_RETURN(HA_EXIT_FAILURE);
12368   }
12369 
12370   DBUG_EXECUTE_IF("rocksdb_is_bg_thread_drop_table", {
12371     if (tbl_name == "test.t") {
12372       THD *thd = new THD();
12373       thd->thread_stack = reinterpret_cast<char *>(&thd);
12374       thd->store_globals();
12375 
12376       static constexpr char act[] = "now signal ready_to_drop_table";
12377       assert(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
12378 
12379       thd->restore_globals();
12380       delete thd;
12381     }
12382   });
12383 
12384   int err = calculate_stats(to_recalc, scan_type, killed);
12385   if (err != HA_EXIT_SUCCESS) {
12386     DBUG_RETURN(err);
12387   }
12388 
12389   DBUG_EXECUTE_IF("rocksdb_is_bg_thread_drop_table", {
12390     if (tbl_name == "test.t") {
12391       THD *thd = new THD();
12392       thd->thread_stack = reinterpret_cast<char *>(&thd);
12393       thd->store_globals();
12394 
12395       static constexpr char act[] = "now wait_for ready_to_save_table_stats";
12396       assert(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
12397 
12398       thd->restore_globals();
12399       delete thd;
12400     }
12401   });
12402 
12403   if (scan_type == SCAN_TYPE_FULL_TABLE) {
12404     // Save table stats including number of rows
12405     // and modified counter
12406     ddl_manager.set_table_stats(tbl_name);
12407   }
12408 
12409   DBUG_RETURN(HA_EXIT_SUCCESS);
12410 }
12411 
12412 /*
12413   @return
12414     HA_ADMIN_OK      OK
12415     other            HA_ADMIN error code
12416 */
12417 int ha_rocksdb::analyze(THD *const thd, HA_CHECK_OPT *const check_opt) {
12418   DBUG_ENTER_FUNC();
12419 
12420   if (table) {
12421     table_cardinality_scan_type scan_type = rocksdb_table_stats_use_table_scan
12422                                                 ? SCAN_TYPE_FULL_TABLE
12423                                                 : SCAN_TYPE_MEMTABLE_ONLY;
12424 
12425     if (calculate_stats_for_table(m_tbl_def->full_tablename(), scan_type,
12426                                   &(thd->killed)) != HA_EXIT_SUCCESS) {
12427       DBUG_RETURN(HA_ADMIN_FAILED);
12428     }
12429   }
12430 
12431   DBUG_RETURN(HA_ADMIN_OK);
12432 }
12433 
12434 int ha_rocksdb::adjust_handler_stats_sst_and_memtable() {
12435   DBUG_ENTER_FUNC();
12436 
12437   /*
12438     If any stats are negative due to bad cached stats, re-run analyze table
12439     and re-retrieve the stats.
12440   */
12441   if (static_cast<longlong>(stats.data_file_length) < 0 ||
12442       static_cast<longlong>(stats.index_file_length) < 0 ||
12443       static_cast<longlong>(stats.records) < 0) {
12444     if (calculate_stats_for_table(m_tbl_def->full_tablename(),
12445                                   SCAN_TYPE_NONE)) {
12446       DBUG_RETURN(HA_EXIT_FAILURE);
12447     }
12448 
12449     update_stats();
12450   }
12451 
12452   // if number of records is hardcoded, we do not want to force computation
12453   // of memtable cardinalities
12454   if (stats.records == 0 || (rocksdb_force_compute_memtable_stats &&
12455                              rocksdb_debug_optimizer_n_rows == 0)) {
12456     // First, compute SST files stats
12457     uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
12458     auto r = get_range(pk_index(table, m_tbl_def), buf);
12459     uint64_t sz = 0;
12460 
12461     uint8_t include_flags = rocksdb::DB::INCLUDE_FILES;
12462 
12463     // recompute SST files stats only if records count is 0
12464     if (stats.records == 0) {
12465       rdb->GetApproximateSizes(m_pk_descr->get_cf(), &r, 1, &sz, include_flags);
12466       stats.records += sz / ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE;
12467       stats.data_file_length += sz;
12468     }
12469 
12470     // Second, compute memtable stats. This call is expensive, so cache
12471     // values computed for some time.
12472     uint64_t cachetime = rocksdb_force_compute_memtable_stats_cachetime;
12473     uint64_t time = (cachetime == 0) ? 0 : my_micro_time();
12474     if (cachetime == 0 ||
12475         time > m_table_handler->m_mtcache_last_update + cachetime) {
12476       uint64_t memtableCount;
12477       uint64_t memtableSize;
12478 
12479       // the stats below are calculated from skiplist wich is a probablistic
12480       // data structure, so the results vary between test runs
12481       // it also can return 0 for quite a large tables which means that
12482       // cardinality for memtable only indxes will be reported as 0
12483 
12484       rdb->GetApproximateMemTableStats(m_pk_descr->get_cf(), r, &memtableCount,
12485                                        &memtableSize);
12486 
12487       // Atomically update all of these fields at the same time
12488       if (cachetime > 0) {
12489         if (m_table_handler->m_mtcache_lock.fetch_add(
12490                 1, std::memory_order_acquire) == 0) {
12491           m_table_handler->m_mtcache_count = memtableCount;
12492           m_table_handler->m_mtcache_size = memtableSize;
12493           m_table_handler->m_mtcache_last_update = time;
12494         }
12495         m_table_handler->m_mtcache_lock.fetch_sub(1, std::memory_order_release);
12496       }
12497 
12498       stats.records += memtableCount;
12499       stats.data_file_length += memtableSize;
12500     } else {
12501       // Cached data is still valid, so use it instead
12502       stats.records += m_table_handler->m_mtcache_count;
12503       stats.data_file_length += m_table_handler->m_mtcache_size;
12504     }
12505   }
12506 
12507   DBUG_RETURN(HA_EXIT_SUCCESS);
12508 }
12509 
12510 void ha_rocksdb::get_auto_increment(ulonglong off, ulonglong inc,
12511                                     ulonglong nb_desired_values,
12512                                     ulonglong *const first_value,
12513                                     ulonglong *const nb_reserved_values) {
12514   /*
12515     MySQL has a somewhat complicated way of handling the auto-increment value.
12516     The first time get_auto_increment is called for a statement,
12517     nb_desired_values is the estimate for how many values will be needed.  The
12518     engine can then reserve some values, and those will be automatically used
12519     by MySQL, until a hard-coded value shows up in the insert statement, after
12520     which MySQL again calls this function to reset its starting value.
12521    *
12522     For simplicity we will just ignore nb_desired_values - we aren't going to
12523     reserve any extra values for a multi-insert statement.  Each row will
12524     simply acquire the next value as needed and we will always tell MySQL that
12525     we only reserved 1 value.  Since we are using an atomic value for
12526     m_auto_incr_val this should be safe - if we had to grab a mutex, doing
12527     an actual reserve of some values might be a better solution.
12528    */
12529   DEBUG_SYNC(ha_thd(), "rocksdb.autoinc_vars");
12530 
12531   if (off > inc) {
12532     off = 1;
12533   }
12534 
12535   Field *field;
12536   ulonglong new_val, max_val;
12537   field = table->key_info[table->s->next_number_index].key_part[0].field;
12538   max_val = rdb_get_int_col_max_value(field);
12539 
12540   // Local variable reference to simplify code below
12541   auto &auto_incr = m_tbl_def->m_auto_incr_val;
12542 
12543   if (inc == 1) {
12544     assert(off == 1);
12545     // Optimization for the standard case where we are always simply
12546     // incrementing from the last position
12547 
12548     // Use CAS operation in a loop to make sure automically get the next auto
12549     // increment value while ensuring that we don't wrap around to a negative
12550     // number.
12551     //
12552     // We set auto_incr to the min of max_val and new_val + 1. This means that
12553     // if we're at the maximum, we should be returning the same value for
12554     // multiple rows, resulting in duplicate key errors (as expected).
12555     //
12556     // If we return values greater than the max, the SQL layer will "truncate"
12557     // the value anyway, but it means that we store invalid values into
12558     // auto_incr that will be visible in SHOW CREATE TABLE.
12559     new_val = auto_incr;
12560     while (new_val != std::numeric_limits<ulonglong>::max()) {
12561       if (auto_incr.compare_exchange_weak(new_val,
12562                                           std::min(new_val + 1, max_val))) {
12563         break;
12564       }
12565     }
12566   } else {
12567     // The next value can be more complicated if either 'inc' or 'off' is not 1
12568     ulonglong last_val = auto_incr;
12569 
12570     if (last_val > max_val) {
12571       new_val = std::numeric_limits<ulonglong>::max();
12572     } else {
12573       // Loop until we can correctly update the atomic value
12574       do {
12575         assert(last_val > 0);
12576         // Calculate the next value in the auto increment series: offset
12577         // + N * increment where N is 0, 1, 2, ...
12578         //
12579         // For further information please visit:
12580         // http://dev.mysql.com/doc/refman/5.7/en/replication-options-master.html
12581         //
12582         // The following is confusing so here is an explanation:
12583         // To get the next number in the sequence above you subtract out the
12584         // offset, calculate the next sequence (N * increment) and then add the
12585         // offset back in.
12586         //
12587         // The additions are rearranged to avoid overflow.  The following is
12588         // equivalent to (last_val - 1 + inc - off) / inc. This uses the fact
12589         // that (a+b)/c = a/c + b/c + (a%c + b%c)/c. To show why:
12590         //
12591         // (a+b)/c
12592         // = (a - a%c + a%c + b - b%c + b%c) / c
12593         // = (a - a%c) / c + (b - b%c) / c + (a%c + b%c) / c
12594         // = a/c + b/c + (a%c + b%c) / c
12595         //
12596         // Now, substitute a = last_val - 1, b = inc - off, c = inc to get the
12597         // following statement.
12598         ulonglong n =
12599             (last_val - 1) / inc + ((last_val - 1) % inc + inc - off) / inc;
12600 
12601         // Check if n * inc + off will overflow. This can only happen if we have
12602         // an UNSIGNED BIGINT field.
12603         if (n > (std::numeric_limits<ulonglong>::max() - off) / inc) {
12604           assert(max_val == std::numeric_limits<ulonglong>::max());
12605           // The 'last_val' value is already equal to or larger than the largest
12606           // value in the sequence.  Continuing would wrap around (technically
12607           // the behavior would be undefined).  What should we do?
12608           // We could:
12609           //   1) set the new value to the last possible number in our sequence
12610           //      as described above.  The problem with this is that this
12611           //      number could be smaller than a value in an existing row.
12612           //   2) set the new value to the largest possible number.  This number
12613           //      may not be in our sequence, but it is guaranteed to be equal
12614           //      to or larger than any other value already inserted.
12615           //
12616           //  For now I'm going to take option 2.
12617           //
12618           //  Returning ULLONG_MAX from get_auto_increment will cause the SQL
12619           //  layer to fail with ER_AUTOINC_READ_FAILED. This means that due to
12620           //  the SE API for get_auto_increment, inserts will fail with
12621           //  ER_AUTOINC_READ_FAILED if the column is UNSIGNED BIGINT, but
12622           //  inserts will fail with ER_DUP_ENTRY for other types (or no failure
12623           //  if the column is in a non-unique SK).
12624           new_val = std::numeric_limits<ulonglong>::max();
12625           auto_incr = new_val;  // Store the largest value into auto_incr
12626           break;
12627         }
12628 
12629         new_val = n * inc + off;
12630 
12631         // Attempt to store the new value (plus 1 since m_auto_incr_val contains
12632         // the next available value) into the atomic value.  If the current
12633         // value no longer matches what we have in 'last_val' this will fail and
12634         // we will repeat the loop (`last_val` will automatically get updated
12635         // with the current value).
12636         //
12637         // See above explanation for inc == 1 for why we use std::min.
12638       } while (!auto_incr.compare_exchange_weak(
12639           last_val, std::min(new_val + 1, max_val)));
12640     }
12641   }
12642 
12643   *first_value = new_val;
12644   *nb_reserved_values = 1;
12645 }
12646 
12647 #ifndef NDEBUG
12648 
12649 /* Debugger help function */
12650 static char dbug_item_print_buf[512];
12651 
12652 const char *dbug_print_item(Item *const item) {
12653   char *const buf = dbug_item_print_buf;
12654   String str(buf, sizeof(dbug_item_print_buf), &my_charset_bin);
12655   str.length(0);
12656   if (!item) return "(Item*)nullptr";
12657   item->print(&str, QT_ORDINARY);
12658   if (str.c_ptr() == buf) {
12659     return buf;
12660   } else {
12661     return "Couldn't fit into buffer";
12662   }
12663 }
12664 
12665 #endif /*NDEBUG*/
12666 
12667 /**
12668   SQL layer calls this function to push an index condition.
12669 
12670   @details
12671     The condition is for index keyno (only one condition can be pushed at a
12672     time).
12673     The caller guarantees that condition refers only to index fields; besides
12674     that, fields must have
12675 
12676       $field->part_of_key.set_bit(keyno)
12677 
12678     which means that
12679 
12680        (handler->index_flags(keyno, $keypart, 0) & HA_KEYREAD_ONLY) == 1
12681 
12682     which means that field value can be restored from the index tuple.
12683 
12684   @return
12685     Part of condition we couldn't check (always nullptr).
12686 */
12687 
12688 class Item *ha_rocksdb::idx_cond_push(uint keyno, class Item *const idx_cond) {
12689   DBUG_ENTER_FUNC();
12690 
12691   assert(keyno != MAX_KEY);
12692   assert(idx_cond != nullptr);
12693 
12694   pushed_idx_cond = idx_cond;
12695   pushed_idx_cond_keyno = keyno;
12696   in_range_check_pushed_down = true;
12697 
12698   /* We will check the whole condition */
12699   DBUG_RETURN(nullptr);
12700 }
12701 
12702 /*
12703   @brief
12704   Check the index condition.
12705 
12706   @detail
12707   Check the index condition. (The caller has unpacked all needed index
12708   columns into table->record[0])
12709 
12710   @return
12711     ICP_NO_MATCH - Condition not satisfied (caller should continue
12712                    scanning)
12713     OUT_OF_RANGE - We've left the range we're scanning (caller should
12714                    stop scanning and return HA_ERR_END_OF_FILE)
12715 
12716     ICP_MATCH    - Condition is satisfied (caller should fetch the record
12717                    and return it)
12718 */
12719 
12720 enum icp_result ha_rocksdb::check_index_cond() const {
12721   assert(pushed_idx_cond);
12722   assert(pushed_idx_cond_keyno != MAX_KEY);
12723 
12724   if (end_range && compare_key_icp(end_range) > 0) {
12725     /* caller should return HA_ERR_END_OF_FILE already */
12726     return ICP_OUT_OF_RANGE;
12727   }
12728 
12729   return pushed_idx_cond->val_int() ? ICP_MATCH : ICP_NO_MATCH;
12730 }
12731 
12732 /*
12733   Checks if inplace alter is supported for a given operation.
12734 */
12735 
12736 my_core::enum_alter_inplace_result ha_rocksdb::check_if_supported_inplace_alter(
12737     TABLE *altered_table, my_core::Alter_inplace_info *const ha_alter_info) {
12738   DBUG_ENTER_FUNC();
12739 
12740   assert(ha_alter_info != nullptr);
12741 
12742   if (ha_alter_info->handler_flags &
12743       ~(my_core::Alter_inplace_info::DROP_INDEX |
12744         my_core::Alter_inplace_info::DROP_UNIQUE_INDEX |
12745         my_core::Alter_inplace_info::ADD_INDEX |
12746         my_core::Alter_inplace_info::ADD_UNIQUE_INDEX |
12747         my_core::Alter_inplace_info::CHANGE_CREATE_OPTION |
12748         (rocksdb_alter_column_default_inplace
12749              ? my_core::Alter_inplace_info::ALTER_COLUMN_DEFAULT
12750              : 0))) {
12751     DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED);
12752   }
12753 
12754   /* We don't support unique keys on table w/ no primary keys */
12755   if ((ha_alter_info->handler_flags &
12756        my_core::Alter_inplace_info::ADD_UNIQUE_INDEX) &&
12757       has_hidden_pk(altered_table)) {
12758     DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED);
12759   }
12760 
12761   /* We only support changing auto_increment for table options. */
12762   if ((ha_alter_info->handler_flags &
12763        my_core::Alter_inplace_info::CHANGE_CREATE_OPTION) &&
12764       !(ha_alter_info->create_info->used_fields & HA_CREATE_USED_AUTO)) {
12765     DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED);
12766   }
12767 
12768   DBUG_RETURN(my_core::HA_ALTER_INPLACE_SHARED_LOCK_AFTER_PREPARE);
12769 }
12770 
12771 /**
12772   Allows the storage engine to update internal structures with concurrent
12773   writes blocked. If check_if_supported_inplace_alter() returns
12774   HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE or
12775   HA_ALTER_INPLACE_SHARED_AFTER_PREPARE, this function is called with
12776   exclusive lock otherwise the same level of locking as for
12777   inplace_alter_table() will be used.
12778 
12779   @note Storage engines are responsible for reporting any errors by
12780   calling my_error()/print_error()
12781 
12782   @note If this function reports error, commit_inplace_alter_table()
12783   will be called with commit= false.
12784 
12785   @note For partitioning, failing to prepare one partition, means that
12786   commit_inplace_alter_table() will be called to roll back changes for
12787   all partitions. This means that commit_inplace_alter_table() might be
12788   called without prepare_inplace_alter_table() having been called first
12789   for a given partition.
12790 
12791   @param    altered_table     TABLE object for new version of table.
12792   @param    ha_alter_info     Structure describing changes to be done
12793                               by ALTER TABLE and holding data used
12794                               during in-place alter.
12795 
12796   @retval   true              Error
12797   @retval   false             Success
12798 */
12799 bool ha_rocksdb::prepare_inplace_alter_table(
12800     TABLE *const altered_table,
12801     my_core::Alter_inplace_info *const ha_alter_info) {
12802   DBUG_ENTER_FUNC();
12803 
12804   assert(altered_table != nullptr);
12805   assert(ha_alter_info != nullptr);
12806 
12807   Rdb_tbl_def *new_tdef = nullptr;
12808   std::shared_ptr<Rdb_key_def> *old_key_descr = nullptr;
12809   std::shared_ptr<Rdb_key_def> *new_key_descr = nullptr;
12810   uint old_n_keys = m_tbl_def->m_key_count;
12811   uint new_n_keys = altered_table->s->keys;
12812   std::unordered_set<std::shared_ptr<Rdb_key_def>> added_indexes;
12813   std::unordered_set<GL_INDEX_ID> dropped_index_ids;
12814   uint n_dropped_keys = 0;
12815   uint n_added_keys = 0;
12816   ulonglong max_auto_incr = 0;
12817 
12818   if (ha_alter_info->handler_flags &
12819       (my_core::Alter_inplace_info::DROP_INDEX |
12820        my_core::Alter_inplace_info::DROP_UNIQUE_INDEX |
12821        my_core::Alter_inplace_info::ADD_INDEX |
12822        my_core::Alter_inplace_info::ADD_UNIQUE_INDEX)) {
12823     if (has_hidden_pk(altered_table)) {
12824       new_n_keys += 1;
12825     }
12826 
12827     const TABLE *const old_table = table;
12828     old_key_descr = m_tbl_def->m_key_descr_arr;
12829     new_key_descr = new std::shared_ptr<Rdb_key_def>[new_n_keys];
12830 
12831     new_tdef = new Rdb_tbl_def(m_tbl_def->full_tablename());
12832     new_tdef->m_key_descr_arr = new_key_descr;
12833     new_tdef->m_key_count = new_n_keys;
12834     new_tdef->m_auto_incr_val =
12835         m_tbl_def->m_auto_incr_val.load(std::memory_order_relaxed);
12836     new_tdef->m_hidden_pk_val =
12837         m_tbl_def->m_hidden_pk_val.load(std::memory_order_relaxed);
12838 
12839     if (create_key_defs(altered_table, new_tdef, table, m_tbl_def)) {
12840       /* Delete the new key descriptors */
12841       delete[] new_key_descr;
12842 
12843       /*
12844         Explicitly mark as nullptr so we don't accidentally remove entries
12845         from data dictionary on cleanup (or cause double delete[]).
12846         */
12847       new_tdef->m_key_descr_arr = nullptr;
12848       delete new_tdef;
12849 
12850       my_error(ER_KEY_CREATE_DURING_ALTER, MYF(0));
12851       DBUG_RETURN(HA_EXIT_FAILURE);
12852     }
12853 
12854     uint i;
12855     uint j;
12856 
12857     /* Determine which(if any) key definition(s) need to be dropped */
12858     for (i = 0; i < ha_alter_info->index_drop_count; i++) {
12859       const KEY *const dropped_key = ha_alter_info->index_drop_buffer[i];
12860       for (j = 0; j < old_n_keys; j++) {
12861         const KEY *const old_key =
12862             &old_table->key_info[old_key_descr[j]->get_keyno()];
12863 
12864         if (!compare_keys(old_key, dropped_key)) {
12865           dropped_index_ids.insert(old_key_descr[j]->get_gl_index_id());
12866           break;
12867         }
12868       }
12869     }
12870 
12871     /* Determine which(if any) key definitions(s) need to be added */
12872     int identical_indexes_found = 0;
12873     for (i = 0; i < ha_alter_info->index_add_count; i++) {
12874       const KEY *const added_key =
12875           &ha_alter_info->key_info_buffer[ha_alter_info->index_add_buffer[i]];
12876       for (j = 0; j < new_n_keys; j++) {
12877         const KEY *const new_key =
12878             &altered_table->key_info[new_key_descr[j]->get_keyno()];
12879         if (!compare_keys(new_key, added_key)) {
12880           /*
12881             Check for cases where an 'identical' index is being dropped and
12882             re-added in a single ALTER statement.  Turn this into a no-op as the
12883             index has not changed.
12884 
12885             E.G. Unique index -> non-unique index requires no change
12886 
12887             Note that cases where the index name remains the same but the
12888             key-parts are changed is already handled in create_inplace_key_defs.
12889             In these cases the index needs to be rebuilt.
12890             */
12891           if (dropped_index_ids.count(new_key_descr[j]->get_gl_index_id())) {
12892             dropped_index_ids.erase(new_key_descr[j]->get_gl_index_id());
12893             identical_indexes_found++;
12894           } else {
12895             added_indexes.insert(new_key_descr[j]);
12896           }
12897 
12898           break;
12899         }
12900       }
12901     }
12902 
12903     n_dropped_keys = ha_alter_info->index_drop_count - identical_indexes_found;
12904     n_added_keys = ha_alter_info->index_add_count - identical_indexes_found;
12905     assert(dropped_index_ids.size() == n_dropped_keys);
12906     assert(added_indexes.size() == n_added_keys);
12907     assert(new_n_keys == (old_n_keys - n_dropped_keys + n_added_keys));
12908   }
12909   if (ha_alter_info->handler_flags &
12910       my_core::Alter_inplace_info::CHANGE_CREATE_OPTION) {
12911     if (!new_tdef) {
12912       new_tdef = m_tbl_def;
12913     }
12914     if (table->found_next_number_field) {
12915       max_auto_incr = load_auto_incr_value_from_index();
12916     }
12917   }
12918 
12919   ha_alter_info->handler_ctx = new Rdb_inplace_alter_ctx(
12920       new_tdef, old_key_descr, new_key_descr, old_n_keys, new_n_keys,
12921       added_indexes, dropped_index_ids, n_added_keys, n_dropped_keys,
12922       max_auto_incr);
12923   DBUG_RETURN(HA_EXIT_SUCCESS);
12924 }
12925 
12926 /**
12927   Alter the table structure in-place with operations specified using
12928   HA_ALTER_FLAGS and Alter_inplace_info. The level of concurrency allowed
12929   during this operation depends on the return value from
12930   check_if_supported_inplace_alter().
12931 
12932   @note Storage engines are responsible for reporting any errors by
12933   calling my_error()/print_error()
12934 
12935   @note If this function reports error, commit_inplace_alter_table()
12936   will be called with commit= false.
12937 
12938   @param    altered_table     TABLE object for new version of table.
12939   @param    ha_alter_info     Structure describing changes to be done
12940                               by ALTER TABLE and holding data used
12941                               during in-place alter.
12942 
12943   @retval   true              Error
12944   @retval   false             Success
12945 */
12946 bool ha_rocksdb::inplace_alter_table(
12947     TABLE *const altered_table,
12948     my_core::Alter_inplace_info *const ha_alter_info) {
12949   DBUG_ENTER_FUNC();
12950 
12951   assert(altered_table != nullptr);
12952   assert(ha_alter_info != nullptr);
12953   assert(ha_alter_info->handler_ctx != nullptr);
12954 
12955   Rdb_inplace_alter_ctx *const ctx =
12956       static_cast<Rdb_inplace_alter_ctx *>(ha_alter_info->handler_ctx);
12957 
12958   if (ha_alter_info->handler_flags &
12959       (my_core::Alter_inplace_info::ADD_INDEX |
12960        my_core::Alter_inplace_info::ADD_UNIQUE_INDEX)) {
12961     /*
12962       Buffers need to be set up again to account for new, possibly longer
12963       secondary keys.
12964     */
12965     free_key_buffers();
12966 
12967     assert(ctx != nullptr);
12968 
12969     /*
12970       If adding unique index, allocate special buffers for duplicate checking.
12971     */
12972     int err;
12973     if ((err = alloc_key_buffers(
12974              altered_table, ctx->m_new_tdef,
12975              ha_alter_info->handler_flags &
12976                  my_core::Alter_inplace_info::ADD_UNIQUE_INDEX))) {
12977       my_error(ER_OUT_OF_RESOURCES, MYF(0));
12978       DBUG_RETURN(err);
12979     }
12980 
12981     /* Populate all new secondary keys by scanning the primary key. */
12982     if ((err = inplace_populate_sk(altered_table, ctx->m_added_indexes))) {
12983       my_error(ER_SK_POPULATE_DURING_ALTER, MYF(0));
12984       DBUG_RETURN(HA_EXIT_FAILURE);
12985     }
12986   }
12987 
12988   DBUG_EXECUTE_IF("myrocks_simulate_index_create_rollback", {
12989     dbug_create_err_inplace_alter();
12990     DBUG_RETURN(HA_EXIT_FAILURE);
12991   };);
12992 
12993   DBUG_RETURN(HA_EXIT_SUCCESS);
12994 }
12995 
12996 /**
12997  Scan the Primary Key index entries and populate the new secondary keys.
12998 */
12999 int ha_rocksdb::inplace_populate_sk(
13000     TABLE *const new_table_arg,
13001     const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes) {
13002   DBUG_ENTER_FUNC();
13003   int res = HA_EXIT_SUCCESS;
13004   const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
13005   rocksdb::WriteBatch *const batch = wb.get();
13006 
13007   DBUG_EXECUTE_IF("rocksdb_inplace_populate_sk", {
13008     static constexpr char act[] =
13009         "now signal ready_to_mark_cf_dropped_in_populate_sk "
13010         "wait_for mark_cf_dropped_done_in_populate_sk";
13011     assert(!debug_sync_set_action(ha_thd(), STRING_WITH_LEN(act)));
13012   });
13013 
13014   {
13015     std::lock_guard<Rdb_dict_manager> dm_lock(dict_manager);
13016     for (const auto &kd : indexes) {
13017       const std::string cf_name = kd->get_cf()->GetName();
13018       std::shared_ptr<rocksdb::ColumnFamilyHandle> cfh =
13019           cf_manager.get_cf(cf_name);
13020 
13021       if (!cfh || cfh != kd->get_shared_cf()) {
13022         // The CF has been dropped, i.e., cf_manager.remove_dropped_cf() has
13023         // been called.
13024         DBUG_RETURN(HA_EXIT_FAILURE);
13025       }
13026 
13027       uint32 cf_id = cfh->GetID();
13028       if (dict_manager.get_dropped_cf(cf_id)) {
13029         DBUG_RETURN(HA_EXIT_FAILURE);
13030       }
13031     }
13032 
13033     /* Update the data dictionary */
13034     std::unordered_set<GL_INDEX_ID> create_index_ids;
13035     for (const auto &index : indexes) {
13036       create_index_ids.insert(index->get_gl_index_id());
13037     }
13038     dict_manager.add_create_index(create_index_ids, batch);
13039     res = dict_manager.commit(batch);
13040     if (res != HA_EXIT_SUCCESS) {
13041       return res;
13042     }
13043 
13044     /*
13045       Add uncommitted key definitons to ddl_manager.  We need to do this
13046       so that the property collector can find this keydef when it needs to
13047       update stats.  The property collector looks for the keydef in the
13048       data dictionary, but it won't be there yet since this key definition
13049       is still in the creation process.
13050     */
13051     ddl_manager.add_uncommitted_keydefs(indexes);
13052   }
13053 
13054   const bool hidden_pk_exists = has_hidden_pk(table);
13055 
13056   Rdb_transaction *tx = get_or_create_tx(table->in_use);
13057 
13058   /*
13059     There is one specific scenario where m_sst_info may not be nullptr. This
13060     happens if the handler we're using happens to be the handler where the PK
13061     bulk load was done on. The sequence of events that lead to this is as
13062     follows (T1 is PK bulk load, T2 is SK alter table):
13063 
13064     T1: Execute last INSERT statement
13065     T1: Return TABLE and handler object back to Table_cache_manager
13066     T1: Close connection
13067     T2: Execute ALTER statement
13068     T2: Take same TABLE/handler from Table_cache_manager
13069     T2: Call closefrm which will call finalize_bulk_load on every other open
13070         table/handler *except* the one it's on.
13071     T2: Acquire stale snapshot of PK
13072     T1: Call finalize_bulk_load
13073 
13074     This is rare because usually, closefrm will call the destructor (and thus
13075     finalize_bulk_load) on the handler where PK bulk load is done. However, if
13076     the thread ids of the bulk load thread and the alter thread differ by a
13077     multiple of table_cache_instances (8 by default), then they hash to the
13078     same bucket in Table_cache_manager and the alter thread will not not call
13079     the destructor on the handler it is holding. Thus, its m_sst_info will not
13080     be nullptr.
13081 
13082     At this point, it is safe to refresh the snapshot because we know all other
13083     open handlers have been closed at this point, and the one we're on is the
13084     only one left.
13085   */
13086   if (m_sst_info) {
13087     if ((res = finalize_bulk_load())) {
13088       DBUG_RETURN(res);
13089     }
13090     tx->commit();
13091   }
13092 
13093   const ulonglong rdb_merge_buf_size = THDVAR(ha_thd(), merge_buf_size);
13094   const ulonglong rdb_merge_combine_read_size =
13095       THDVAR(ha_thd(), merge_combine_read_size);
13096   const ulonglong rdb_merge_tmp_file_removal_delay =
13097       THDVAR(ha_thd(), merge_tmp_file_removal_delay_ms);
13098 
13099   for (const auto &index : indexes) {
13100     bool is_unique_index =
13101         new_table_arg->key_info[index->get_keyno()].flags & HA_NOSAME;
13102 
13103     Rdb_index_merge rdb_merge(tx->get_rocksdb_tmpdir(), rdb_merge_buf_size,
13104                               rdb_merge_combine_read_size,
13105                               rdb_merge_tmp_file_removal_delay,
13106                               index->get_cf());
13107 
13108     if ((res = rdb_merge.init())) {
13109       DBUG_RETURN(res);
13110     }
13111 
13112     /*
13113       Note: We pass in the currently existing table + tbl_def object here,
13114       as the pk index position may have changed in the case of hidden primary
13115       keys.
13116     */
13117     const uint pk = pk_index(table, m_tbl_def);
13118     res = ha_index_init(pk, true);
13119     if (res) DBUG_RETURN(res);
13120 
13121     /* Scan each record in the primary key in order */
13122     for (res = index_first(table->record[0]); res == 0;
13123          res = index_next(table->record[0])) {
13124       longlong hidden_pk_id = 0;
13125       if (hidden_pk_exists &&
13126           (res = read_hidden_pk_id_from_rowkey(&hidden_pk_id))) {
13127         // NO_LINT_DEBUG
13128         sql_print_error("Error retrieving hidden pk id.");
13129         ha_index_end();
13130         DBUG_RETURN(res);
13131       }
13132 
13133       /* Create new secondary index entry */
13134       const int new_packed_size = index->pack_record(
13135           new_table_arg, m_pack_buffer, table->record[0], m_sk_packed_tuple,
13136           &m_sk_tails, should_store_row_debug_checksums(), hidden_pk_id, 0,
13137           nullptr, m_ttl_bytes);
13138 
13139       const rocksdb::Slice key = rocksdb::Slice(
13140           reinterpret_cast<const char *>(m_sk_packed_tuple), new_packed_size);
13141       const rocksdb::Slice val =
13142           rocksdb::Slice(reinterpret_cast<const char *>(m_sk_tails.ptr()),
13143                          m_sk_tails.get_current_pos());
13144 
13145       /*
13146         Add record to offset tree in preparation for writing out to
13147         disk in sorted chunks.
13148       */
13149       if ((res = rdb_merge.add(key, val))) {
13150         ha_index_end();
13151         DBUG_RETURN(res);
13152       }
13153     }
13154 
13155     if (res != HA_ERR_END_OF_FILE) {
13156       // NO_LINT_DEBUG
13157       sql_print_error("Error retrieving index entry from primary key.");
13158       ha_index_end();
13159       DBUG_RETURN(res);
13160     }
13161 
13162     ha_index_end();
13163 
13164     /*
13165       Perform an n-way merge of n sorted buffers on disk, then writes all
13166       results to RocksDB via SSTFileWriter API.
13167     */
13168     rocksdb::Slice merge_key;
13169     rocksdb::Slice merge_val;
13170 
13171     struct unique_sk_buf_info sk_info;
13172     sk_info.dup_sk_buf = m_dup_sk_packed_tuple;
13173     sk_info.dup_sk_buf_old = m_dup_sk_packed_tuple_old;
13174 
13175     while ((res = rdb_merge.next(&merge_key, &merge_val)) == 0) {
13176       /* Perform uniqueness check if needed */
13177       if (is_unique_index) {
13178         if (check_duplicate_sk(new_table_arg, *index, &merge_key, &sk_info)) {
13179           /*
13180             Duplicate entry found when trying to create unique secondary key.
13181             We need to unpack the record into new_table_arg->record[0] as it
13182             is used inside print_keydup_error so that the error message shows
13183             the duplicate record.
13184           */
13185           if (index->unpack_record(
13186                   new_table_arg, new_table_arg->record[0], &merge_key,
13187                   &merge_val, m_converter->get_verify_row_debug_checksums())) {
13188             /* Should never reach here */
13189             assert(0);
13190           }
13191 
13192           print_keydup_error(new_table_arg,
13193                              &new_table_arg->key_info[index->get_keyno()],
13194                              MYF(0));
13195           DBUG_RETURN(ER_DUP_ENTRY);
13196         }
13197       }
13198 
13199       /*
13200         Insert key and slice to SST via SSTFileWriter API.
13201       */
13202       if ((res = bulk_load_key(tx, *index, merge_key, merge_val, false))) {
13203         break;
13204       }
13205     }
13206 
13207     /*
13208       Here, res == -1 means that we are finished, while > 0 means an error
13209       occurred.
13210     */
13211     if (res > 0) {
13212       // NO_LINT_DEBUG
13213       sql_print_error("Error while bulk loading keys in external merge sort.");
13214       DBUG_RETURN(res);
13215     }
13216 
13217     bool is_critical_error;
13218     res = tx->finish_bulk_load(&is_critical_error);
13219     if (res && is_critical_error) {
13220       // NO_LINT_DEBUG
13221       sql_print_error("Error finishing bulk load.");
13222       DBUG_RETURN(res);
13223     }
13224   }
13225 
13226   /*
13227     Explicitly tell jemalloc to clean up any unused dirty pages at this point.
13228     See https://reviews.facebook.net/D63723 for more details.
13229   */
13230   purge_all_jemalloc_arenas();
13231 
13232   DBUG_EXECUTE_IF("crash_during_online_index_creation", DBUG_SUICIDE(););
13233   DBUG_RETURN(res);
13234 }
13235 
13236 /**
13237   Commit or rollback the changes made during prepare_inplace_alter_table()
13238   and inplace_alter_table() inside the storage engine.
13239   Note that in case of rollback the allowed level of concurrency during
13240   this operation will be the same as for inplace_alter_table() and thus
13241   might be higher than during prepare_inplace_alter_table(). (For example,
13242   concurrent writes were blocked during prepare, but might not be during
13243   rollback).
13244 
13245   @note Storage engines are responsible for reporting any errors by
13246   calling my_error()/print_error()
13247 
13248   @note If this function with commit= true reports error, it will be called
13249   again with commit= false.
13250 
13251   @note In case of partitioning, this function might be called for rollback
13252   without prepare_inplace_alter_table() having been called first.
13253   Also partitioned tables sets ha_alter_info->group_commit_ctx to a NULL
13254   terminated array of the partitions handlers and if all of them are
13255   committed as one, then group_commit_ctx should be set to NULL to indicate
13256   to the partitioning handler that all partitions handlers are committed.
13257   @see prepare_inplace_alter_table().
13258 
13259   @param    altered_table     TABLE object for new version of table.
13260   @param    ha_alter_info     Structure describing changes to be done
13261                               by ALTER TABLE and holding data used
13262                               during in-place alter.
13263   @param    commit            True => Commit, False => Rollback.
13264 
13265   @retval   true              Error
13266   @retval   false             Success
13267 */
13268 bool ha_rocksdb::commit_inplace_alter_table(
13269     my_core::TABLE *const altered_table,
13270     my_core::Alter_inplace_info *const ha_alter_info, bool commit) {
13271   DBUG_ENTER_FUNC();
13272 
13273   assert(altered_table != nullptr);
13274   assert(ha_alter_info != nullptr);
13275 
13276   Rdb_inplace_alter_ctx *const ctx0 =
13277       static_cast<Rdb_inplace_alter_ctx *>(ha_alter_info->handler_ctx);
13278 
13279   DEBUG_SYNC(ha_thd(), "rocksdb.commit_in_place_alter_table");
13280 
13281   /*
13282     IMPORTANT: When rollback is requested, mysql will abort with
13283     an assertion failure. That means every failed commit during inplace alter
13284     table will result in a fatal error on the server. Indexes ongoing creation
13285     will be detected when the server restarts, and dropped.
13286 
13287     For partitioned tables, a rollback call to this function (commit == false)
13288     is done for each partition.  A successful commit call only executes once
13289     for all partitions.
13290   */
13291   if (!commit) {
13292     /* If ctx has not been created yet, nothing to do here */
13293     if (!ctx0) {
13294       DBUG_RETURN(HA_EXIT_SUCCESS);
13295     }
13296 
13297     /*
13298       Cannot call destructor for Rdb_tbl_def directly because we don't want to
13299       erase the mappings inside the ddl_manager, as the old_key_descr is still
13300       using them.
13301     */
13302     if (ctx0->m_new_key_descr) {
13303       /* Delete the new key descriptors */
13304       for (uint i = 0; i < ctx0->m_new_tdef->m_key_count; i++) {
13305         ctx0->m_new_key_descr[i] = nullptr;
13306       }
13307 
13308       delete[] ctx0->m_new_key_descr;
13309       ctx0->m_new_key_descr = nullptr;
13310       ctx0->m_new_tdef->m_key_descr_arr = nullptr;
13311 
13312       delete ctx0->m_new_tdef;
13313     }
13314 
13315     {
13316       std::lock_guard<Rdb_dict_manager> dm_lock(dict_manager);
13317       /* Remove uncommitted key definitons from ddl_manager */
13318       ddl_manager.remove_uncommitted_keydefs(ctx0->m_added_indexes);
13319 
13320       std::unordered_set<GL_INDEX_ID> all_gl_index_ids;
13321       dict_manager.get_ongoing_create_indexes(&all_gl_index_ids);
13322 
13323       std::unordered_set<GL_INDEX_ID> gl_index_ids;
13324       for (auto index : ctx0->m_added_indexes) {
13325         auto gl_index_id = index->get_gl_index_id();
13326         if (all_gl_index_ids.find(gl_index_id) != all_gl_index_ids.end()) {
13327           gl_index_ids.insert(gl_index_id);
13328         }
13329       }
13330 
13331       if (!gl_index_ids.empty()) {
13332         /* Rollback any partially created indexes of this table */
13333         dict_manager.rollback_ongoing_index_creation(gl_index_ids);
13334       }
13335     }
13336 
13337     DBUG_RETURN(HA_EXIT_SUCCESS);
13338   }
13339 
13340   assert(ctx0);
13341 
13342   /*
13343     For partitioned tables, we need to commit all changes to all tables at
13344     once, unlike in the other inplace alter API methods.
13345   */
13346   inplace_alter_handler_ctx **ctx_array;
13347   inplace_alter_handler_ctx *ctx_single[2];
13348 
13349   if (ha_alter_info->group_commit_ctx) {
13350     DBUG_EXECUTE_IF("crash_during_index_creation_partition", DBUG_SUICIDE(););
13351     ctx_array = ha_alter_info->group_commit_ctx;
13352   } else {
13353     ctx_single[0] = ctx0;
13354     ctx_single[1] = nullptr;
13355     ctx_array = ctx_single;
13356   }
13357 
13358   assert(ctx0 == ctx_array[0]);
13359   ha_alter_info->group_commit_ctx = nullptr;
13360 
13361   if (ha_alter_info->handler_flags &
13362       (my_core::Alter_inplace_info::DROP_INDEX |
13363        my_core::Alter_inplace_info::DROP_UNIQUE_INDEX |
13364        my_core::Alter_inplace_info::ADD_INDEX |
13365        my_core::Alter_inplace_info::ADD_UNIQUE_INDEX)) {
13366     const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
13367     rocksdb::WriteBatch *const batch = wb.get();
13368     std::unordered_set<GL_INDEX_ID> create_index_ids;
13369 
13370     m_tbl_def = ctx0->m_new_tdef;
13371     m_key_descr_arr = m_tbl_def->m_key_descr_arr;
13372     m_pk_descr = m_key_descr_arr[pk_index(altered_table, m_tbl_def)];
13373 
13374     DBUG_EXECUTE_IF("rocksdb_commit_alter_table", {
13375       static constexpr char act[] =
13376           "now signal ready_to_mark_cf_dropped_before_commit_alter_table "
13377           "wait_for mark_cf_dropped_done_before_commit_alter_table";
13378       assert(!debug_sync_set_action(ha_thd(), STRING_WITH_LEN(act)));
13379     });
13380 
13381     {
13382       std::lock_guard<Rdb_dict_manager> dm_lock(dict_manager);
13383       for (inplace_alter_handler_ctx **pctx = ctx_array; *pctx; pctx++) {
13384         Rdb_inplace_alter_ctx *const ctx =
13385             static_cast<Rdb_inplace_alter_ctx *>(*pctx);
13386 
13387         /* Mark indexes to be dropped */
13388         dict_manager.add_drop_index(ctx->m_dropped_index_ids, batch);
13389 
13390         for (const auto &index : ctx->m_added_indexes) {
13391           create_index_ids.insert(index->get_gl_index_id());
13392         }
13393 
13394         if (ddl_manager.put_and_write(ctx->m_new_tdef, batch)) {
13395           /*
13396             Failed to write new entry into data dictionary, this should never
13397             happen.
13398           */
13399           assert(0);
13400         }
13401 
13402         /*
13403           Remove uncommitted key definitons from ddl_manager, as they are now
13404           committed into the data dictionary.
13405         */
13406         ddl_manager.remove_uncommitted_keydefs(ctx->m_added_indexes);
13407       }
13408 
13409       if (dict_manager.commit(batch)) {
13410         /*
13411           Should never reach here. We assume MyRocks will abort if commit
13412           fails.
13413         */
13414         assert(0);
13415       }
13416 
13417       /* Mark ongoing create indexes as finished/remove from data dictionary */
13418       dict_manager.finish_indexes_operation(
13419           create_index_ids, Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
13420     }
13421 
13422     DBUG_EXECUTE_IF("rocksdb_delete_index", {
13423       static constexpr char act[] =
13424           "now signal ready_to_mark_cf_dropped_after_commit_alter_table "
13425           "wait_for mark_cf_dropped_done_after_commit_alter_table";
13426       assert(!debug_sync_set_action(ha_thd(), STRING_WITH_LEN(act)));
13427     });
13428 
13429     rdb_drop_idx_thread.signal();
13430 
13431     if (rocksdb_table_stats_use_table_scan && !ctx0->m_added_indexes.empty()) {
13432       // If new indexes are created, add the table to the recalc queue
13433       // to calculate stats for new indexes
13434       rdb_is_thread.add_index_stats_request(m_tbl_def->full_tablename());
13435     }
13436   }
13437 
13438   if (ha_alter_info->handler_flags &
13439       (my_core::Alter_inplace_info::CHANGE_CREATE_OPTION)) {
13440     const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
13441     rocksdb::WriteBatch *const batch = wb.get();
13442     std::unordered_set<GL_INDEX_ID> create_index_ids;
13443 
13444     ulonglong auto_incr_val = ha_alter_info->create_info->auto_increment_value;
13445 
13446     for (inplace_alter_handler_ctx **pctx = ctx_array; *pctx; pctx++) {
13447       Rdb_inplace_alter_ctx *const ctx =
13448           static_cast<Rdb_inplace_alter_ctx *>(*pctx);
13449       auto_incr_val = std::max(auto_incr_val, ctx->m_max_auto_incr);
13450       dict_manager.put_auto_incr_val(
13451           batch, ctx->m_new_tdef->get_autoincr_gl_index_id(), auto_incr_val,
13452           true /* overwrite */);
13453       ctx->m_new_tdef->m_auto_incr_val = auto_incr_val;
13454     }
13455 
13456     if (dict_manager.commit(batch)) {
13457       assert(0);
13458     }
13459   }
13460 
13461   DBUG_RETURN(HA_EXIT_SUCCESS);
13462 }
13463 
13464 #define SHOW_FNAME(name) rocksdb_show_##name
13465 
13466 #define DEF_SHOW_FUNC(name, key)                                           \
13467   static int SHOW_FNAME(name)(MYSQL_THD thd, SHOW_VAR * var, char *buff) { \
13468     rocksdb_status_counters.name =                                         \
13469         rocksdb_stats->getTickerCount(rocksdb::key);                       \
13470     var->type = SHOW_LONGLONG;                                             \
13471     var->value = (char *)&rocksdb_status_counters.name;                    \
13472     return HA_EXIT_SUCCESS;                                                \
13473   }
13474 
13475 #define DEF_STATUS_VAR(name) \
13476   { "rocksdb_" #name, (char *)&SHOW_FNAME(name), SHOW_FUNC, SHOW_SCOPE_GLOBAL }
13477 
13478 #define DEF_STATUS_VAR_PTR(name, ptr, option) \
13479   { "rocksdb_" name, (char *)ptr, option, SHOW_SCOPE_GLOBAL }
13480 
13481 #define DEF_STATUS_VAR_FUNC(name, ptr, option) \
13482   { name, reinterpret_cast<char *>(ptr), option, SHOW_SCOPE_GLOBAL }
13483 
13484 struct rocksdb_status_counters_t {
13485   uint64_t block_cache_miss;
13486   uint64_t block_cache_hit;
13487   uint64_t block_cache_add;
13488   uint64_t block_cache_add_failures;
13489   uint64_t block_cache_index_miss;
13490   uint64_t block_cache_index_hit;
13491   uint64_t block_cache_index_add;
13492   uint64_t block_cache_index_bytes_insert;
13493   uint64_t block_cache_index_bytes_evict;
13494   uint64_t block_cache_filter_miss;
13495   uint64_t block_cache_filter_hit;
13496   uint64_t block_cache_filter_add;
13497   uint64_t block_cache_filter_bytes_insert;
13498   uint64_t block_cache_filter_bytes_evict;
13499   uint64_t block_cache_bytes_read;
13500   uint64_t block_cache_bytes_write;
13501   uint64_t block_cache_data_bytes_insert;
13502   uint64_t block_cache_data_miss;
13503   uint64_t block_cache_data_hit;
13504   uint64_t block_cache_data_add;
13505   uint64_t bloom_filter_useful;
13506   uint64_t bloom_filter_full_positive;
13507   uint64_t bloom_filter_full_true_positive;
13508   uint64_t memtable_hit;
13509   uint64_t memtable_miss;
13510   uint64_t get_hit_l0;
13511   uint64_t get_hit_l1;
13512   uint64_t get_hit_l2_and_up;
13513   uint64_t compaction_key_drop_new;
13514   uint64_t compaction_key_drop_obsolete;
13515   uint64_t compaction_key_drop_user;
13516   uint64_t number_keys_written;
13517   uint64_t number_keys_read;
13518   uint64_t number_keys_updated;
13519   uint64_t bytes_written;
13520   uint64_t bytes_read;
13521   uint64_t number_db_seek;
13522   uint64_t number_db_seek_found;
13523   uint64_t number_db_next;
13524   uint64_t number_db_next_found;
13525   uint64_t number_db_prev;
13526   uint64_t number_db_prev_found;
13527   uint64_t iter_bytes_read;
13528   uint64_t no_file_closes;
13529   uint64_t no_file_opens;
13530   uint64_t no_file_errors;
13531   uint64_t stall_micros;
13532   uint64_t num_iterators;
13533   uint64_t number_multiget_get;
13534   uint64_t number_multiget_keys_read;
13535   uint64_t number_multiget_bytes_read;
13536   uint64_t number_deletes_filtered;
13537   uint64_t number_merge_failures;
13538   uint64_t bloom_filter_prefix_checked;
13539   uint64_t bloom_filter_prefix_useful;
13540   uint64_t number_reseeks_iteration;
13541   uint64_t get_updates_since_calls;
13542   uint64_t block_cache_compressed_miss;
13543   uint64_t block_cache_compressed_hit;
13544   uint64_t wal_synced;
13545   uint64_t wal_bytes;
13546   uint64_t write_self;
13547   uint64_t write_other;
13548   uint64_t write_timedout;
13549   uint64_t write_wal;
13550   uint64_t flush_write_bytes;
13551   uint64_t compact_read_bytes;
13552   uint64_t compact_write_bytes;
13553   uint64_t number_superversion_acquires;
13554   uint64_t number_superversion_releases;
13555   uint64_t number_superversion_cleanups;
13556   uint64_t number_block_not_compressed;
13557 };
13558 
13559 static rocksdb_status_counters_t rocksdb_status_counters;
13560 
13561 DEF_SHOW_FUNC(block_cache_miss, BLOCK_CACHE_MISS)
13562 DEF_SHOW_FUNC(block_cache_hit, BLOCK_CACHE_HIT)
13563 DEF_SHOW_FUNC(block_cache_add, BLOCK_CACHE_ADD)
13564 DEF_SHOW_FUNC(block_cache_add_failures, BLOCK_CACHE_ADD_FAILURES)
13565 DEF_SHOW_FUNC(block_cache_index_miss, BLOCK_CACHE_INDEX_MISS)
13566 DEF_SHOW_FUNC(block_cache_index_hit, BLOCK_CACHE_INDEX_HIT)
13567 DEF_SHOW_FUNC(block_cache_index_add, BLOCK_CACHE_INDEX_ADD)
13568 DEF_SHOW_FUNC(block_cache_index_bytes_insert, BLOCK_CACHE_INDEX_BYTES_INSERT)
13569 DEF_SHOW_FUNC(block_cache_index_bytes_evict, BLOCK_CACHE_INDEX_BYTES_EVICT)
13570 DEF_SHOW_FUNC(block_cache_filter_miss, BLOCK_CACHE_FILTER_MISS)
13571 DEF_SHOW_FUNC(block_cache_filter_hit, BLOCK_CACHE_FILTER_HIT)
13572 DEF_SHOW_FUNC(block_cache_filter_add, BLOCK_CACHE_FILTER_ADD)
13573 DEF_SHOW_FUNC(block_cache_filter_bytes_insert, BLOCK_CACHE_FILTER_BYTES_INSERT)
13574 DEF_SHOW_FUNC(block_cache_filter_bytes_evict, BLOCK_CACHE_FILTER_BYTES_EVICT)
13575 DEF_SHOW_FUNC(block_cache_bytes_read, BLOCK_CACHE_BYTES_READ)
13576 DEF_SHOW_FUNC(block_cache_bytes_write, BLOCK_CACHE_BYTES_WRITE)
13577 DEF_SHOW_FUNC(block_cache_data_bytes_insert, BLOCK_CACHE_DATA_BYTES_INSERT)
13578 DEF_SHOW_FUNC(block_cache_data_miss, BLOCK_CACHE_DATA_MISS)
13579 DEF_SHOW_FUNC(block_cache_data_hit, BLOCK_CACHE_DATA_HIT)
13580 DEF_SHOW_FUNC(block_cache_data_add, BLOCK_CACHE_DATA_ADD)
13581 DEF_SHOW_FUNC(bloom_filter_useful, BLOOM_FILTER_USEFUL)
13582 DEF_SHOW_FUNC(bloom_filter_full_positive, BLOOM_FILTER_FULL_POSITIVE)
13583 DEF_SHOW_FUNC(bloom_filter_full_true_positive, BLOOM_FILTER_FULL_TRUE_POSITIVE)
13584 DEF_SHOW_FUNC(memtable_hit, MEMTABLE_HIT)
13585 DEF_SHOW_FUNC(memtable_miss, MEMTABLE_MISS)
13586 DEF_SHOW_FUNC(get_hit_l0, GET_HIT_L0)
13587 DEF_SHOW_FUNC(get_hit_l1, GET_HIT_L1)
13588 DEF_SHOW_FUNC(get_hit_l2_and_up, GET_HIT_L2_AND_UP)
13589 DEF_SHOW_FUNC(compaction_key_drop_new, COMPACTION_KEY_DROP_NEWER_ENTRY)
13590 DEF_SHOW_FUNC(compaction_key_drop_obsolete, COMPACTION_KEY_DROP_OBSOLETE)
13591 DEF_SHOW_FUNC(compaction_key_drop_user, COMPACTION_KEY_DROP_USER)
13592 DEF_SHOW_FUNC(number_keys_written, NUMBER_KEYS_WRITTEN)
13593 DEF_SHOW_FUNC(number_keys_read, NUMBER_KEYS_READ)
13594 DEF_SHOW_FUNC(number_keys_updated, NUMBER_KEYS_UPDATED)
13595 DEF_SHOW_FUNC(bytes_written, BYTES_WRITTEN)
13596 DEF_SHOW_FUNC(bytes_read, BYTES_READ)
13597 DEF_SHOW_FUNC(number_db_seek, NUMBER_DB_SEEK)
13598 DEF_SHOW_FUNC(number_db_seek_found, NUMBER_DB_SEEK_FOUND)
13599 DEF_SHOW_FUNC(number_db_next, NUMBER_DB_NEXT)
13600 DEF_SHOW_FUNC(number_db_next_found, NUMBER_DB_NEXT_FOUND)
13601 DEF_SHOW_FUNC(number_db_prev, NUMBER_DB_PREV)
13602 DEF_SHOW_FUNC(number_db_prev_found, NUMBER_DB_PREV_FOUND)
13603 DEF_SHOW_FUNC(iter_bytes_read, ITER_BYTES_READ)
13604 DEF_SHOW_FUNC(no_file_closes, NO_FILE_CLOSES)
13605 DEF_SHOW_FUNC(no_file_opens, NO_FILE_OPENS)
13606 DEF_SHOW_FUNC(no_file_errors, NO_FILE_ERRORS)
13607 DEF_SHOW_FUNC(stall_micros, STALL_MICROS)
13608 DEF_SHOW_FUNC(num_iterators, NO_ITERATORS)
13609 DEF_SHOW_FUNC(number_multiget_get, NUMBER_MULTIGET_CALLS)
13610 DEF_SHOW_FUNC(number_multiget_keys_read, NUMBER_MULTIGET_KEYS_READ)
13611 DEF_SHOW_FUNC(number_multiget_bytes_read, NUMBER_MULTIGET_BYTES_READ)
13612 DEF_SHOW_FUNC(number_deletes_filtered, NUMBER_FILTERED_DELETES)
13613 DEF_SHOW_FUNC(number_merge_failures, NUMBER_MERGE_FAILURES)
13614 DEF_SHOW_FUNC(bloom_filter_prefix_checked, BLOOM_FILTER_PREFIX_CHECKED)
13615 DEF_SHOW_FUNC(bloom_filter_prefix_useful, BLOOM_FILTER_PREFIX_USEFUL)
13616 DEF_SHOW_FUNC(number_reseeks_iteration, NUMBER_OF_RESEEKS_IN_ITERATION)
13617 DEF_SHOW_FUNC(get_updates_since_calls, GET_UPDATES_SINCE_CALLS)
13618 DEF_SHOW_FUNC(block_cache_compressed_miss, BLOCK_CACHE_COMPRESSED_MISS)
13619 DEF_SHOW_FUNC(block_cache_compressed_hit, BLOCK_CACHE_COMPRESSED_HIT)
13620 DEF_SHOW_FUNC(wal_synced, WAL_FILE_SYNCED)
13621 DEF_SHOW_FUNC(wal_bytes, WAL_FILE_BYTES)
13622 DEF_SHOW_FUNC(write_self, WRITE_DONE_BY_SELF)
13623 DEF_SHOW_FUNC(write_other, WRITE_DONE_BY_OTHER)
13624 DEF_SHOW_FUNC(write_timedout, WRITE_TIMEDOUT)
13625 DEF_SHOW_FUNC(write_wal, WRITE_WITH_WAL)
13626 DEF_SHOW_FUNC(flush_write_bytes, FLUSH_WRITE_BYTES)
13627 DEF_SHOW_FUNC(compact_read_bytes, COMPACT_READ_BYTES)
13628 DEF_SHOW_FUNC(compact_write_bytes, COMPACT_WRITE_BYTES)
13629 DEF_SHOW_FUNC(number_superversion_acquires, NUMBER_SUPERVERSION_ACQUIRES)
13630 DEF_SHOW_FUNC(number_superversion_releases, NUMBER_SUPERVERSION_RELEASES)
13631 DEF_SHOW_FUNC(number_superversion_cleanups, NUMBER_SUPERVERSION_CLEANUPS)
13632 DEF_SHOW_FUNC(number_block_not_compressed, NUMBER_BLOCK_NOT_COMPRESSED)
13633 
13634 static void myrocks_update_status() {
13635   export_stats.rows_deleted = global_stats.rows[ROWS_DELETED];
13636   export_stats.rows_deleted_blind = global_stats.rows[ROWS_DELETED_BLIND];
13637   export_stats.rows_inserted = global_stats.rows[ROWS_INSERTED];
13638   export_stats.rows_read = global_stats.rows[ROWS_READ];
13639   export_stats.rows_updated = global_stats.rows[ROWS_UPDATED];
13640   export_stats.rows_expired = global_stats.rows[ROWS_EXPIRED];
13641   export_stats.rows_filtered = global_stats.rows[ROWS_FILTERED];
13642 
13643   export_stats.system_rows_deleted = global_stats.system_rows[ROWS_DELETED];
13644   export_stats.system_rows_inserted = global_stats.system_rows[ROWS_INSERTED];
13645   export_stats.system_rows_read = global_stats.system_rows[ROWS_READ];
13646   export_stats.system_rows_updated = global_stats.system_rows[ROWS_UPDATED];
13647 
13648   export_stats.queries_point = global_stats.queries[QUERIES_POINT];
13649   export_stats.queries_range = global_stats.queries[QUERIES_RANGE];
13650 
13651   export_stats.table_index_stats_success =
13652       global_stats.table_index_stats_result[TABLE_INDEX_STATS_SUCCESS];
13653   export_stats.table_index_stats_failure =
13654       global_stats.table_index_stats_result[TABLE_INDEX_STATS_FAILURE];
13655   export_stats.table_index_stats_req_queue_length =
13656       rdb_is_thread.get_request_queue_size();
13657 
13658   export_stats.covered_secondary_key_lookups =
13659       global_stats.covered_secondary_key_lookups;
13660 }
13661 
13662 static void myrocks_update_memory_status() {
13663   std::vector<rocksdb::DB *> dbs;
13664   std::unordered_set<const rocksdb::Cache *> cache_set;
13665   dbs.push_back(rdb);
13666   std::map<rocksdb::MemoryUtil::UsageType, uint64_t> temp_usage_by_type;
13667   rocksdb::MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set,
13668                                                        &temp_usage_by_type);
13669   memory_stats.memtable_total =
13670       temp_usage_by_type[rocksdb::MemoryUtil::kMemTableTotal];
13671   memory_stats.memtable_unflushed =
13672       temp_usage_by_type[rocksdb::MemoryUtil::kMemTableUnFlushed];
13673 }
13674 
13675 static SHOW_VAR myrocks_status_variables[] = {
13676     DEF_STATUS_VAR_FUNC("rows_deleted", &export_stats.rows_deleted,
13677                         SHOW_LONGLONG),
13678     DEF_STATUS_VAR_FUNC("rows_deleted_blind", &export_stats.rows_deleted_blind,
13679                         SHOW_LONGLONG),
13680     DEF_STATUS_VAR_FUNC("rows_inserted", &export_stats.rows_inserted,
13681                         SHOW_LONGLONG),
13682     DEF_STATUS_VAR_FUNC("rows_read", &export_stats.rows_read, SHOW_LONGLONG),
13683     DEF_STATUS_VAR_FUNC("rows_updated", &export_stats.rows_updated,
13684                         SHOW_LONGLONG),
13685     DEF_STATUS_VAR_FUNC("rows_expired", &export_stats.rows_expired,
13686                         SHOW_LONGLONG),
13687     DEF_STATUS_VAR_FUNC("rows_filtered", &export_stats.rows_filtered,
13688                         SHOW_LONGLONG),
13689     DEF_STATUS_VAR_FUNC("system_rows_deleted",
13690                         &export_stats.system_rows_deleted, SHOW_LONGLONG),
13691     DEF_STATUS_VAR_FUNC("system_rows_inserted",
13692                         &export_stats.system_rows_inserted, SHOW_LONGLONG),
13693     DEF_STATUS_VAR_FUNC("system_rows_read", &export_stats.system_rows_read,
13694                         SHOW_LONGLONG),
13695     DEF_STATUS_VAR_FUNC("system_rows_updated",
13696                         &export_stats.system_rows_updated, SHOW_LONGLONG),
13697     DEF_STATUS_VAR_FUNC("memtable_total", &memory_stats.memtable_total,
13698                         SHOW_LONGLONG),
13699     DEF_STATUS_VAR_FUNC("memtable_unflushed", &memory_stats.memtable_unflushed,
13700                         SHOW_LONGLONG),
13701     DEF_STATUS_VAR_FUNC("queries_point", &export_stats.queries_point,
13702                         SHOW_LONGLONG),
13703     DEF_STATUS_VAR_FUNC("queries_range", &export_stats.queries_range,
13704                         SHOW_LONGLONG),
13705     DEF_STATUS_VAR_FUNC("table_index_stats_success",
13706                         &export_stats.table_index_stats_success, SHOW_LONGLONG),
13707     DEF_STATUS_VAR_FUNC("table_index_stats_failure",
13708                         &export_stats.table_index_stats_failure, SHOW_LONGLONG),
13709     DEF_STATUS_VAR_FUNC("table_index_stats_req_queue_length",
13710                         &export_stats.table_index_stats_req_queue_length,
13711                         SHOW_LONGLONG),
13712     DEF_STATUS_VAR_FUNC("covered_secondary_key_lookups",
13713                         &export_stats.covered_secondary_key_lookups,
13714                         SHOW_LONGLONG),
13715 
13716     {NullS, NullS, SHOW_LONG, SHOW_SCOPE_GLOBAL}};
13717 
13718 static void show_myrocks_vars(THD *thd, SHOW_VAR *var, char *buff) {
13719   myrocks_update_status();
13720   myrocks_update_memory_status();
13721   var->type = SHOW_ARRAY;
13722   var->value = reinterpret_cast<char *>(&myrocks_status_variables);
13723 }
13724 
13725 static ulonglong io_stall_prop_value(
13726     const std::map<std::string, std::string> &props, const std::string &key) {
13727   std::map<std::string, std::string>::const_iterator iter =
13728       props.find("io_stalls." + key);
13729   if (iter != props.end()) {
13730     return std::stoull(iter->second);
13731   } else {
13732     DBUG_PRINT("warning",
13733                ("RocksDB GetMapProperty hasn't returned key=%s", key.c_str()));
13734     assert(0);
13735     return 0;
13736   }
13737 }
13738 
13739 static void update_rocksdb_stall_status() {
13740   st_io_stall_stats local_io_stall_stats;
13741   for (const auto &cf_name : cf_manager.get_cf_names()) {
13742     std::shared_ptr<rocksdb::ColumnFamilyHandle> cfh =
13743         cf_manager.get_cf(cf_name);
13744     if (!cfh) {
13745       continue;
13746     }
13747 
13748     // Retrieve information from valid CF handle object. It is safe
13749     // even if the CF is removed from cf_manager at this point.
13750     std::map<std::string, std::string> props;
13751     if (!rdb->GetMapProperty(cfh.get(), "rocksdb.cfstats", &props)) {
13752       continue;
13753     }
13754 
13755     local_io_stall_stats.level0_slowdown +=
13756         io_stall_prop_value(props, "level0_slowdown");
13757     local_io_stall_stats.level0_slowdown_with_compaction +=
13758         io_stall_prop_value(props, "level0_slowdown_with_compaction");
13759     local_io_stall_stats.level0_numfiles +=
13760         io_stall_prop_value(props, "level0_numfiles");
13761     local_io_stall_stats.level0_numfiles_with_compaction +=
13762         io_stall_prop_value(props, "level0_numfiles_with_compaction");
13763     local_io_stall_stats.stop_for_pending_compaction_bytes +=
13764         io_stall_prop_value(props, "stop_for_pending_compaction_bytes");
13765     local_io_stall_stats.slowdown_for_pending_compaction_bytes +=
13766         io_stall_prop_value(props, "slowdown_for_pending_compaction_bytes");
13767     local_io_stall_stats.memtable_compaction +=
13768         io_stall_prop_value(props, "memtable_compaction");
13769     local_io_stall_stats.memtable_slowdown +=
13770         io_stall_prop_value(props, "memtable_slowdown");
13771     local_io_stall_stats.total_stop += io_stall_prop_value(props, "total_stop");
13772     local_io_stall_stats.total_slowdown +=
13773         io_stall_prop_value(props, "total_slowdown");
13774   }
13775   io_stall_stats = local_io_stall_stats;
13776 }
13777 
13778 static SHOW_VAR rocksdb_stall_status_variables[] = {
13779     DEF_STATUS_VAR_FUNC("l0_file_count_limit_slowdowns",
13780                         &io_stall_stats.level0_slowdown, SHOW_LONGLONG),
13781     DEF_STATUS_VAR_FUNC("locked_l0_file_count_limit_slowdowns",
13782                         &io_stall_stats.level0_slowdown_with_compaction,
13783                         SHOW_LONGLONG),
13784     DEF_STATUS_VAR_FUNC("l0_file_count_limit_stops",
13785                         &io_stall_stats.level0_numfiles, SHOW_LONGLONG),
13786     DEF_STATUS_VAR_FUNC("locked_l0_file_count_limit_stops",
13787                         &io_stall_stats.level0_numfiles_with_compaction,
13788                         SHOW_LONGLONG),
13789     DEF_STATUS_VAR_FUNC("pending_compaction_limit_stops",
13790                         &io_stall_stats.stop_for_pending_compaction_bytes,
13791                         SHOW_LONGLONG),
13792     DEF_STATUS_VAR_FUNC("pending_compaction_limit_slowdowns",
13793                         &io_stall_stats.slowdown_for_pending_compaction_bytes,
13794                         SHOW_LONGLONG),
13795     DEF_STATUS_VAR_FUNC("memtable_limit_stops",
13796                         &io_stall_stats.memtable_compaction, SHOW_LONGLONG),
13797     DEF_STATUS_VAR_FUNC("memtable_limit_slowdowns",
13798                         &io_stall_stats.memtable_slowdown, SHOW_LONGLONG),
13799     DEF_STATUS_VAR_FUNC("total_stops", &io_stall_stats.total_stop,
13800                         SHOW_LONGLONG),
13801     DEF_STATUS_VAR_FUNC("total_slowdowns", &io_stall_stats.total_slowdown,
13802                         SHOW_LONGLONG),
13803     // end of the array marker
13804     {NullS, NullS, SHOW_LONG, SHOW_SCOPE_GLOBAL}};
13805 
13806 static void show_rocksdb_stall_vars(THD *thd, SHOW_VAR *var, char *buff) {
13807   update_rocksdb_stall_status();
13808   var->type = SHOW_ARRAY;
13809   var->value = reinterpret_cast<char *>(&rocksdb_stall_status_variables);
13810 }
13811 
13812 static SHOW_VAR rocksdb_status_vars[] = {
13813     DEF_STATUS_VAR(block_cache_miss),
13814     DEF_STATUS_VAR(block_cache_hit),
13815     DEF_STATUS_VAR(block_cache_add),
13816     DEF_STATUS_VAR(block_cache_add_failures),
13817     DEF_STATUS_VAR(block_cache_index_miss),
13818     DEF_STATUS_VAR(block_cache_index_hit),
13819     DEF_STATUS_VAR(block_cache_index_add),
13820     DEF_STATUS_VAR(block_cache_index_bytes_insert),
13821     DEF_STATUS_VAR(block_cache_index_bytes_evict),
13822     DEF_STATUS_VAR(block_cache_filter_miss),
13823     DEF_STATUS_VAR(block_cache_filter_hit),
13824     DEF_STATUS_VAR(block_cache_filter_add),
13825     DEF_STATUS_VAR(block_cache_filter_bytes_insert),
13826     DEF_STATUS_VAR(block_cache_filter_bytes_evict),
13827     DEF_STATUS_VAR(block_cache_bytes_read),
13828     DEF_STATUS_VAR(block_cache_bytes_write),
13829     DEF_STATUS_VAR(block_cache_data_bytes_insert),
13830     DEF_STATUS_VAR(block_cache_data_miss),
13831     DEF_STATUS_VAR(block_cache_data_hit),
13832     DEF_STATUS_VAR(block_cache_data_add),
13833     DEF_STATUS_VAR(bloom_filter_useful),
13834     DEF_STATUS_VAR(bloom_filter_full_positive),
13835     DEF_STATUS_VAR(bloom_filter_full_true_positive),
13836     DEF_STATUS_VAR(memtable_hit),
13837     DEF_STATUS_VAR(memtable_miss),
13838     DEF_STATUS_VAR(get_hit_l0),
13839     DEF_STATUS_VAR(get_hit_l1),
13840     DEF_STATUS_VAR(get_hit_l2_and_up),
13841     DEF_STATUS_VAR(compaction_key_drop_new),
13842     DEF_STATUS_VAR(compaction_key_drop_obsolete),
13843     DEF_STATUS_VAR(compaction_key_drop_user),
13844     DEF_STATUS_VAR(number_keys_written),
13845     DEF_STATUS_VAR(number_keys_read),
13846     DEF_STATUS_VAR(number_keys_updated),
13847     DEF_STATUS_VAR(bytes_written),
13848     DEF_STATUS_VAR(bytes_read),
13849     DEF_STATUS_VAR(number_db_seek),
13850     DEF_STATUS_VAR(number_db_seek_found),
13851     DEF_STATUS_VAR(number_db_next),
13852     DEF_STATUS_VAR(number_db_next_found),
13853     DEF_STATUS_VAR(number_db_prev),
13854     DEF_STATUS_VAR(number_db_prev_found),
13855     DEF_STATUS_VAR(iter_bytes_read),
13856     DEF_STATUS_VAR(no_file_closes),
13857     DEF_STATUS_VAR(no_file_opens),
13858     DEF_STATUS_VAR(no_file_errors),
13859     DEF_STATUS_VAR(stall_micros),
13860     DEF_STATUS_VAR(num_iterators),
13861     DEF_STATUS_VAR(number_multiget_get),
13862     DEF_STATUS_VAR(number_multiget_keys_read),
13863     DEF_STATUS_VAR(number_multiget_bytes_read),
13864     DEF_STATUS_VAR(number_deletes_filtered),
13865     DEF_STATUS_VAR(number_merge_failures),
13866     DEF_STATUS_VAR(bloom_filter_prefix_checked),
13867     DEF_STATUS_VAR(bloom_filter_prefix_useful),
13868     DEF_STATUS_VAR(number_reseeks_iteration),
13869     DEF_STATUS_VAR(get_updates_since_calls),
13870     DEF_STATUS_VAR(block_cache_compressed_miss),
13871     DEF_STATUS_VAR(block_cache_compressed_hit),
13872     DEF_STATUS_VAR(wal_synced),
13873     DEF_STATUS_VAR(wal_bytes),
13874     DEF_STATUS_VAR(write_self),
13875     DEF_STATUS_VAR(write_other),
13876     DEF_STATUS_VAR(write_timedout),
13877     DEF_STATUS_VAR(write_wal),
13878     DEF_STATUS_VAR(flush_write_bytes),
13879     DEF_STATUS_VAR(compact_read_bytes),
13880     DEF_STATUS_VAR(compact_write_bytes),
13881     DEF_STATUS_VAR(number_superversion_acquires),
13882     DEF_STATUS_VAR(number_superversion_releases),
13883     DEF_STATUS_VAR(number_superversion_cleanups),
13884     DEF_STATUS_VAR(number_block_not_compressed),
13885     DEF_STATUS_VAR_PTR("row_lock_deadlocks", &rocksdb_row_lock_deadlocks,
13886                        SHOW_LONGLONG),
13887     DEF_STATUS_VAR_PTR("row_lock_wait_timeouts",
13888                        &rocksdb_row_lock_wait_timeouts, SHOW_LONGLONG),
13889     DEF_STATUS_VAR_PTR("snapshot_conflict_errors",
13890                        &rocksdb_snapshot_conflict_errors, SHOW_LONGLONG),
13891     DEF_STATUS_VAR_PTR("wal_group_syncs", &rocksdb_wal_group_syncs,
13892                        SHOW_LONGLONG),
13893     DEF_STATUS_VAR_PTR("manual_compactions_processed",
13894                        &rocksdb_manual_compactions_processed, SHOW_LONGLONG),
13895     DEF_STATUS_VAR_PTR("manual_compactions_running",
13896                        &rocksdb_manual_compactions_running, SHOW_LONGLONG),
13897     DEF_STATUS_VAR_PTR("number_sst_entry_put", &rocksdb_num_sst_entry_put,
13898                        SHOW_LONGLONG),
13899     DEF_STATUS_VAR_PTR("number_sst_entry_delete", &rocksdb_num_sst_entry_delete,
13900                        SHOW_LONGLONG),
13901     DEF_STATUS_VAR_PTR("number_sst_entry_singledelete",
13902                        &rocksdb_num_sst_entry_singledelete, SHOW_LONGLONG),
13903     DEF_STATUS_VAR_PTR("number_sst_entry_merge", &rocksdb_num_sst_entry_merge,
13904                        SHOW_LONGLONG),
13905     DEF_STATUS_VAR_PTR("number_sst_entry_other", &rocksdb_num_sst_entry_other,
13906                        SHOW_LONGLONG),
13907     DEF_STATUS_VAR_PTR("additional_compaction_triggers",
13908                        &rocksdb_additional_compaction_triggers, SHOW_LONGLONG),
13909 #ifndef NDEBUG
13910     DEF_STATUS_VAR_PTR("num_get_for_update_calls",
13911                        &rocksdb_num_get_for_update_calls, SHOW_LONGLONG),
13912 #endif
13913     // the variables generated by SHOW_FUNC are sorted only by prefix (first
13914     // arg in the tuple below), so make sure it is unique to make sorting
13915     // deterministic as quick sort is not stable
13916     {"rocksdb", reinterpret_cast<char *>(&show_myrocks_vars), SHOW_FUNC,
13917      SHOW_SCOPE_GLOBAL},
13918     {"rocksdb_stall", reinterpret_cast<char *>(&show_rocksdb_stall_vars),
13919      SHOW_FUNC, SHOW_SCOPE_GLOBAL},
13920     {NullS, NullS, SHOW_LONG, SHOW_SCOPE_GLOBAL}};
13921 
13922 /*
13923   Background thread's main logic
13924 */
13925 
13926 void Rdb_background_thread::run() {
13927   // How many seconds to wait till flushing the WAL next time.
13928   const int WAKE_UP_INTERVAL = 1;
13929 
13930   timespec ts_next_sync;
13931   clock_gettime(CLOCK_REALTIME, &ts_next_sync);
13932   ts_next_sync.tv_sec += WAKE_UP_INTERVAL;
13933 
13934   for (;;) {
13935     // Wait until the next timeout or until we receive a signal to stop the
13936     // thread. Request to stop the thread should only be triggered when the
13937     // storage engine is being unloaded.
13938     RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
13939     const auto ret MY_ATTRIBUTE((__unused__)) =
13940         mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts_next_sync);
13941 
13942     // Check that we receive only the expected error codes.
13943     assert(ret == 0 || ret == ETIMEDOUT);
13944     const THD::killed_state local_killed = m_killed;
13945     const bool local_save_stats = m_save_stats;
13946     reset();
13947     RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
13948 
13949     if (local_killed) {
13950       // If we're here then that's because condition variable was signaled by
13951       // another thread and we're shutting down. Break out the loop to make
13952       // sure that shutdown thread can proceed.
13953       break;
13954     }
13955 
13956     // This path should be taken only when the timer expired.
13957     assert(ret == ETIMEDOUT);
13958 
13959     if (local_save_stats) {
13960       ddl_manager.persist_stats();
13961     }
13962 
13963     timespec ts;
13964     clock_gettime(CLOCK_REALTIME, &ts);
13965 
13966     // Flush the WAL. Sync it for both background and never modes to copy
13967     // InnoDB's behavior. For mode never, the wal file isn't even written,
13968     // whereas background writes to the wal file, but issues the syncs in a
13969     // background thread.
13970     if (rdb && (rocksdb_flush_log_at_trx_commit != FLUSH_LOG_SYNC) &&
13971         !rocksdb_db_options->allow_mmap_writes) {
13972       const rocksdb::Status s = rdb->FlushWAL(true);
13973       if (!s.ok()) {
13974         rdb_handle_io_error(s, RDB_IO_ERROR_BG_THREAD);
13975       }
13976     }
13977 
13978     // Recalculate statistics for indexes only if
13979     // rocksdb_table_stats_use_table_scan is disabled.
13980     //  Otherwise, Rdb_index_stats_thread will do the work
13981     if (!rocksdb_table_stats_use_table_scan && rocksdb_stats_recalc_rate) {
13982       std::vector<std::string> to_recalc;
13983       if (rdb_tables_to_recalc.empty()) {
13984         struct Rdb_index_collector : public Rdb_tables_scanner {
13985           int add_table(Rdb_tbl_def *tdef) override {
13986             rdb_tables_to_recalc.push_back(tdef->full_tablename());
13987             return HA_EXIT_SUCCESS;
13988           }
13989         } collector;
13990         ddl_manager.scan_for_tables(&collector);
13991       }
13992 
13993       while (to_recalc.size() < rocksdb_stats_recalc_rate &&
13994              !rdb_tables_to_recalc.empty()) {
13995         to_recalc.push_back(rdb_tables_to_recalc.back());
13996         rdb_tables_to_recalc.pop_back();
13997       }
13998 
13999       for (const auto &tbl_name : to_recalc) {
14000         calculate_stats_for_table(tbl_name, SCAN_TYPE_NONE);
14001       }
14002     }
14003 
14004     // Set the next timestamp for mysql_cond_timedwait() (which ends up calling
14005     // pthread_cond_timedwait()) to wait on.
14006     ts_next_sync.tv_sec = ts.tv_sec + WAKE_UP_INTERVAL;
14007   }
14008 
14009   // save remaining stats which might've left unsaved
14010   ddl_manager.persist_stats();
14011 }
14012 
14013 void Rdb_index_stats_thread::run() {
14014   const int WAKE_UP_INTERVAL = 1;
14015 #ifdef TARGET_OS_LINUX
14016   RDB_MUTEX_LOCK_CHECK(m_is_mutex);
14017   m_tid_set = true;
14018   m_tid = syscall(SYS_gettid);
14019   RDB_MUTEX_UNLOCK_CHECK(m_is_mutex);
14020 #endif
14021 
14022   renice(rocksdb_table_stats_background_thread_nice_value);
14023   for (;;) {
14024     RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
14025     if (m_killed) {
14026       RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
14027       break;
14028     }
14029 
14030     timespec ts;
14031     clock_gettime(CLOCK_REALTIME, &ts);
14032 
14033     // Wait for 24 hours if the table scan based index calculation
14034     // is off. When the switch is turned on and any request is added
14035     // to the recalc queue, this thread will be signaled.
14036     ts.tv_sec +=
14037         (rocksdb_table_stats_use_table_scan) ? WAKE_UP_INTERVAL : 24 * 60 * 60;
14038 
14039     const auto ret MY_ATTRIBUTE((__unused__)) =
14040         mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts);
14041 
14042     if (m_killed) {
14043       RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
14044       break;
14045     }
14046 
14047     // Make sure, no program error is returned
14048     assert(ret == 0 || ret == ETIMEDOUT);
14049     RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
14050 
14051     for (;;) {
14052       if (!rocksdb_table_stats_use_table_scan) {
14053         // Clear the recalc queue
14054         clear_all_index_stats_requests();
14055         break;
14056       }
14057 
14058       std::string tbl_name;
14059       if (!get_index_stats_request(&tbl_name)) {
14060         // No request in the recalc queue
14061         break;
14062       }
14063 
14064       Rdb_table_stats tbl_stats;
14065       if (ddl_manager.find_table_stats(tbl_name, &tbl_stats) !=
14066           HA_EXIT_SUCCESS) {
14067         // The table has been dropped. Skip this table.
14068         continue;
14069       }
14070 
14071       clock_gettime(CLOCK_REALTIME, &ts);
14072       if (difftime(ts.tv_sec, tbl_stats.m_last_recalc) <
14073           RDB_MIN_RECALC_INTERVAL) {
14074         /* Stats were (re)calculated not long ago. To avoid
14075         too frequent stats updates we put back the table on
14076         the recalc queue and do nothing. */
14077 
14078         add_index_stats_request(tbl_name);
14079         break;
14080       }
14081 
14082       DBUG_EXECUTE_IF("rocksdb_is_bg_thread", {
14083         if (tbl_name == "test.t") {
14084           THD *thd = new THD();
14085           thd->thread_stack = reinterpret_cast<char *>(&thd);
14086           thd->store_globals();
14087 
14088           static constexpr char act[] = "now wait_for ready_to_calculate_index_stats";
14089           assert(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
14090 
14091           thd->restore_globals();
14092           delete thd;
14093         }
14094       });
14095 
14096       int err =
14097           calculate_stats_for_table(tbl_name, SCAN_TYPE_FULL_TABLE, &m_killed);
14098 
14099       if (err != HA_EXIT_SUCCESS) {
14100         global_stats.table_index_stats_result[TABLE_INDEX_STATS_FAILURE].inc();
14101         break;
14102       }
14103 
14104       global_stats.table_index_stats_result[TABLE_INDEX_STATS_SUCCESS].inc();
14105 
14106       DBUG_EXECUTE_IF("rocksdb_is_bg_thread", {
14107         if (tbl_name == "test.t") {
14108           THD *thd = new THD();
14109           thd->thread_stack = reinterpret_cast<char *>(&thd);
14110           thd->store_globals();
14111 
14112           static constexpr char act[] = "now signal index_stats_calculation_done";
14113           assert(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
14114 
14115           thd->restore_globals();
14116           delete thd;
14117         }
14118       });
14119     }
14120   }
14121 
14122   RDB_MUTEX_LOCK_CHECK(m_is_mutex);
14123   m_tid_set = false;
14124   m_tid = 0;
14125   RDB_MUTEX_UNLOCK_CHECK(m_is_mutex);
14126 }
14127 
14128 bool Rdb_index_stats_thread::get_index_stats_request(std::string *tbl_name) {
14129   RDB_MUTEX_LOCK_CHECK(m_is_mutex);
14130   if (m_requests.empty()) {
14131     RDB_MUTEX_UNLOCK_CHECK(m_is_mutex);
14132     return false;
14133   }
14134 
14135   *tbl_name = m_requests[0];
14136   m_requests.pop_front();
14137 
14138   auto count = m_tbl_names.erase(*tbl_name);
14139   if (count != 1) {
14140     assert(0);
14141   }
14142 
14143   RDB_MUTEX_UNLOCK_CHECK(m_is_mutex);
14144   return true;
14145 }
14146 
14147 void Rdb_index_stats_thread::add_index_stats_request(
14148     const std::string &tbl_name) {
14149   RDB_MUTEX_LOCK_CHECK(m_is_mutex);
14150 
14151   /* Quit if already in the queue */
14152   auto ret = m_tbl_names.insert(tbl_name);
14153   if (!ret.second) {
14154     RDB_MUTEX_UNLOCK_CHECK(m_is_mutex);
14155     return;
14156   }
14157 
14158   m_requests.push_back(*ret.first);
14159   RDB_MUTEX_UNLOCK_CHECK(m_is_mutex);
14160   signal();
14161 }
14162 
14163 void Rdb_index_stats_thread::clear_all_index_stats_requests() {
14164   RDB_MUTEX_LOCK_CHECK(m_is_mutex);
14165   m_requests.clear();
14166   m_tbl_names.clear();
14167   RDB_MUTEX_UNLOCK_CHECK(m_is_mutex);
14168 }
14169 
14170 int Rdb_index_stats_thread::renice(int nice_val) {
14171   RDB_MUTEX_LOCK_CHECK(m_is_mutex);
14172   if (!m_tid_set) {
14173     RDB_MUTEX_UNLOCK_CHECK(m_is_mutex);
14174     return HA_EXIT_FAILURE;
14175   }
14176 
14177 #ifdef TARGET_OS_LINUX
14178   int ret = setpriority(PRIO_PROCESS, m_tid, nice_val);
14179   if (ret != 0) {
14180     // NO_LINT_DEBUG
14181     sql_print_error("Set index stats thread priority failed due to %s",
14182                     strerror(errno));
14183     RDB_MUTEX_UNLOCK_CHECK(m_is_mutex);
14184     return HA_EXIT_FAILURE;
14185   }
14186 #endif
14187 
14188   RDB_MUTEX_UNLOCK_CHECK(m_is_mutex);
14189   return HA_EXIT_SUCCESS;
14190 }
14191 
14192 size_t Rdb_index_stats_thread::get_request_queue_size() {
14193   size_t len = 0;
14194   RDB_MUTEX_LOCK_CHECK(m_is_mutex);
14195   len = m_requests.size();
14196   RDB_MUTEX_UNLOCK_CHECK(m_is_mutex);
14197 
14198   return len;
14199 }
14200 
14201 /*
14202   A background thread to handle manual compactions,
14203   except for dropping indexes/tables. Every second, it checks
14204   pending manual compactions, and it calls CompactRange if there is.
14205 */
14206 void Rdb_manual_compaction_thread::run() {
14207   RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
14208   for (;;) {
14209     if (m_killed) {
14210       break;
14211     }
14212     timespec ts;
14213     clock_gettime(CLOCK_REALTIME, &ts);
14214     ts.tv_sec += 1;
14215 
14216     const auto ret MY_ATTRIBUTE((__unused__)) =
14217         mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts);
14218     if (m_killed) {
14219       break;
14220     }
14221     // make sure, no program error is returned
14222     assert(ret == 0 || ret == ETIMEDOUT);
14223     RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
14224 
14225     RDB_MUTEX_LOCK_CHECK(m_mc_mutex);
14226     // Grab the first item and proceed, if not empty.
14227     if (m_requests.empty()) {
14228       RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
14229       RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
14230       continue;
14231     }
14232     Manual_compaction_request &mcr = m_requests.begin()->second;
14233     assert(mcr.cf);
14234     assert(mcr.state == Manual_compaction_request::INITED);
14235     mcr.state = Manual_compaction_request::RUNNING;
14236     RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
14237 
14238     assert(mcr.state == Manual_compaction_request::RUNNING);
14239     // NO_LINT_DEBUG
14240     sql_print_information("Manual Compaction id %d cf %s started.", mcr.mc_id,
14241                           mcr.cf->GetName().c_str());
14242     rocksdb_manual_compactions_running++;
14243     if (rocksdb_debug_manual_compaction_delay > 0) {
14244       // In Facebook MySQL 5.6.35, my_sleep breaks the sleep when the server
14245       // gets a shutdown signal and this code depended on that behavior.
14246       // In 5.7, for whatever reason, this is not the case.  my_sleep will
14247       // continue to sleep until the sleep time has elapsed.  For the purpose
14248       // of this variable and the accompanying test case, we need to break this
14249       // down into a loop that sleeps and checks to see if the thread was
14250       // signalled with the stop flag.  It is ugly, but without having DBUG_SYNC
14251       // available in background threads, it is good enough for the test.
14252       for (uint32_t sleeps = 0; sleeps < rocksdb_debug_manual_compaction_delay;
14253            sleeps++) {
14254         RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
14255         const bool local_stop = m_killed;
14256         RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
14257         if (local_stop) break;
14258         my_sleep(1000000);
14259       }
14260     }
14261 
14262     DBUG_EXECUTE_IF("rocksdb_manual_compaction", {
14263       THD *thd = new THD();
14264       thd->thread_stack = reinterpret_cast<char *>(&(thd));
14265       thd->store_globals();
14266       static constexpr char act[] =
14267           "now signal ready_to_mark_cf_dropped_in_manual_compaction wait_for "
14268           "mark_cf_dropped_done_in_manual_compaction";
14269       assert(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
14270       thd->restore_globals();
14271       delete thd;
14272     });
14273 
14274     // CompactRange may take a very long time. On clean shutdown,
14275     // it is cancelled by CancelAllBackgroundWork, then status is
14276     // set to shutdownInProgress.
14277     const rocksdb::Status s =
14278         rdb->CompactRange(getCompactRangeOptions(
14279                               mcr.concurrency, mcr.bottommost_level_compaction),
14280                           mcr.cf.get(), mcr.start, mcr.limit);
14281 
14282     rocksdb_manual_compactions_running--;
14283     if (s.ok()) {
14284       // NO_LINT_DEBUG
14285       sql_print_information("Manual Compaction id %d cf %s ended.", mcr.mc_id,
14286                             mcr.cf->GetName().c_str());
14287     } else {
14288       // NO_LINT_DEBUG
14289       sql_print_information("Manual Compaction id %d cf %s aborted. %s",
14290                             mcr.mc_id, mcr.cf->GetName().c_str(), s.getState());
14291       if (!cf_manager.get_cf(mcr.cf->GetID())) {
14292         // NO_LINT_DEBUG
14293         sql_print_information("cf %s has been dropped",
14294                               mcr.cf->GetName().c_str());
14295       } else if (!s.IsShutdownInProgress()) {
14296         rdb_handle_io_error(s, RDB_IO_ERROR_BG_THREAD);
14297       } else {
14298         assert(m_requests.size() == 1);
14299       }
14300     }
14301     rocksdb_manual_compactions_processed++;
14302     clear_manual_compaction_request(mcr.mc_id, false);
14303     RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
14304   }
14305   clear_all_manual_compaction_requests();
14306   assert(m_requests.empty());
14307   RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
14308 }
14309 
14310 void Rdb_manual_compaction_thread::clear_all_manual_compaction_requests() {
14311   RDB_MUTEX_LOCK_CHECK(m_mc_mutex);
14312   m_requests.clear();
14313   RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
14314 }
14315 
14316 void Rdb_manual_compaction_thread::clear_manual_compaction_request(
14317     int mc_id, bool init_only) {
14318   bool erase = true;
14319   RDB_MUTEX_LOCK_CHECK(m_mc_mutex);
14320   auto it = m_requests.find(mc_id);
14321   if (it != m_requests.end()) {
14322     if (init_only) {
14323       Manual_compaction_request mcr = it->second;
14324       if (mcr.state != Manual_compaction_request::INITED) {
14325         erase = false;
14326       }
14327     }
14328     if (erase) {
14329       m_requests.erase(it);
14330     }
14331   } else {
14332     // Current code path guarantees that erasing by the same mc_id happens
14333     // at most once. INITED state may be erased by a thread that requested
14334     // the compaction. RUNNING state is erased by mc thread only.
14335     assert(0);
14336   }
14337   RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
14338 }
14339 
14340 int Rdb_manual_compaction_thread::request_manual_compaction(
14341     std::shared_ptr<rocksdb::ColumnFamilyHandle> cf, rocksdb::Slice *start,
14342     rocksdb::Slice *limit, int concurrency,
14343     rocksdb::BottommostLevelCompaction bottommost_level_compaction) {
14344   int mc_id = -1;
14345   RDB_MUTEX_LOCK_CHECK(m_mc_mutex);
14346   if (m_requests.size() >= rocksdb_max_manual_compactions) {
14347     RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
14348     return mc_id;
14349   }
14350   Manual_compaction_request mcr;
14351   mc_id = mcr.mc_id = ++m_latest_mc_id;
14352   mcr.state = Manual_compaction_request::INITED;
14353   mcr.cf = cf;
14354   mcr.start = start;
14355   mcr.limit = limit;
14356   mcr.concurrency = concurrency;
14357   mcr.bottommost_level_compaction = bottommost_level_compaction;
14358   m_requests.insert(std::make_pair(mcr.mc_id, mcr));
14359   RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
14360   return mc_id;
14361 }
14362 
14363 bool Rdb_manual_compaction_thread::is_manual_compaction_finished(int mc_id) {
14364   bool finished = false;
14365   RDB_MUTEX_LOCK_CHECK(m_mc_mutex);
14366   if (m_requests.count(mc_id) == 0) {
14367     finished = true;
14368   }
14369   RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
14370   return finished;
14371 }
14372 
14373 /**
14374  * Locking read + Not Found + Read Committed occurs if we accessed
14375  * a row by Seek, tried to lock it, failed, released and reacquired the
14376  * snapshot (because of READ COMMITTED mode) and the row was deleted by
14377  * someone else in the meantime.
14378  * If so, we either just skipping the row, or re-creating a snapshot
14379  * and seek again. In both cases, Read Committed constraint is not broken.
14380  */
14381 bool ha_rocksdb::should_skip_invalidated_record(const int rc) const {
14382   if ((m_lock_rows != RDB_LOCK_NONE && rc == HA_ERR_KEY_NOT_FOUND &&
14383        my_core::thd_tx_isolation(ha_thd()) == ISO_READ_COMMITTED)) {
14384     return true;
14385   }
14386   return false;
14387 }
14388 /**
14389  * Indicating snapshot needs to be re-created and retrying seek again,
14390  * instead of returning errors or empty set. This is normally applicable
14391  * when hitting kBusy when locking the first row of the transaction,
14392  * with Repeatable Read isolation level.
14393  */
14394 bool ha_rocksdb::should_recreate_snapshot(const int rc,
14395                                           const bool is_new_snapshot) const {
14396   if (should_skip_invalidated_record(rc) ||
14397       (rc == HA_ERR_ROCKSDB_STATUS_BUSY && is_new_snapshot)) {
14398     return true;
14399   }
14400   return false;
14401 }
14402 
14403 /**
14404  * If calling put/delete/singledelete without locking the row,
14405  * it is necessary to pass assume_tracked=false to RocksDB TX API.
14406  * Read Free Replication and Blind Deletes are the cases when
14407  * using TX API and skipping row locking.
14408  */
14409 bool ha_rocksdb::can_assume_tracked(THD *thd) {
14410   if (use_read_free_rpl() || (THDVAR(thd, blind_delete_primary_key))) {
14411     return false;
14412   }
14413   return true;
14414 }
14415 
14416 bool ha_rocksdb::check_bloom_and_set_bounds(
14417     THD *thd, const Rdb_key_def &kd, const rocksdb::Slice &eq_cond,
14418     const bool use_all_keys, size_t bound_len, uchar *const lower_bound,
14419     uchar *const upper_bound, rocksdb::Slice *lower_bound_slice,
14420     rocksdb::Slice *upper_bound_slice) {
14421   bool can_use_bloom = can_use_bloom_filter(thd, kd, eq_cond, use_all_keys);
14422   if (!can_use_bloom && (THDVAR(thd, enable_iterate_bounds))) {
14423     setup_iterator_bounds(kd, eq_cond, bound_len, lower_bound, upper_bound,
14424                           lower_bound_slice, upper_bound_slice);
14425   }
14426   return can_use_bloom;
14427 }
14428 
14429 /**
14430   Deciding if it is possible to use bloom filter or not.
14431 
14432   @detail
14433    Even if bloom filter exists, it is not always possible
14434    to use bloom filter. If using bloom filter when you shouldn't,
14435    false negative may happen -- fewer rows than expected may be returned.
14436    It is users' responsibility to use bloom filter correctly.
14437 
14438    If bloom filter does not exist, return value does not matter because
14439    RocksDB does not use bloom filter internally.
14440 
14441   @param kd
14442   @param eq_cond      Equal condition part of the key. This always includes
14443                       system index id (4 bytes).
14444   @param use_all_keys True if all key parts are set with equal conditions.
14445                       This is aware of extended keys.
14446 */
14447 bool ha_rocksdb::can_use_bloom_filter(THD *thd, const Rdb_key_def &kd,
14448                                       const rocksdb::Slice &eq_cond,
14449                                       const bool use_all_keys) {
14450   bool can_use = false;
14451 
14452   if (THDVAR(thd, skip_bloom_filter_on_read)) {
14453     return can_use;
14454   }
14455 
14456   const rocksdb::SliceTransform *prefix_extractor = kd.get_extractor();
14457   if (prefix_extractor) {
14458     /*
14459       This is an optimized use case for CappedPrefixTransform.
14460       If eq_cond length >= prefix extractor length and if
14461       all keys are used for equal lookup, it is
14462       always possible to use bloom filter.
14463 
14464       Prefix bloom filter can't be used on descending scan with
14465       prefix lookup (i.e. WHERE id1=1 ORDER BY id2 DESC), because of
14466       RocksDB's limitation. On ascending (or not sorting) scan,
14467       keys longer than the capped prefix length will be truncated down
14468       to the capped length and the resulting key is added to the bloom filter.
14469 
14470       Keys shorter than the capped prefix length will be added to
14471       the bloom filter. When keys are looked up, key conditionals
14472       longer than the capped length can be used; key conditionals
14473       shorter require all parts of the key to be available
14474       for the short key match.
14475     */
14476     if ((use_all_keys && prefix_extractor->InRange(eq_cond)) ||
14477         prefix_extractor->SameResultWhenAppended(eq_cond))
14478       can_use = true;
14479     else
14480       can_use = false;
14481   } else {
14482     /*
14483       if prefix extractor is not defined, all key parts have to be
14484       used by eq_cond.
14485     */
14486     if (use_all_keys) {
14487       can_use = true;
14488     } else {
14489       can_use = false;
14490     }
14491   }
14492 
14493   return can_use;
14494 }
14495 
14496 /* For modules that need access to the global data structures */
14497 rocksdb::TransactionDB *rdb_get_rocksdb_db() { return rdb; }
14498 
14499 Rdb_cf_manager &rdb_get_cf_manager() { return cf_manager; }
14500 
14501 const rocksdb::BlockBasedTableOptions &rdb_get_table_options() {
14502   return *rocksdb_tbl_options;
14503 }
14504 
14505 bool rdb_is_table_scan_index_stats_calculation_enabled() {
14506   return rocksdb_table_stats_use_table_scan;
14507 }
14508 bool rdb_is_ttl_enabled() { return rocksdb_enable_ttl; }
14509 bool rdb_is_ttl_read_filtering_enabled() {
14510   return rocksdb_enable_ttl_read_filtering;
14511 }
14512 #if !defined(NDEBUG)
14513 int rdb_dbug_set_ttl_rec_ts() { return rocksdb_debug_ttl_rec_ts; }
14514 int rdb_dbug_set_ttl_snapshot_ts() { return rocksdb_debug_ttl_snapshot_ts; }
14515 int rdb_dbug_set_ttl_read_filter_ts() {
14516   return rocksdb_debug_ttl_read_filter_ts;
14517 }
14518 bool rdb_dbug_set_ttl_ignore_pk() { return rocksdb_debug_ttl_ignore_pk; }
14519 #endif  // !defined(NDEBUG)
14520 
14521 void rdb_update_global_stats(const operation_type &type, uint count,
14522                              bool is_system_table) {
14523   assert(type < ROWS_MAX);
14524 
14525   if (count == 0) {
14526     return;
14527   }
14528 
14529   if (is_system_table) {
14530     global_stats.system_rows[type].add(count);
14531   } else {
14532     global_stats.rows[type].add(count);
14533   }
14534 }
14535 
14536 int rdb_get_table_perf_counters(const char *const tablename,
14537                                 Rdb_perf_counters *const counters) {
14538   assert(counters != nullptr);
14539   assert(tablename != nullptr);
14540 
14541   Rdb_table_handler *table_handler;
14542   table_handler = rdb_open_tables.get_table_handler(tablename);
14543   if (table_handler == nullptr) {
14544     return HA_ERR_ROCKSDB_INVALID_TABLE;
14545   }
14546 
14547   counters->load(table_handler->m_table_perf_context);
14548 
14549   rdb_open_tables.release_table_handler(table_handler);
14550   return HA_EXIT_SUCCESS;
14551 }
14552 
14553 const char *get_rdb_io_error_string(const RDB_IO_ERROR_TYPE err_type) {
14554   // If this assertion fails then this means that a member has been either added
14555   // to or removed from RDB_IO_ERROR_TYPE enum and this function needs to be
14556   // changed to return the appropriate value.
14557   static_assert(RDB_IO_ERROR_LAST == 4, "Please handle all the error types.");
14558 
14559   switch (err_type) {
14560     case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_TX_COMMIT:
14561       return "RDB_IO_ERROR_TX_COMMIT";
14562     case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_DICT_COMMIT:
14563       return "RDB_IO_ERROR_DICT_COMMIT";
14564     case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_BG_THREAD:
14565       return "RDB_IO_ERROR_BG_THREAD";
14566     case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_GENERAL:
14567       return "RDB_IO_ERROR_GENERAL";
14568     default:
14569       assert(false);
14570       return "(unknown)";
14571   }
14572 }
14573 
14574 // In case of core dump generation we want this function NOT to be optimized
14575 // so that we can capture as much data as possible to debug the root cause
14576 // more efficiently.
14577 #if defined(NDEBUG)
14578 #ifdef __clang__
14579 MY_ATTRIBUTE((optnone))
14580 #else
14581 MY_ATTRIBUTE((optimize("O0")))
14582 #endif
14583 #endif
14584 void rdb_handle_io_error(const rocksdb::Status status,
14585                          const RDB_IO_ERROR_TYPE err_type) {
14586   if (status.IsIOError()) {
14587     switch (err_type) {
14588       case RDB_IO_ERROR_TX_COMMIT:
14589       case RDB_IO_ERROR_DICT_COMMIT: {
14590         rdb_log_status_error(status, "failed to write to WAL");
14591         /* NO_LINT_DEBUG */
14592         sql_print_error("MyRocks: aborting on WAL write error.");
14593         abort();
14594         break;
14595       }
14596       case RDB_IO_ERROR_BG_THREAD: {
14597         rdb_log_status_error(status, "BG thread failed to write to RocksDB");
14598         /* NO_LINT_DEBUG */
14599         sql_print_error("MyRocks: aborting on BG write error.");
14600         abort();
14601         break;
14602       }
14603       case RDB_IO_ERROR_GENERAL: {
14604         rdb_log_status_error(status, "failed on I/O");
14605         /* NO_LINT_DEBUG */
14606         sql_print_error("MyRocks: aborting on I/O error.");
14607         abort();
14608         break;
14609       }
14610       default:
14611         assert(0);
14612         break;
14613     }
14614   } else if (status.IsCorruption()) {
14615     rdb_log_status_error(status, "data corruption detected!");
14616     rdb_persist_corruption_marker();
14617     /* NO_LINT_DEBUG */
14618     sql_print_error("MyRocks: aborting because of data corruption.");
14619     abort();
14620   } else if (!status.ok()) {
14621     switch (err_type) {
14622       case RDB_IO_ERROR_TX_COMMIT:
14623       case RDB_IO_ERROR_DICT_COMMIT: {
14624         rdb_log_status_error(status, "Failed to write to WAL (non kIOError)");
14625         /* NO_LINT_DEBUG */
14626         sql_print_error("MyRocks: aborting on WAL write error.");
14627         abort();
14628         break;
14629       }
14630       default:
14631         rdb_log_status_error(status, "Failed to read/write in RocksDB");
14632         break;
14633     }
14634   }
14635 }
14636 
14637 Rdb_dict_manager *rdb_get_dict_manager(void) { return &dict_manager; }
14638 
14639 Rdb_ddl_manager *rdb_get_ddl_manager(void) { return &ddl_manager; }
14640 
14641 Rdb_hton_init_state *rdb_get_hton_init_state(void) { return &hton_init_state; }
14642 
14643 void rocksdb_set_compaction_options(
14644     my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
14645     my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14646     void *const var_ptr, const void *const save) {
14647   if (var_ptr && save) {
14648     *(uint64_t *)var_ptr = *(const uint64_t *)save;
14649   }
14650   const Rdb_compact_params params = {
14651       (uint64_t)rocksdb_compaction_sequential_deletes,
14652       (uint64_t)rocksdb_compaction_sequential_deletes_window,
14653       (uint64_t)rocksdb_compaction_sequential_deletes_file_size};
14654   if (properties_collector_factory) {
14655     properties_collector_factory->SetCompactionParams(params);
14656   }
14657 }
14658 
14659 void rocksdb_set_table_stats_sampling_pct(
14660     my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
14661     my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14662     void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
14663   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14664 
14665   const uint32_t new_val = *static_cast<const uint32_t *>(save);
14666 
14667   if (new_val != rocksdb_table_stats_sampling_pct) {
14668     rocksdb_table_stats_sampling_pct = new_val;
14669 
14670     if (properties_collector_factory) {
14671       properties_collector_factory->SetTableStatsSamplingPct(
14672           rocksdb_table_stats_sampling_pct);
14673     }
14674   }
14675 
14676   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14677 }
14678 
14679 void rocksdb_update_table_stats_use_table_scan(
14680     THD *const /* thd */, struct st_mysql_sys_var *const /* var */,
14681     void *const var_ptr, const void *const save) {
14682   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14683   bool old_val = *static_cast<const my_bool *>(var_ptr);
14684   bool new_val = *static_cast<const my_bool *>(save);
14685 
14686   if (old_val == new_val) {
14687     RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14688     return;
14689   }
14690 
14691   if (new_val) {
14692     struct Rdb_table_collector : public Rdb_tables_scanner {
14693       int add_table(Rdb_tbl_def *tdef) override {
14694         assert(tdef->m_key_count > 0);
14695         tdef->m_tbl_stats.set(tdef->m_key_count > 0
14696                                   ? tdef->m_key_descr_arr[0]->m_stats.m_rows
14697                                   : 0,
14698                               0, 0);
14699         return HA_EXIT_SUCCESS;
14700       }
14701     } collector;
14702     ddl_manager.scan_for_tables(&collector);
14703 
14704     // We do not add all tables to the index stats recalculation queue
14705     // to avoid index stats calculation workload spike.
14706   } else {
14707     rdb_is_thread.clear_all_index_stats_requests();
14708   }
14709 
14710   *static_cast<my_bool *>(var_ptr) = *static_cast<const my_bool *>(save);
14711   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14712 }
14713 
14714 int rocksdb_index_stats_thread_renice(THD *const /* thd */,
14715                                       struct st_mysql_sys_var *const /* var */,
14716                                       void *const save,
14717                                       struct st_mysql_value *const value) {
14718   long long nice_val;
14719   /* value is NULL */
14720   if (value->val_int(value, &nice_val)) {
14721     return HA_EXIT_FAILURE;
14722   }
14723 
14724   if (rdb_is_thread.renice(nice_val) != HA_EXIT_SUCCESS) {
14725     return HA_EXIT_FAILURE;
14726   }
14727 
14728   *static_cast<int32_t *>(save) = static_cast<int32_t>(nice_val);
14729   return HA_EXIT_SUCCESS;
14730 }
14731 
14732 /*
14733   This function allows setting the rate limiter's bytes per second value
14734   but only if the rate limiter is turned on which has to be done at startup.
14735   If the rate is already 0 (turned off) or we are changing it to 0 (trying
14736   to turn it off) this function will push a warning to the client and do
14737   nothing.
14738   This is similar to the code in innodb_doublewrite_update (found in
14739   storage/innobase/handler/ha_innodb.cc).
14740 */
14741 void rocksdb_set_rate_limiter_bytes_per_sec(
14742     my_core::THD *const thd,
14743     my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14744     void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
14745   const uint64_t new_val = *static_cast<const uint64_t *>(save);
14746   if (new_val == 0 || rocksdb_rate_limiter_bytes_per_sec == 0) {
14747     /*
14748       If a rate_limiter was not enabled at startup we can't change it nor
14749       can we disable it if one was created at startup
14750     */
14751     push_warning_printf(thd, Sql_condition::SL_WARNING, ER_WRONG_ARGUMENTS,
14752                         "RocksDB: rocksdb_rate_limiter_bytes_per_sec cannot "
14753                         "be dynamically changed to or from 0.  Do a clean "
14754                         "shutdown if you want to change it from or to 0.");
14755   } else if (new_val != rocksdb_rate_limiter_bytes_per_sec) {
14756     /* Apply the new value to the rate limiter and store it locally */
14757     assert(rocksdb_rate_limiter != nullptr);
14758     rocksdb_rate_limiter_bytes_per_sec = new_val;
14759     rocksdb_rate_limiter->SetBytesPerSecond(new_val);
14760   }
14761 }
14762 
14763 void rocksdb_set_sst_mgr_rate_bytes_per_sec(
14764     my_core::THD *const thd,
14765     my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14766     void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
14767   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14768 
14769   const uint64_t new_val = *static_cast<const uint64_t *>(save);
14770 
14771   if (new_val != rocksdb_sst_mgr_rate_bytes_per_sec) {
14772     rocksdb_sst_mgr_rate_bytes_per_sec = new_val;
14773 
14774     rocksdb_db_options->sst_file_manager->SetDeleteRateBytesPerSecond(
14775         rocksdb_sst_mgr_rate_bytes_per_sec);
14776   }
14777 
14778   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14779 }
14780 
14781 void rocksdb_set_delayed_write_rate(THD *thd, struct st_mysql_sys_var *var,
14782                                     void *var_ptr, const void *save) {
14783   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14784   const uint64_t new_val = *static_cast<const uint64_t *>(save);
14785   if (rocksdb_delayed_write_rate != new_val) {
14786     rocksdb_delayed_write_rate = new_val;
14787     rocksdb::Status s =
14788         rdb->SetDBOptions({{"delayed_write_rate", std::to_string(new_val)}});
14789 
14790     if (!s.ok()) {
14791       /* NO_LINT_DEBUG */
14792       sql_print_warning(
14793           "MyRocks: failed to update delayed_write_rate. "
14794           "status code = %d, status = %s",
14795           s.code(), s.ToString().c_str());
14796     }
14797   }
14798   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14799 }
14800 
14801 void rocksdb_set_max_latest_deadlocks(THD *thd, struct st_mysql_sys_var *var,
14802                                       void *var_ptr, const void *save) {
14803   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14804   const uint32_t new_val = *static_cast<const uint32_t *>(save);
14805   if (rocksdb_max_latest_deadlocks != new_val) {
14806     rocksdb_max_latest_deadlocks = new_val;
14807     rdb->SetDeadlockInfoBufferSize(rocksdb_max_latest_deadlocks);
14808   }
14809   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14810 }
14811 
14812 void rdb_set_collation_exception_list(const char *const exception_list) {
14813   assert(rdb_collation_exceptions != nullptr);
14814 
14815   int flags = MY_REG_EXTENDED | MY_REG_NOSUB;
14816   if (lower_case_table_names) flags |= MY_REG_ICASE;
14817   if (!rdb_collation_exceptions->compile(exception_list, flags,
14818                                          table_alias_charset)) {
14819     warn_about_bad_patterns(*rdb_collation_exceptions,
14820                             "strict_collation_exceptions");
14821   }
14822 }
14823 
14824 void rocksdb_set_collation_exception_list(THD *const thd,
14825                                           struct st_mysql_sys_var *const var,
14826                                           void *const var_ptr,
14827                                           const void *const save) {
14828   const char *const val = *static_cast<const char *const *>(save);
14829 
14830   rdb_set_collation_exception_list(val == nullptr ? "" : val);
14831 
14832   *static_cast<const char **>(var_ptr) = val;
14833 }
14834 
14835 int mysql_value_to_bool(struct st_mysql_value *value, my_bool *return_value) {
14836   int new_value_type = value->value_type(value);
14837   if (new_value_type == MYSQL_VALUE_TYPE_STRING) {
14838     char buf[16];
14839     int len = sizeof(buf);
14840     const char *str = value->val_str(value, buf, &len);
14841     if (str && (my_strcasecmp(system_charset_info, "true", str) == 0 ||
14842                 my_strcasecmp(system_charset_info, "on", str) == 0)) {
14843       *return_value = true;
14844     } else if (str && (my_strcasecmp(system_charset_info, "false", str) == 0 ||
14845                        my_strcasecmp(system_charset_info, "off", str) == 0)) {
14846       *return_value = false;
14847     } else {
14848       return 1;
14849     }
14850   } else if (new_value_type == MYSQL_VALUE_TYPE_INT) {
14851     long long intbuf;
14852     value->val_int(value, &intbuf);
14853     if (intbuf > 1) return 1;
14854     *return_value = intbuf > 0;
14855   } else {
14856     return 1;
14857   }
14858 
14859   return 0;
14860 }
14861 
14862 int rocksdb_check_bulk_load(
14863     THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)),
14864     void *save, struct st_mysql_value *value) {
14865   my_bool new_value;
14866   if (mysql_value_to_bool(value, &new_value) != 0) {
14867     return 1;
14868   }
14869 
14870   Rdb_transaction *&tx = get_tx_from_thd(thd);
14871   if (tx != nullptr) {
14872     bool is_critical_error;
14873     const int rc = tx->finish_bulk_load(&is_critical_error);
14874     if (rc != 0 && is_critical_error) {
14875       // NO_LINT_DEBUG
14876       sql_print_error(
14877           "RocksDB: Error %d finalizing last SST file while "
14878           "setting bulk loading variable",
14879           rc);
14880       THDVAR(thd, bulk_load) = 0;
14881       return 1;
14882     }
14883   }
14884 
14885   *static_cast<bool *>(save) = new_value;
14886   return 0;
14887 }
14888 
14889 int rocksdb_check_bulk_load_allow_unsorted(
14890     THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)),
14891     void *save, struct st_mysql_value *value) {
14892   my_bool new_value;
14893   if (mysql_value_to_bool(value, &new_value) != 0) {
14894     return 1;
14895   }
14896 
14897   if (THDVAR(thd, bulk_load)) {
14898     sql_print_error(
14899         "RocksDB: Cannot change this setting while bulk load is "
14900         "enabled");
14901 
14902     return 1;
14903   }
14904 
14905   *static_cast<bool *>(save) = new_value;
14906   return 0;
14907 }
14908 
14909 static void rocksdb_set_max_background_jobs(THD *thd,
14910                                             struct st_mysql_sys_var *const var,
14911                                             void *const var_ptr,
14912                                             const void *const save) {
14913   assert(save != nullptr);
14914   assert(rocksdb_db_options != nullptr);
14915   assert(rocksdb_db_options->env != nullptr);
14916 
14917   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14918 
14919   const int new_val = *static_cast<const int *>(save);
14920 
14921   if (rocksdb_db_options->max_background_jobs != new_val) {
14922     rocksdb_db_options->max_background_jobs = new_val;
14923     rocksdb::Status s =
14924         rdb->SetDBOptions({{"max_background_jobs", std::to_string(new_val)}});
14925 
14926     if (!s.ok()) {
14927       /* NO_LINT_DEBUG */
14928       sql_print_warning(
14929           "MyRocks: failed to update max_background_jobs. "
14930           "Status code = %d, status = %s.",
14931           s.code(), s.ToString().c_str());
14932     }
14933   }
14934 
14935   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14936 }
14937 
14938 static void rocksdb_set_max_background_compactions(THD *thd,
14939                                                    struct st_mysql_sys_var *const var,
14940                                                    void *const var_ptr,
14941                                                    const void *const save) {
14942   assert(save != nullptr);
14943   assert(rocksdb_db_options != nullptr);
14944   assert(rocksdb_db_options->env != nullptr);
14945 
14946   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14947 
14948   const int new_val = *static_cast<const int *>(save);
14949 
14950   if (rocksdb_db_options->max_background_compactions != new_val) {
14951     rocksdb_db_options->max_background_compactions = new_val;
14952     rocksdb::Status s =
14953         rdb->SetDBOptions({{"max_background_compactions", std::to_string(new_val)}});
14954 
14955     if (!s.ok()) {
14956       /* NO_LINT_DEBUG */
14957       sql_print_warning(
14958           "MyRocks: failed to update max_background_compactions. "
14959           "Status code = %d, status = %s.",
14960           s.code(), s.ToString().c_str());
14961     }
14962   }
14963 
14964   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14965 }
14966 
14967 /**
14968    rocksdb_set_max_bottom_pri_background_compactions_internal() changes
14969    the number of rocksdb background threads.
14970    Creating new threads may take up to a few seconds, so instead of
14971    calling the function at sys_var::update path where global mutex is held,
14972    doing at sys_var::check path so that other queries are not blocked.
14973    Same optimization is done for rocksdb_block_cache_size too.
14974 */
14975 static int rocksdb_validate_max_bottom_pri_background_compactions(
14976     THD *thd MY_ATTRIBUTE((__unused__)),
14977     struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14978     void *var_ptr, struct st_mysql_value *value) {
14979   assert(value != nullptr);
14980 
14981   long long new_value;
14982 
14983   /* value is NULL */
14984   if (value->val_int(value, &new_value)) {
14985     return HA_EXIT_FAILURE;
14986   }
14987   if (new_value < 0 ||
14988       new_value > ROCKSDB_MAX_BOTTOM_PRI_BACKGROUND_COMPACTIONS) {
14989     return HA_EXIT_FAILURE;
14990   }
14991   RDB_MUTEX_LOCK_CHECK(rdb_bottom_pri_background_compactions_resize_mutex);
14992   if (rocksdb_max_bottom_pri_background_compactions != new_value) {
14993     if (new_value == 0) {
14994       my_error(ER_ERROR_WHEN_EXECUTING_COMMAND, MYF(0), "SET",
14995                "max_bottom_pri_background_compactions can't be changed to 0 "
14996                "online.");
14997       RDB_MUTEX_UNLOCK_CHECK(
14998           rdb_bottom_pri_background_compactions_resize_mutex);
14999       return HA_EXIT_FAILURE;
15000     }
15001     rocksdb_set_max_bottom_pri_background_compactions_internal(new_value);
15002   }
15003   *static_cast<int64_t *>(var_ptr) = static_cast<int64_t>(new_value);
15004   RDB_MUTEX_UNLOCK_CHECK(rdb_bottom_pri_background_compactions_resize_mutex);
15005   return HA_EXIT_SUCCESS;
15006 }
15007 
15008 static void rocksdb_set_bytes_per_sync(
15009     THD *thd MY_ATTRIBUTE((__unused__)),
15010     struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
15011     void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
15012   assert(save != nullptr);
15013   assert(rocksdb_db_options != nullptr);
15014   assert(rocksdb_db_options->env != nullptr);
15015 
15016   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
15017 
15018   const ulonglong new_val = *static_cast<const ulonglong *>(save);
15019 
15020   if (rocksdb_db_options->bytes_per_sync != new_val) {
15021     rocksdb_db_options->bytes_per_sync = new_val;
15022     rocksdb::Status s =
15023         rdb->SetDBOptions({{"bytes_per_sync", std::to_string(new_val)}});
15024 
15025     if (!s.ok()) {
15026       /* NO_LINT_DEBUG */
15027       sql_print_warning(
15028           "MyRocks: failed to update max_background_jobs. "
15029           "Status code = %d, status = %s.",
15030           s.code(), s.ToString().c_str());
15031     }
15032   }
15033 
15034   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
15035 }
15036 
15037 static void rocksdb_set_wal_bytes_per_sync(
15038     THD *thd MY_ATTRIBUTE((__unused__)),
15039     struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
15040     void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
15041   assert(save != nullptr);
15042   assert(rocksdb_db_options != nullptr);
15043   assert(rocksdb_db_options->env != nullptr);
15044 
15045   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
15046 
15047   const ulonglong new_val = *static_cast<const ulonglong *>(save);
15048 
15049   if (rocksdb_db_options->wal_bytes_per_sync != new_val) {
15050     rocksdb_db_options->wal_bytes_per_sync = new_val;
15051     rocksdb::Status s =
15052         rdb->SetDBOptions({{"wal_bytes_per_sync", std::to_string(new_val)}});
15053 
15054     if (!s.ok()) {
15055       /* NO_LINT_DEBUG */
15056       sql_print_warning(
15057           "MyRocks: failed to update max_background_jobs. "
15058           "Status code = %d, status = %s.",
15059           s.code(), s.ToString().c_str());
15060     }
15061   }
15062 
15063   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
15064 }
15065 
15066 /*
15067   Validating and updating block cache size via sys_var::check path.
15068   SetCapacity may take seconds when reducing block cache, and
15069   sys_var::update holds LOCK_global_system_variables mutex, so
15070   updating block cache size is done at check path instead.
15071 */
15072 static int rocksdb_validate_set_block_cache_size(
15073     THD *thd MY_ATTRIBUTE((__unused__)),
15074     struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
15075     void *var_ptr, struct st_mysql_value *value) {
15076   assert(value != nullptr);
15077 
15078   long long new_value;
15079 
15080   /* value is NULL */
15081   if (value->val_int(value, &new_value)) {
15082     return HA_EXIT_FAILURE;
15083   }
15084 
15085   if (new_value < RDB_MIN_BLOCK_CACHE_SIZE ||
15086       (uint64_t)new_value > (uint64_t)LLONG_MAX) {
15087     return HA_EXIT_FAILURE;
15088   }
15089 
15090   RDB_MUTEX_LOCK_CHECK(rdb_block_cache_resize_mutex);
15091   const rocksdb::BlockBasedTableOptions &table_options =
15092       rdb_get_table_options();
15093 
15094   if (rocksdb_block_cache_size != new_value && table_options.block_cache) {
15095     table_options.block_cache->SetCapacity(new_value);
15096   }
15097   *static_cast<int64_t *>(var_ptr) = static_cast<int64_t>(new_value);
15098   RDB_MUTEX_UNLOCK_CHECK(rdb_block_cache_resize_mutex);
15099   return HA_EXIT_SUCCESS;
15100 }
15101 
15102 static int rocksdb_validate_update_cf_options(
15103     THD *thd MY_ATTRIBUTE((__unused__)),
15104     struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)), void *save,
15105     struct st_mysql_value *value) {
15106   char buff[STRING_BUFFER_USUAL_SIZE];
15107   const char *str;
15108   int length;
15109   length = sizeof(buff);
15110   str = value->val_str(value, buff, &length);
15111   *static_cast<const char **>(save) = str;
15112 
15113   if (str == nullptr) {
15114     return HA_EXIT_SUCCESS;
15115   }
15116 
15117   Rdb_cf_options::Name_to_config_t option_map;
15118 
15119   // Basic sanity checking and parsing the options into a map. If this fails
15120   // then there's no point to proceed.
15121   if (!Rdb_cf_options::parse_cf_options(str, &option_map)) {
15122     my_error(ER_WRONG_VALUE_FOR_VAR, MYF(0), "rocksdb_update_cf_options", str);
15123     return HA_EXIT_FAILURE;
15124   }
15125   // Loop through option_map and check if all specified CFs exist.
15126   std::vector<const std::string *> unknown_cfs;
15127   for (const auto &option : option_map) {
15128     if (!cf_manager.get_cf(option.first)) {
15129       unknown_cfs.push_back(&(option.first));
15130     }
15131   }
15132 
15133   if (!unknown_cfs.empty()) {
15134     std::string err(str);
15135     err.append(" Unknown CF: ");
15136     bool first = true;
15137     for (const auto cf : unknown_cfs) {
15138       if (first)
15139         first = false;
15140       else
15141         err.append(", ");
15142       err.append(*cf);
15143     }
15144     my_error(ER_WRONG_VALUE_FOR_VAR, MYF(0), "rocksdb_update_cf_options",
15145              err.c_str());
15146     return HA_EXIT_FAILURE;
15147   }
15148   return HA_EXIT_SUCCESS;
15149 }
15150 
15151 static void rocksdb_set_update_cf_options(
15152     THD *const thd MY_ATTRIBUTE((__unused__)),
15153     struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
15154     void *const var_ptr, const void *const save) {
15155   const char *const val = *static_cast<const char *const *>(save);
15156 
15157   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
15158 
15159   if (!val) {
15160     *reinterpret_cast<char **>(var_ptr) = nullptr;
15161     RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
15162     return;
15163   }
15164 
15165   assert(val != nullptr);
15166 
15167   // Reset the pointers regardless of how much success we had with updating
15168   // the CF options. This will results in consistent behavior and avoids
15169   // dealing with cases when only a subset of CF-s was successfully updated.
15170   *static_cast<const char **>(var_ptr) =
15171       *static_cast<const char *const *>(save);
15172 
15173   // Do the real work of applying the changes.
15174   Rdb_cf_options::Name_to_config_t option_map;
15175 
15176   // This should never fail, because of rocksdb_validate_update_cf_options
15177   if (!Rdb_cf_options::parse_cf_options(val, &option_map)) {
15178     RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
15179     return;
15180   }
15181 
15182   // For each CF we have, see if we need to update any settings.
15183   for (const auto &cf_name : cf_manager.get_cf_names()) {
15184     assert(!cf_name.empty());
15185 
15186     std::shared_ptr<rocksdb::ColumnFamilyHandle> cfh =
15187         cf_manager.get_cf(cf_name);
15188 
15189     if (!cfh) {
15190       // NO_LINT_DEBUG
15191       sql_print_information(
15192           "Skip updating options for cf %s because the cf has been dropped.",
15193           cf_name.c_str());
15194       continue;
15195     }
15196 
15197     const auto it = option_map.find(cf_name);
15198     std::string per_cf_options = (it != option_map.end()) ? it->second : "";
15199 
15200     if (!per_cf_options.empty()) {
15201       Rdb_cf_options::Name_to_config_t opt_map;
15202       rocksdb::Status s = rocksdb::StringToMap(per_cf_options, &opt_map);
15203 
15204       if (s != rocksdb::Status::OK()) {
15205         // NO_LINT_DEBUG
15206         sql_print_warning(
15207             "MyRocks: failed to convert the options for column "
15208             "family '%s' to a map. %s",
15209             cf_name.c_str(), s.ToString().c_str());
15210       } else {
15211         assert(rdb != nullptr);
15212 
15213         // Finally we can apply the options.
15214         // If cf_manager.drop_cf() has been called at this point, SetOptions()
15215         // will still succeed. The options data will only be cleared when
15216         // the CF handle object is destroyed.
15217         s = rdb->SetOptions(cfh.get(), opt_map);
15218 
15219         if (s != rocksdb::Status::OK()) {
15220           // NO_LINT_DEBUG
15221           sql_print_warning(
15222               "MyRocks: failed to apply the options for column "
15223               "family '%s'. %s",
15224               cf_name.c_str(), s.ToString().c_str());
15225         } else {
15226           // NO_LINT_DEBUG
15227           sql_print_information(
15228               "MyRocks: options for column family '%s' "
15229               "have been successfully updated.",
15230               cf_name.c_str());
15231 
15232           // Make sure that data is internally consistent as well and update
15233           // the CF options. This is necessary also to make sure that the CF
15234           // options will be correctly reflected in the relevant table:
15235           // ROCKSDB_CF_OPTIONS in INFORMATION_SCHEMA.
15236           rocksdb::ColumnFamilyOptions cf_options = rdb->GetOptions(cfh.get());
15237           std::string updated_options;
15238 
15239           s = rocksdb::GetStringFromColumnFamilyOptions(&updated_options,
15240                                                         cf_options);
15241 
15242           assert(s == rocksdb::Status::OK());
15243           assert(!updated_options.empty());
15244 
15245           cf_manager.update_options_map(cf_name, updated_options);
15246         }
15247       }
15248     }
15249   }
15250 
15251   // Our caller (`plugin_var_memalloc_global_update`) will call `my_free` to
15252   // free up resources used before.
15253 
15254   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
15255 }
15256 
15257 void rdb_queue_save_stats_request() { rdb_bg_thread.request_save_stats(); }
15258 
15259 #if defined(ROCKSDB_INCLUDE_RFR) && ROCKSDB_INCLUDE_RFR
15260 void ha_rocksdb::rpl_before_delete_rows() {
15261   DBUG_ENTER_FUNC();
15262 
15263   m_in_rpl_delete_rows = true;
15264 
15265   DBUG_VOID_RETURN;
15266 }
15267 
15268 void ha_rocksdb::rpl_after_delete_rows() {
15269   DBUG_ENTER_FUNC();
15270 
15271   m_in_rpl_delete_rows = false;
15272 
15273   DBUG_VOID_RETURN;
15274 }
15275 
15276 void ha_rocksdb::rpl_before_update_rows() {
15277   DBUG_ENTER_FUNC();
15278 
15279   m_in_rpl_update_rows = true;
15280 
15281   DBUG_VOID_RETURN;
15282 }
15283 
15284 void ha_rocksdb::rpl_after_update_rows() {
15285   DBUG_ENTER_FUNC();
15286 
15287   m_in_rpl_update_rows = false;
15288 
15289   DBUG_VOID_RETURN;
15290 }
15291 
15292 bool ha_rocksdb::rpl_lookup_rows() { return !use_read_free_rpl(); }
15293 
15294 bool ha_rocksdb::is_read_free_rpl_table() const {
15295 #if 1  // Percona Server disabled rocksdb_read_free_rpl_tables as it's dangerous to use
15296   return true;
15297 #else
15298   return table->s && m_tbl_def->m_is_read_free_rpl_table;
15299 #endif
15300 }
15301 
15302 /**
15303   @brief
15304   Read Free Replication can be used or not. Returning true means
15305   Read Free Replication can be used.
15306 */
15307 bool ha_rocksdb::use_read_free_rpl() const {
15308   DBUG_ENTER_FUNC();
15309 
15310   if (!ha_thd()->rli_slave || table->triggers || !is_read_free_rpl_table()) {
15311     DBUG_RETURN(false);
15312   }
15313 
15314   switch (rocksdb_read_free_rpl) {
15315     case read_free_rpl_type::OFF:
15316       DBUG_RETURN(false);
15317     case read_free_rpl_type::PK_ONLY:
15318       DBUG_RETURN(!has_hidden_pk(table) && table->s->keys == 1);
15319     case read_free_rpl_type::PK_SK:
15320       DBUG_RETURN(!has_hidden_pk(table));
15321   }
15322 
15323   assert(false);
15324   DBUG_RETURN(false);
15325 }
15326 #endif  // defined(ROCKSDB_INCLUDE_RFR) && ROCKSDB_INCLUDE_RFR
15327 
15328 double ha_rocksdb::read_time(uint index, uint ranges, ha_rows rows) {
15329   DBUG_ENTER_FUNC();
15330 
15331   if (index != table->s->primary_key) {
15332     /* Non covering index range scan */
15333     DBUG_RETURN(handler::read_time(index, ranges, rows));
15334   }
15335 
15336   DBUG_RETURN((rows / 20.0) + 1);
15337 }
15338 
15339 void ha_rocksdb::print_error(int error, myf errflag) {
15340   if (error == HA_ERR_ROCKSDB_STATUS_BUSY) {
15341     error = HA_ERR_LOCK_DEADLOCK;
15342   }
15343   handler::print_error(error, errflag);
15344 }
15345 
15346 std::string rdb_corruption_marker_file_name() {
15347   std::string ret(rocksdb_datadir);
15348   ret.append("/ROCKSDB_CORRUPTED");
15349   return ret;
15350 }
15351 
15352 }  // namespace myrocks
15353 
15354 /*
15355   Register the storage engine plugin outside of myrocks namespace
15356   so that mysql_declare_plugin does not get confused when it does
15357   its name generation.
15358 */
15359 
15360 struct st_mysql_storage_engine rocksdb_storage_engine = {
15361     MYSQL_HANDLERTON_INTERFACE_VERSION};
15362 
15363 mysql_declare_plugin(rocksdb_se){
15364     MYSQL_STORAGE_ENGINE_PLUGIN,       /* Plugin Type */
15365     &rocksdb_storage_engine,           /* Plugin Descriptor */
15366     "ROCKSDB",                         /* Plugin Name */
15367     "Monty Program Ab",                /* Plugin Author */
15368     "RocksDB storage engine",          /* Plugin Description */
15369     PLUGIN_LICENSE_GPL,                /* Plugin Licence */
15370     myrocks::rocksdb_init_func,        /* Plugin Entry Point */
15371     myrocks::rocksdb_done_func,        /* Plugin Deinitializer */
15372     0x0001,                            /* version number (0.1) */
15373     myrocks::rocksdb_status_vars,      /* status variables */
15374     myrocks::rocksdb_system_variables, /* system variables */
15375     nullptr,                           /* config options */
15376     0,                                 /* flags */
15377 },
15378     myrocks::rdb_i_s_cfstats, myrocks::rdb_i_s_dbstats,
15379     myrocks::rdb_i_s_perf_context, myrocks::rdb_i_s_perf_context_global,
15380     myrocks::rdb_i_s_cfoptions, myrocks::rdb_i_s_compact_stats,
15381     myrocks::rdb_i_s_global_info, myrocks::rdb_i_s_ddl,
15382     myrocks::rdb_i_s_sst_props, myrocks::rdb_i_s_index_file_map,
15383     myrocks::rdb_i_s_lock_info, myrocks::rdb_i_s_trx_info,
15384     myrocks::rdb_i_s_deadlock_info mysql_declare_plugin_end;
15385