1 /*
2    Copyright (c) 2012, Monty Program Ab
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; version 2 of the License.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA */
16 
17 #ifdef USE_PRAGMA_IMPLEMENTATION
18 #pragma implementation  // gcc: Class implementation
19 #endif
20 
21 #define MYSQL_SERVER 1
22 
23 /* For use of 'PRIu64': */
24 #define __STDC_FORMAT_MACROS
25 
26 #include <my_global.h>
27 
28 #include <inttypes.h>
29 
30 /* The C++ file's header */
31 #include "./ha_rocksdb.h"
32 
33 /* C++ standard header files */
34 #include <inttypes.h>
35 #include <algorithm>
36 #include <limits>
37 #include <map>
38 #include <queue>
39 #include <set>
40 #include <string>
41 #include <vector>
42 
43 /* MySQL includes */
44 #include <mysql/psi/mysql_table.h>
45 // #include <thread_pool_priv.h>
46 #include <mysys_err.h>
47 #include "./debug_sync.h"
48 #include "./my_bit.h"
49 #include "./my_stacktrace.h"
50 #include "./my_sys.h"
51 #include "./sql_audit.h"
52 #include "./sql_table.h"
53 #include "./sql_hset.h"
54 #ifdef MARIAROCKS_NOT_YET
55 #endif
56 
57 /* RocksDB includes */
58 #include "monitoring/histogram.h"
59 #include "rocksdb/compaction_filter.h"
60 #include "rocksdb/env.h"
61 #include "rocksdb/memory_allocator.h"
62 #include "rocksdb/persistent_cache.h"
63 #include "rocksdb/rate_limiter.h"
64 #include "rocksdb/slice_transform.h"
65 #include "rocksdb/thread_status.h"
66 #include "rocksdb/utilities/checkpoint.h"
67 #include "rocksdb/utilities/convenience.h"
68 #include "rocksdb/utilities/memory_util.h"
69 #include "rocksdb/utilities/sim_cache.h"
70 #include "rocksdb/utilities/write_batch_with_index.h"
71 #include "util/stop_watch.h"
72 #include "./rdb_source_revision.h"
73 
74 // MariaRocks: this is needed to access RocksDB debug syncpoints:
75 #include "test_util/sync_point.h"
76 
77 /* MyRocks includes */
78 #include "./event_listener.h"
79 #include "./ha_rocksdb_proto.h"
80 #include "./logger.h"
81 #include "./nosql_access.h"
82 #include "./rdb_cf_manager.h"
83 #include "./rdb_cf_options.h"
84 #include "./rdb_converter.h"
85 #include "./rdb_datadic.h"
86 #include "./rdb_i_s.h"
87 #include "./rdb_index_merge.h"
88 #include "./rdb_mutex_wrapper.h"
89 #include "./rdb_psi.h"
90 #include "./rdb_threads.h"
91 #include "./rdb_mariadb_server_port.h"
92 
93 // Internal MySQL APIs not exposed in any header.
94 extern "C" {
95 /**
96   Mark transaction to rollback and mark error as fatal to a sub-statement.
97   @param  thd   Thread handle
98   @param  all   TRUE <=> rollback main transaction.
99 */
100 void thd_mark_transaction_to_rollback(MYSQL_THD thd, bool all);
101 
102 /**
103  *   Get the user thread's binary logging format
104  *   @param thd  user thread
105  *   @return Value to be used as index into the binlog_format_names array
106  */
107 int thd_binlog_format(const MYSQL_THD thd);
108 
109 /**
110  *   Check if binary logging is filtered for thread's current db.
111  *   @param  thd   Thread handle
112  *   @retval 1 the query is not filtered, 0 otherwise.
113  */
114 bool thd_binlog_filter_ok(const MYSQL_THD thd);
115 }
116 
117 extern my_bool opt_core_file;
118 
119 // Needed in rocksdb_init_func
120 void ignore_db_dirs_append(const char *dirname_arg);
121 
122 
123 namespace myrocks {
124 
125 static st_global_stats global_stats;
126 static st_export_stats export_stats;
127 static st_memory_stats memory_stats;
128 static st_io_stall_stats io_stall_stats;
129 
130 const std::string DEFAULT_CF_NAME("default");
131 const std::string DEFAULT_SYSTEM_CF_NAME("__system__");
132 const std::string PER_INDEX_CF_NAME("$per_index_cf");
133 
134 static std::vector<GL_INDEX_ID> rdb_indexes_to_recalc;
135 
136 #ifdef MARIADB_NOT_YET
137 class Rdb_explicit_snapshot : public explicit_snapshot {
138  public:
create(snapshot_info_st * ss_info,rocksdb::DB * db,const rocksdb::Snapshot * snapshot)139   static std::shared_ptr<Rdb_explicit_snapshot> create(
140       snapshot_info_st *ss_info, rocksdb::DB *db,
141       const rocksdb::Snapshot *snapshot) {
142     std::lock_guard<std::mutex> lock(explicit_snapshot_mutex);
143     auto s = std::unique_ptr<rocksdb::ManagedSnapshot>(
144         new rocksdb::ManagedSnapshot(db, snapshot));
145     if (!s) {
146       return nullptr;
147     }
148     ss_info->snapshot_id = ++explicit_snapshot_counter;
149     auto ret = std::make_shared<Rdb_explicit_snapshot>(*ss_info, std::move(s));
150     if (!ret) {
151       return nullptr;
152     }
153     explicit_snapshots[ss_info->snapshot_id] = ret;
154     return ret;
155   }
156 
dump_snapshots()157   static std::string dump_snapshots() {
158     std::string str;
159     std::lock_guard<std::mutex> lock(explicit_snapshot_mutex);
160     for (const auto &elem : explicit_snapshots) {
161       const auto &ss = elem.second.lock();
162       DBUG_ASSERT(ss != nullptr);
163       const auto &info = ss->ss_info;
164       str += "\nSnapshot ID: " + std::to_string(info.snapshot_id) +
165              "\nBinlog File: " + info.binlog_file +
166              "\nBinlog Pos: " + std::to_string(info.binlog_pos) +
167              "\nGtid Executed: " + info.gtid_executed + "\n";
168     }
169 
170     return str;
171   }
172 
get(const ulonglong snapshot_id)173   static std::shared_ptr<Rdb_explicit_snapshot> get(
174       const ulonglong snapshot_id) {
175     std::lock_guard<std::mutex> lock(explicit_snapshot_mutex);
176     auto elem = explicit_snapshots.find(snapshot_id);
177     if (elem == explicit_snapshots.end()) {
178       return nullptr;
179     }
180     return elem->second.lock();
181   }
182 
get_snapshot()183   rocksdb::ManagedSnapshot *get_snapshot() { return snapshot.get(); }
184 
Rdb_explicit_snapshot(snapshot_info_st ss_info,std::unique_ptr<rocksdb::ManagedSnapshot> && snapshot)185   Rdb_explicit_snapshot(snapshot_info_st ss_info,
186                         std::unique_ptr<rocksdb::ManagedSnapshot> &&snapshot)
187       : explicit_snapshot(ss_info), snapshot(std::move(snapshot)) {}
188 
~Rdb_explicit_snapshot()189   virtual ~Rdb_explicit_snapshot() {
190     std::lock_guard<std::mutex> lock(explicit_snapshot_mutex);
191     explicit_snapshots.erase(ss_info.snapshot_id);
192   }
193 
194  private:
195   std::unique_ptr<rocksdb::ManagedSnapshot> snapshot;
196 
197   static std::mutex explicit_snapshot_mutex;
198   static ulonglong explicit_snapshot_counter;
199   static std::unordered_map<ulonglong, std::weak_ptr<Rdb_explicit_snapshot>>
200       explicit_snapshots;
201 };
202 
203 std::mutex Rdb_explicit_snapshot::explicit_snapshot_mutex;
204 ulonglong Rdb_explicit_snapshot::explicit_snapshot_counter = 0;
205 std::unordered_map<ulonglong, std::weak_ptr<Rdb_explicit_snapshot>>
206     Rdb_explicit_snapshot::explicit_snapshots;
207 #endif
208 
209 /**
210   Updates row counters based on the table type and operation type.
211 */
update_row_stats(const operation_type & type)212 void ha_rocksdb::update_row_stats(const operation_type &type) {
213   DBUG_ASSERT(type < ROWS_MAX);
214   // Find if we are modifying system databases.
215   if (table->s && m_tbl_def->m_is_mysql_system_table) {
216     global_stats.system_rows[type].inc();
217   } else {
218     global_stats.rows[type].inc();
219   }
220 }
221 
222 void dbug_dump_database(rocksdb::DB *db);
223 static handler *rocksdb_create_handler(my_core::handlerton *hton,
224                                        my_core::TABLE_SHARE *table_arg,
225                                        my_core::MEM_ROOT *mem_root);
226 
getCompactRangeOptions(int concurrency=0)227 static rocksdb::CompactRangeOptions getCompactRangeOptions(
228     int concurrency = 0) {
229   rocksdb::CompactRangeOptions compact_range_options;
230   compact_range_options.bottommost_level_compaction =
231       rocksdb::BottommostLevelCompaction::kForce;
232   compact_range_options.exclusive_manual_compaction = false;
233   if (concurrency > 0) {
234     compact_range_options.max_subcompactions = concurrency;
235   }
236   return compact_range_options;
237 }
238 
239 ///////////////////////////////////////////////////////////
240 // Parameters and settings
241 ///////////////////////////////////////////////////////////
242 static char *rocksdb_default_cf_options = nullptr;
243 static char *rocksdb_override_cf_options = nullptr;
244 static char *rocksdb_update_cf_options = nullptr;
245 
246 ///////////////////////////////////////////////////////////
247 // Globals
248 ///////////////////////////////////////////////////////////
249 handlerton *rocksdb_hton;
250 
251 rocksdb::TransactionDB *rdb = nullptr;
252 rocksdb::HistogramImpl *commit_latency_stats = nullptr;
253 
254 static std::shared_ptr<rocksdb::Statistics> rocksdb_stats;
255 static std::unique_ptr<rocksdb::Env> flashcache_aware_env;
256 static std::shared_ptr<Rdb_tbl_prop_coll_factory> properties_collector_factory;
257 
258 Rdb_dict_manager dict_manager;
259 Rdb_cf_manager cf_manager;
260 Rdb_ddl_manager ddl_manager;
261 Rdb_binlog_manager binlog_manager;
262 
263 #if !defined(_WIN32) && !defined(__APPLE__)
264 Rdb_io_watchdog *io_watchdog = nullptr;
265 #endif
266 /**
267   MyRocks background thread control
268   N.B. This is besides RocksDB's own background threads
269        (@see rocksdb::CancelAllBackgroundWork())
270 */
271 
272 static Rdb_background_thread rdb_bg_thread;
273 
274 static Rdb_manual_compaction_thread rdb_mc_thread;
275 
276 // List of table names (using regex) that are exceptions to the strict
277 // collation check requirement.
278 Regex_list_handler *rdb_collation_exceptions;
279 
280 static const char **rdb_get_error_messages(int nr);
281 
rocksdb_flush_all_memtables()282 static void rocksdb_flush_all_memtables() {
283   const Rdb_cf_manager &cf_manager = rdb_get_cf_manager();
284   for (const auto &cf_handle : cf_manager.get_all_cf()) {
285     rdb->Flush(rocksdb::FlushOptions(), cf_handle);
286   }
287 }
288 
rocksdb_delete_column_family_stub(THD * const,struct st_mysql_sys_var * const,void * const,const void * const)289 static void rocksdb_delete_column_family_stub(
290     THD *const /* thd */, struct st_mysql_sys_var *const /* var */,
291     void *const /* var_ptr */, const void *const /* save */) {}
292 
rocksdb_delete_column_family(THD * const,struct st_mysql_sys_var * const,void * const,struct st_mysql_value * const value)293 static int rocksdb_delete_column_family(
294     THD *const /* thd */, struct st_mysql_sys_var *const /* var */,
295     void *const /* var_ptr */, struct st_mysql_value *const value) {
296   // Return failure for now until the race condition between creating
297   // CF and deleting CF is resolved
298   return HA_EXIT_FAILURE;
299 
300   char buff[STRING_BUFFER_USUAL_SIZE];
301   int len = sizeof(buff);
302 
303   DBUG_ASSERT(value != nullptr);
304 
305   if (const char *const cf = value->val_str(value, buff, &len)) {
306     auto &cf_manager = rdb_get_cf_manager();
307     auto ret = cf_manager.drop_cf(cf);
308     if (ret == HA_EXIT_SUCCESS) {
309       // NO_LINT_DEBUG
310       sql_print_information("RocksDB: Dropped column family: %s\n", cf);
311     } else {
312       // NO_LINT_DEBUG
313       sql_print_error("RocksDB: Failed to drop column family: %s, error: %d\n",
314                       cf, ret);
315     }
316 
317     return ret;
318   }
319 
320   return HA_EXIT_SUCCESS;
321 }
322 
323 ///////////////////////////////////////////////////////////
324 // Hash map: table name => open table handler
325 ///////////////////////////////////////////////////////////
326 
327 namespace  // anonymous namespace = not visible outside this source file
328 {
329 
330 typedef Hash_set<Rdb_table_handler> Rdb_table_set;
331 
332 class Rdb_open_tables_map {
333  private:
334   /* Hash table used to track the handlers of open tables */
335   std::unordered_map<std::string, Rdb_table_handler *> m_table_map;
336 
337   /* The mutex used to protect the hash table */
338   mutable mysql_mutex_t m_mutex;
339 
340  public:
init()341   void init() {
342     m_table_map.clear();
343     mysql_mutex_init(rdb_psi_open_tbls_mutex_key, &m_mutex, MY_MUTEX_INIT_FAST);
344   }
345 
free()346   void free() {
347     m_table_map.clear();
348     mysql_mutex_destroy(&m_mutex);
349   }
count()350   size_t count() { return m_table_map.size(); }
351 
352   Rdb_table_handler *get_table_handler(const char *const table_name);
353   void release_table_handler(Rdb_table_handler *const table_handler);
354 
355   std::vector<std::string> get_table_names(void) const;
356 };
357 
358 }  // anonymous namespace
359 
360 static Rdb_open_tables_map rdb_open_tables;
361 
rdb_normalize_dir(std::string dir)362 static std::string rdb_normalize_dir(std::string dir) {
363   while (dir.size() > 0 && dir.back() == '/') {
364     dir.resize(dir.size() - 1);
365   }
366   return dir;
367 }
368 
rocksdb_create_checkpoint(THD * const thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * const var MY_ATTRIBUTE ((__unused__)),void * const save MY_ATTRIBUTE ((__unused__)),struct st_mysql_value * const value)369 static int rocksdb_create_checkpoint(
370     THD *const thd MY_ATTRIBUTE((__unused__)),
371     struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
372     void *const save MY_ATTRIBUTE((__unused__)),
373     struct st_mysql_value *const value) {
374   char buf[FN_REFLEN];
375   int len = sizeof(buf);
376   const char *const checkpoint_dir_raw = value->val_str(value, buf, &len);
377   if (checkpoint_dir_raw) {
378     if (rdb != nullptr) {
379       std::string checkpoint_dir = rdb_normalize_dir(checkpoint_dir_raw);
380       // NO_LINT_DEBUG
381       sql_print_information("RocksDB: creating checkpoint in directory : %s\n",
382                             checkpoint_dir.c_str());
383       rocksdb::Checkpoint *checkpoint;
384       auto status = rocksdb::Checkpoint::Create(rdb, &checkpoint);
385       // We can only return HA_EXIT_FAILURE/HA_EXIT_SUCCESS here which is why
386       // the return code is ignored, but by calling into rdb_error_to_mysql,
387       // it will call my_error for us, which will propogate up to the client.
388       int rc __attribute__((__unused__));
389       if (status.ok()) {
390         status = checkpoint->CreateCheckpoint(checkpoint_dir.c_str());
391         delete checkpoint;
392         if (status.ok()) {
393           // NO_LINT_DEBUG
394           sql_print_information(
395               "RocksDB: created checkpoint in directory : %s\n",
396               checkpoint_dir.c_str());
397           return HA_EXIT_SUCCESS;
398         } else {
399           rc = ha_rocksdb::rdb_error_to_mysql(status);
400         }
401       } else {
402         rc = ha_rocksdb::rdb_error_to_mysql(status);
403       }
404     }
405   }
406   return HA_EXIT_FAILURE;
407 }
408 
409 /* This method is needed to indicate that the
410    ROCKSDB_CREATE_CHECKPOINT command is not read-only */
rocksdb_create_checkpoint_stub(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)411 static void rocksdb_create_checkpoint_stub(THD *const thd,
412                                            struct st_mysql_sys_var *const var,
413                                            void *const var_ptr,
414                                            const void *const save) {}
415 
rocksdb_force_flush_memtable_now_stub(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)416 static void rocksdb_force_flush_memtable_now_stub(
417     THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
418     const void *const save) {}
419 
rocksdb_force_flush_memtable_now(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,struct st_mysql_value * const value)420 static int rocksdb_force_flush_memtable_now(
421     THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
422     struct st_mysql_value *const value) {
423   // NO_LINT_DEBUG
424   sql_print_information("RocksDB: Manual memtable flush.");
425   rocksdb_flush_all_memtables();
426   return HA_EXIT_SUCCESS;
427 }
428 
rocksdb_force_flush_memtable_and_lzero_now_stub(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)429 static void rocksdb_force_flush_memtable_and_lzero_now_stub(
430     THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
431     const void *const save) {}
432 
rocksdb_force_flush_memtable_and_lzero_now(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,struct st_mysql_value * const value)433 static int rocksdb_force_flush_memtable_and_lzero_now(
434     THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
435     struct st_mysql_value *const value) {
436   // NO_LINT_DEBUG
437   sql_print_information("RocksDB: Manual memtable and L0 flush.");
438   rocksdb_flush_all_memtables();
439 
440   const Rdb_cf_manager &cf_manager = rdb_get_cf_manager();
441   rocksdb::CompactionOptions c_options = rocksdb::CompactionOptions();
442   rocksdb::ColumnFamilyMetaData metadata;
443   rocksdb::ColumnFamilyDescriptor cf_descr;
444 
445   int i, max_attempts = 3, num_errors = 0;
446 
447   for (const auto &cf_handle : cf_manager.get_all_cf()) {
448     for (i = 0; i < max_attempts; i++) {
449       rdb->GetColumnFamilyMetaData(cf_handle, &metadata);
450       cf_handle->GetDescriptor(&cf_descr);
451       c_options.output_file_size_limit = cf_descr.options.target_file_size_base;
452 
453       DBUG_ASSERT(metadata.levels[0].level == 0);
454       std::vector<std::string> file_names;
455       for (auto &file : metadata.levels[0].files) {
456         file_names.emplace_back(file.db_path + file.name);
457       }
458 
459       if (file_names.empty()) {
460         break;
461       }
462 
463       rocksdb::Status s;
464       s = rdb->CompactFiles(c_options, cf_handle, file_names, 1);
465 
466       // Due to a race, it's possible for CompactFiles to collide
467       // with auto compaction, causing an error to return
468       // regarding file not found. In that case, retry.
469       if (s.IsInvalidArgument()) {
470         continue;
471       }
472 
473       if (!s.ok() && !s.IsAborted()) {
474         rdb_handle_io_error(s, RDB_IO_ERROR_GENERAL);
475         return HA_EXIT_FAILURE;
476       }
477       break;
478     }
479     if (i == max_attempts) {
480       num_errors++;
481     }
482   }
483 
484   return num_errors == 0 ? HA_EXIT_SUCCESS : HA_EXIT_FAILURE;
485 }
486 
487 static void rocksdb_drop_index_wakeup_thread(
488     my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
489     struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
490     void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save);
491 
492 static my_bool rocksdb_pause_background_work = 0;
493 static mysql_mutex_t rdb_sysvars_mutex;
494 static mysql_mutex_t rdb_block_cache_resize_mutex;
495 
rocksdb_set_pause_background_work(my_core::THD * const,struct st_mysql_sys_var * const,void * const,const void * const save)496 static void rocksdb_set_pause_background_work(
497     my_core::THD *const,
498     struct st_mysql_sys_var *const,
499     void *const, const void *const save) {
500   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
501   const my_bool pause_requested = *static_cast<const my_bool *>(save);
502   if (rocksdb_pause_background_work != pause_requested) {
503     if (pause_requested) {
504       rdb->PauseBackgroundWork();
505     } else {
506       rdb->ContinueBackgroundWork();
507     }
508     rocksdb_pause_background_work = pause_requested;
509   }
510   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
511 }
512 
513 static void rocksdb_set_compaction_options(THD *thd,
514                                            struct st_mysql_sys_var *var,
515                                            void *var_ptr, const void *save);
516 
517 static void rocksdb_set_table_stats_sampling_pct(THD *thd,
518                                                  struct st_mysql_sys_var *var,
519                                                  void *var_ptr,
520                                                  const void *save);
521 
522 static void rocksdb_set_rate_limiter_bytes_per_sec(THD *thd,
523                                                    struct st_mysql_sys_var *var,
524                                                    void *var_ptr,
525                                                    const void *save);
526 
527 static void rocksdb_set_sst_mgr_rate_bytes_per_sec(THD *thd,
528                                                    struct st_mysql_sys_var *var,
529                                                    void *var_ptr,
530                                                    const void *save);
531 
532 static void rocksdb_set_delayed_write_rate(THD *thd,
533                                            struct st_mysql_sys_var *var,
534                                            void *var_ptr, const void *save);
535 
536 static void rocksdb_set_max_latest_deadlocks(THD *thd,
537                                              struct st_mysql_sys_var *var,
538                                              void *var_ptr, const void *save);
539 
540 static void rdb_set_collation_exception_list(const char *exception_list);
541 static void rocksdb_set_collation_exception_list(THD *thd,
542                                                  struct st_mysql_sys_var *var,
543                                                  void *var_ptr,
544                                                  const void *save);
545 
546 static int rocksdb_validate_update_cf_options(THD *thd,
547                                               struct st_mysql_sys_var *var,
548                                               void *save,
549                                               st_mysql_value *value);
550 
551 static void rocksdb_set_update_cf_options(THD *thd,
552                                           struct st_mysql_sys_var *var,
553                                           void *var_ptr, const void *save);
554 
555 static int rocksdb_check_bulk_load(
556     THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)),
557     void *save, struct st_mysql_value *value);
558 
559 static int rocksdb_check_bulk_load_allow_unsorted(
560     THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)),
561     void *save, struct st_mysql_value *value);
562 
563 static void rocksdb_set_max_background_jobs(THD *thd,
564                                             struct st_mysql_sys_var *const var,
565                                             void *const var_ptr,
566                                             const void *const save);
567 static void rocksdb_set_bytes_per_sync(THD *thd,
568                                        struct st_mysql_sys_var *const var,
569                                        void *const var_ptr,
570                                        const void *const save);
571 static void rocksdb_set_wal_bytes_per_sync(THD *thd,
572                                            struct st_mysql_sys_var *const var,
573                                            void *const var_ptr,
574                                            const void *const save);
575 static int rocksdb_validate_set_block_cache_size(
576     THD *thd, struct st_mysql_sys_var *const var, void *var_ptr,
577     struct st_mysql_value *value);
578 //////////////////////////////////////////////////////////////////////////////
579 // Options definitions
580 //////////////////////////////////////////////////////////////////////////////
581 static long long rocksdb_block_cache_size;
582 static long long rocksdb_sim_cache_size;
583 static my_bool rocksdb_use_clock_cache;
584 static double rocksdb_cache_high_pri_pool_ratio;
585 static my_bool rocksdb_cache_dump;
586 /* Use unsigned long long instead of uint64_t because of MySQL compatibility */
587 static unsigned long long  // NOLINT(runtime/int)
588     rocksdb_rate_limiter_bytes_per_sec;
589 static unsigned long long  // NOLINT(runtime/int)
590     rocksdb_sst_mgr_rate_bytes_per_sec;
591 static unsigned long long rocksdb_delayed_write_rate;
592 static uint32_t rocksdb_max_latest_deadlocks;
593 static unsigned long  // NOLINT(runtime/int)
594     rocksdb_persistent_cache_size_mb;
595 static ulong rocksdb_info_log_level;
596 static char *rocksdb_wal_dir;
597 static char *rocksdb_persistent_cache_path;
598 static ulong rocksdb_index_type;
599 static uint32_t rocksdb_flush_log_at_trx_commit;
600 static uint32_t rocksdb_debug_optimizer_n_rows;
601 static my_bool rocksdb_force_compute_memtable_stats;
602 static uint32_t rocksdb_force_compute_memtable_stats_cachetime;
603 static my_bool rocksdb_debug_optimizer_no_zero_cardinality;
604 static uint32_t rocksdb_wal_recovery_mode;
605 static uint32_t rocksdb_stats_level;
606 static uint32_t rocksdb_access_hint_on_compaction_start;
607 static char *rocksdb_compact_cf_name;
608 static char *rocksdb_delete_cf_name;
609 static char *rocksdb_checkpoint_name;
610 static my_bool rocksdb_signal_drop_index_thread;
611 static my_bool rocksdb_signal_remove_mariabackup_checkpoint;
612 static my_bool rocksdb_strict_collation_check = 1;
613 static my_bool rocksdb_ignore_unknown_options = 1;
614 static my_bool rocksdb_enable_2pc = 0;
615 static char *rocksdb_strict_collation_exceptions;
616 static my_bool rocksdb_collect_sst_properties = 1;
617 static my_bool rocksdb_force_flush_memtable_now_var = 0;
618 static my_bool rocksdb_force_flush_memtable_and_lzero_now_var = 0;
619 static my_bool rocksdb_enable_ttl = 1;
620 static my_bool rocksdb_enable_ttl_read_filtering = 1;
621 static int rocksdb_debug_ttl_rec_ts = 0;
622 static int rocksdb_debug_ttl_snapshot_ts = 0;
623 static int rocksdb_debug_ttl_read_filter_ts = 0;
624 static my_bool rocksdb_debug_ttl_ignore_pk = 0;
625 static my_bool rocksdb_reset_stats = 0;
626 static uint32_t rocksdb_io_write_timeout_secs = 0;
627 static uint32_t rocksdb_seconds_between_stat_computes = 3600;
628 static long long rocksdb_compaction_sequential_deletes = 0l;
629 static long long rocksdb_compaction_sequential_deletes_window = 0l;
630 static long long rocksdb_compaction_sequential_deletes_file_size = 0l;
631 static uint32_t rocksdb_validate_tables = 1;
632 static char *rocksdb_datadir;
633 static uint32_t rocksdb_table_stats_sampling_pct;
634 static my_bool rocksdb_enable_bulk_load_api = 1;
635 static my_bool rocksdb_print_snapshot_conflict_queries = 0;
636 static my_bool rocksdb_large_prefix = 0;
637 static my_bool rocksdb_allow_to_start_after_corruption = 0;
638 static char* rocksdb_git_hash;
639 
640 uint32_t rocksdb_ignore_datadic_errors = 0;
641 
642 char *compression_types_val=
643   const_cast<char*>(get_rocksdb_supported_compression_types());
644 static unsigned long rocksdb_write_policy =
645     rocksdb::TxnDBWritePolicy::WRITE_COMMITTED;
646 
647 #if 0 // MARIAROCKS_NOT_YET : read-free replication is not supported
648 char *rocksdb_read_free_rpl_tables;
649 std::mutex rocksdb_read_free_rpl_tables_mutex;
650 #if defined(HAVE_PSI_INTERFACE)
651 Regex_list_handler rdb_read_free_regex_handler(key_rwlock_read_free_rpl_tables);
652 #else
653 Regex_list_handler rdb_read_free_regex_handler;
654 #endif
655 enum read_free_rpl_type { OFF = 0, PK_ONLY, PK_SK };
656 static unsigned long rocksdb_read_free_rpl = read_free_rpl_type::OFF;
657 #endif
658 
659 static my_bool rocksdb_error_on_suboptimal_collation = 1;
660 static uint32_t rocksdb_stats_recalc_rate = 0;
661 static uint32_t rocksdb_debug_manual_compaction_delay = 0;
662 static uint32_t rocksdb_max_manual_compactions = 0;
663 static my_bool rocksdb_rollback_on_timeout = FALSE;
664 static my_bool rocksdb_enable_insert_with_update_caching = TRUE;
665 
666 std::atomic<uint64_t> rocksdb_row_lock_deadlocks(0);
667 std::atomic<uint64_t> rocksdb_row_lock_wait_timeouts(0);
668 std::atomic<uint64_t> rocksdb_snapshot_conflict_errors(0);
669 std::atomic<uint64_t> rocksdb_wal_group_syncs(0);
670 std::atomic<uint64_t> rocksdb_manual_compactions_processed(0);
671 std::atomic<uint64_t> rocksdb_manual_compactions_running(0);
672 #ifndef DBUG_OFF
673 std::atomic<uint64_t> rocksdb_num_get_for_update_calls(0);
674 #endif
675 
676 
677 
678 /*
679   Remove directory with files in it.
680   Used to remove checkpoint created by mariabackup.
681 */
682 #ifdef _WIN32
683 #include <direct.h> /* unlink*/
684 #ifndef F_OK
685 #define F_OK 0
686 #endif
687 #endif
688 
rmdir_force(const char * dir)689 static int rmdir_force(const char *dir) {
690   if (access(dir, F_OK))
691     return true;
692 
693   char path[FN_REFLEN];
694   char sep[] = {FN_LIBCHAR, 0};
695   int err = 0;
696 
697   MY_DIR *dir_info = my_dir(dir, MYF(MY_DONT_SORT | MY_WANT_STAT));
698   if (!dir_info)
699     return 1;
700 
701   for (uint i = 0; i < dir_info->number_of_files; i++) {
702     FILEINFO *file = dir_info->dir_entry + i;
703 
704     strxnmov(path, sizeof(path), dir, sep, file->name, NULL);
705 
706     err = my_delete(path, 0);
707 
708     if (err) {
709       break;
710     }
711   }
712 
713   my_dirend(dir_info);
714 
715   if (!err)
716     err = rmdir(dir);
717 
718   return (err == 0) ? HA_EXIT_SUCCESS : HA_EXIT_FAILURE;
719 }
720 
721 
rocksdb_remove_mariabackup_checkpoint(my_core::THD * const,struct st_mysql_sys_var * const,void * const var_ptr,const void * const)722 static void rocksdb_remove_mariabackup_checkpoint(
723     my_core::THD *const,
724     struct st_mysql_sys_var *const ,
725     void *const var_ptr, const void *const) {
726   std::string mariabackup_checkpoint_dir(rocksdb_datadir);
727 
728   mariabackup_checkpoint_dir.append("/mariabackup-checkpoint");
729 
730   if (unlink(mariabackup_checkpoint_dir.c_str())  == 0)
731     return;
732 
733   rmdir_force(mariabackup_checkpoint_dir.c_str());
734 }
735 
736 
rdb_init_rocksdb_db_options(void)737 static std::unique_ptr<rocksdb::DBOptions> rdb_init_rocksdb_db_options(void) {
738   auto o = std::unique_ptr<rocksdb::DBOptions>(new rocksdb::DBOptions());
739 
740   o->create_if_missing = true;
741   o->listeners.push_back(std::make_shared<Rdb_event_listener>(&ddl_manager));
742   o->info_log_level = rocksdb::InfoLogLevel::INFO_LEVEL;
743   o->max_subcompactions = DEFAULT_SUBCOMPACTIONS;
744   o->max_open_files = -2;  // auto-tune to 50% open_files_limit
745 
746   o->two_write_queues = true;
747   o->manual_wal_flush = true;
748   return o;
749 }
750 
751 /* DBOptions contains Statistics and needs to be destructed last */
752 static std::unique_ptr<rocksdb::BlockBasedTableOptions> rocksdb_tbl_options =
753     std::unique_ptr<rocksdb::BlockBasedTableOptions>(
754         new rocksdb::BlockBasedTableOptions());
755 static std::unique_ptr<rocksdb::DBOptions> rocksdb_db_options =
756     rdb_init_rocksdb_db_options();
757 
758 static std::shared_ptr<rocksdb::RateLimiter> rocksdb_rate_limiter;
759 
760 /* This enum needs to be kept up to date with rocksdb::TxnDBWritePolicy */
761 static const char *write_policy_names[] = {"write_committed", "write_prepared",
762                                            "write_unprepared", NullS};
763 
764 static TYPELIB write_policy_typelib = {array_elements(write_policy_names) - 1,
765                                        "write_policy_typelib",
766                                        write_policy_names, nullptr};
767 
768 #if 0 // MARIAROCKS_NOT_YET : read-free replication is not supported
769 /* This array needs to be kept up to date with myrocks::read_free_rpl_type */
770 static const char *read_free_rpl_names[] = {"OFF", "PK_ONLY", "PK_SK", NullS};
771 
772 static TYPELIB read_free_rpl_typelib = {array_elements(read_free_rpl_names) - 1,
773                                         "read_free_rpl_typelib",
774                                         read_free_rpl_names, nullptr};
775 #endif
776 
777 /* This enum needs to be kept up to date with rocksdb::InfoLogLevel */
778 static const char *info_log_level_names[] = {"debug_level", "info_level",
779                                              "warn_level",  "error_level",
780                                              "fatal_level", NullS};
781 
782 static TYPELIB info_log_level_typelib = {
783     array_elements(info_log_level_names) - 1, "info_log_level_typelib",
784     info_log_level_names, nullptr};
785 
rocksdb_set_rocksdb_info_log_level(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)786 static void rocksdb_set_rocksdb_info_log_level(
787     THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
788     const void *const save) {
789   DBUG_ASSERT(save != nullptr);
790 
791   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
792   rocksdb_info_log_level = *static_cast<const uint64_t *>(save);
793   rocksdb_db_options->info_log->SetInfoLogLevel(
794       static_cast<rocksdb::InfoLogLevel>(rocksdb_info_log_level));
795   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
796 }
797 
rocksdb_set_rocksdb_stats_level(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)798 static void rocksdb_set_rocksdb_stats_level(THD *const thd,
799                                             struct st_mysql_sys_var *const var,
800                                             void *const var_ptr,
801                                             const void *const save) {
802   DBUG_ASSERT(save != nullptr);
803 
804   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
805   rocksdb_db_options->statistics->set_stats_level(
806       static_cast<rocksdb::StatsLevel>(
807           *static_cast<const uint64_t *>(save)));
808   // Actual stats level is defined at rocksdb dbopt::statistics::stats_level_
809   // so adjusting rocksdb_stats_level here to make sure it points to
810   // the correct stats level.
811   rocksdb_stats_level = rocksdb_db_options->statistics->get_stats_level();
812   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
813 }
814 
rocksdb_set_reset_stats(my_core::THD * const,my_core::st_mysql_sys_var * const var MY_ATTRIBUTE ((__unused__)),void * const var_ptr,const void * const save)815 static void rocksdb_set_reset_stats(
816     my_core::THD *const /* unused */,
817     my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
818     void *const var_ptr, const void *const save) {
819   DBUG_ASSERT(save != nullptr);
820   DBUG_ASSERT(rdb != nullptr);
821   DBUG_ASSERT(rocksdb_stats != nullptr);
822 
823   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
824 
825   *static_cast<bool *>(var_ptr) = *static_cast<const bool *>(save);
826 
827   if (rocksdb_reset_stats) {
828     rocksdb::Status s = rdb->ResetStats();
829 
830     // RocksDB will always return success. Let's document this assumption here
831     // as well so that we'll get immediately notified when contract changes.
832     DBUG_ASSERT(s == rocksdb::Status::OK());
833 
834     s = rocksdb_stats->Reset();
835     DBUG_ASSERT(s == rocksdb::Status::OK());
836   }
837 
838   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
839 }
840 
rocksdb_set_io_write_timeout(my_core::THD * const thd MY_ATTRIBUTE ((__unused__)),my_core::st_mysql_sys_var * const var MY_ATTRIBUTE ((__unused__)),void * const var_ptr MY_ATTRIBUTE ((__unused__)),const void * const save)841 static void rocksdb_set_io_write_timeout(
842     my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
843     my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
844     void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
845   DBUG_ASSERT(save != nullptr);
846   DBUG_ASSERT(rdb != nullptr);
847 #if !defined(_WIN32) && !defined(__APPLE__)
848   DBUG_ASSERT(io_watchdog != nullptr);
849 #endif
850 
851   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
852 
853   const uint32_t new_val = *static_cast<const uint32_t *>(save);
854 
855   rocksdb_io_write_timeout_secs = new_val;
856 #if !defined(_WIN32) && !defined(__APPLE__)
857   io_watchdog->reset_timeout(rocksdb_io_write_timeout_secs);
858 #endif
859   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
860 }
861 
862 enum rocksdb_flush_log_at_trx_commit_type : unsigned int {
863   FLUSH_LOG_NEVER = 0,
864   FLUSH_LOG_SYNC,
865   FLUSH_LOG_BACKGROUND,
866   FLUSH_LOG_MAX /* must be last */
867 };
868 
rocksdb_validate_flush_log_at_trx_commit(THD * const thd,struct st_mysql_sys_var * const var,void * var_ptr,struct st_mysql_value * const value)869 static int rocksdb_validate_flush_log_at_trx_commit(
870     THD *const thd,
871     struct st_mysql_sys_var *const var, /* in: pointer to system variable */
872     void *var_ptr, /* out: immediate result for update function */
873     struct st_mysql_value *const value /* in: incoming value */) {
874   long long new_value;
875 
876   /* value is NULL */
877   if (value->val_int(value, &new_value)) {
878     return HA_EXIT_FAILURE;
879   }
880 
881   if (rocksdb_db_options->allow_mmap_writes && new_value != FLUSH_LOG_NEVER) {
882     return HA_EXIT_FAILURE;
883   }
884 
885   *static_cast<uint32_t *>(var_ptr) = static_cast<uint32_t>(new_value);
886   return HA_EXIT_SUCCESS;
887 }
rocksdb_compact_column_family_stub(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)888 static void rocksdb_compact_column_family_stub(
889     THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
890     const void *const save) {}
891 
892 static int rocksdb_compact_column_family(THD *const thd,
893                                          struct st_mysql_sys_var *const var,
894                                          void *const var_ptr,
895                                          struct st_mysql_value *const value);
896 
897 static const char *index_type_names[] = {"kBinarySearch", "kHashSearch", NullS};
898 
899 static TYPELIB index_type_typelib = {array_elements(index_type_names) - 1,
900                                      "index_type_typelib", index_type_names,
901                                      nullptr};
902 
903 const ulong RDB_MAX_LOCK_WAIT_SECONDS = 1024 * 1024 * 1024;
904 const ulong RDB_DEFAULT_MAX_ROW_LOCKS = 1024 * 1024;
905 const ulong RDB_MAX_ROW_LOCKS = 1024 * 1024 * 1024;
906 const ulong RDB_DEFAULT_BULK_LOAD_SIZE = 1000;
907 const ulong RDB_MAX_BULK_LOAD_SIZE = 1024 * 1024 * 1024;
908 const size_t RDB_DEFAULT_MERGE_BUF_SIZE = 64 * 1024 * 1024;
909 const size_t RDB_MIN_MERGE_BUF_SIZE = 100;
910 const size_t RDB_DEFAULT_MERGE_COMBINE_READ_SIZE = 1024 * 1024 * 1024;
911 const size_t RDB_MIN_MERGE_COMBINE_READ_SIZE = 100;
912 const size_t RDB_DEFAULT_MERGE_TMP_FILE_REMOVAL_DELAY = 0;
913 const size_t RDB_MIN_MERGE_TMP_FILE_REMOVAL_DELAY = 0;
914 const int64 RDB_DEFAULT_BLOCK_CACHE_SIZE = 512 * 1024 * 1024;
915 const int64 RDB_MIN_BLOCK_CACHE_SIZE = 1024;
916 const int RDB_MAX_CHECKSUMS_PCT = 100;
917 const ulong RDB_DEADLOCK_DETECT_DEPTH = 50;
918 
919 // TODO: 0 means don't wait at all, and we don't support it yet?
920 static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG,
921                           "Number of seconds to wait for lock", nullptr,
922                           nullptr, /*default*/ 1, /*min*/ 1,
923                           /*max*/ RDB_MAX_LOCK_WAIT_SECONDS, 0);
924 
925 static MYSQL_THDVAR_BOOL(deadlock_detect, PLUGIN_VAR_RQCMDARG,
926                          "Enables deadlock detection", nullptr, nullptr, FALSE);
927 
928 static MYSQL_THDVAR_ULONG(deadlock_detect_depth, PLUGIN_VAR_RQCMDARG,
929                           "Number of transactions deadlock detection will "
930                           "traverse through before assuming deadlock",
931                           nullptr, nullptr,
932                           /*default*/ RDB_DEADLOCK_DETECT_DEPTH,
933                           /*min*/ 2,
934                           /*max*/ ULONG_MAX, 0);
935 
936 static MYSQL_THDVAR_BOOL(
937     commit_time_batch_for_recovery, PLUGIN_VAR_RQCMDARG,
938     "TransactionOptions::commit_time_batch_for_recovery for RocksDB", nullptr,
939     nullptr, TRUE);
940 
941 static MYSQL_THDVAR_BOOL(
942     trace_sst_api, PLUGIN_VAR_RQCMDARG,
943     "Generate trace output in the log for each call to the SstFileWriter",
944     nullptr, nullptr, FALSE);
945 
946 static MYSQL_THDVAR_BOOL(
947     bulk_load, PLUGIN_VAR_RQCMDARG,
948     "Use bulk-load mode for inserts. This disables "
949     "unique_checks and enables rocksdb_commit_in_the_middle.",
950     rocksdb_check_bulk_load, nullptr, FALSE);
951 
952 static MYSQL_THDVAR_BOOL(bulk_load_allow_sk, PLUGIN_VAR_RQCMDARG,
953                          "Allow bulk loading of sk keys during bulk-load. "
954                          "Can be changed only when bulk load is disabled.",
955                          /* Intentionally reuse unsorted's check function */
956                          rocksdb_check_bulk_load_allow_unsorted, nullptr,
957                          FALSE);
958 
959 static MYSQL_THDVAR_BOOL(bulk_load_allow_unsorted, PLUGIN_VAR_RQCMDARG,
960                          "Allow unsorted input during bulk-load. "
961                          "Can be changed only when bulk load is disabled.",
962                          rocksdb_check_bulk_load_allow_unsorted, nullptr,
963                          FALSE);
964 
965 static MYSQL_SYSVAR_BOOL(enable_bulk_load_api, rocksdb_enable_bulk_load_api,
966                          PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
967                          "Enables using SstFileWriter for bulk loading",
968                          nullptr, nullptr, rocksdb_enable_bulk_load_api);
969 
970 static MYSQL_SYSVAR_STR(git_hash, rocksdb_git_hash,
971                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
972                         "Git revision of the RocksDB library used by MyRocks",
973                         nullptr, nullptr, ROCKSDB_GIT_HASH);
974 
975 static MYSQL_THDVAR_STR(tmpdir, PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_MEMALLOC,
976                         "Directory for temporary files during DDL operations.",
977                         nullptr, nullptr, "");
978 
979 #define DEFAULT_SKIP_UNIQUE_CHECK_TABLES ".*"
980 static MYSQL_THDVAR_STR(
981     skip_unique_check_tables, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
982     "Skip unique constraint checking for the specified tables", nullptr,
983     nullptr, DEFAULT_SKIP_UNIQUE_CHECK_TABLES);
984 
985 static MYSQL_THDVAR_BOOL(
986     commit_in_the_middle, PLUGIN_VAR_RQCMDARG,
987     "Commit rows implicitly every rocksdb_bulk_load_size, on bulk load/insert, "
988     "update and delete",
989     nullptr, nullptr, FALSE);
990 
991 static MYSQL_THDVAR_BOOL(
992     blind_delete_primary_key, PLUGIN_VAR_RQCMDARG,
993     "Deleting rows by primary key lookup, without reading rows (Blind Deletes)."
994     " Blind delete is disabled if the table has secondary key",
995     nullptr, nullptr, FALSE);
996 
997 #if 0 // MARIAROCKS_NOT_YET : read-free replication is not supported
998 
999 static const char *DEFAULT_READ_FREE_RPL_TABLES = ".*";
1000 
rocksdb_validate_read_free_rpl_tables(THD * thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * var MY_ATTRIBUTE ((__unused__)),void * save,struct st_mysql_value * value)1001 static int rocksdb_validate_read_free_rpl_tables(
1002     THD *thd MY_ATTRIBUTE((__unused__)),
1003     struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)), void *save,
1004     struct st_mysql_value *value) {
1005   char buff[STRING_BUFFER_USUAL_SIZE];
1006   int length = sizeof(buff);
1007   const char *wlist_buf = value->val_str(value, buff, &length);
1008   const auto wlist = wlist_buf ? wlist_buf : DEFAULT_READ_FREE_RPL_TABLES;
1009 
1010 #if defined(HAVE_PSI_INTERFACE)
1011   Regex_list_handler regex_handler(key_rwlock_read_free_rpl_tables);
1012 #else
1013   Regex_list_handler regex_handler;
1014 #endif
1015 
1016   if (!regex_handler.set_patterns(wlist)) {
1017     warn_about_bad_patterns(&regex_handler, "rocksdb_read_free_rpl_tables");
1018     return HA_EXIT_FAILURE;
1019   }
1020 
1021   *static_cast<const char **>(save) = my_strdup(wlist, MYF(MY_WME));
1022   return HA_EXIT_SUCCESS;
1023 }
1024 
rocksdb_update_read_free_rpl_tables(THD * thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * var MY_ATTRIBUTE ((__unused__)),void * var_ptr,const void * save)1025 static void rocksdb_update_read_free_rpl_tables(
1026     THD *thd MY_ATTRIBUTE((__unused__)),
1027     struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)), void *var_ptr,
1028     const void *save) {
1029   const auto wlist = *static_cast<const char *const *>(save);
1030   DBUG_ASSERT(wlist != nullptr);
1031 
1032   // This is bound to succeed since we've already checked for bad patterns in
1033   // rocksdb_validate_read_free_rpl_tables
1034   rdb_read_free_regex_handler.set_patterns(wlist);
1035 
1036   // update all table defs
1037   struct Rdb_read_free_rpl_updater : public Rdb_tables_scanner {
1038     int add_table(Rdb_tbl_def *tdef) override {
1039       tdef->check_and_set_read_free_rpl_table();
1040       return HA_EXIT_SUCCESS;
1041     }
1042   } updater;
1043   ddl_manager.scan_for_tables(&updater);
1044 
1045   if (wlist == DEFAULT_READ_FREE_RPL_TABLES) {
1046     // If running SET var = DEFAULT, then rocksdb_validate_read_free_rpl_tables
1047     // isn't called, and memory is never allocated for the value. Allocate it
1048     // here.
1049     *static_cast<const char **>(var_ptr) = my_strdup(wlist, MYF(MY_WME));
1050   } else {
1051     // Otherwise, we just reuse the value allocated from
1052     // rocksdb_validate_read_free_rpl_tables.
1053     *static_cast<const char **>(var_ptr) = wlist;
1054   }
1055 }
1056 
1057 static MYSQL_SYSVAR_STR(
1058     read_free_rpl_tables, rocksdb_read_free_rpl_tables,
1059     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC /*| PLUGIN_VAR_ALLOCATED*/,
1060     "List of tables that will use read-free replication on the slave "
1061     "(i.e. not lookup a row during replication)",
1062     rocksdb_validate_read_free_rpl_tables, rocksdb_update_read_free_rpl_tables,
1063     DEFAULT_READ_FREE_RPL_TABLES);
1064 
1065 static MYSQL_SYSVAR_ENUM(
1066     read_free_rpl, rocksdb_read_free_rpl,
1067     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
1068     "Use read-free replication on the slave (i.e. no row lookup during "
1069     "replication). Default is OFF, PK_SK will enable it on all tables with "
1070     "primary key. PK_ONLY will enable it on tables where the only key is the "
1071     "primary key (i.e. no secondary keys).",
1072     nullptr, nullptr, read_free_rpl_type::OFF, &read_free_rpl_typelib);
1073 #endif
1074 
1075 static MYSQL_THDVAR_BOOL(skip_bloom_filter_on_read, PLUGIN_VAR_RQCMDARG,
1076                          "Skip using bloom filter for reads", nullptr, nullptr,
1077                          FALSE);
1078 
1079 static MYSQL_THDVAR_ULONG(max_row_locks, PLUGIN_VAR_RQCMDARG,
1080                           "Maximum number of locks a transaction can have",
1081                           nullptr, nullptr,
1082                           /*default*/ RDB_DEFAULT_MAX_ROW_LOCKS,
1083                           /*min*/ 1,
1084                           /*max*/ RDB_MAX_ROW_LOCKS, 0);
1085 
1086 static MYSQL_THDVAR_ULONGLONG(
1087     write_batch_max_bytes, PLUGIN_VAR_RQCMDARG,
1088     "Maximum size of write batch in bytes. 0 means no limit.", nullptr, nullptr,
1089     /* default */ 0, /* min */ 0, /* max */ SIZE_T_MAX, 1);
1090 
1091 static MYSQL_THDVAR_BOOL(
1092     lock_scanned_rows, PLUGIN_VAR_RQCMDARG,
1093     "Take and hold locks on rows that are scanned but not updated", nullptr,
1094     nullptr, FALSE);
1095 
1096 static MYSQL_THDVAR_ULONG(bulk_load_size, PLUGIN_VAR_RQCMDARG,
1097                           "Max #records in a batch for bulk-load mode", nullptr,
1098                           nullptr,
1099                           /*default*/ RDB_DEFAULT_BULK_LOAD_SIZE,
1100                           /*min*/ 1,
1101                           /*max*/ RDB_MAX_BULK_LOAD_SIZE, 0);
1102 
1103 static MYSQL_THDVAR_ULONGLONG(
1104     merge_buf_size, PLUGIN_VAR_RQCMDARG,
1105     "Size to allocate for merge sort buffers written out to disk "
1106     "during inplace index creation.",
1107     nullptr, nullptr,
1108     /* default (64MB) */ RDB_DEFAULT_MERGE_BUF_SIZE,
1109     /* min (100B) */ RDB_MIN_MERGE_BUF_SIZE,
1110     /* max */ SIZE_T_MAX, 1);
1111 
1112 static MYSQL_THDVAR_ULONGLONG(
1113     merge_combine_read_size, PLUGIN_VAR_RQCMDARG,
1114     "Size that we have to work with during combine (reading from disk) phase "
1115     "of "
1116     "external sort during fast index creation.",
1117     nullptr, nullptr,
1118     /* default (1GB) */ RDB_DEFAULT_MERGE_COMBINE_READ_SIZE,
1119     /* min (100B) */ RDB_MIN_MERGE_COMBINE_READ_SIZE,
1120     /* max */ SIZE_T_MAX, 1);
1121 
1122 static MYSQL_THDVAR_ULONGLONG(
1123     merge_tmp_file_removal_delay_ms, PLUGIN_VAR_RQCMDARG,
1124     "Fast index creation creates a large tmp file on disk during index "
1125     "creation.  Removing this large file all at once when index creation is "
1126     "complete can cause trim stalls on Flash.  This variable specifies a "
1127     "duration to sleep (in milliseconds) between calling chsize() to truncate "
1128     "the file in chunks.  The chunk size is  the same as merge_buf_size.",
1129     nullptr, nullptr,
1130     /* default (0ms) */ RDB_DEFAULT_MERGE_TMP_FILE_REMOVAL_DELAY,
1131     /* min (0ms) */ RDB_MIN_MERGE_TMP_FILE_REMOVAL_DELAY,
1132     /* max */ SIZE_T_MAX, 1);
1133 
1134 static MYSQL_THDVAR_INT(
1135     manual_compaction_threads, PLUGIN_VAR_RQCMDARG,
1136     "How many rocksdb threads to run for manual compactions", nullptr, nullptr,
1137     /* default rocksdb.dboption max_subcompactions */ 0,
1138     /* min */ 0, /* max */ 128, 0);
1139 
1140 static MYSQL_SYSVAR_BOOL(
1141     create_if_missing,
1142     *reinterpret_cast<my_bool *>(&rocksdb_db_options->create_if_missing),
1143     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1144     "DBOptions::create_if_missing for RocksDB", nullptr, nullptr,
1145     rocksdb_db_options->create_if_missing);
1146 
1147 static MYSQL_SYSVAR_BOOL(
1148     two_write_queues,
1149     *reinterpret_cast<my_bool *>(&rocksdb_db_options->two_write_queues),
1150     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1151     "DBOptions::two_write_queues for RocksDB", nullptr, nullptr,
1152     rocksdb_db_options->two_write_queues);
1153 
1154 static MYSQL_SYSVAR_BOOL(
1155     manual_wal_flush,
1156     *reinterpret_cast<my_bool *>(&rocksdb_db_options->manual_wal_flush),
1157     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1158     "DBOptions::manual_wal_flush for RocksDB", nullptr, nullptr,
1159     rocksdb_db_options->manual_wal_flush);
1160 
1161 static MYSQL_SYSVAR_ENUM(write_policy, rocksdb_write_policy,
1162                          PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1163                          "DBOptions::write_policy for RocksDB", nullptr,
1164                          nullptr, rocksdb::TxnDBWritePolicy::WRITE_COMMITTED,
1165                          &write_policy_typelib);
1166 
1167 static MYSQL_SYSVAR_BOOL(
1168     create_missing_column_families,
1169     *reinterpret_cast<my_bool *>(
1170         &rocksdb_db_options->create_missing_column_families),
1171     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1172     "DBOptions::create_missing_column_families for RocksDB", nullptr, nullptr,
1173     rocksdb_db_options->create_missing_column_families);
1174 
1175 static MYSQL_SYSVAR_BOOL(
1176     error_if_exists,
1177     *reinterpret_cast<my_bool *>(&rocksdb_db_options->error_if_exists),
1178     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1179     "DBOptions::error_if_exists for RocksDB", nullptr, nullptr,
1180     rocksdb_db_options->error_if_exists);
1181 
1182 static MYSQL_SYSVAR_BOOL(
1183     paranoid_checks,
1184     *reinterpret_cast<my_bool *>(&rocksdb_db_options->paranoid_checks),
1185     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1186     "DBOptions::paranoid_checks for RocksDB", nullptr, nullptr,
1187     rocksdb_db_options->paranoid_checks);
1188 
1189 static MYSQL_SYSVAR_ULONGLONG(
1190     rate_limiter_bytes_per_sec, rocksdb_rate_limiter_bytes_per_sec,
1191     PLUGIN_VAR_RQCMDARG, "DBOptions::rate_limiter bytes_per_sec for RocksDB",
1192     nullptr, rocksdb_set_rate_limiter_bytes_per_sec, /* default */ 0L,
1193     /* min */ 0L, /* max */ MAX_RATE_LIMITER_BYTES_PER_SEC, 0);
1194 
1195 static MYSQL_SYSVAR_ULONGLONG(
1196     sst_mgr_rate_bytes_per_sec, rocksdb_sst_mgr_rate_bytes_per_sec,
1197     PLUGIN_VAR_RQCMDARG,
1198     "DBOptions::sst_file_manager rate_bytes_per_sec for RocksDB", nullptr,
1199     rocksdb_set_sst_mgr_rate_bytes_per_sec,
1200     /* default */ DEFAULT_SST_MGR_RATE_BYTES_PER_SEC,
1201     /* min */ 0L, /* max */ UINT64_MAX, 0);
1202 
1203 static MYSQL_SYSVAR_ULONGLONG(delayed_write_rate, rocksdb_delayed_write_rate,
1204                               PLUGIN_VAR_RQCMDARG,
1205                               "DBOptions::delayed_write_rate", nullptr,
1206                               rocksdb_set_delayed_write_rate,
1207                               rocksdb_db_options->delayed_write_rate, 0,
1208                               UINT64_MAX, 0);
1209 
1210 static MYSQL_SYSVAR_UINT(max_latest_deadlocks, rocksdb_max_latest_deadlocks,
1211                          PLUGIN_VAR_RQCMDARG,
1212                          "Maximum number of recent "
1213                          "deadlocks to store",
1214                          nullptr, rocksdb_set_max_latest_deadlocks,
1215                          rocksdb::kInitialMaxDeadlocks, 0, UINT32_MAX, 0);
1216 
1217 static MYSQL_SYSVAR_ENUM(
1218     info_log_level, rocksdb_info_log_level, PLUGIN_VAR_RQCMDARG,
1219     "Filter level for info logs to be written mysqld error log. "
1220     "Valid values include 'debug_level', 'info_level', 'warn_level'"
1221     "'error_level' and 'fatal_level'.",
1222     nullptr, rocksdb_set_rocksdb_info_log_level,
1223     rocksdb::InfoLogLevel::ERROR_LEVEL, &info_log_level_typelib);
1224 
1225 static MYSQL_THDVAR_INT(
1226     perf_context_level, PLUGIN_VAR_RQCMDARG,
1227     "Perf Context Level for rocksdb internal timer stat collection", nullptr,
1228     nullptr,
1229     /* default */ rocksdb::PerfLevel::kUninitialized,
1230     /* min */ rocksdb::PerfLevel::kUninitialized,
1231     /* max */ rocksdb::PerfLevel::kOutOfBounds - 1, 0);
1232 
1233 static MYSQL_SYSVAR_UINT(
1234     wal_recovery_mode, rocksdb_wal_recovery_mode, PLUGIN_VAR_RQCMDARG,
1235     "DBOptions::wal_recovery_mode for RocksDB. Default is kAbsoluteConsistency",
1236     nullptr, nullptr,
1237     /* default */ (uint)rocksdb::WALRecoveryMode::kAbsoluteConsistency,
1238     /* min */ (uint)rocksdb::WALRecoveryMode::kTolerateCorruptedTailRecords,
1239     /* max */ (uint)rocksdb::WALRecoveryMode::kSkipAnyCorruptedRecords, 0);
1240 
1241 static MYSQL_SYSVAR_UINT(
1242     stats_level, rocksdb_stats_level, PLUGIN_VAR_RQCMDARG,
1243     "Statistics Level for RocksDB. Default is 0 (kExceptHistogramOrTimers)",
1244     nullptr, rocksdb_set_rocksdb_stats_level,
1245     /* default */ (uint)rocksdb::StatsLevel::kExceptHistogramOrTimers,
1246     /* min */ (uint)rocksdb::StatsLevel::kExceptHistogramOrTimers,
1247     /* max */ (uint)rocksdb::StatsLevel::kAll, 0);
1248 
1249 static MYSQL_SYSVAR_SIZE_T(compaction_readahead_size,
1250                           rocksdb_db_options->compaction_readahead_size,
1251                           PLUGIN_VAR_RQCMDARG,
1252                           "DBOptions::compaction_readahead_size for RocksDB",
1253                           nullptr, nullptr,
1254                           rocksdb_db_options->compaction_readahead_size,
1255                           /* min */ 0L, /* max */ SIZE_T_MAX, 0);
1256 
1257 static MYSQL_SYSVAR_BOOL(
1258     new_table_reader_for_compaction_inputs,
1259     *reinterpret_cast<my_bool *>(
1260         &rocksdb_db_options->new_table_reader_for_compaction_inputs),
1261     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1262     "DBOptions::new_table_reader_for_compaction_inputs for RocksDB", nullptr,
1263     nullptr, rocksdb_db_options->new_table_reader_for_compaction_inputs);
1264 
1265 static MYSQL_SYSVAR_UINT(
1266     access_hint_on_compaction_start, rocksdb_access_hint_on_compaction_start,
1267     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1268     "DBOptions::access_hint_on_compaction_start for RocksDB", nullptr, nullptr,
1269     /* default */ (uint)rocksdb::Options::AccessHint::NORMAL,
1270     /* min */ (uint)rocksdb::Options::AccessHint::NONE,
1271     /* max */ (uint)rocksdb::Options::AccessHint::WILLNEED, 0);
1272 
1273 static MYSQL_SYSVAR_BOOL(
1274     allow_concurrent_memtable_write,
1275     *reinterpret_cast<my_bool *>(
1276         &rocksdb_db_options->allow_concurrent_memtable_write),
1277     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1278     "DBOptions::allow_concurrent_memtable_write for RocksDB", nullptr, nullptr,
1279     false);
1280 
1281 static MYSQL_SYSVAR_BOOL(
1282     enable_write_thread_adaptive_yield,
1283     *reinterpret_cast<my_bool *>(
1284         &rocksdb_db_options->enable_write_thread_adaptive_yield),
1285     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1286     "DBOptions::enable_write_thread_adaptive_yield for RocksDB", nullptr,
1287     nullptr, false);
1288 
1289 static MYSQL_SYSVAR_INT(max_open_files, rocksdb_db_options->max_open_files,
1290                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1291                         "DBOptions::max_open_files for RocksDB", nullptr,
1292                         nullptr, rocksdb_db_options->max_open_files,
1293                         /* min */ -2, /* max */ INT_MAX, 0);
1294 
1295 static MYSQL_SYSVAR_UINT64_T(max_total_wal_size,
1296                           rocksdb_db_options->max_total_wal_size,
1297                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1298                           "DBOptions::max_total_wal_size for RocksDB", nullptr,
1299                           nullptr, rocksdb_db_options->max_total_wal_size,
1300                           /* min */ 0, /* max */ LONGLONG_MAX, 0);
1301 
1302 static MYSQL_SYSVAR_BOOL(
1303     use_fsync, *reinterpret_cast<my_bool *>(&rocksdb_db_options->use_fsync),
1304     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1305     "DBOptions::use_fsync for RocksDB", nullptr, nullptr,
1306     rocksdb_db_options->use_fsync);
1307 
1308 static MYSQL_SYSVAR_STR(wal_dir, rocksdb_wal_dir,
1309                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1310                         "DBOptions::wal_dir for RocksDB", nullptr, nullptr,
1311                         rocksdb_db_options->wal_dir.c_str());
1312 
1313 static MYSQL_SYSVAR_STR(
1314     persistent_cache_path, rocksdb_persistent_cache_path,
1315     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1316     "Path for BlockBasedTableOptions::persistent_cache for RocksDB", nullptr,
1317     nullptr, "");
1318 
1319 static MYSQL_SYSVAR_ULONG(
1320     persistent_cache_size_mb, rocksdb_persistent_cache_size_mb,
1321     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1322     "Size of cache in MB for BlockBasedTableOptions::persistent_cache "
1323     "for RocksDB",
1324     nullptr, nullptr, rocksdb_persistent_cache_size_mb,
1325     /* min */ 0L, /* max */ ULONG_MAX, 0);
1326 
1327 static MYSQL_SYSVAR_UINT64_T(
1328     delete_obsolete_files_period_micros,
1329     rocksdb_db_options->delete_obsolete_files_period_micros,
1330     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1331     "DBOptions::delete_obsolete_files_period_micros for RocksDB", nullptr,
1332     nullptr, rocksdb_db_options->delete_obsolete_files_period_micros,
1333   /* min */ 0, /* max */ LONGLONG_MAX, 0);
1334 
1335 static MYSQL_SYSVAR_INT(max_background_jobs,
1336                         rocksdb_db_options->max_background_jobs,
1337                         PLUGIN_VAR_RQCMDARG,
1338                         "DBOptions::max_background_jobs for RocksDB", nullptr,
1339                         rocksdb_set_max_background_jobs,
1340                         rocksdb_db_options->max_background_jobs,
1341                         /* min */ -1, /* max */ MAX_BACKGROUND_JOBS, 0);
1342 
1343 static MYSQL_SYSVAR_UINT(max_subcompactions,
1344                          rocksdb_db_options->max_subcompactions,
1345                          PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1346                          "DBOptions::max_subcompactions for RocksDB", nullptr,
1347                          nullptr, rocksdb_db_options->max_subcompactions,
1348                          /* min */ 1, /* max */ MAX_SUBCOMPACTIONS, 0);
1349 
1350 static MYSQL_SYSVAR_SIZE_T(max_log_file_size,
1351                           rocksdb_db_options->max_log_file_size,
1352                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1353                           "DBOptions::max_log_file_size for RocksDB", nullptr,
1354                           nullptr, rocksdb_db_options->max_log_file_size,
1355                           /* min */ 0L, /* max */ SIZE_T_MAX, 0);
1356 
1357 static MYSQL_SYSVAR_SIZE_T(log_file_time_to_roll,
1358                           rocksdb_db_options->log_file_time_to_roll,
1359                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1360                           "DBOptions::log_file_time_to_roll for RocksDB",
1361                           nullptr, nullptr,
1362                           rocksdb_db_options->log_file_time_to_roll,
1363                           /* min */ 0L, /* max */ SIZE_T_MAX, 0);
1364 
1365 static MYSQL_SYSVAR_SIZE_T(keep_log_file_num,
1366                           rocksdb_db_options->keep_log_file_num,
1367                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1368                           "DBOptions::keep_log_file_num for RocksDB", nullptr,
1369                           nullptr, rocksdb_db_options->keep_log_file_num,
1370                           /* min */ 0L, /* max */ SIZE_T_MAX, 0);
1371 
1372 static MYSQL_SYSVAR_UINT64_T(max_manifest_file_size,
1373                           rocksdb_db_options->max_manifest_file_size,
1374                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1375                           "DBOptions::max_manifest_file_size for RocksDB",
1376                           nullptr, nullptr,
1377                           rocksdb_db_options->max_manifest_file_size,
1378                           /* min */ 0L, /* max */ ULONGLONG_MAX, 0);
1379 
1380 static MYSQL_SYSVAR_INT(table_cache_numshardbits,
1381                         rocksdb_db_options->table_cache_numshardbits,
1382                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1383                         "DBOptions::table_cache_numshardbits for RocksDB",
1384                         nullptr, nullptr,
1385                         rocksdb_db_options->table_cache_numshardbits,
1386                         // LRUCache limits this to 19 bits, anything greater
1387                         // fails to create a cache and returns a nullptr
1388                         /* min */ 0, /* max */ 19, 0);
1389 
1390 static MYSQL_SYSVAR_UINT64_T(wal_ttl_seconds, rocksdb_db_options->WAL_ttl_seconds,
1391                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1392                           "DBOptions::WAL_ttl_seconds for RocksDB", nullptr,
1393                           nullptr, rocksdb_db_options->WAL_ttl_seconds,
1394                           /* min */ 0L, /* max */ LONGLONG_MAX, 0);
1395 
1396 static MYSQL_SYSVAR_UINT64_T(wal_size_limit_mb,
1397                           rocksdb_db_options->WAL_size_limit_MB,
1398                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1399                           "DBOptions::WAL_size_limit_MB for RocksDB", nullptr,
1400                           nullptr, rocksdb_db_options->WAL_size_limit_MB,
1401                           /* min */ 0L, /* max */ LONGLONG_MAX, 0);
1402 
1403 static MYSQL_SYSVAR_SIZE_T(manifest_preallocation_size,
1404                           rocksdb_db_options->manifest_preallocation_size,
1405                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1406                           "DBOptions::manifest_preallocation_size for RocksDB",
1407                           nullptr, nullptr,
1408                           rocksdb_db_options->manifest_preallocation_size,
1409                           /* min */ 0L, /* max */ SIZE_T_MAX, 0);
1410 
1411 static MYSQL_SYSVAR_BOOL(
1412     use_direct_reads,
1413     *reinterpret_cast<my_bool *>(&rocksdb_db_options->use_direct_reads),
1414     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1415     "DBOptions::use_direct_reads for RocksDB", nullptr, nullptr,
1416     rocksdb_db_options->use_direct_reads);
1417 
1418 static MYSQL_SYSVAR_BOOL(
1419     use_direct_io_for_flush_and_compaction,
1420     *reinterpret_cast<my_bool *>(&rocksdb_db_options->use_direct_io_for_flush_and_compaction),
1421     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1422     "DBOptions::use_direct_io_for_flush_and_compaction for RocksDB", nullptr, nullptr,
1423     rocksdb_db_options->use_direct_io_for_flush_and_compaction);
1424 
1425 static MYSQL_SYSVAR_BOOL(
1426     allow_mmap_reads,
1427     *reinterpret_cast<my_bool *>(&rocksdb_db_options->allow_mmap_reads),
1428     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1429     "DBOptions::allow_mmap_reads for RocksDB", nullptr, nullptr,
1430     rocksdb_db_options->allow_mmap_reads);
1431 
1432 static MYSQL_SYSVAR_BOOL(
1433     allow_mmap_writes,
1434     *reinterpret_cast<my_bool *>(&rocksdb_db_options->allow_mmap_writes),
1435     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1436     "DBOptions::allow_mmap_writes for RocksDB", nullptr, nullptr,
1437     rocksdb_db_options->allow_mmap_writes);
1438 
1439 static MYSQL_SYSVAR_BOOL(
1440     is_fd_close_on_exec,
1441     *reinterpret_cast<my_bool *>(&rocksdb_db_options->is_fd_close_on_exec),
1442     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1443     "DBOptions::is_fd_close_on_exec for RocksDB", nullptr, nullptr,
1444     rocksdb_db_options->is_fd_close_on_exec);
1445 
1446 static MYSQL_SYSVAR_UINT(stats_dump_period_sec,
1447                          rocksdb_db_options->stats_dump_period_sec,
1448                          PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1449                          "DBOptions::stats_dump_period_sec for RocksDB",
1450                          nullptr, nullptr,
1451                          rocksdb_db_options->stats_dump_period_sec,
1452                          /* min */ 0, /* max */ INT_MAX, 0);
1453 
1454 static MYSQL_SYSVAR_BOOL(
1455     advise_random_on_open,
1456     *reinterpret_cast<my_bool *>(&rocksdb_db_options->advise_random_on_open),
1457     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1458     "DBOptions::advise_random_on_open for RocksDB", nullptr, nullptr,
1459     rocksdb_db_options->advise_random_on_open);
1460 
1461 static MYSQL_SYSVAR_SIZE_T(db_write_buffer_size,
1462                           rocksdb_db_options->db_write_buffer_size,
1463                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1464                           "DBOptions::db_write_buffer_size for RocksDB",
1465                           nullptr, nullptr,
1466                           rocksdb_db_options->db_write_buffer_size,
1467                           /* min */ 0L, /* max */ SIZE_T_MAX, 0);
1468 
1469 static MYSQL_SYSVAR_BOOL(
1470     use_adaptive_mutex,
1471     *reinterpret_cast<my_bool *>(&rocksdb_db_options->use_adaptive_mutex),
1472     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1473     "DBOptions::use_adaptive_mutex for RocksDB", nullptr, nullptr,
1474     rocksdb_db_options->use_adaptive_mutex);
1475 
1476 static MYSQL_SYSVAR_UINT64_T(bytes_per_sync, rocksdb_db_options->bytes_per_sync,
1477                           PLUGIN_VAR_RQCMDARG,
1478                           "DBOptions::bytes_per_sync for RocksDB", nullptr,
1479                           rocksdb_set_bytes_per_sync,
1480                           rocksdb_db_options->bytes_per_sync,
1481                           /* min */ 0L, /* max */ ULONGLONG_MAX, 0);
1482 
1483 static MYSQL_SYSVAR_UINT64_T(wal_bytes_per_sync,
1484                           rocksdb_db_options->wal_bytes_per_sync,
1485                           PLUGIN_VAR_RQCMDARG,
1486                           "DBOptions::wal_bytes_per_sync for RocksDB", nullptr,
1487                           rocksdb_set_wal_bytes_per_sync,
1488                           rocksdb_db_options->wal_bytes_per_sync,
1489                           /* min */ 0L, /* max */ ULONGLONG_MAX, 0);
1490 
1491 static MYSQL_SYSVAR_BOOL(
1492     enable_thread_tracking,
1493     *reinterpret_cast<my_bool *>(&rocksdb_db_options->enable_thread_tracking),
1494     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1495     "DBOptions::enable_thread_tracking for RocksDB", nullptr, nullptr, true);
1496 
1497 static MYSQL_SYSVAR_LONGLONG(block_cache_size, rocksdb_block_cache_size,
1498                              PLUGIN_VAR_RQCMDARG,
1499                              "block_cache size for RocksDB",
1500                              rocksdb_validate_set_block_cache_size, nullptr,
1501                              /* default */ RDB_DEFAULT_BLOCK_CACHE_SIZE,
1502                              /* min */ RDB_MIN_BLOCK_CACHE_SIZE,
1503                              /* max */ LLONG_MAX,
1504                              /* Block size */ RDB_MIN_BLOCK_CACHE_SIZE);
1505 
1506 static MYSQL_SYSVAR_LONGLONG(sim_cache_size, rocksdb_sim_cache_size,
1507                              PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1508                              "Simulated cache size for RocksDB", nullptr,
1509                              nullptr,
1510                              /* default */ 0,
1511                              /* min */ 0,
1512                              /* max */ LLONG_MAX,
1513                              /* Block size */ 0);
1514 
1515 static MYSQL_SYSVAR_BOOL(
1516     use_clock_cache, rocksdb_use_clock_cache,
1517     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1518     "Use ClockCache instead of default LRUCache for RocksDB", nullptr, nullptr,
1519     false);
1520 
1521 static MYSQL_SYSVAR_BOOL(cache_dump, rocksdb_cache_dump,
1522                          PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1523                          "Include RocksDB block cache content in core dump.",
1524                          nullptr, nullptr, true);
1525 
1526 static MYSQL_SYSVAR_DOUBLE(cache_high_pri_pool_ratio,
1527                            rocksdb_cache_high_pri_pool_ratio,
1528                            PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1529                            "Specify the size of block cache high-pri pool",
1530                            nullptr, nullptr, /* default */ 0.0, /* min */ 0.0,
1531                            /* max */ 1.0, 0);
1532 
1533 static MYSQL_SYSVAR_BOOL(
1534     cache_index_and_filter_blocks,
1535     *reinterpret_cast<my_bool *>(
1536         &rocksdb_tbl_options->cache_index_and_filter_blocks),
1537     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1538     "BlockBasedTableOptions::cache_index_and_filter_blocks for RocksDB",
1539     nullptr, nullptr, true);
1540 
1541 static MYSQL_SYSVAR_BOOL(
1542     cache_index_and_filter_with_high_priority,
1543     *reinterpret_cast<my_bool *>(
1544         &rocksdb_tbl_options->cache_index_and_filter_blocks_with_high_priority),
1545     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1546     "cache_index_and_filter_blocks_with_high_priority for RocksDB", nullptr,
1547     nullptr, true);
1548 
1549 // When pin_l0_filter_and_index_blocks_in_cache is true, RocksDB will  use the
1550 // LRU cache, but will always keep the filter & idndex block's handle checked
1551 // out (=won't call ShardedLRUCache::Release), plus the parsed out objects
1552 // the LRU cache will never push flush them out, hence they're pinned.
1553 //
1554 // This fixes the mutex contention between :ShardedLRUCache::Lookup and
1555 // ShardedLRUCache::Release which reduced the QPS ratio (QPS using secondary
1556 // index / QPS using PK).
1557 static MYSQL_SYSVAR_BOOL(
1558     pin_l0_filter_and_index_blocks_in_cache,
1559     *reinterpret_cast<my_bool *>(
1560         &rocksdb_tbl_options->pin_l0_filter_and_index_blocks_in_cache),
1561     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1562     "pin_l0_filter_and_index_blocks_in_cache for RocksDB", nullptr, nullptr,
1563     true);
1564 
1565 static MYSQL_SYSVAR_ENUM(index_type, rocksdb_index_type,
1566                          PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1567                          "BlockBasedTableOptions::index_type for RocksDB",
1568                          nullptr, nullptr,
1569                          (ulong)rocksdb_tbl_options->index_type,
1570                          &index_type_typelib);
1571 
1572 static MYSQL_SYSVAR_BOOL(
1573     hash_index_allow_collision,
1574     *reinterpret_cast<my_bool *>(
1575         &rocksdb_tbl_options->hash_index_allow_collision),
1576     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1577     "BlockBasedTableOptions::hash_index_allow_collision for RocksDB", nullptr,
1578     nullptr, rocksdb_tbl_options->hash_index_allow_collision);
1579 
1580 static MYSQL_SYSVAR_BOOL(
1581     no_block_cache,
1582     *reinterpret_cast<my_bool *>(&rocksdb_tbl_options->no_block_cache),
1583     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1584     "BlockBasedTableOptions::no_block_cache for RocksDB", nullptr, nullptr,
1585     rocksdb_tbl_options->no_block_cache);
1586 
1587 static MYSQL_SYSVAR_SIZE_T(block_size, rocksdb_tbl_options->block_size,
1588                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1589                           "BlockBasedTableOptions::block_size for RocksDB",
1590                           nullptr, nullptr, rocksdb_tbl_options->block_size,
1591                           /* min */ 1L, /* max */ SIZE_T_MAX, 0);
1592 
1593 static MYSQL_SYSVAR_INT(
1594     block_size_deviation, rocksdb_tbl_options->block_size_deviation,
1595     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1596     "BlockBasedTableOptions::block_size_deviation for RocksDB", nullptr,
1597     nullptr, rocksdb_tbl_options->block_size_deviation,
1598     /* min */ 0, /* max */ INT_MAX, 0);
1599 
1600 static MYSQL_SYSVAR_INT(
1601     block_restart_interval, rocksdb_tbl_options->block_restart_interval,
1602     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1603     "BlockBasedTableOptions::block_restart_interval for RocksDB", nullptr,
1604     nullptr, rocksdb_tbl_options->block_restart_interval,
1605     /* min */ 1, /* max */ INT_MAX, 0);
1606 
1607 static MYSQL_SYSVAR_BOOL(
1608     whole_key_filtering,
1609     *reinterpret_cast<my_bool *>(&rocksdb_tbl_options->whole_key_filtering),
1610     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1611     "BlockBasedTableOptions::whole_key_filtering for RocksDB", nullptr, nullptr,
1612     rocksdb_tbl_options->whole_key_filtering);
1613 
1614 static MYSQL_SYSVAR_STR(default_cf_options, rocksdb_default_cf_options,
1615                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1616                         "default cf options for RocksDB", nullptr, nullptr, "");
1617 
1618 static MYSQL_SYSVAR_STR(override_cf_options, rocksdb_override_cf_options,
1619                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1620                         "option overrides per cf for RocksDB", nullptr, nullptr,
1621                         "");
1622 
1623 static MYSQL_SYSVAR_STR(update_cf_options, rocksdb_update_cf_options,
1624                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC
1625                         /* psergey-merge: need this? :  PLUGIN_VAR_ALLOCATED*/,
1626                         "Option updates per column family for RocksDB",
1627                         rocksdb_validate_update_cf_options,
1628                         rocksdb_set_update_cf_options, nullptr);
1629 
1630 static MYSQL_SYSVAR_UINT(flush_log_at_trx_commit,
1631                          rocksdb_flush_log_at_trx_commit, PLUGIN_VAR_RQCMDARG,
1632                          "Sync on transaction commit. Similar to "
1633                          "innodb_flush_log_at_trx_commit. 1: sync on commit, "
1634                          "0,2: not sync on commit",
1635                          rocksdb_validate_flush_log_at_trx_commit, nullptr,
1636                          /* default */ FLUSH_LOG_SYNC,
1637                          /* min */ FLUSH_LOG_NEVER,
1638                          /* max */ FLUSH_LOG_BACKGROUND, 0);
1639 
1640 static MYSQL_THDVAR_BOOL(write_disable_wal, PLUGIN_VAR_RQCMDARG,
1641                          "WriteOptions::disableWAL for RocksDB", nullptr,
1642                          nullptr, rocksdb::WriteOptions().disableWAL);
1643 
1644 static MYSQL_THDVAR_BOOL(
1645     write_ignore_missing_column_families, PLUGIN_VAR_RQCMDARG,
1646     "WriteOptions::ignore_missing_column_families for RocksDB", nullptr,
1647     nullptr, rocksdb::WriteOptions().ignore_missing_column_families);
1648 
1649 static MYSQL_THDVAR_BOOL(skip_fill_cache, PLUGIN_VAR_RQCMDARG,
1650                          "Skip filling block cache on read requests", nullptr,
1651                          nullptr, FALSE);
1652 
1653 static MYSQL_THDVAR_BOOL(
1654     unsafe_for_binlog, PLUGIN_VAR_RQCMDARG,
1655     "Allowing statement based binary logging which may break consistency",
1656     nullptr, nullptr, FALSE);
1657 
1658 static MYSQL_THDVAR_UINT(records_in_range, PLUGIN_VAR_RQCMDARG,
1659                          "Used to override the result of records_in_range(). "
1660                          "Set to a positive number to override",
1661                          nullptr, nullptr, 0,
1662                          /* min */ 0, /* max */ INT_MAX, 0);
1663 
1664 static MYSQL_THDVAR_UINT(force_index_records_in_range, PLUGIN_VAR_RQCMDARG,
1665                          "Used to override the result of records_in_range() "
1666                          "when FORCE INDEX is used.",
1667                          nullptr, nullptr, 0,
1668                          /* min */ 0, /* max */ INT_MAX, 0);
1669 
1670 static MYSQL_SYSVAR_UINT(
1671     debug_optimizer_n_rows, rocksdb_debug_optimizer_n_rows,
1672     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_NOSYSVAR,
1673     "Test only to override rocksdb estimates of table size in a memtable",
1674     nullptr, nullptr, 0, /* min */ 0, /* max */ INT_MAX, 0);
1675 
1676 static MYSQL_SYSVAR_BOOL(force_compute_memtable_stats,
1677                          rocksdb_force_compute_memtable_stats,
1678                          PLUGIN_VAR_RQCMDARG,
1679                          "Force to always compute memtable stats", nullptr,
1680                          nullptr, TRUE);
1681 
1682 static MYSQL_SYSVAR_UINT(force_compute_memtable_stats_cachetime,
1683                          rocksdb_force_compute_memtable_stats_cachetime,
1684                          PLUGIN_VAR_RQCMDARG,
1685                          "Time in usecs to cache memtable estimates", nullptr,
1686                          nullptr, /* default */ 60 * 1000 * 1000,
1687                          /* min */ 0, /* max */ INT_MAX, 0);
1688 
1689 static MYSQL_SYSVAR_BOOL(
1690     debug_optimizer_no_zero_cardinality,
1691     rocksdb_debug_optimizer_no_zero_cardinality, PLUGIN_VAR_RQCMDARG,
1692     "In case if cardinality is zero, overrides it with some value", nullptr,
1693     nullptr, TRUE);
1694 
1695 static MYSQL_SYSVAR_STR(compact_cf, rocksdb_compact_cf_name,
1696                         PLUGIN_VAR_RQCMDARG, "Compact column family",
1697                         rocksdb_compact_column_family,
1698                         rocksdb_compact_column_family_stub, "");
1699 
1700 static MYSQL_SYSVAR_STR(delete_cf, rocksdb_delete_cf_name, PLUGIN_VAR_RQCMDARG,
1701                         "Delete column family", rocksdb_delete_column_family,
1702                         rocksdb_delete_column_family_stub, "");
1703 
1704 static MYSQL_SYSVAR_STR(create_checkpoint, rocksdb_checkpoint_name,
1705                         PLUGIN_VAR_RQCMDARG, "Checkpoint directory",
1706                         rocksdb_create_checkpoint,
1707                         rocksdb_create_checkpoint_stub, "");
1708 
1709 static MYSQL_SYSVAR_BOOL(remove_mariabackup_checkpoint,
1710                          rocksdb_signal_remove_mariabackup_checkpoint,
1711                          PLUGIN_VAR_RQCMDARG, "Remove mariabackup checkpoint",
1712                          nullptr, rocksdb_remove_mariabackup_checkpoint, FALSE);
1713 
1714 static MYSQL_SYSVAR_BOOL(signal_drop_index_thread,
1715                          rocksdb_signal_drop_index_thread, PLUGIN_VAR_RQCMDARG,
1716                          "Wake up drop index thread", nullptr,
1717                          rocksdb_drop_index_wakeup_thread, FALSE);
1718 
1719 static MYSQL_SYSVAR_BOOL(pause_background_work, rocksdb_pause_background_work,
1720                          PLUGIN_VAR_RQCMDARG,
1721                          "Disable all rocksdb background operations", nullptr,
1722                          rocksdb_set_pause_background_work, FALSE);
1723 
1724 static MYSQL_SYSVAR_BOOL(
1725     enable_ttl, rocksdb_enable_ttl, PLUGIN_VAR_RQCMDARG,
1726     "Enable expired TTL records to be dropped during compaction.", nullptr,
1727     nullptr, TRUE);
1728 
1729 static MYSQL_SYSVAR_BOOL(
1730     enable_ttl_read_filtering, rocksdb_enable_ttl_read_filtering,
1731     PLUGIN_VAR_RQCMDARG,
1732     "For tables with TTL, expired records are skipped/filtered out during "
1733     "processing and in query results. Disabling this will allow these records "
1734     "to be seen, but as a result rows may disappear in the middle of "
1735     "transactions as they are dropped during compaction. Use with caution.",
1736     nullptr, nullptr, TRUE);
1737 
1738 static MYSQL_SYSVAR_INT(
1739     debug_ttl_rec_ts, rocksdb_debug_ttl_rec_ts, PLUGIN_VAR_RQCMDARG,
1740     "For debugging purposes only.  Overrides the TTL of records to "
1741     "now() + debug_ttl_rec_ts.  The value can be +/- to simulate "
1742     "a record inserted in the past vs a record inserted in the 'future'. "
1743     "A value of 0 denotes that the variable is not set. This variable is a "
1744     "no-op in non-debug builds.",
1745     nullptr, nullptr, 0, /* min */ -3600, /* max */ 3600, 0);
1746 
1747 static MYSQL_SYSVAR_INT(
1748     debug_ttl_snapshot_ts, rocksdb_debug_ttl_snapshot_ts, PLUGIN_VAR_RQCMDARG,
1749     "For debugging purposes only.  Sets the snapshot during compaction to "
1750     "now() + debug_set_ttl_snapshot_ts.  The value can be +/- to simulate "
1751     "a snapshot in the past vs a snapshot created in the 'future'. "
1752     "A value of 0 denotes that the variable is not set. This variable is a "
1753     "no-op in non-debug builds.",
1754     nullptr, nullptr, 0, /* min */ -3600, /* max */ 3600, 0);
1755 
1756 static MYSQL_SYSVAR_INT(
1757     debug_ttl_read_filter_ts, rocksdb_debug_ttl_read_filter_ts,
1758     PLUGIN_VAR_RQCMDARG,
1759     "For debugging purposes only.  Overrides the TTL read filtering time to "
1760     "time + debug_ttl_read_filter_ts. A value of 0 denotes that the variable "
1761     "is not set. This variable is a no-op in non-debug builds.",
1762     nullptr, nullptr, 0, /* min */ -3600, /* max */ 3600, 0);
1763 
1764 static MYSQL_SYSVAR_BOOL(
1765     debug_ttl_ignore_pk, rocksdb_debug_ttl_ignore_pk, PLUGIN_VAR_RQCMDARG,
1766     "For debugging purposes only. If true, compaction filtering will not occur "
1767     "on PK TTL data. This variable is a no-op in non-debug builds.",
1768     nullptr, nullptr, FALSE);
1769 
1770 static MYSQL_SYSVAR_UINT(
1771     max_manual_compactions, rocksdb_max_manual_compactions, PLUGIN_VAR_RQCMDARG,
1772     "Maximum number of pending + ongoing number of manual compactions.",
1773     nullptr, nullptr, /* default */ 10, /* min */ 0, /* max */ UINT_MAX, 0);
1774 
1775 static MYSQL_SYSVAR_BOOL(
1776     rollback_on_timeout, rocksdb_rollback_on_timeout, PLUGIN_VAR_OPCMDARG,
1777     "Whether to roll back the complete transaction or a single statement on "
1778     "lock wait timeout (a single statement by default)",
1779     NULL, NULL, FALSE);
1780 
1781 static MYSQL_SYSVAR_UINT(
1782     debug_manual_compaction_delay, rocksdb_debug_manual_compaction_delay,
1783     PLUGIN_VAR_RQCMDARG,
1784     "For debugging purposes only. Sleeping specified seconds "
1785     "for simulating long running compactions.",
1786     nullptr, nullptr, 0, /* min */ 0, /* max */ UINT_MAX, 0);
1787 
1788 static MYSQL_SYSVAR_BOOL(
1789     reset_stats, rocksdb_reset_stats, PLUGIN_VAR_RQCMDARG,
1790     "Reset the RocksDB internal statistics without restarting the DB.", nullptr,
1791     rocksdb_set_reset_stats, FALSE);
1792 
1793 static MYSQL_SYSVAR_UINT(io_write_timeout, rocksdb_io_write_timeout_secs,
1794                          PLUGIN_VAR_RQCMDARG,
1795                          "Timeout for experimental I/O watchdog.", nullptr,
1796                          rocksdb_set_io_write_timeout, /* default */ 0,
1797                          /* min */ 0L,
1798                          /* max */ UINT_MAX, 0);
1799 
1800 static MYSQL_SYSVAR_BOOL(enable_2pc, rocksdb_enable_2pc, PLUGIN_VAR_RQCMDARG,
1801                          "Enable two phase commit for MyRocks", nullptr,
1802                          nullptr, TRUE);
1803 
1804 static MYSQL_SYSVAR_BOOL(ignore_unknown_options, rocksdb_ignore_unknown_options,
1805                          PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
1806                          "Enable ignoring unknown options passed to RocksDB",
1807                          nullptr, nullptr, TRUE);
1808 
1809 static MYSQL_SYSVAR_BOOL(strict_collation_check, rocksdb_strict_collation_check,
1810                          PLUGIN_VAR_RQCMDARG,
1811                          "Enforce case sensitive collation for MyRocks indexes",
1812                          nullptr, nullptr, TRUE);
1813 
1814 static MYSQL_SYSVAR_STR(strict_collation_exceptions,
1815                         rocksdb_strict_collation_exceptions,
1816                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
1817                         "List of tables (using regex) that are excluded "
1818                         "from the case sensitive collation enforcement",
1819                         nullptr, rocksdb_set_collation_exception_list, "");
1820 
1821 static MYSQL_SYSVAR_BOOL(collect_sst_properties, rocksdb_collect_sst_properties,
1822                          PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1823                          "Enables collecting SST file properties on each flush",
1824                          nullptr, nullptr, rocksdb_collect_sst_properties);
1825 
1826 static MYSQL_SYSVAR_BOOL(
1827     force_flush_memtable_now, rocksdb_force_flush_memtable_now_var,
1828     PLUGIN_VAR_RQCMDARG,
1829     "Forces memstore flush which may block all write requests so be careful",
1830     rocksdb_force_flush_memtable_now, rocksdb_force_flush_memtable_now_stub,
1831     FALSE);
1832 
1833 static MYSQL_SYSVAR_BOOL(
1834     force_flush_memtable_and_lzero_now,
1835     rocksdb_force_flush_memtable_and_lzero_now_var, PLUGIN_VAR_RQCMDARG,
1836     "Acts similar to force_flush_memtable_now, but also compacts all L0 files.",
1837     rocksdb_force_flush_memtable_and_lzero_now,
1838     rocksdb_force_flush_memtable_and_lzero_now_stub, FALSE);
1839 
1840 static MYSQL_SYSVAR_UINT(
1841     seconds_between_stat_computes, rocksdb_seconds_between_stat_computes,
1842     PLUGIN_VAR_RQCMDARG,
1843     "Sets a number of seconds to wait between optimizer stats recomputation. "
1844     "Only changed indexes will be refreshed.",
1845     nullptr, nullptr, rocksdb_seconds_between_stat_computes,
1846     /* min */ 0L, /* max */ UINT_MAX, 0);
1847 
1848 static MYSQL_SYSVAR_LONGLONG(compaction_sequential_deletes,
1849                              rocksdb_compaction_sequential_deletes,
1850                              PLUGIN_VAR_RQCMDARG,
1851                              "RocksDB will trigger compaction for the file if "
1852                              "it has more than this number sequential deletes "
1853                              "per window",
1854                              nullptr, rocksdb_set_compaction_options,
1855                              DEFAULT_COMPACTION_SEQUENTIAL_DELETES,
1856                              /* min */ 0L,
1857                              /* max */ MAX_COMPACTION_SEQUENTIAL_DELETES, 0);
1858 
1859 static MYSQL_SYSVAR_LONGLONG(
1860     compaction_sequential_deletes_window,
1861     rocksdb_compaction_sequential_deletes_window, PLUGIN_VAR_RQCMDARG,
1862     "Size of the window for counting rocksdb_compaction_sequential_deletes",
1863     nullptr, rocksdb_set_compaction_options,
1864     DEFAULT_COMPACTION_SEQUENTIAL_DELETES_WINDOW,
1865     /* min */ 0L, /* max */ MAX_COMPACTION_SEQUENTIAL_DELETES_WINDOW, 0);
1866 
1867 static MYSQL_SYSVAR_LONGLONG(
1868     compaction_sequential_deletes_file_size,
1869     rocksdb_compaction_sequential_deletes_file_size, PLUGIN_VAR_RQCMDARG,
1870     "Minimum file size required for compaction_sequential_deletes", nullptr,
1871     rocksdb_set_compaction_options, 0L,
1872     /* min */ -1L, /* max */ LLONG_MAX, 0);
1873 
1874 static MYSQL_SYSVAR_BOOL(
1875     compaction_sequential_deletes_count_sd,
1876     rocksdb_compaction_sequential_deletes_count_sd, PLUGIN_VAR_RQCMDARG,
1877     "Counting SingleDelete as rocksdb_compaction_sequential_deletes", nullptr,
1878     nullptr, rocksdb_compaction_sequential_deletes_count_sd);
1879 
1880 static MYSQL_SYSVAR_BOOL(
1881     print_snapshot_conflict_queries, rocksdb_print_snapshot_conflict_queries,
1882     PLUGIN_VAR_RQCMDARG,
1883     "Logging queries that got snapshot conflict errors into *.err log", nullptr,
1884     nullptr, rocksdb_print_snapshot_conflict_queries);
1885 
1886 static MYSQL_THDVAR_INT(checksums_pct, PLUGIN_VAR_RQCMDARG,
1887                         "How many percentages of rows to be checksummed",
1888                         nullptr, nullptr, RDB_MAX_CHECKSUMS_PCT,
1889                         /* min */ 0, /* max */ RDB_MAX_CHECKSUMS_PCT, 0);
1890 
1891 static MYSQL_THDVAR_BOOL(store_row_debug_checksums, PLUGIN_VAR_RQCMDARG,
1892                          "Include checksums when writing index/table records",
1893                          nullptr, nullptr, false /* default value */);
1894 
1895 static MYSQL_THDVAR_BOOL(verify_row_debug_checksums, PLUGIN_VAR_RQCMDARG,
1896                          "Verify checksums when reading index/table records",
1897                          nullptr, nullptr, false /* default value */);
1898 
1899 static MYSQL_THDVAR_BOOL(master_skip_tx_api, PLUGIN_VAR_RQCMDARG,
1900                          "Skipping holding any lock on row access. "
1901                          "Not effective on slave.",
1902                          nullptr, nullptr, false);
1903 
1904 static MYSQL_SYSVAR_UINT(
1905     validate_tables, rocksdb_validate_tables,
1906     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1907     "Verify all .frm files match all RocksDB tables (0 means no verification, "
1908     "1 means verify and fail on error, and 2 means verify but continue",
1909     nullptr, nullptr, 1 /* default value */, 0 /* min value */,
1910     2 /* max value */, 0);
1911 
1912 static MYSQL_SYSVAR_UINT(
1913     ignore_datadic_errors, rocksdb_ignore_datadic_errors,
1914     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1915     "Ignore MyRocks' data directory errors. "
1916     "(CAUTION: Use only to start the server and perform repairs. Do NOT use "
1917     "for regular operation)",
1918     nullptr, nullptr, 0 /* default value */, 0 /* min value */,
1919     1 /* max value */, 0);
1920 
1921 static MYSQL_SYSVAR_STR(datadir, rocksdb_datadir,
1922                         PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
1923                         "RocksDB data directory", nullptr, nullptr,
1924                         "./#rocksdb");
1925 
1926 static MYSQL_SYSVAR_STR(supported_compression_types,
1927   compression_types_val,
1928   PLUGIN_VAR_NOCMDOPT | PLUGIN_VAR_READONLY,
1929   "Compression algorithms supported by RocksDB",
1930   nullptr, nullptr,
1931   compression_types_val);
1932 
1933 static MYSQL_SYSVAR_UINT(
1934     table_stats_sampling_pct, rocksdb_table_stats_sampling_pct,
1935     PLUGIN_VAR_RQCMDARG,
1936     "Percentage of entries to sample when collecting statistics about table "
1937     "properties. Specify either 0 to sample everything or percentage "
1938     "[" STRINGIFY_ARG(RDB_TBL_STATS_SAMPLE_PCT_MIN) ".." STRINGIFY_ARG(
1939         RDB_TBL_STATS_SAMPLE_PCT_MAX) "]. "
1940                                       "By default " STRINGIFY_ARG(
1941                                           RDB_DEFAULT_TBL_STATS_SAMPLE_PCT) "% "
1942                                                                             "of"
1943                                                                             " e"
1944                                                                             "nt"
1945                                                                             "ri"
1946                                                                             "es"
1947                                                                             " a"
1948                                                                             "re"
1949                                                                             " "
1950                                                                             "sa"
1951                                                                             "mp"
1952                                                                             "le"
1953                                                                             "d"
1954                                                                             ".",
1955     nullptr, rocksdb_set_table_stats_sampling_pct, /* default */
1956     RDB_DEFAULT_TBL_STATS_SAMPLE_PCT, /* everything */ 0,
1957     /* max */ RDB_TBL_STATS_SAMPLE_PCT_MAX, 0);
1958 
1959 static MYSQL_SYSVAR_UINT(
1960     stats_recalc_rate, rocksdb_stats_recalc_rate, PLUGIN_VAR_RQCMDARG,
1961     "The number of indexes per second to recalculate statistics for. 0 to "
1962     "disable background recalculation.",
1963     nullptr, nullptr, 0 /* default value */, 0 /* min value */,
1964     UINT_MAX /* max value */, 0);
1965 
1966 static MYSQL_SYSVAR_BOOL(
1967     large_prefix, rocksdb_large_prefix, PLUGIN_VAR_RQCMDARG,
1968     "Support large index prefix length of 3072 bytes. If off, the maximum "
1969     "index prefix length is 767.",
1970     nullptr, nullptr, FALSE);
1971 
1972 static MYSQL_SYSVAR_BOOL(
1973     allow_to_start_after_corruption, rocksdb_allow_to_start_after_corruption,
1974     PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
1975     "Allow server still to start successfully even if RocksDB corruption is "
1976     "detected.",
1977     nullptr, nullptr, FALSE);
1978 
1979 static MYSQL_SYSVAR_BOOL(error_on_suboptimal_collation,
1980                          rocksdb_error_on_suboptimal_collation,
1981                          PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
1982                          "Raise an error instead of warning if a sub-optimal "
1983                          "collation is used",
1984                          nullptr, nullptr, TRUE);
1985 
1986 static MYSQL_SYSVAR_BOOL(
1987     enable_insert_with_update_caching,
1988     rocksdb_enable_insert_with_update_caching, PLUGIN_VAR_OPCMDARG,
1989     "Whether to enable optimization where we cache the read from a failed "
1990     "insertion attempt in INSERT ON DUPLICATE KEY UPDATE",
1991     nullptr, nullptr, TRUE);
1992 
1993 static const int ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE = 100;
1994 
1995 static struct st_mysql_sys_var *rocksdb_system_variables[] = {
1996     MYSQL_SYSVAR(lock_wait_timeout),
1997     MYSQL_SYSVAR(deadlock_detect),
1998     MYSQL_SYSVAR(deadlock_detect_depth),
1999     MYSQL_SYSVAR(commit_time_batch_for_recovery),
2000     MYSQL_SYSVAR(max_row_locks),
2001     MYSQL_SYSVAR(write_batch_max_bytes),
2002     MYSQL_SYSVAR(lock_scanned_rows),
2003     MYSQL_SYSVAR(bulk_load),
2004     MYSQL_SYSVAR(bulk_load_allow_sk),
2005     MYSQL_SYSVAR(bulk_load_allow_unsorted),
2006     MYSQL_SYSVAR(skip_unique_check_tables),
2007     MYSQL_SYSVAR(trace_sst_api),
2008     MYSQL_SYSVAR(commit_in_the_middle),
2009     MYSQL_SYSVAR(blind_delete_primary_key),
2010 #if 0 // MARIAROCKS_NOT_YET : read-free replication is not supported
2011     MYSQL_SYSVAR(read_free_rpl_tables),
2012     MYSQL_SYSVAR(read_free_rpl),
2013 #endif
2014     MYSQL_SYSVAR(bulk_load_size),
2015     MYSQL_SYSVAR(merge_buf_size),
2016     MYSQL_SYSVAR(enable_bulk_load_api),
2017     MYSQL_SYSVAR(tmpdir),
2018     MYSQL_SYSVAR(merge_combine_read_size),
2019     MYSQL_SYSVAR(merge_tmp_file_removal_delay_ms),
2020     MYSQL_SYSVAR(skip_bloom_filter_on_read),
2021 
2022     MYSQL_SYSVAR(create_if_missing),
2023     MYSQL_SYSVAR(two_write_queues),
2024     MYSQL_SYSVAR(manual_wal_flush),
2025     MYSQL_SYSVAR(write_policy),
2026     MYSQL_SYSVAR(create_missing_column_families),
2027     MYSQL_SYSVAR(error_if_exists),
2028     MYSQL_SYSVAR(paranoid_checks),
2029     MYSQL_SYSVAR(rate_limiter_bytes_per_sec),
2030     MYSQL_SYSVAR(sst_mgr_rate_bytes_per_sec),
2031     MYSQL_SYSVAR(delayed_write_rate),
2032     MYSQL_SYSVAR(max_latest_deadlocks),
2033     MYSQL_SYSVAR(info_log_level),
2034     MYSQL_SYSVAR(max_open_files),
2035     MYSQL_SYSVAR(max_total_wal_size),
2036     MYSQL_SYSVAR(use_fsync),
2037     MYSQL_SYSVAR(wal_dir),
2038     MYSQL_SYSVAR(persistent_cache_path),
2039     MYSQL_SYSVAR(persistent_cache_size_mb),
2040     MYSQL_SYSVAR(delete_obsolete_files_period_micros),
2041     MYSQL_SYSVAR(max_background_jobs),
2042     MYSQL_SYSVAR(max_log_file_size),
2043     MYSQL_SYSVAR(max_subcompactions),
2044     MYSQL_SYSVAR(log_file_time_to_roll),
2045     MYSQL_SYSVAR(keep_log_file_num),
2046     MYSQL_SYSVAR(max_manifest_file_size),
2047     MYSQL_SYSVAR(table_cache_numshardbits),
2048     MYSQL_SYSVAR(wal_ttl_seconds),
2049     MYSQL_SYSVAR(wal_size_limit_mb),
2050     MYSQL_SYSVAR(manifest_preallocation_size),
2051     MYSQL_SYSVAR(use_direct_reads),
2052     MYSQL_SYSVAR(use_direct_io_for_flush_and_compaction),
2053     MYSQL_SYSVAR(allow_mmap_reads),
2054     MYSQL_SYSVAR(allow_mmap_writes),
2055     MYSQL_SYSVAR(is_fd_close_on_exec),
2056     MYSQL_SYSVAR(stats_dump_period_sec),
2057     MYSQL_SYSVAR(advise_random_on_open),
2058     MYSQL_SYSVAR(db_write_buffer_size),
2059     MYSQL_SYSVAR(use_adaptive_mutex),
2060     MYSQL_SYSVAR(bytes_per_sync),
2061     MYSQL_SYSVAR(wal_bytes_per_sync),
2062     MYSQL_SYSVAR(enable_thread_tracking),
2063     MYSQL_SYSVAR(perf_context_level),
2064     MYSQL_SYSVAR(wal_recovery_mode),
2065     MYSQL_SYSVAR(stats_level),
2066     MYSQL_SYSVAR(access_hint_on_compaction_start),
2067     MYSQL_SYSVAR(new_table_reader_for_compaction_inputs),
2068     MYSQL_SYSVAR(compaction_readahead_size),
2069     MYSQL_SYSVAR(allow_concurrent_memtable_write),
2070     MYSQL_SYSVAR(enable_write_thread_adaptive_yield),
2071 
2072     MYSQL_SYSVAR(block_cache_size),
2073     MYSQL_SYSVAR(sim_cache_size),
2074     MYSQL_SYSVAR(use_clock_cache),
2075     MYSQL_SYSVAR(cache_high_pri_pool_ratio),
2076     MYSQL_SYSVAR(cache_dump),
2077     MYSQL_SYSVAR(cache_index_and_filter_blocks),
2078     MYSQL_SYSVAR(cache_index_and_filter_with_high_priority),
2079     MYSQL_SYSVAR(pin_l0_filter_and_index_blocks_in_cache),
2080     MYSQL_SYSVAR(index_type),
2081     MYSQL_SYSVAR(hash_index_allow_collision),
2082     MYSQL_SYSVAR(no_block_cache),
2083     MYSQL_SYSVAR(block_size),
2084     MYSQL_SYSVAR(block_size_deviation),
2085     MYSQL_SYSVAR(block_restart_interval),
2086     MYSQL_SYSVAR(whole_key_filtering),
2087 
2088     MYSQL_SYSVAR(default_cf_options),
2089     MYSQL_SYSVAR(override_cf_options),
2090     MYSQL_SYSVAR(update_cf_options),
2091 
2092     MYSQL_SYSVAR(flush_log_at_trx_commit),
2093     MYSQL_SYSVAR(write_disable_wal),
2094     MYSQL_SYSVAR(write_ignore_missing_column_families),
2095 
2096     MYSQL_SYSVAR(skip_fill_cache),
2097     MYSQL_SYSVAR(unsafe_for_binlog),
2098 
2099     MYSQL_SYSVAR(records_in_range),
2100     MYSQL_SYSVAR(force_index_records_in_range),
2101     MYSQL_SYSVAR(debug_optimizer_n_rows),
2102     MYSQL_SYSVAR(force_compute_memtable_stats),
2103     MYSQL_SYSVAR(force_compute_memtable_stats_cachetime),
2104     MYSQL_SYSVAR(debug_optimizer_no_zero_cardinality),
2105 
2106     MYSQL_SYSVAR(compact_cf),
2107     MYSQL_SYSVAR(delete_cf),
2108     MYSQL_SYSVAR(signal_drop_index_thread),
2109     MYSQL_SYSVAR(pause_background_work),
2110     MYSQL_SYSVAR(enable_2pc),
2111     MYSQL_SYSVAR(ignore_unknown_options),
2112     MYSQL_SYSVAR(strict_collation_check),
2113     MYSQL_SYSVAR(strict_collation_exceptions),
2114     MYSQL_SYSVAR(collect_sst_properties),
2115     MYSQL_SYSVAR(force_flush_memtable_now),
2116     MYSQL_SYSVAR(force_flush_memtable_and_lzero_now),
2117     MYSQL_SYSVAR(enable_ttl),
2118     MYSQL_SYSVAR(enable_ttl_read_filtering),
2119     MYSQL_SYSVAR(debug_ttl_rec_ts),
2120     MYSQL_SYSVAR(debug_ttl_snapshot_ts),
2121     MYSQL_SYSVAR(debug_ttl_read_filter_ts),
2122     MYSQL_SYSVAR(debug_ttl_ignore_pk),
2123     MYSQL_SYSVAR(reset_stats),
2124     MYSQL_SYSVAR(io_write_timeout),
2125     MYSQL_SYSVAR(seconds_between_stat_computes),
2126 
2127     MYSQL_SYSVAR(compaction_sequential_deletes),
2128     MYSQL_SYSVAR(compaction_sequential_deletes_window),
2129     MYSQL_SYSVAR(compaction_sequential_deletes_file_size),
2130     MYSQL_SYSVAR(compaction_sequential_deletes_count_sd),
2131     MYSQL_SYSVAR(print_snapshot_conflict_queries),
2132 
2133     MYSQL_SYSVAR(datadir),
2134   MYSQL_SYSVAR(supported_compression_types),
2135     MYSQL_SYSVAR(create_checkpoint),
2136     MYSQL_SYSVAR(remove_mariabackup_checkpoint),
2137     MYSQL_SYSVAR(checksums_pct),
2138     MYSQL_SYSVAR(store_row_debug_checksums),
2139     MYSQL_SYSVAR(verify_row_debug_checksums),
2140     MYSQL_SYSVAR(master_skip_tx_api),
2141 
2142     MYSQL_SYSVAR(validate_tables),
2143     MYSQL_SYSVAR(table_stats_sampling_pct),
2144 
2145     MYSQL_SYSVAR(large_prefix),
2146     MYSQL_SYSVAR(allow_to_start_after_corruption),
2147     MYSQL_SYSVAR(git_hash),
2148     MYSQL_SYSVAR(error_on_suboptimal_collation),
2149     MYSQL_SYSVAR(stats_recalc_rate),
2150     MYSQL_SYSVAR(debug_manual_compaction_delay),
2151     MYSQL_SYSVAR(max_manual_compactions),
2152     MYSQL_SYSVAR(manual_compaction_threads),
2153     MYSQL_SYSVAR(rollback_on_timeout),
2154 
2155     MYSQL_SYSVAR(enable_insert_with_update_caching),
2156 
2157     MYSQL_SYSVAR(ignore_datadic_errors),
2158     nullptr};
2159 
rdb_get_rocksdb_write_options(my_core::THD * const thd)2160 static rocksdb::WriteOptions rdb_get_rocksdb_write_options(
2161     my_core::THD *const thd) {
2162   rocksdb::WriteOptions opt;
2163 
2164   opt.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
2165   opt.disableWAL = THDVAR(thd, write_disable_wal);
2166   opt.ignore_missing_column_families =
2167       THDVAR(thd, write_ignore_missing_column_families);
2168 
2169   return opt;
2170 }
2171 
rocksdb_compact_column_family(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,struct st_mysql_value * const value)2172 static int rocksdb_compact_column_family(THD *const thd,
2173                                          struct st_mysql_sys_var *const var,
2174                                          void *const var_ptr,
2175                                          struct st_mysql_value *const value) {
2176   char buff[STRING_BUFFER_USUAL_SIZE];
2177   int len = sizeof(buff);
2178 
2179   DBUG_ASSERT(value != nullptr);
2180 
2181   if (const char *const cf = value->val_str(value, buff, &len)) {
2182     auto cfh = cf_manager.get_cf(cf);
2183     if (cfh != nullptr && rdb != nullptr) {
2184       int mc_id = rdb_mc_thread.request_manual_compaction(
2185           cfh, nullptr, nullptr, THDVAR(thd, manual_compaction_threads));
2186       if (mc_id == -1) {
2187         my_error(ER_INTERNAL_ERROR, MYF(0),
2188                  "Can't schedule more manual compactions. "
2189                  "Increase rocksdb_max_manual_compactions or stop issuing "
2190                  "more manual compactions.");
2191         return HA_EXIT_FAILURE;
2192       } else if (mc_id < 0) {
2193         return HA_EXIT_FAILURE;
2194       }
2195       // NO_LINT_DEBUG
2196       sql_print_information("RocksDB: Manual compaction of column family: %s\n",
2197                             cf);
2198       // Checking thd state every short cycle (100ms). This is for allowing to
2199       // exiting this function without waiting for CompactRange to finish.
2200       do {
2201         my_sleep(100000);
2202       } while (!thd->killed &&
2203                !rdb_mc_thread.is_manual_compaction_finished(mc_id));
2204 
2205       if (thd->killed) {
2206         // This cancels if requested compaction state is INITED.
2207         // TODO(yoshinorim): Cancel running compaction as well once
2208         // it is supported in RocksDB.
2209         rdb_mc_thread.clear_manual_compaction_request(mc_id, true);
2210       }
2211     }
2212   }
2213   return HA_EXIT_SUCCESS;
2214 }
2215 
2216 ///////////////////////////////////////////////////////////////////////////////////////////
2217 
2218 /*
2219   Drop index thread's control
2220 */
2221 
2222 static Rdb_drop_index_thread rdb_drop_idx_thread;
2223 
rocksdb_drop_index_wakeup_thread(my_core::THD * const thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * const var MY_ATTRIBUTE ((__unused__)),void * const var_ptr MY_ATTRIBUTE ((__unused__)),const void * const save)2224 static void rocksdb_drop_index_wakeup_thread(
2225     my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
2226     struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
2227     void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
2228   if (*static_cast<const bool *>(save)) {
2229     rdb_drop_idx_thread.signal();
2230   }
2231 }
2232 
rocksdb_perf_context_level(THD * const thd)2233 static inline uint32_t rocksdb_perf_context_level(THD *const thd) {
2234   DBUG_ASSERT(thd != nullptr);
2235 
2236   const int session_perf_context_level = THDVAR(thd, perf_context_level);
2237   if (session_perf_context_level > rocksdb::PerfLevel::kUninitialized) {
2238     return session_perf_context_level;
2239   }
2240 
2241   /*
2242     Fallback to global thdvar, if session specific one was not set to a valid
2243     value.
2244   */
2245 
2246   const int global_perf_context_level = THDVAR(nullptr, perf_context_level);
2247   if (global_perf_context_level > rocksdb::PerfLevel::kUninitialized) {
2248     return global_perf_context_level;
2249   }
2250 
2251   return rocksdb::PerfLevel::kDisable;
2252 }
2253 
2254 /*
2255   Very short (functor-like) interface to be passed to
2256   Rdb_transaction::walk_tx_list()
2257 */
2258 
2259 interface Rdb_tx_list_walker {
2260   virtual ~Rdb_tx_list_walker() {}
2261   virtual void process_tran(const Rdb_transaction *const) = 0;
2262 };
2263 
2264 /*
2265   This is a helper class that is passed to RocksDB to get notifications when
2266   a snapshot gets created.
2267 */
2268 
2269 class Rdb_snapshot_notifier : public rocksdb::TransactionNotifier {
2270   Rdb_transaction *m_owning_tx;
2271 
2272   void SnapshotCreated(const rocksdb::Snapshot *snapshot) override;
2273 
2274  public:
2275   Rdb_snapshot_notifier(const Rdb_snapshot_notifier &) = delete;
2276   Rdb_snapshot_notifier &operator=(const Rdb_snapshot_notifier &) = delete;
2277 
Rdb_snapshot_notifier(Rdb_transaction * const owning_tx)2278   explicit Rdb_snapshot_notifier(Rdb_transaction *const owning_tx)
2279       : m_owning_tx(owning_tx) {}
2280 
2281   // If the owning Rdb_transaction gets destructed we need to not reference
2282   // it anymore.
detach()2283   void detach() { m_owning_tx = nullptr; }
2284 };
2285 
2286 
2287 #ifdef MARIAROCKS_NOT_YET
2288 // ER_LOCK_WAIT_TIMEOUT error also has a reason in facebook/mysql-5.6
2289 #endif
timeout_message(const char * command,const char * name1,const char * name2)2290 String timeout_message(const char *command, const char *name1,
2291                        const char *name2)
2292 {
2293     String msg;
2294     msg.append("Timeout on ");
2295     msg.append(command);
2296     msg.append(": ");
2297     msg.append(name1);
2298     if (name2 && name2[0])
2299     {
2300       msg.append(".");
2301       msg.append(name2);
2302     }
2303     return msg;
2304 }
2305 
2306 
2307 /* This is the base class for transactions when interacting with rocksdb.
2308  */
2309 class Rdb_transaction {
2310  protected:
2311   ulonglong m_write_count = 0;
2312   ulonglong m_insert_count = 0;
2313   ulonglong m_update_count = 0;
2314   ulonglong m_delete_count = 0;
2315   ulonglong m_lock_count = 0;
2316   std::unordered_map<GL_INDEX_ID, ulonglong> m_auto_incr_map;
2317 
2318   bool m_is_delayed_snapshot = false;
2319   bool m_is_two_phase = false;
2320 
2321   std::unordered_set<Rdb_tbl_def*> modified_tables;
2322 
2323  private:
2324   /*
2325     Number of write operations this transaction had when we took the last
2326     savepoint (the idea is not to take another savepoint if we haven't made
2327     any changes)
2328   */
2329   ulonglong m_writes_at_last_savepoint;
2330 
2331  protected:
2332 
2333 protected:
2334   THD *m_thd = nullptr;
2335 
2336   static std::multiset<Rdb_transaction *> s_tx_list;
2337   static mysql_mutex_t s_tx_list_mutex;
2338 
2339   Rdb_io_perf *m_tbl_io_perf;
2340 
2341   bool m_tx_read_only = false;
2342 
2343   int m_timeout_sec; /* Cached value of @@rocksdb_lock_wait_timeout */
2344 
2345   /* Maximum number of locks the transaction can have */
2346   ulonglong m_max_row_locks;
2347 
2348   bool m_is_tx_failed = false;
2349   bool m_rollback_only = false;
2350 
2351   std::shared_ptr<Rdb_snapshot_notifier> m_notifier;
2352 
2353   // This should be used only when updating binlog information.
2354   virtual rocksdb::WriteBatchBase *get_write_batch() = 0;
2355   virtual bool commit_no_binlog() = 0;
2356   virtual rocksdb::Iterator *get_iterator(
2357       const rocksdb::ReadOptions &options,
2358       rocksdb::ColumnFamilyHandle *column_family) = 0;
2359 
2360 protected:
2361   /*
2362     The following two are helper functions to be overloaded by child classes.
2363     They should provide RocksDB's savepoint semantics.
2364   */
2365   virtual void do_set_savepoint() = 0;
2366   virtual void do_rollback_to_savepoint() = 0;
2367 
2368   /*
2369     @detail
2370       This function takes in the WriteBatch of the transaction to add
2371       all the AUTO_INCREMENT merges. It does so by iterating through
2372       m_auto_incr_map and then constructing key/value pairs to call merge upon.
2373 
2374     @param wb
2375    */
merge_auto_incr_map(rocksdb::WriteBatchBase * const wb)2376   rocksdb::Status merge_auto_incr_map(rocksdb::WriteBatchBase *const wb) {
2377     DBUG_EXECUTE_IF("myrocks_autoinc_upgrade", return rocksdb::Status::OK(););
2378 
2379     // Iterate through the merge map merging all keys into data dictionary.
2380     rocksdb::Status s;
2381     for (auto &it : m_auto_incr_map) {
2382       s = dict_manager.put_auto_incr_val(wb, it.first, it.second);
2383       if (!s.ok()) {
2384         return s;
2385       }
2386     }
2387     m_auto_incr_map.clear();
2388     return s;
2389   }
2390 
2391  public:
2392   rocksdb::ReadOptions m_read_opts;
2393   const char *m_mysql_log_file_name;
2394   my_off_t m_mysql_log_offset;
2395 #ifdef MARIAROCKS_NOT_YET
2396   // TODO: MariaDB probably doesn't need these at all:
2397   const char *m_mysql_gtid;
2398   const char *m_mysql_max_gtid;
2399 #endif
2400   String m_detailed_error;
2401   int64_t m_snapshot_timestamp = 0;
2402   bool m_ddl_transaction;
2403 #ifdef MARIAROCKS_NOT_YET
2404   std::shared_ptr<Rdb_explicit_snapshot> m_explicit_snapshot;
2405 #endif
2406 
2407   /*
2408     Tracks the number of tables in use through external_lock.
2409     This should not be reset during start_tx().
2410   */
2411   int64_t m_n_mysql_tables_in_use = 0;
2412 
2413   /*
2414     MariaDB's group commit:
2415   */
2416   bool commit_ordered_done;
2417   bool commit_ordered_res;
2418 
2419   /*
2420     for distinction between rdb_transaction_impl and rdb_writebatch_impl
2421     when using walk tx list
2422   */
2423   virtual bool is_writebatch_trx() const = 0;
2424 
init_mutex()2425   static void init_mutex() {
2426     mysql_mutex_init(key_mutex_tx_list, &s_tx_list_mutex, MY_MUTEX_INIT_FAST);
2427   }
2428 
term_mutex()2429   static void term_mutex() {
2430     DBUG_ASSERT(s_tx_list.size() == 0);
2431     mysql_mutex_destroy(&s_tx_list_mutex);
2432   }
2433 
walk_tx_list(Rdb_tx_list_walker * walker)2434   static void walk_tx_list(Rdb_tx_list_walker *walker) {
2435     DBUG_ASSERT(walker != nullptr);
2436 
2437     RDB_MUTEX_LOCK_CHECK(s_tx_list_mutex);
2438 
2439     for (auto it : s_tx_list) {
2440       walker->process_tran(it);
2441     }
2442 
2443     RDB_MUTEX_UNLOCK_CHECK(s_tx_list_mutex);
2444   }
2445 
set_status_error(THD * const thd,const rocksdb::Status & s,const Rdb_key_def & kd,Rdb_tbl_def * const tbl_def,Rdb_table_handler * const table_handler)2446   int set_status_error(THD *const thd, const rocksdb::Status &s,
2447                        const Rdb_key_def &kd, Rdb_tbl_def *const tbl_def,
2448                        Rdb_table_handler *const table_handler) {
2449     DBUG_ASSERT(!s.ok());
2450     DBUG_ASSERT(tbl_def != nullptr);
2451 
2452     if (s.IsTimedOut()) {
2453       /*
2454         SQL layer has weird expectations. If we return an error when
2455         doing a read in DELETE IGNORE, it will ignore the error ("because it's
2456         an IGNORE command!) but then will fail an assert, because "error code
2457         was returned, but no error happened".  Do what InnoDB's
2458         convert_error_code_to_mysql() does: force a statement
2459         rollback before returning HA_ERR_LOCK_WAIT_TIMEOUT:
2460         */
2461       my_core::thd_mark_transaction_to_rollback(
2462           thd, static_cast<bool>(rocksdb_rollback_on_timeout));
2463       m_detailed_error.copy(timeout_message(
2464           "index", tbl_def->full_tablename().c_str(), kd.get_name().c_str()));
2465       table_handler->m_lock_wait_timeout_counter.inc();
2466       rocksdb_row_lock_wait_timeouts++;
2467 
2468       return HA_ERR_LOCK_WAIT_TIMEOUT;
2469     }
2470 
2471     if (s.IsDeadlock()) {
2472       my_core::thd_mark_transaction_to_rollback(thd,
2473                                                 true /* whole transaction */);
2474       m_detailed_error = String();
2475       table_handler->m_deadlock_counter.inc();
2476       rocksdb_row_lock_deadlocks++;
2477       return HA_ERR_LOCK_DEADLOCK;
2478     } else if (s.IsBusy()) {
2479       rocksdb_snapshot_conflict_errors++;
2480       if (rocksdb_print_snapshot_conflict_queries) {
2481         char user_host_buff[MAX_USER_HOST_SIZE + 1];
2482         make_user_name(thd, user_host_buff);
2483         // NO_LINT_DEBUG
2484         sql_print_warning(
2485             "Got snapshot conflict errors: User: %s "
2486             "Query: %s",
2487             user_host_buff, thd->query());
2488       }
2489       m_detailed_error = String(" (snapshot conflict)", system_charset_info);
2490       table_handler->m_deadlock_counter.inc();
2491       return HA_ERR_ROCKSDB_STATUS_BUSY;
2492     }
2493 
2494     if (s.IsIOError() || s.IsCorruption()) {
2495       rdb_handle_io_error(s, RDB_IO_ERROR_GENERAL);
2496     }
2497 
2498     return ha_rocksdb::rdb_error_to_mysql(s);
2499   }
2500 
get_thd() const2501   THD *get_thd() const { return m_thd; }
2502 
2503   /* Used for tracking io_perf counters */
io_perf_start(Rdb_io_perf * const io_perf)2504   void io_perf_start(Rdb_io_perf *const io_perf) {
2505     /*
2506       Since perf_context is tracked per thread, it is difficult and expensive
2507       to maintain perf_context on a per table basis. Therefore, roll all
2508       perf_context data into the first table used in a query. This works well
2509       for single table queries and is probably good enough for queries that hit
2510       multiple tables.
2511 
2512       perf_context stats gathering is started when the table lock is acquired
2513       or when ha_rocksdb::start_stmt is called in case of LOCK TABLES. They
2514       are recorded when the table lock is released, or when commit/rollback
2515       is called on the transaction, whichever comes first. Table lock release
2516       and commit/rollback can happen in different orders. In the case where
2517       the lock is released before commit/rollback is called, an extra step to
2518       gather stats during commit/rollback is needed.
2519     */
2520     if (m_tbl_io_perf == nullptr &&
2521         io_perf->start(rocksdb_perf_context_level(m_thd))) {
2522       m_tbl_io_perf = io_perf;
2523     }
2524   }
2525 
io_perf_end_and_record(void)2526   void io_perf_end_and_record(void) {
2527     if (m_tbl_io_perf != nullptr) {
2528       m_tbl_io_perf->end_and_record(rocksdb_perf_context_level(m_thd));
2529       m_tbl_io_perf = nullptr;
2530     }
2531   }
2532 
io_perf_end_and_record(Rdb_io_perf * const io_perf)2533   void io_perf_end_and_record(Rdb_io_perf *const io_perf) {
2534     if (m_tbl_io_perf == io_perf) {
2535       io_perf_end_and_record();
2536     }
2537   }
2538 
update_bytes_written(ulonglong bytes_written)2539   void update_bytes_written(ulonglong bytes_written) {
2540     if (m_tbl_io_perf != nullptr) {
2541       m_tbl_io_perf->update_bytes_written(rocksdb_perf_context_level(m_thd),
2542                                           bytes_written);
2543     }
2544   }
2545 
set_params(int timeout_sec_arg,int max_row_locks_arg)2546   void set_params(int timeout_sec_arg, int max_row_locks_arg) {
2547     m_timeout_sec = timeout_sec_arg;
2548     m_max_row_locks = max_row_locks_arg;
2549     set_lock_timeout(timeout_sec_arg);
2550   }
2551 
2552   virtual void set_lock_timeout(int timeout_sec_arg) = 0;
2553 
get_write_count() const2554   ulonglong get_write_count() const { return m_write_count; }
2555 
get_insert_count() const2556   ulonglong get_insert_count() const { return m_insert_count; }
2557 
get_update_count() const2558   ulonglong get_update_count() const { return m_update_count; }
2559 
get_delete_count() const2560   ulonglong get_delete_count() const { return m_delete_count; }
2561 
incr_insert_count()2562   void incr_insert_count() { ++m_insert_count; }
2563 
incr_update_count()2564   void incr_update_count() { ++m_update_count; }
2565 
incr_delete_count()2566   void incr_delete_count() { ++m_delete_count; }
2567 
get_timeout_sec() const2568   int get_timeout_sec() const { return m_timeout_sec; }
2569 
get_lock_count() const2570   ulonglong get_lock_count() const { return m_lock_count; }
2571 
2572   virtual void set_sync(bool sync) = 0;
2573 
2574   virtual void release_lock(rocksdb::ColumnFamilyHandle *const column_family,
2575                             const std::string &rowkey) = 0;
2576 
2577   virtual bool prepare(const rocksdb::TransactionName &name) = 0;
2578 
commit_or_rollback()2579   bool commit_or_rollback() {
2580     bool res;
2581     if (m_is_tx_failed) {
2582       rollback();
2583       res = false;
2584     } else {
2585       res = commit();
2586     }
2587     return res;
2588   }
2589 
commit()2590   bool commit() {
2591     if (get_write_count() == 0) {
2592       rollback();
2593       return false;
2594     } else if (m_rollback_only) {
2595       /*
2596         Transactions marked as rollback_only are expected to be rolled back at
2597         prepare(). But there are some exceptions like below that prepare() is
2598         never called and commit() is called instead.
2599          1. Binlog is disabled
2600          2. No modification exists in binlog cache for the transaction (#195)
2601         In both cases, rolling back transaction is safe. Nothing is written to
2602         binlog.
2603        */
2604       my_error(ER_ROLLBACK_ONLY, MYF(0));
2605       rollback();
2606       return true;
2607     } else {
2608 #ifdef MARIAROCKS_NOT_YET
2609       /*
2610         Storing binlog position inside MyRocks is needed only for restoring
2611         MyRocks from backups. This feature is not supported yet.
2612       */
2613       mysql_bin_log_commit_pos(m_thd, &m_mysql_log_offset,
2614                                &m_mysql_log_file_name);
2615       binlog_manager.update(m_mysql_log_file_name, m_mysql_log_offset,
2616                             get_write_batch());
2617 #endif
2618       return commit_no_binlog();
2619     }
2620   }
2621 
2622   virtual void rollback() = 0;
2623 
snapshot_created(const rocksdb::Snapshot * const snapshot)2624   void snapshot_created(const rocksdb::Snapshot *const snapshot) {
2625     DBUG_ASSERT(snapshot != nullptr);
2626 
2627     m_read_opts.snapshot = snapshot;
2628     rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp);
2629     m_is_delayed_snapshot = false;
2630   }
2631 
2632   virtual void acquire_snapshot(bool acquire_now) = 0;
2633   virtual void release_snapshot() = 0;
2634 
has_snapshot() const2635   bool has_snapshot() const { return m_read_opts.snapshot != nullptr; }
2636 
2637  private:
2638   // The Rdb_sst_info structures we are currently loading.  In a partitioned
2639   // table this can have more than one entry
2640   std::vector<std::shared_ptr<Rdb_sst_info>> m_curr_bulk_load;
2641   std::string m_curr_bulk_load_tablename;
2642 
2643   /* External merge sorts for bulk load: key ID -> merge sort instance */
2644   std::unordered_map<GL_INDEX_ID, Rdb_index_merge> m_key_merge;
2645 
2646  public:
get_key_merge(GL_INDEX_ID kd_gl_id,rocksdb::ColumnFamilyHandle * cf,Rdb_index_merge ** key_merge)2647   int get_key_merge(GL_INDEX_ID kd_gl_id, rocksdb::ColumnFamilyHandle *cf,
2648                     Rdb_index_merge **key_merge) {
2649     int res;
2650     auto it = m_key_merge.find(kd_gl_id);
2651     if (it == m_key_merge.end()) {
2652       m_key_merge.emplace(
2653           std::piecewise_construct, std::make_tuple(kd_gl_id),
2654           std::make_tuple(
2655               get_rocksdb_tmpdir(), THDVAR(get_thd(), merge_buf_size),
2656               THDVAR(get_thd(), merge_combine_read_size),
2657               THDVAR(get_thd(), merge_tmp_file_removal_delay_ms), cf));
2658       it = m_key_merge.find(kd_gl_id);
2659       if ((res = it->second.init()) != 0) {
2660         return res;
2661       }
2662     }
2663     *key_merge = &it->second;
2664     return HA_EXIT_SUCCESS;
2665   }
2666 
2667   /* Finish bulk loading for all table handlers belongs to one connection */
finish_bulk_load(bool * is_critical_error=nullptr,int print_client_error=true)2668   int finish_bulk_load(bool *is_critical_error = nullptr,
2669                        int print_client_error = true) {
2670     Ensure_cleanup cleanup([&]() {
2671       // Always clear everything regardless of success/failure
2672       m_curr_bulk_load.clear();
2673       m_curr_bulk_load_tablename.clear();
2674       m_key_merge.clear();
2675     });
2676 
2677     int rc = 0;
2678     if (is_critical_error) {
2679       *is_critical_error = true;
2680     }
2681 
2682     // PREPARE phase: finish all on-going bulk loading Rdb_sst_info and
2683     // collect all Rdb_sst_commit_info containing (SST files, cf)
2684     int rc2 = 0;
2685     std::vector<Rdb_sst_info::Rdb_sst_commit_info> sst_commit_list;
2686     sst_commit_list.reserve(m_curr_bulk_load.size());
2687 
2688     for (auto &sst_info : m_curr_bulk_load) {
2689       Rdb_sst_info::Rdb_sst_commit_info commit_info;
2690 
2691       // Commit the list of SST files and move it to the end of
2692       // sst_commit_list, effectively transfer the ownership over
2693       rc2 = sst_info->finish(&commit_info, print_client_error);
2694       if (rc2 && rc == 0) {
2695         // Don't return yet - make sure we finish all the SST infos
2696         rc = rc2;
2697       }
2698 
2699       // Make sure we have work to do - we might be losing the race
2700       if (rc2 == 0 && commit_info.has_work()) {
2701         sst_commit_list.emplace_back(std::move(commit_info));
2702         DBUG_ASSERT(!commit_info.has_work());
2703       }
2704     }
2705 
2706     if (rc) {
2707       return rc;
2708     }
2709 
2710     // MERGING Phase: Flush the index_merge sort buffers into SST files in
2711     // Rdb_sst_info and collect all Rdb_sst_commit_info containing
2712     // (SST files, cf)
2713     if (!m_key_merge.empty()) {
2714       Ensure_cleanup malloc_cleanup([]() {
2715         /*
2716           Explicitly tell jemalloc to clean up any unused dirty pages at this
2717           point.
2718           See https://reviews.facebook.net/D63723 for more details.
2719         */
2720         purge_all_jemalloc_arenas();
2721       });
2722 
2723       rocksdb::Slice merge_key;
2724       rocksdb::Slice merge_val;
2725       for (auto it = m_key_merge.begin(); it != m_key_merge.end(); it++) {
2726         GL_INDEX_ID index_id = it->first;
2727         std::shared_ptr<const Rdb_key_def> keydef =
2728             ddl_manager.safe_find(index_id);
2729         std::string table_name = ddl_manager.safe_get_table_name(index_id);
2730 
2731         // Unable to find key definition or table name since the
2732         // table could have been dropped.
2733         // TODO(herman): there is a race here between dropping the table
2734         // and detecting a drop here. If the table is dropped while bulk
2735         // loading is finishing, these keys being added here may
2736         // be missed by the compaction filter and not be marked for
2737         // removal. It is unclear how to lock the sql table from the storage
2738         // engine to prevent modifications to it while bulk load is occurring.
2739         if (keydef == nullptr) {
2740           if (is_critical_error) {
2741             // We used to set the error but simply ignores it. This follows
2742             // current behavior and we should revisit this later
2743             *is_critical_error = false;
2744           }
2745           return HA_ERR_KEY_NOT_FOUND;
2746         } else if (table_name.empty()) {
2747           if (is_critical_error) {
2748             // We used to set the error but simply ignores it. This follows
2749             // current behavior and we should revisit this later
2750             *is_critical_error = false;
2751           }
2752           return HA_ERR_NO_SUCH_TABLE;
2753         }
2754         const std::string &index_name = keydef->get_name();
2755         Rdb_index_merge &rdb_merge = it->second;
2756 
2757         // Rdb_sst_info expects a denormalized table name in the form of
2758         // "./database/table"
2759         std::replace(table_name.begin(), table_name.end(), '.', '/');
2760         table_name = "./" + table_name;
2761         auto sst_info = std::make_shared<Rdb_sst_info>(
2762             rdb, table_name, index_name, rdb_merge.get_cf(),
2763             *rocksdb_db_options, THDVAR(get_thd(), trace_sst_api));
2764 
2765         while ((rc2 = rdb_merge.next(&merge_key, &merge_val)) == 0) {
2766           if ((rc2 = sst_info->put(merge_key, merge_val)) != 0) {
2767             rc = rc2;
2768 
2769             // Don't return yet - make sure we finish the sst_info
2770             break;
2771           }
2772         }
2773 
2774         // -1 => no more items
2775         if (rc2 != -1 && rc != 0) {
2776           rc = rc2;
2777         }
2778 
2779         Rdb_sst_info::Rdb_sst_commit_info commit_info;
2780         rc2 = sst_info->finish(&commit_info, print_client_error);
2781         if (rc2 != 0 && rc == 0) {
2782           // Only set the error from sst_info->finish if finish failed and we
2783           // didn't fail before. In other words, we don't have finish's
2784           // success mask earlier failures
2785           rc = rc2;
2786         }
2787 
2788         if (rc) {
2789           return rc;
2790         }
2791 
2792         if (commit_info.has_work()) {
2793           sst_commit_list.emplace_back(std::move(commit_info));
2794           DBUG_ASSERT(!commit_info.has_work());
2795         }
2796       }
2797     }
2798 
2799     // Early return in case we lost the race completely and end up with no
2800     // work at all
2801     if (sst_commit_list.size() == 0) {
2802       return rc;
2803     }
2804 
2805     // INGEST phase: Group all Rdb_sst_commit_info by cf (as they might
2806     // have the same cf across different indexes) and call out to RocksDB
2807     // to ingest all SST files in one atomic operation
2808     rocksdb::IngestExternalFileOptions options;
2809     options.move_files = true;
2810     options.snapshot_consistency = false;
2811     options.allow_global_seqno = false;
2812     options.allow_blocking_flush = false;
2813 
2814     std::map<rocksdb::ColumnFamilyHandle *, rocksdb::IngestExternalFileArg>
2815         arg_map;
2816 
2817     // Group by column_family
2818     for (auto &commit_info : sst_commit_list) {
2819       if (arg_map.find(commit_info.get_cf()) == arg_map.end()) {
2820         rocksdb::IngestExternalFileArg arg;
2821         arg.column_family = commit_info.get_cf(),
2822         arg.external_files = commit_info.get_committed_files(),
2823         arg.options = options;
2824 
2825         arg_map.emplace(commit_info.get_cf(), arg);
2826       } else {
2827         auto &files = arg_map[commit_info.get_cf()].external_files;
2828         files.insert(files.end(), commit_info.get_committed_files().begin(),
2829                      commit_info.get_committed_files().end());
2830       }
2831     }
2832 
2833     std::vector<rocksdb::IngestExternalFileArg> args;
2834     size_t file_count = 0;
2835     for (auto &cf_files_pair : arg_map) {
2836       args.push_back(cf_files_pair.second);
2837       file_count += cf_files_pair.second.external_files.size();
2838     }
2839 
2840     const rocksdb::Status s = rdb->IngestExternalFiles(args);
2841     if (THDVAR(m_thd, trace_sst_api)) {
2842       // NO_LINT_DEBUG
2843       sql_print_information(
2844           "SST Tracing: IngestExternalFile '%zu' files returned %s", file_count,
2845           s.ok() ? "ok" : "not ok");
2846     }
2847 
2848     if (!s.ok()) {
2849       if (print_client_error) {
2850         Rdb_sst_info::report_error_msg(s, nullptr);
2851       }
2852       return HA_ERR_ROCKSDB_BULK_LOAD;
2853     }
2854 
2855     // COMMIT phase: mark everything as completed. This avoids SST file
2856     // deletion kicking in. Otherwise SST files would get deleted if this
2857     // entire operation is aborted
2858     for (auto &commit_info : sst_commit_list) {
2859       commit_info.commit();
2860     }
2861 
2862     return rc;
2863   }
2864 
start_bulk_load(ha_rocksdb * const bulk_load,std::shared_ptr<Rdb_sst_info> sst_info)2865   int start_bulk_load(ha_rocksdb *const bulk_load,
2866                       std::shared_ptr<Rdb_sst_info> sst_info) {
2867     /*
2868      If we already have an open bulk load of a table and the name doesn't
2869      match the current one, close out the currently running one.  This allows
2870      multiple bulk loads to occur on a partitioned table, but then closes
2871      them all out when we switch to another table.
2872     */
2873     DBUG_ASSERT(bulk_load != nullptr);
2874 
2875     if (!m_curr_bulk_load.empty() &&
2876         bulk_load->get_table_basename() != m_curr_bulk_load_tablename) {
2877       const auto res = finish_bulk_load();
2878       if (res != HA_EXIT_SUCCESS) {
2879         return res;
2880       }
2881     }
2882 
2883     /*
2884      This used to track ha_rocksdb handler objects, but those can be
2885      freed by the table cache while this was referencing them. Instead
2886      of tracking ha_rocksdb handler objects, this now tracks the
2887      Rdb_sst_info allocated, and both the ha_rocksdb handler and the
2888      Rdb_transaction both have shared pointers to them.
2889 
2890      On transaction complete, it will commit each Rdb_sst_info structure found.
2891      If the ha_rocksdb object is freed, etc., it will also commit
2892      the Rdb_sst_info. The Rdb_sst_info commit path needs to be idempotent.
2893     */
2894     m_curr_bulk_load.push_back(sst_info);
2895     m_curr_bulk_load_tablename = bulk_load->get_table_basename();
2896     return HA_EXIT_SUCCESS;
2897   }
2898 
num_ongoing_bulk_load() const2899   int num_ongoing_bulk_load() const { return m_curr_bulk_load.size(); }
2900 
get_rocksdb_tmpdir() const2901   const char *get_rocksdb_tmpdir() const {
2902     const char *tmp_dir = THDVAR(get_thd(), tmpdir);
2903 
2904     /*
2905       We want to treat an empty string as nullptr, in these cases DDL operations
2906       will use the default --tmpdir passed to mysql instead.
2907     */
2908     if (tmp_dir != nullptr && *tmp_dir == '\0') {
2909       tmp_dir = nullptr;
2910     }
2911     return (tmp_dir);
2912   }
2913 
2914   /*
2915     Flush the data accumulated so far. This assumes we're doing a bulk insert.
2916 
2917     @detail
2918       This should work like transaction commit, except that we don't
2919       synchronize with the binlog (there is no API that would allow to have
2920       binlog flush the changes accumulated so far and return its current
2921       position)
2922 
2923     @todo
2924       Add test coverage for what happens when somebody attempts to do bulk
2925       inserts while inside a multi-statement transaction.
2926   */
flush_batch()2927   bool flush_batch() {
2928     if (get_write_count() == 0) return false;
2929 
2930     /* Commit the current transaction */
2931     if (commit_no_binlog()) return true;
2932 
2933     /* Start another one */
2934     start_tx();
2935     return false;
2936   }
2937 
set_auto_incr(const GL_INDEX_ID & gl_index_id,ulonglong curr_id)2938   void set_auto_incr(const GL_INDEX_ID &gl_index_id, ulonglong curr_id) {
2939     m_auto_incr_map[gl_index_id] =
2940         std::max(m_auto_incr_map[gl_index_id], curr_id);
2941   }
2942 
2943 #ifndef DBUG_OFF
get_auto_incr(const GL_INDEX_ID & gl_index_id)2944   ulonglong get_auto_incr(const GL_INDEX_ID &gl_index_id) {
2945     if (m_auto_incr_map.count(gl_index_id) > 0) {
2946       return m_auto_incr_map[gl_index_id];
2947     }
2948     return 0;
2949   }
2950 #endif
2951 
2952   virtual rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family,
2953                               const rocksdb::Slice &key,
2954                               const rocksdb::Slice &value,
2955                               const bool assume_tracked) = 0;
2956   virtual rocksdb::Status delete_key(
2957       rocksdb::ColumnFamilyHandle *const column_family,
2958       const rocksdb::Slice &key, const bool assume_tracked) = 0;
2959   virtual rocksdb::Status single_delete(
2960       rocksdb::ColumnFamilyHandle *const column_family,
2961       const rocksdb::Slice &key, const bool assume_tracked) = 0;
2962 
2963   virtual bool has_modifications() const = 0;
2964 
2965   virtual rocksdb::WriteBatchBase *get_indexed_write_batch() = 0;
2966   /*
2967     Return a WriteBatch that one can write to. The writes will skip any
2968     transaction locking. The writes will NOT be visible to the transaction.
2969   */
get_blind_write_batch()2970   rocksdb::WriteBatchBase *get_blind_write_batch() {
2971     return get_indexed_write_batch()->GetWriteBatch();
2972   }
2973 
2974   virtual rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family,
2975                               const rocksdb::Slice &key,
2976                               rocksdb::PinnableSlice *const value) const = 0;
2977   virtual rocksdb::Status get_for_update(
2978       rocksdb::ColumnFamilyHandle *const column_family,
2979       const rocksdb::Slice &key, rocksdb::PinnableSlice *const value,
2980       bool exclusive, const bool do_validate) = 0;
2981 
get_iterator(rocksdb::ColumnFamilyHandle * const column_family,bool skip_bloom_filter,bool fill_cache,const rocksdb::Slice & eq_cond_lower_bound,const rocksdb::Slice & eq_cond_upper_bound,bool read_current=false,bool create_snapshot=true)2982   rocksdb::Iterator *get_iterator(
2983       rocksdb::ColumnFamilyHandle *const column_family, bool skip_bloom_filter,
2984       bool fill_cache, const rocksdb::Slice &eq_cond_lower_bound,
2985       const rocksdb::Slice &eq_cond_upper_bound, bool read_current = false,
2986       bool create_snapshot = true) {
2987     // Make sure we are not doing both read_current (which implies we don't
2988     // want a snapshot) and create_snapshot which makes sure we create
2989     // a snapshot
2990     DBUG_ASSERT(column_family != nullptr);
2991     DBUG_ASSERT(!read_current || !create_snapshot);
2992 
2993     if (create_snapshot) acquire_snapshot(true);
2994 
2995     rocksdb::ReadOptions options = m_read_opts;
2996 
2997     if (skip_bloom_filter) {
2998       options.total_order_seek = true;
2999       options.iterate_lower_bound = &eq_cond_lower_bound;
3000       options.iterate_upper_bound = &eq_cond_upper_bound;
3001     } else {
3002       // With this option, Iterator::Valid() returns false if key
3003       // is outside of the prefix bloom filter range set at Seek().
3004       // Must not be set to true if not using bloom filter.
3005       options.prefix_same_as_start = true;
3006     }
3007     options.fill_cache = fill_cache;
3008     if (read_current) {
3009       options.snapshot = nullptr;
3010     }
3011     return get_iterator(options, column_family);
3012   }
3013 
3014   virtual bool is_tx_started() const = 0;
3015   virtual void start_tx() = 0;
3016   virtual void start_stmt() = 0;
3017 
3018  protected:
3019   // Non-virtual functions with actions to be done on transaction start and
3020   // commit.
on_commit()3021   void on_commit() {
3022     time_t tm;
3023     tm = time(nullptr);
3024     for (auto &it : modified_tables) {
3025       it->m_update_time = tm;
3026     }
3027     modified_tables.clear();
3028   }
on_rollback()3029   void on_rollback() {
3030     modified_tables.clear();
3031   }
3032  public:
3033   // Inform the transaction that this table was modified
log_table_write_op(Rdb_tbl_def * tbl)3034   void log_table_write_op(Rdb_tbl_def *tbl) {
3035     modified_tables.insert(tbl);
3036   }
3037 
set_initial_savepoint()3038   void set_initial_savepoint() {
3039     /*
3040       Set the initial savepoint. If the first statement in the transaction
3041       fails, we need something to roll back to, without rolling back the
3042       entire transaction.
3043     */
3044     do_set_savepoint();
3045     m_writes_at_last_savepoint = m_write_count;
3046   }
3047 
3048   /*
3049     Called when a "top-level" statement inside a transaction completes
3050     successfully and its changes become part of the transaction's changes.
3051   */
make_stmt_savepoint_permanent()3052   int make_stmt_savepoint_permanent() {
3053     // Take another RocksDB savepoint only if we had changes since the last
3054     // one. This is very important for long transactions doing lots of
3055     // SELECTs.
3056     if (m_writes_at_last_savepoint != m_write_count) {
3057       rocksdb::WriteBatchBase *batch = get_write_batch();
3058       rocksdb::Status status = rocksdb::Status::NotFound();
3059       while ((status = batch->PopSavePoint()) == rocksdb::Status::OK()) {
3060       }
3061 
3062       if (status != rocksdb::Status::NotFound()) {
3063         return HA_EXIT_FAILURE;
3064       }
3065 
3066       do_set_savepoint();
3067       m_writes_at_last_savepoint = m_write_count;
3068     }
3069 
3070     return HA_EXIT_SUCCESS;
3071   }
3072 
3073   /*
3074     Rollback to the savepoint we've set before the last statement
3075   */
rollback_to_stmt_savepoint()3076   void rollback_to_stmt_savepoint() {
3077     if (m_writes_at_last_savepoint != m_write_count) {
3078       do_rollback_to_savepoint();
3079       /*
3080         RollbackToSavePoint "removes the most recent SetSavePoint()", so
3081         we need to set it again so that next statement can roll back to this
3082         stage.
3083         It's ok to do it here at statement end (instead of doing it at next
3084         statement start) because setting a savepoint is cheap.
3085       */
3086       do_set_savepoint();
3087       m_writes_at_last_savepoint = m_write_count;
3088     }
3089   }
3090 
3091   virtual void rollback_stmt() = 0;
3092 
set_tx_failed(bool failed_arg)3093   void set_tx_failed(bool failed_arg) { m_is_tx_failed = failed_arg; }
3094 
can_prepare() const3095   bool can_prepare() const {
3096     if (m_rollback_only) {
3097       my_error(ER_ROLLBACK_ONLY, MYF(0));
3098       return false;
3099     }
3100     return true;
3101   }
3102 
rollback_to_savepoint(void * const savepoint)3103   int rollback_to_savepoint(void *const savepoint) {
3104     if (has_modifications()) {
3105       my_error(ER_ROLLBACK_TO_SAVEPOINT, MYF(0));
3106       m_rollback_only = true;
3107       return HA_EXIT_FAILURE;
3108     }
3109     return HA_EXIT_SUCCESS;
3110   }
3111 
3112   /*
3113     This is used by transactions started with "START TRANSACTION WITH "
3114     "CONSISTENT [ROCKSDB] SNAPSHOT". When tx_read_only is turned on,
3115     snapshot has to be created via DB::GetSnapshot(), not via Transaction
3116     API.
3117   */
is_tx_read_only() const3118   bool is_tx_read_only() const { return m_tx_read_only; }
3119 
is_two_phase() const3120   bool is_two_phase() const { return m_is_two_phase; }
3121 
set_tx_read_only(bool val)3122   void set_tx_read_only(bool val) { m_tx_read_only = val; }
3123 
Rdb_transaction(THD * const thd)3124   explicit Rdb_transaction(THD *const thd)
3125       : m_thd(thd), m_tbl_io_perf(nullptr) {
3126     RDB_MUTEX_LOCK_CHECK(s_tx_list_mutex);
3127     s_tx_list.insert(this);
3128     RDB_MUTEX_UNLOCK_CHECK(s_tx_list_mutex);
3129   }
3130 
~Rdb_transaction()3131   virtual ~Rdb_transaction() {
3132     RDB_MUTEX_LOCK_CHECK(s_tx_list_mutex);
3133     s_tx_list.erase(this);
3134     RDB_MUTEX_UNLOCK_CHECK(s_tx_list_mutex);
3135   }
is_prepared()3136   virtual bool is_prepared()        { return false; };
detach_prepared_tx()3137   virtual void detach_prepared_tx() {};
3138 };
3139 
3140 /*
3141   This is a rocksdb transaction. Its members represent the current transaction,
3142   which consists of:
3143   - the snapshot
3144   - the changes we've made but are not seeing yet.
3145 
3146   The changes are made to individual tables, which store them here and then
3147   this object commits them on commit.
3148 */
3149 class Rdb_transaction_impl : public Rdb_transaction {
3150   rocksdb::Transaction *m_rocksdb_tx = nullptr;
3151   rocksdb::Transaction *m_rocksdb_reuse_tx = nullptr;
3152 
3153  public:
set_lock_timeout(int timeout_sec_arg)3154   void set_lock_timeout(int timeout_sec_arg) override {
3155     if (m_rocksdb_tx) {
3156       m_rocksdb_tx->SetLockTimeout(rdb_convert_sec_to_ms(m_timeout_sec));
3157     }
3158   }
3159 
set_sync(bool sync)3160   void set_sync(bool sync) override {
3161     if (m_rocksdb_tx)
3162       m_rocksdb_tx->GetWriteOptions()->sync = sync;
3163   }
3164 
release_lock(rocksdb::ColumnFamilyHandle * const column_family,const std::string & rowkey)3165   void release_lock(rocksdb::ColumnFamilyHandle *const column_family,
3166                     const std::string &rowkey) override {
3167     if (!THDVAR(m_thd, lock_scanned_rows)) {
3168       m_rocksdb_tx->UndoGetForUpdate(column_family, rocksdb::Slice(rowkey));
3169     }
3170   }
3171 
is_writebatch_trx() const3172   virtual bool is_writebatch_trx() const override { return false; }
3173 
is_prepared()3174   bool is_prepared() override {
3175     return m_rocksdb_tx && rocksdb::Transaction::PREPARED == m_rocksdb_tx->GetState();
3176   }
3177 
detach_prepared_tx()3178   void detach_prepared_tx() override {
3179     DBUG_ASSERT(rocksdb::Transaction::PREPARED == m_rocksdb_tx->GetState());
3180     m_rocksdb_tx = nullptr;
3181   }
3182 
3183 private:
release_tx(void)3184   void release_tx(void) {
3185     // We are done with the current active transaction object.  Preserve it
3186     // for later reuse.
3187     DBUG_ASSERT(m_rocksdb_reuse_tx == nullptr);
3188     m_rocksdb_reuse_tx = m_rocksdb_tx;
3189     m_rocksdb_tx = nullptr;
3190   }
3191 
prepare(const rocksdb::TransactionName & name)3192   bool prepare(const rocksdb::TransactionName &name) override {
3193     rocksdb::Status s;
3194     s = m_rocksdb_tx->SetName(name);
3195     if (!s.ok()) {
3196       rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3197       return false;
3198     }
3199 
3200     s = merge_auto_incr_map(m_rocksdb_tx->GetWriteBatch()->GetWriteBatch());
3201     if (!s.ok()) {
3202       rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3203       return false;
3204     }
3205 
3206     s = m_rocksdb_tx->Prepare();
3207     if (!s.ok()) {
3208       rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3209       return false;
3210     }
3211     return true;
3212   }
3213 
commit_no_binlog()3214   bool commit_no_binlog() override {
3215     bool res = false;
3216     rocksdb::Status s;
3217 
3218     s = merge_auto_incr_map(m_rocksdb_tx->GetWriteBatch()->GetWriteBatch());
3219     if (!s.ok()) {
3220       rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3221       res = true;
3222       goto error;
3223     }
3224 
3225     release_snapshot();
3226     s = m_rocksdb_tx->Commit();
3227     if (!s.ok()) {
3228       rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3229       res = true;
3230       goto error;
3231     }
3232 
3233     on_commit();
3234   error:
3235     on_rollback();
3236     /* Save the transaction object to be reused */
3237     release_tx();
3238 
3239     m_write_count = 0;
3240     m_insert_count = 0;
3241     m_update_count = 0;
3242     m_delete_count = 0;
3243     m_lock_count = 0;
3244     set_tx_read_only(false);
3245     m_rollback_only = false;
3246     return res;
3247   }
3248 
3249  public:
rollback()3250   void rollback() override {
3251     on_rollback();
3252     m_write_count = 0;
3253     m_insert_count = 0;
3254     m_update_count = 0;
3255     m_delete_count = 0;
3256     m_lock_count = 0;
3257     m_auto_incr_map.clear();
3258     m_ddl_transaction = false;
3259     if (m_rocksdb_tx) {
3260       release_snapshot();
3261       /* This will also release all of the locks: */
3262       m_rocksdb_tx->Rollback();
3263 
3264       /* Save the transaction object to be reused */
3265       release_tx();
3266 
3267       set_tx_read_only(false);
3268       m_rollback_only = false;
3269     }
3270   }
3271 
acquire_snapshot(bool acquire_now)3272   void acquire_snapshot(bool acquire_now) override {
3273     if (m_read_opts.snapshot == nullptr) {
3274 #ifdef MARIAROCKS_NOT_YET
3275       const auto thd_ss = std::static_pointer_cast<Rdb_explicit_snapshot>(
3276           m_thd->get_explicit_snapshot());
3277       if (thd_ss) {
3278         m_explicit_snapshot = thd_ss;
3279       }
3280       if (m_explicit_snapshot) {
3281         auto snapshot = m_explicit_snapshot->get_snapshot()->snapshot();
3282         snapshot_created(snapshot);
3283       } else
3284 #endif
3285       if (is_tx_read_only()) {
3286         snapshot_created(rdb->GetSnapshot());
3287       } else if (acquire_now) {
3288         m_rocksdb_tx->SetSnapshot();
3289         snapshot_created(m_rocksdb_tx->GetSnapshot());
3290       } else if (!m_is_delayed_snapshot) {
3291         m_rocksdb_tx->SetSnapshotOnNextOperation(m_notifier);
3292         m_is_delayed_snapshot = true;
3293       }
3294     }
3295   }
3296 
release_snapshot()3297   void release_snapshot() override {
3298     bool need_clear = m_is_delayed_snapshot;
3299 
3300     if (m_read_opts.snapshot != nullptr) {
3301       m_snapshot_timestamp = 0;
3302 #ifdef MARIAROCKS_NOT_YET
3303       if (m_explicit_snapshot) {
3304         m_explicit_snapshot.reset();
3305         need_clear = false;
3306       } else
3307 #endif
3308       if (is_tx_read_only()) {
3309         rdb->ReleaseSnapshot(m_read_opts.snapshot);
3310         need_clear = false;
3311       } else {
3312         need_clear = true;
3313       }
3314       m_read_opts.snapshot = nullptr;
3315     }
3316 
3317     if (need_clear && m_rocksdb_tx != nullptr) m_rocksdb_tx->ClearSnapshot();
3318   }
3319 
has_snapshot()3320   bool has_snapshot() { return m_read_opts.snapshot != nullptr; }
3321 
put(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const rocksdb::Slice & value,const bool assume_tracked)3322   rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family,
3323                       const rocksdb::Slice &key, const rocksdb::Slice &value,
3324                       const bool assume_tracked) override {
3325     ++m_write_count;
3326     ++m_lock_count;
3327     if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks) {
3328       return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
3329     }
3330     return m_rocksdb_tx->Put(column_family, key, value, assume_tracked);
3331   }
3332 
delete_key(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const bool assume_tracked)3333   rocksdb::Status delete_key(rocksdb::ColumnFamilyHandle *const column_family,
3334                              const rocksdb::Slice &key,
3335                              const bool assume_tracked) override {
3336     ++m_write_count;
3337     ++m_lock_count;
3338     if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks) {
3339       return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
3340     }
3341     return m_rocksdb_tx->Delete(column_family, key, assume_tracked);
3342   }
3343 
single_delete(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const bool assume_tracked)3344   rocksdb::Status single_delete(
3345       rocksdb::ColumnFamilyHandle *const column_family,
3346       const rocksdb::Slice &key, const bool assume_tracked) override {
3347     ++m_write_count;
3348     ++m_lock_count;
3349     if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks) {
3350       return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
3351     }
3352     return m_rocksdb_tx->SingleDelete(column_family, key, assume_tracked);
3353   }
3354 
has_modifications() const3355   bool has_modifications() const override {
3356     return m_rocksdb_tx->GetWriteBatch() &&
3357            m_rocksdb_tx->GetWriteBatch()->GetWriteBatch() &&
3358            m_rocksdb_tx->GetWriteBatch()->GetWriteBatch()->Count() > 0;
3359   }
3360 
get_write_batch()3361   rocksdb::WriteBatchBase *get_write_batch() override {
3362     if (is_two_phase()) {
3363       return m_rocksdb_tx->GetCommitTimeWriteBatch();
3364     }
3365     return m_rocksdb_tx->GetWriteBatch()->GetWriteBatch();
3366   }
3367 
3368   /*
3369     Return a WriteBatch that one can write to. The writes will skip any
3370     transaction locking. The writes WILL be visible to the transaction.
3371   */
get_indexed_write_batch()3372   rocksdb::WriteBatchBase *get_indexed_write_batch() override {
3373     ++m_write_count;
3374     return m_rocksdb_tx->GetWriteBatch();
3375   }
3376 
get(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,rocksdb::PinnableSlice * const value) const3377   rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family,
3378                       const rocksdb::Slice &key,
3379                       rocksdb::PinnableSlice *const value) const override {
3380     // clean PinnableSlice right begfore Get() for multiple gets per statement
3381     // the resources after the last Get in a statement are cleared in
3382     // handler::reset call
3383     value->Reset();
3384     global_stats.queries[QUERIES_POINT].inc();
3385     return m_rocksdb_tx->Get(m_read_opts, column_family, key, value);
3386   }
3387 
get_for_update(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,rocksdb::PinnableSlice * const value,bool exclusive,const bool do_validate)3388   rocksdb::Status get_for_update(
3389       rocksdb::ColumnFamilyHandle *const column_family,
3390       const rocksdb::Slice &key, rocksdb::PinnableSlice *const value,
3391       bool exclusive, const bool do_validate) override {
3392     if (++m_lock_count > m_max_row_locks) {
3393       return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
3394     }
3395 
3396     if (value != nullptr) {
3397       value->Reset();
3398     }
3399     rocksdb::Status s;
3400     // If snapshot is null, pass it to GetForUpdate and snapshot is
3401     // initialized there. Snapshot validation is skipped in that case.
3402     if (m_read_opts.snapshot == nullptr || do_validate) {
3403       s = m_rocksdb_tx->GetForUpdate(
3404           m_read_opts, column_family, key, value, exclusive,
3405           m_read_opts.snapshot ? do_validate : false);
3406     } else {
3407       // If snapshot is set, and if skipping validation,
3408       // call GetForUpdate without validation and set back old snapshot
3409       auto saved_snapshot = m_read_opts.snapshot;
3410       m_read_opts.snapshot = nullptr;
3411       s = m_rocksdb_tx->GetForUpdate(m_read_opts, column_family, key, value,
3412                                      exclusive, false);
3413       m_read_opts.snapshot = saved_snapshot;
3414     }
3415     return s;
3416   }
3417 
get_iterator(const rocksdb::ReadOptions & options,rocksdb::ColumnFamilyHandle * const column_family)3418   rocksdb::Iterator *get_iterator(
3419       const rocksdb::ReadOptions &options,
3420       rocksdb::ColumnFamilyHandle *const column_family) override {
3421     global_stats.queries[QUERIES_RANGE].inc();
3422     return m_rocksdb_tx->GetIterator(options, column_family);
3423   }
3424 
get_rdb_trx() const3425   const rocksdb::Transaction *get_rdb_trx() const { return m_rocksdb_tx; }
3426 
is_tx_started() const3427   bool is_tx_started() const override { return (m_rocksdb_tx != nullptr); }
3428 
start_tx()3429   void start_tx() override {
3430     rocksdb::TransactionOptions tx_opts;
3431     rocksdb::WriteOptions write_opts;
3432     tx_opts.set_snapshot = false;
3433     tx_opts.lock_timeout = rdb_convert_sec_to_ms(m_timeout_sec);
3434     tx_opts.deadlock_detect = THDVAR(m_thd, deadlock_detect);
3435     tx_opts.deadlock_detect_depth = THDVAR(m_thd, deadlock_detect_depth);
3436     // If this variable is set, this will write commit time write batch
3437     // information on recovery or memtable flush.
3438     tx_opts.use_only_the_last_commit_time_batch_for_recovery =
3439         THDVAR(m_thd, commit_time_batch_for_recovery);
3440     tx_opts.max_write_batch_size = THDVAR(m_thd, write_batch_max_bytes);
3441 
3442     write_opts.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
3443     write_opts.disableWAL = THDVAR(m_thd, write_disable_wal);
3444     write_opts.ignore_missing_column_families =
3445         THDVAR(m_thd, write_ignore_missing_column_families);
3446     m_is_two_phase = rocksdb_enable_2pc;
3447 
3448     commit_ordered_done= false;
3449 
3450     /*
3451       If m_rocksdb_reuse_tx is null this will create a new transaction object.
3452       Otherwise it will reuse the existing one.
3453     */
3454     m_rocksdb_tx =
3455         rdb->BeginTransaction(write_opts, tx_opts, m_rocksdb_reuse_tx);
3456     m_rocksdb_reuse_tx = nullptr;
3457 
3458     m_read_opts = rocksdb::ReadOptions();
3459 
3460     set_initial_savepoint();
3461 
3462     m_ddl_transaction = false;
3463   }
3464 
3465   /* Implementations of do_*savepoint based on rocksdB::Transaction savepoints
3466    */
do_set_savepoint()3467   void do_set_savepoint() override { m_rocksdb_tx->SetSavePoint(); }
3468 
do_rollback_to_savepoint()3469   void do_rollback_to_savepoint() override {
3470     m_rocksdb_tx->RollbackToSavePoint();
3471   }
3472 
3473   /*
3474     Start a statement inside a multi-statement transaction.
3475 
3476     @todo: are we sure this is called once (and not several times) per
3477     statement start?
3478 
3479     For hooking to start of statement that is its own transaction, see
3480     ha_rocksdb::external_lock().
3481   */
start_stmt()3482   void start_stmt() override {
3483     // Set the snapshot to delayed acquisition (SetSnapshotOnNextOperation)
3484     acquire_snapshot(false);
3485   }
3486 
3487   /*
3488     This must be called when last statement is rolled back, but the transaction
3489     continues
3490   */
rollback_stmt()3491   void rollback_stmt() override {
3492     /* TODO: here we must release the locks taken since the start_stmt() call */
3493     if (m_rocksdb_tx) {
3494       const rocksdb::Snapshot *const org_snapshot = m_rocksdb_tx->GetSnapshot();
3495       rollback_to_stmt_savepoint();
3496 
3497       const rocksdb::Snapshot *const cur_snapshot = m_rocksdb_tx->GetSnapshot();
3498       if (org_snapshot != cur_snapshot) {
3499         if (org_snapshot != nullptr) m_snapshot_timestamp = 0;
3500 
3501         m_read_opts.snapshot = cur_snapshot;
3502         if (cur_snapshot != nullptr) {
3503           rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp);
3504         } else {
3505           m_is_delayed_snapshot = true;
3506         }
3507       }
3508     }
3509   }
3510 
Rdb_transaction_impl(THD * const thd)3511   explicit Rdb_transaction_impl(THD *const thd)
3512       : Rdb_transaction(thd), m_rocksdb_tx(nullptr) {
3513     // Create a notifier that can be called when a snapshot gets generated.
3514     m_notifier = std::make_shared<Rdb_snapshot_notifier>(this);
3515   }
3516 
~Rdb_transaction_impl()3517   virtual ~Rdb_transaction_impl() override {
3518     rollback();
3519 
3520     // Theoretically the notifier could outlive the Rdb_transaction_impl
3521     // (because of the shared_ptr), so let it know it can't reference
3522     // the transaction anymore.
3523     m_notifier->detach();
3524 
3525     // Free any transaction memory that is still hanging around.
3526     delete m_rocksdb_reuse_tx;
3527     DBUG_ASSERT(m_rocksdb_tx == nullptr);
3528   }
3529 };
3530 
3531 /* This is a rocksdb write batch. This class doesn't hold or wait on any
3532    transaction locks (skips rocksdb transaction API) thus giving better
3533    performance.
3534 
3535    Currently this is only used for replication threads which are guaranteed
3536    to be non-conflicting. Any further usage of this class should completely
3537    be thought thoroughly.
3538 */
3539 class Rdb_writebatch_impl : public Rdb_transaction {
3540   rocksdb::WriteBatchWithIndex *m_batch;
3541   rocksdb::WriteOptions write_opts;
3542   // Called after commit/rollback.
reset()3543   void reset() {
3544     m_batch->Clear();
3545     m_read_opts = rocksdb::ReadOptions();
3546     m_ddl_transaction = false;
3547   }
3548 
3549  private:
prepare(const rocksdb::TransactionName & name)3550   bool prepare(const rocksdb::TransactionName &name) override { return true; }
3551 
commit_no_binlog()3552   bool commit_no_binlog() override {
3553     bool res = false;
3554     rocksdb::Status s;
3555     rocksdb::TransactionDBWriteOptimizations optimize;
3556     optimize.skip_concurrency_control = true;
3557 
3558     s = merge_auto_incr_map(m_batch->GetWriteBatch());
3559     if (!s.ok()) {
3560       rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3561       res = true;
3562       goto error;
3563     }
3564 
3565     release_snapshot();
3566 
3567     s = rdb->Write(write_opts, optimize, m_batch->GetWriteBatch());
3568     if (!s.ok()) {
3569       rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3570       res = true;
3571       goto error;
3572     }
3573     on_commit();
3574   error:
3575     on_rollback();
3576     reset();
3577 
3578     m_write_count = 0;
3579     m_insert_count = 0;
3580     m_update_count = 0;
3581     m_delete_count = 0;
3582     set_tx_read_only(false);
3583     m_rollback_only = false;
3584     return res;
3585   }
3586 
3587   /* Implementations of do_*savepoint based on rocksdB::WriteBatch savepoints */
do_set_savepoint()3588   void do_set_savepoint() override { m_batch->SetSavePoint(); }
3589 
do_rollback_to_savepoint()3590   void do_rollback_to_savepoint() override { m_batch->RollbackToSavePoint(); }
3591 
3592 
3593  public:
is_writebatch_trx() const3594   bool is_writebatch_trx() const override { return true; }
3595 
set_lock_timeout(int timeout_sec_arg)3596   void set_lock_timeout(int timeout_sec_arg) override {
3597     // Nothing to do here.
3598   }
3599 
set_sync(bool sync)3600   void set_sync(bool sync) override { write_opts.sync = sync; }
3601 
release_lock(rocksdb::ColumnFamilyHandle * const column_family,const std::string & rowkey)3602   void release_lock(rocksdb::ColumnFamilyHandle *const column_family,
3603                     const std::string &rowkey) override {
3604     // Nothing to do here since we don't hold any row locks.
3605   }
3606 
rollback()3607   void rollback() override {
3608     on_rollback();
3609     m_write_count = 0;
3610     m_insert_count = 0;
3611     m_update_count = 0;
3612     m_delete_count = 0;
3613     m_lock_count = 0;
3614     release_snapshot();
3615 
3616     reset();
3617     set_tx_read_only(false);
3618     m_rollback_only = false;
3619   }
3620 
acquire_snapshot(bool acquire_now)3621   void acquire_snapshot(bool acquire_now) override {
3622     if (m_read_opts.snapshot == nullptr) snapshot_created(rdb->GetSnapshot());
3623   }
3624 
release_snapshot()3625   void release_snapshot() override {
3626     if (m_read_opts.snapshot != nullptr) {
3627       rdb->ReleaseSnapshot(m_read_opts.snapshot);
3628       m_read_opts.snapshot = nullptr;
3629     }
3630   }
3631 
put(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const rocksdb::Slice & value,const bool assume_tracked)3632   rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family,
3633                       const rocksdb::Slice &key, const rocksdb::Slice &value,
3634                       const bool assume_tracked) override {
3635     ++m_write_count;
3636     m_batch->Put(column_family, key, value);
3637     // Note Put/Delete in write batch doesn't return any error code. We simply
3638     // return OK here.
3639     return rocksdb::Status::OK();
3640   }
3641 
delete_key(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const bool assume_tracked)3642   rocksdb::Status delete_key(rocksdb::ColumnFamilyHandle *const column_family,
3643                              const rocksdb::Slice &key,
3644                              const bool assume_tracked) override {
3645     ++m_write_count;
3646     m_batch->Delete(column_family, key);
3647     return rocksdb::Status::OK();
3648   }
3649 
single_delete(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const bool)3650   rocksdb::Status single_delete(
3651       rocksdb::ColumnFamilyHandle *const column_family,
3652       const rocksdb::Slice &key, const bool /* assume_tracked */) override {
3653     ++m_write_count;
3654     m_batch->SingleDelete(column_family, key);
3655     return rocksdb::Status::OK();
3656   }
3657 
has_modifications() const3658   bool has_modifications() const override {
3659     return m_batch->GetWriteBatch()->Count() > 0;
3660   }
3661 
get_write_batch()3662   rocksdb::WriteBatchBase *get_write_batch() override { return m_batch; }
3663 
get_indexed_write_batch()3664   rocksdb::WriteBatchBase *get_indexed_write_batch() override {
3665     ++m_write_count;
3666     return m_batch;
3667   }
3668 
get(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,rocksdb::PinnableSlice * const value) const3669   rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family,
3670                       const rocksdb::Slice &key,
3671                       rocksdb::PinnableSlice *const value) const override {
3672     value->Reset();
3673     return m_batch->GetFromBatchAndDB(rdb, m_read_opts, column_family, key,
3674                                       value);
3675   }
3676 
get_for_update(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,rocksdb::PinnableSlice * const value,bool,const bool)3677   rocksdb::Status get_for_update(
3678       rocksdb::ColumnFamilyHandle *const column_family,
3679       const rocksdb::Slice &key, rocksdb::PinnableSlice *const value,
3680       bool /* exclusive */, const bool /* do_validate */) override {
3681     if (value == nullptr) {
3682       rocksdb::PinnableSlice pin_val;
3683       rocksdb::Status s = get(column_family, key, &pin_val);
3684       pin_val.Reset();
3685       return s;
3686     }
3687 
3688     return get(column_family, key, value);
3689   }
3690 
get_iterator(const rocksdb::ReadOptions & options,rocksdb::ColumnFamilyHandle * const)3691   rocksdb::Iterator *get_iterator(
3692       const rocksdb::ReadOptions &options,
3693       rocksdb::ColumnFamilyHandle *const /* column_family */) override {
3694     const auto it = rdb->NewIterator(options);
3695     return m_batch->NewIteratorWithBase(it);
3696   }
3697 
is_tx_started() const3698   bool is_tx_started() const override { return (m_batch != nullptr); }
3699 
start_tx()3700   void start_tx() override {
3701     commit_ordered_done= false; // Do we need this here?
3702     reset();
3703     write_opts.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
3704     write_opts.disableWAL = THDVAR(m_thd, write_disable_wal);
3705     write_opts.ignore_missing_column_families =
3706         THDVAR(m_thd, write_ignore_missing_column_families);
3707 
3708     set_initial_savepoint();
3709   }
3710 
start_stmt()3711   void start_stmt() override {}
3712 
rollback_stmt()3713   void rollback_stmt() override {
3714     if (m_batch) rollback_to_stmt_savepoint();
3715   }
3716 
Rdb_writebatch_impl(THD * const thd)3717   explicit Rdb_writebatch_impl(THD *const thd)
3718       : Rdb_transaction(thd), m_batch(nullptr) {
3719     m_batch = new rocksdb::WriteBatchWithIndex(rocksdb::BytewiseComparator(), 0,
3720                                                true);
3721   }
3722 
~Rdb_writebatch_impl()3723   virtual ~Rdb_writebatch_impl() override {
3724     rollback();
3725     delete m_batch;
3726   }
3727 };
3728 
SnapshotCreated(const rocksdb::Snapshot * const snapshot)3729 void Rdb_snapshot_notifier::SnapshotCreated(
3730     const rocksdb::Snapshot *const snapshot) {
3731   if (m_owning_tx != nullptr) {
3732     m_owning_tx->snapshot_created(snapshot);
3733   }
3734 }
3735 
3736 std::multiset<Rdb_transaction *> Rdb_transaction::s_tx_list;
3737 mysql_mutex_t Rdb_transaction::s_tx_list_mutex;
3738 
get_tx_from_thd(THD * const thd)3739 static Rdb_transaction *get_tx_from_thd(THD *const thd) {
3740   return reinterpret_cast<Rdb_transaction *>(
3741       my_core::thd_get_ha_data(thd, rocksdb_hton));
3742 }
3743 
3744 namespace {
3745 
3746 class Rdb_perf_context_guard {
3747   Rdb_io_perf m_io_perf;
3748   Rdb_io_perf *m_io_perf_ptr;
3749   Rdb_transaction *m_tx;
3750   uint m_level;
3751 
3752  public:
3753   Rdb_perf_context_guard(const Rdb_perf_context_guard &) = delete;
3754   Rdb_perf_context_guard &operator=(const Rdb_perf_context_guard &) = delete;
3755 
Rdb_perf_context_guard(Rdb_io_perf * io_perf,uint level)3756   explicit Rdb_perf_context_guard(Rdb_io_perf *io_perf, uint level)
3757       : m_io_perf_ptr(io_perf), m_tx(nullptr), m_level(level) {
3758     m_io_perf_ptr->start(m_level);
3759   }
3760 
Rdb_perf_context_guard(Rdb_transaction * tx,uint level)3761   explicit Rdb_perf_context_guard(Rdb_transaction *tx, uint level)
3762       : m_io_perf_ptr(nullptr), m_tx(tx), m_level(level) {
3763     /*
3764       if perf_context information is already being recorded, this becomes a
3765       no-op
3766     */
3767     if (tx != nullptr) {
3768       tx->io_perf_start(&m_io_perf);
3769     }
3770   }
3771 
~Rdb_perf_context_guard()3772   ~Rdb_perf_context_guard() {
3773     if (m_tx != nullptr) {
3774       m_tx->io_perf_end_and_record();
3775     } else if (m_io_perf_ptr != nullptr) {
3776       m_io_perf_ptr->end_and_record(m_level);
3777     }
3778   }
3779 };
3780 
3781 }  // anonymous namespace
3782 
3783 /*
3784   TODO: maybe, call this in external_lock() and store in ha_rocksdb..
3785 */
3786 
get_or_create_tx(THD * const thd)3787 static Rdb_transaction *get_or_create_tx(THD *const thd) {
3788   Rdb_transaction *tx = get_tx_from_thd(thd);
3789   // TODO: this is called too many times.. O(#rows)
3790   if (tx == nullptr) {
3791     bool rpl_skip_tx_api= false; // MARIAROCKS_NOT_YET.
3792     if ((rpl_skip_tx_api && thd->rgi_slave) ||
3793         (THDVAR(thd, master_skip_tx_api) && !thd->rgi_slave))
3794     {
3795       tx = new Rdb_writebatch_impl(thd);
3796     } else {
3797       tx = new Rdb_transaction_impl(thd);
3798     }
3799     tx->set_params(THDVAR(thd, lock_wait_timeout), THDVAR(thd, max_row_locks));
3800     tx->start_tx();
3801     my_core::thd_set_ha_data(thd, rocksdb_hton, tx);
3802   } else {
3803     tx->set_params(THDVAR(thd, lock_wait_timeout), THDVAR(thd, max_row_locks));
3804     if (!tx->is_tx_started()) {
3805       tx->start_tx();
3806     }
3807   }
3808 
3809   return tx;
3810 }
3811 
rocksdb_close_connection(handlerton * const hton,THD * const thd)3812 static int rocksdb_close_connection(handlerton *const hton, THD *const thd) {
3813   Rdb_transaction *tx = get_tx_from_thd(thd);
3814   if (tx != nullptr) {
3815     bool is_critical_error;
3816     int rc = tx->finish_bulk_load(&is_critical_error, false);
3817     if (rc != 0 && is_critical_error) {
3818       // NO_LINT_DEBUG
3819       sql_print_error(
3820           "RocksDB: Error %d finalizing last SST file while "
3821           "disconnecting",
3822           rc);
3823     }
3824     if (tx->is_prepared())
3825       tx->detach_prepared_tx();
3826     delete tx;
3827   }
3828   return HA_EXIT_SUCCESS;
3829 }
3830 
3831 /*
3832  * Serializes an xid to a string so that it can
3833  * be used as a rocksdb transaction name
3834  */
rdb_xid_to_string(const XID & src)3835 static std::string rdb_xid_to_string(const XID &src) {
3836   DBUG_ASSERT(src.gtrid_length >= 0 && src.gtrid_length <= MAXGTRIDSIZE);
3837   DBUG_ASSERT(src.bqual_length >= 0 && src.bqual_length <= MAXBQUALSIZE);
3838 
3839   std::string buf;
3840   buf.reserve(RDB_XIDHDR_LEN + src.gtrid_length + src.bqual_length);
3841 
3842   /*
3843    * expand formatID to fill 8 bytes if it doesn't already
3844    * then reinterpret bit pattern as unsigned and store in network order
3845    */
3846   uchar fidbuf[RDB_FORMATID_SZ];
3847   int64 signed_fid8 = src.formatID;
3848   const uint64 raw_fid8 = *reinterpret_cast<uint64 *>(&signed_fid8);
3849   rdb_netbuf_store_uint64(fidbuf, raw_fid8);
3850   buf.append(reinterpret_cast<const char *>(fidbuf), RDB_FORMATID_SZ);
3851 
3852   buf.push_back(src.gtrid_length);
3853   buf.push_back(src.bqual_length);
3854   buf.append(src.data, (src.gtrid_length) + (src.bqual_length));
3855   return buf;
3856 }
3857 
3858 #if 0
3859 // MARIAROCKS: MariaDB doesn't have flush_wal method
3860 /**
3861   Called by hton->flush_logs after MySQL group commit prepares a set of
3862   transactions.
3863 */
3864 static bool rocksdb_flush_wal(handlerton* hton __attribute__((__unused__)))
3865   DBUG_ASSERT(rdb != nullptr);
3866 
3867   rocksdb::Status s;
3868   /*
3869     target_lsn is set to 0 when MySQL wants to sync the wal files
3870   */
3871   if ((target_lsn == 0 && !rocksdb_db_options->allow_mmap_writes) ||
3872       rocksdb_flush_log_at_trx_commit != FLUSH_LOG_NEVER) {
3873     rocksdb_wal_group_syncs++;
3874     s = rdb->FlushWAL(target_lsn == 0 ||
3875                       rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
3876   }
3877 
3878   if (!s.ok()) {
3879     rdb_log_status_error(s);
3880     return HA_EXIT_FAILURE;
3881   }
3882   return HA_EXIT_SUCCESS;
3883 }
3884 #endif
3885 
3886 /**
3887   For a slave, prepare() updates the slave_gtid_info table which tracks the
3888   replication progress.
3889 */
rocksdb_prepare(handlerton * hton,THD * thd,bool prepare_tx)3890 static int rocksdb_prepare(handlerton* hton, THD* thd, bool prepare_tx)
3891 {
3892   bool async=false; // This is "ASYNC_COMMIT" feature which is only present in webscalesql
3893 
3894   Rdb_transaction *tx = get_tx_from_thd(thd);
3895   if (!tx->can_prepare()) {
3896     return HA_EXIT_FAILURE;
3897   }
3898   if (prepare_tx ||
3899       (!my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
3900     /* We were instructed to prepare the whole transaction, or
3901     this is an SQL statement end and autocommit is on */
3902 
3903 #ifdef MARIAROCKS_NOT_YET
3904     /*
3905       Storing binlog position inside MyRocks is needed only for restoring
3906       MyRocks from backups. This feature is not supported yet.
3907     */
3908     std::vector<st_slave_gtid_info> slave_gtid_info;
3909     my_core::thd_slave_gtid_info(thd, &slave_gtid_info);
3910     for (const auto &it : slave_gtid_info) {
3911       rocksdb::WriteBatchBase *const write_batch = tx->get_blind_write_batch();
3912       binlog_manager.update_slave_gtid_info(it.id, it.db, it.gtid, write_batch);
3913     }
3914 #endif
3915 
3916     if (tx->is_two_phase()) {
3917 
3918       /*
3919         MariaDB: the following branch is never taken.
3920         We always flush at Prepare and rely on RocksDB's internal Group Commit
3921         to do some grouping.
3922       */
3923       if (thd->durability_property == HA_IGNORE_DURABILITY || async) {
3924         tx->set_sync(false);
3925       }
3926 
3927       /*
3928         MariaDB: do not flush logs if we are running in a non-crash-safe mode.
3929       */
3930       if (!rocksdb_flush_log_at_trx_commit)
3931         tx->set_sync(false);
3932 
3933       XID xid;
3934       thd_get_xid(thd, reinterpret_cast<MYSQL_XID *>(&xid));
3935       if (!tx->prepare(rdb_xid_to_string(xid))) {
3936         return HA_EXIT_FAILURE;
3937       }
3938 
3939       /*
3940         MariaDB: our Group Commit implementation does not use the
3941         hton->flush_logs call (at least currently) so the following is not
3942         needed (TODO: will we need this for binlog rotation?)
3943       */
3944 #ifdef MARIAROCKS_NOT_YET
3945       if (thd->durability_property == HA_IGNORE_DURABILITY )
3946           (rocksdb_flush_log_at_trx_commit != FLUSH_LOG_NEVER))
3947           &&
3948           THDVAR(thd, flush_log_at_trx_commit))
3949 #endif
3950 #ifdef MARIAROCKS_NOT_YET
3951       {
3952         // MariaRocks: disable the
3953         //   "write/sync redo log before flushing binlog cache to file"
3954         //  feature. See a869c56d361bb44f46c0efeb11a8f03561676247
3955         /**
3956           we set the log sequence as '1' just to trigger hton->flush_logs
3957         */
3958         thd_store_lsn(thd, 1, DB_TYPE_ROCKSDB);
3959       }
3960 #endif
3961     }
3962 
3963     DEBUG_SYNC(thd, "rocksdb.prepared");
3964   } else {
3965     tx->make_stmt_savepoint_permanent();
3966   }
3967   return HA_EXIT_SUCCESS;
3968 }
3969 
3970 /**
3971  do nothing for prepare/commit by xid
3972  this is needed to avoid crashes in XA scenarios
3973 */
rocksdb_commit_by_xid(handlerton * const hton,XID * const xid)3974 static int rocksdb_commit_by_xid(handlerton *const hton, XID *const xid) {
3975   DBUG_ENTER_FUNC();
3976 
3977   DBUG_ASSERT(hton != nullptr);
3978   DBUG_ASSERT(xid != nullptr);
3979   DBUG_ASSERT(commit_latency_stats != nullptr);
3980 
3981   rocksdb::StopWatchNano timer(rocksdb::Env::Default(), true);
3982 
3983   const auto name = rdb_xid_to_string(*xid);
3984   DBUG_ASSERT(!name.empty());
3985 
3986   rocksdb::Transaction *const trx = rdb->GetTransactionByName(name);
3987 
3988   if (trx == nullptr) {
3989     DBUG_RETURN(HA_EXIT_FAILURE);
3990   }
3991 
3992   const rocksdb::Status s = trx->Commit();
3993 
3994   if (!s.ok()) {
3995     rdb_log_status_error(s);
3996     DBUG_RETURN(HA_EXIT_FAILURE);
3997   }
3998 
3999   delete trx;
4000 
4001   // `Add()` is implemented in a thread-safe manner.
4002   commit_latency_stats->Add(timer.ElapsedNanos() / 1000);
4003 
4004   DBUG_RETURN(HA_EXIT_SUCCESS);
4005 }
4006 
rocksdb_rollback_by_xid(handlerton * const hton MY_ATTRIBUTE ((__unused__)),XID * const xid)4007 static int rocksdb_rollback_by_xid(
4008     handlerton *const hton MY_ATTRIBUTE((__unused__)), XID *const xid) {
4009   DBUG_ENTER_FUNC();
4010 
4011   DBUG_ASSERT(hton != nullptr);
4012   DBUG_ASSERT(xid != nullptr);
4013   DBUG_ASSERT(rdb != nullptr);
4014 
4015   const auto name = rdb_xid_to_string(*xid);
4016 
4017   rocksdb::Transaction *const trx = rdb->GetTransactionByName(name);
4018 
4019   if (trx == nullptr) {
4020     DBUG_RETURN(HA_EXIT_FAILURE);
4021   }
4022 
4023   const rocksdb::Status s = trx->Rollback();
4024 
4025   if (!s.ok()) {
4026     rdb_log_status_error(s);
4027     DBUG_RETURN(HA_EXIT_FAILURE);
4028   }
4029 
4030   delete trx;
4031 
4032   DBUG_RETURN(HA_EXIT_SUCCESS);
4033 }
4034 
4035 /**
4036   Rebuilds an XID from a serialized version stored in a string.
4037 */
rdb_xid_from_string(const std::string & src,XID * const dst)4038 static void rdb_xid_from_string(const std::string &src, XID *const dst) {
4039   DBUG_ASSERT(dst != nullptr);
4040   uint offset = 0;
4041   uint64 raw_fid8 =
4042       rdb_netbuf_to_uint64(reinterpret_cast<const uchar *>(src.data()));
4043   const int64 signed_fid8 = *reinterpret_cast<int64 *>(&raw_fid8);
4044   dst->formatID = signed_fid8;
4045   offset += RDB_FORMATID_SZ;
4046   dst->gtrid_length = src.at(offset);
4047   offset += RDB_GTRID_SZ;
4048   dst->bqual_length = src.at(offset);
4049   offset += RDB_BQUAL_SZ;
4050 
4051   DBUG_ASSERT(dst->gtrid_length >= 0 && dst->gtrid_length <= MAXGTRIDSIZE);
4052   DBUG_ASSERT(dst->bqual_length >= 0 && dst->bqual_length <= MAXBQUALSIZE);
4053 
4054   memset(dst->data, 0, XIDDATASIZE);
4055   src.copy(dst->data, (dst->gtrid_length) + (dst->bqual_length),
4056            RDB_XIDHDR_LEN);
4057 }
4058 
4059 /**
4060   Reading last committed binary log info from RocksDB system row.
4061   The info is needed for crash safe slave/master to work.
4062 */
rocksdb_recover(handlerton * hton,XID * xid_list,uint len)4063 static int rocksdb_recover(handlerton* hton, XID* xid_list, uint len)
4064 #ifdef MARIAROCKS_NOT_YET
4065                            char* const binlog_file,
4066                            my_off_t *const binlog_pos,
4067                            Gtid *const binlog_max_gtid) {
4068 #endif
4069 {
4070 #ifdef MARIAROCKS_NOT_YET
4071   if (binlog_file && binlog_pos) {
4072     char file_buf[FN_REFLEN + 1] = {0};
4073     my_off_t pos;
4074     char gtid_buf[FN_REFLEN + 1] = {0};
4075     if (binlog_manager.read(file_buf, &pos, gtid_buf)) {
4076       if (is_binlog_advanced(binlog_file, *binlog_pos, file_buf, pos)) {
4077         memcpy(binlog_file, file_buf, FN_REFLEN + 1);
4078         *binlog_pos = pos;
4079         // NO_LINT_DEBUG
4080         fprintf(stderr,
4081                 "RocksDB: Last binlog file position %llu,"
4082                 " file name %s\n",
4083                 pos, file_buf);
4084         if (*gtid_buf) {
4085           global_sid_lock->rdlock();
4086           binlog_max_gtid->parse(global_sid_map, gtid_buf);
4087           global_sid_lock->unlock();
4088           // NO_LINT_DEBUG
4089           fprintf(stderr, "RocksDB: Last MySQL Gtid %s\n", gtid_buf);
4090         }
4091       }
4092     }
4093   }
4094 #endif
4095 
4096   if (len == 0 || xid_list == nullptr) {
4097     return HA_EXIT_SUCCESS;
4098   }
4099 
4100   std::vector<rocksdb::Transaction *> trans_list;
4101   rdb->GetAllPreparedTransactions(&trans_list);
4102 
4103   uint count = 0;
4104   for (auto &trans : trans_list) {
4105     if (count >= len) {
4106       break;
4107     }
4108     auto name = trans->GetName();
4109     rdb_xid_from_string(name, &xid_list[count]);
4110     count++;
4111   }
4112   return count;
4113 }
4114 
4115 
4116 /*
4117   Handle a commit checkpoint request from server layer.
4118 
4119   InnoDB does this:
4120     We put the request in a queue, so that we can notify upper layer about
4121     checkpoint complete when we have flushed the redo log.
4122     If we have already flushed all relevant redo log, we notify immediately.
4123 
4124   MariaRocks just flushes everything right away ATM
4125 */
4126 
4127 static void rocksdb_checkpoint_request(void *cookie)
4128 {
4129   const rocksdb::Status s= rdb->FlushWAL(true);
4130   //TODO: what to do on error?
4131   if (s.ok())
4132   {
4133     rocksdb_wal_group_syncs++;
4134     commit_checkpoint_notify_ha(cookie);
4135   }
4136 }
4137 
4138 /*
4139   @param all:   TRUE - commit the transaction
4140                 FALSE - SQL statement ended
4141 */
4142 static void rocksdb_commit_ordered(handlerton *hton, THD* thd, bool all)
4143 {
4144   // Same assert as InnoDB has
4145   DBUG_ASSERT(all || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT |
4146                                              OPTION_BEGIN)));
4147   Rdb_transaction *tx = get_tx_from_thd(thd);
4148   if (!tx->is_two_phase()) {
4149     /*
4150       ordered_commit is supposedly slower as it is done sequentially
4151       in order to preserve commit order.
4152 
4153       if we are not required do 2-phase commit with the binlog, do not do
4154       anything here.
4155     */
4156     return;
4157   }
4158 
4159   tx->set_sync(false);
4160 
4161   /* This will note the master position also */
4162   tx->commit_ordered_res= tx->commit();
4163   tx->commit_ordered_done= true;
4164 
4165 }
4166 
4167 
4168 static int rocksdb_commit(handlerton* hton, THD* thd, bool commit_tx)
4169 {
4170   DBUG_ENTER_FUNC();
4171 
4172   DBUG_ASSERT(hton != nullptr);
4173   DBUG_ASSERT(thd != nullptr);
4174   DBUG_ASSERT(commit_latency_stats != nullptr);
4175 
4176   rocksdb::StopWatchNano timer(rocksdb::Env::Default(), true);
4177 
4178   /* note: h->external_lock(F_UNLCK) is called after this function is called) */
4179   Rdb_transaction *tx = get_tx_from_thd(thd);
4180 
4181   /* this will trigger saving of perf_context information */
4182   Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd));
4183 
4184   if (tx != nullptr) {
4185     if (commit_tx || (!my_core::thd_test_options(
4186                          thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
4187       /*
4188         This will not add anything to commit_latency_stats, and this is correct
4189         right?
4190       */
4191       if (tx->commit_ordered_done)
4192       {
4193         thd_wakeup_subsequent_commits(thd, 0);
4194         DBUG_RETURN((tx->commit_ordered_res? HA_ERR_INTERNAL_ERROR: 0));
4195       }
4196 
4197       /*
4198         We get here
4199          - For a COMMIT statement that finishes a multi-statement transaction
4200          - For a statement that has its own transaction
4201       */
4202       if (thd->slave_thread)
4203       {
4204         // An attempt to make parallel slave performant (not fully successful,
4205         // see MDEV-15372):
4206 
4207         //  First, commit without syncing. This establishes the commit order
4208         tx->set_sync(false);
4209         bool tx_had_writes = tx->get_write_count()? true : false ;
4210         if (tx->commit()) {
4211           DBUG_RETURN(HA_ERR_ROCKSDB_COMMIT_FAILED);
4212         }
4213         thd_wakeup_subsequent_commits(thd, 0);
4214 
4215         if (tx_had_writes && rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC)
4216         {
4217           rocksdb::Status s= rdb->FlushWAL(true);
4218           if (!s.ok())
4219             DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
4220         }
4221       }
4222       else
4223       {
4224         /* Not a slave thread */
4225         if (tx->commit()) {
4226           DBUG_RETURN(HA_ERR_ROCKSDB_COMMIT_FAILED);
4227         }
4228       }
4229     } else {
4230       /*
4231         We get here when committing a statement within a transaction.
4232       */
4233       tx->make_stmt_savepoint_permanent();
4234     }
4235 
4236     if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED) {
4237       // For READ_COMMITTED, we release any existing snapshot so that we will
4238       // see any changes that occurred since the last statement.
4239       tx->release_snapshot();
4240     }
4241   }
4242 
4243   // `Add()` is implemented in a thread-safe manner.
4244   commit_latency_stats->Add(timer.ElapsedNanos() / 1000);
4245 
4246   DBUG_RETURN(HA_EXIT_SUCCESS);
4247 }
4248 
4249 
4250 static int rocksdb_rollback(handlerton *const hton, THD *const thd,
4251                             bool rollback_tx) {
4252   Rdb_transaction *tx = get_tx_from_thd(thd);
4253   Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd));
4254 
4255   if (tx != nullptr) {
4256     if (rollback_tx) {
4257       /*
4258         We get here, when
4259         - ROLLBACK statement is issued.
4260 
4261         Discard the changes made by the transaction
4262       */
4263       tx->rollback();
4264     } else {
4265       /*
4266         We get here when
4267         - a statement with AUTOCOMMIT=1 is being rolled back (because of some
4268           error)
4269         - a statement inside a transaction is rolled back
4270       */
4271 
4272       tx->rollback_stmt();
4273       tx->set_tx_failed(true);
4274     }
4275 
4276     if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED) {
4277       // For READ_COMMITTED, we release any existing snapshot so that we will
4278       // see any changes that occurred since the last statement.
4279       tx->release_snapshot();
4280     }
4281   }
4282   return HA_EXIT_SUCCESS;
4283 }
4284 
4285 static bool print_stats(THD *const thd, std::string const &type,
4286                         std::string const &name, std::string const &status,
4287                         stat_print_fn *stat_print) {
4288   return stat_print(thd, type.c_str(), type.size(), name.c_str(), name.size(),
4289                     status.c_str(), status.size());
4290 }
4291 
4292 static std::string format_string(const char *const format, ...) {
4293   std::string res;
4294   va_list args;
4295   va_list args_copy;
4296   char static_buff[256];
4297 
4298   DBUG_ASSERT(format != nullptr);
4299 
4300   va_start(args, format);
4301   va_copy(args_copy, args);
4302 
4303   // Calculate how much space we will need
4304   int len = vsnprintf(nullptr, 0, format, args);
4305   va_end(args);
4306 
4307   if (len < 0) {
4308     res = std::string("<format error>");
4309   } else if (len == 0) {
4310     // Shortcut for an empty string
4311     res = std::string("");
4312   } else {
4313     // For short enough output use a static buffer
4314     char *buff = static_buff;
4315     std::unique_ptr<char[]> dynamic_buff = nullptr;
4316 
4317     len++;  // Add one for null terminator
4318 
4319     // for longer output use an allocated buffer
4320     if (static_cast<uint>(len) > sizeof(static_buff)) {
4321       dynamic_buff.reset(new char[len]);
4322       buff = dynamic_buff.get();
4323     }
4324 
4325     // Now re-do the vsnprintf with the buffer which is now large enough
4326     (void)vsnprintf(buff, len, format, args_copy);
4327 
4328     // Convert to a std::string.  Note we could have created a std::string
4329     // large enough and then converted the buffer to a 'char*' and created
4330     // the output in place.  This would probably work but feels like a hack.
4331     // Since this isn't code that needs to be super-performant we are going
4332     // with this 'safer' method.
4333     res = std::string(buff);
4334   }
4335 
4336   va_end(args_copy);
4337 
4338   return res;
4339 }
4340 
4341 class Rdb_snapshot_status : public Rdb_tx_list_walker {
4342  private:
4343   std::string m_data;
4344 
4345   static std::string current_timestamp(void) {
4346     static const char *const format = "%d-%02d-%02d %02d:%02d:%02d";
4347     time_t currtime;
4348     struct tm currtm;
4349 
4350     time(&currtime);
4351 
4352     localtime_r(&currtime, &currtm);
4353 
4354     return format_string(format, currtm.tm_year + 1900, currtm.tm_mon + 1,
4355                          currtm.tm_mday, currtm.tm_hour, currtm.tm_min,
4356                          currtm.tm_sec);
4357   }
4358 
4359   static std::string get_header(void) {
4360     return "\n============================================================\n" +
4361            current_timestamp() +
4362            " ROCKSDB TRANSACTION MONITOR OUTPUT\n"
4363            "============================================================\n"
4364            "---------\n"
4365            "SNAPSHOTS\n"
4366            "---------\n"
4367            "LIST OF SNAPSHOTS FOR EACH SESSION:\n";
4368   }
4369 
4370   static std::string get_footer(void) {
4371     return "-----------------------------------------\n"
4372            "END OF ROCKSDB TRANSACTION MONITOR OUTPUT\n"
4373            "=========================================\n";
4374   }
4375 
4376   static Rdb_deadlock_info::Rdb_dl_trx_info get_dl_txn_info(
4377       const rocksdb::DeadlockInfo &txn, const GL_INDEX_ID &gl_index_id) {
4378     Rdb_deadlock_info::Rdb_dl_trx_info txn_data;
4379 
4380     txn_data.trx_id = txn.m_txn_id;
4381 
4382     txn_data.table_name = ddl_manager.safe_get_table_name(gl_index_id);
4383     if (txn_data.table_name.empty()) {
4384       txn_data.table_name =
4385           "NOT FOUND; INDEX_ID: " + std::to_string(gl_index_id.index_id);
4386     }
4387 
4388     auto kd = ddl_manager.safe_find(gl_index_id);
4389     txn_data.index_name =
4390         (kd) ? kd->get_name()
4391              : "NOT FOUND; INDEX_ID: " + std::to_string(gl_index_id.index_id);
4392 
4393     rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(txn.m_cf_id);
4394     txn_data.cf_name = cfh->GetName();
4395 
4396     txn_data.waiting_key =
4397         rdb_hexdump(txn.m_waiting_key.c_str(), txn.m_waiting_key.length());
4398 
4399     txn_data.exclusive_lock = txn.m_exclusive;
4400 
4401     return txn_data;
4402   }
4403 
4404   static Rdb_deadlock_info get_dl_path_trx_info(
4405       const rocksdb::DeadlockPath &path_entry) {
4406     Rdb_deadlock_info deadlock_info;
4407 
4408     for (auto it = path_entry.path.begin(); it != path_entry.path.end(); it++) {
4409       const auto &txn = *it;
4410       const GL_INDEX_ID gl_index_id = {
4411           txn.m_cf_id, rdb_netbuf_to_uint32(reinterpret_cast<const uchar *>(
4412                            txn.m_waiting_key.c_str()))};
4413       deadlock_info.path.push_back(get_dl_txn_info(txn, gl_index_id));
4414     }
4415     DBUG_ASSERT_IFF(path_entry.limit_exceeded, path_entry.path.empty());
4416     /* print the first txn in the path to display the full deadlock cycle */
4417     if (!path_entry.path.empty() && !path_entry.limit_exceeded) {
4418       const auto &deadlocking_txn = *(path_entry.path.end() - 1);
4419       deadlock_info.victim_trx_id = deadlocking_txn.m_txn_id;
4420       deadlock_info.deadlock_time = path_entry.deadlock_time;
4421     }
4422     return deadlock_info;
4423   }
4424 
4425  public:
4426   Rdb_snapshot_status() : m_data(get_header()) {}
4427 
4428   std::string getResult() { return m_data + get_footer(); }
4429 
4430   /* Implement Rdb_transaction interface */
4431   /* Create one row in the snapshot status table */
4432   void process_tran(const Rdb_transaction *const tx) override {
4433     DBUG_ASSERT(tx != nullptr);
4434 
4435     /* Calculate the duration the snapshot has existed */
4436     int64_t snapshot_timestamp = tx->m_snapshot_timestamp;
4437     if (snapshot_timestamp != 0) {
4438       int64_t curr_time;
4439       rdb->GetEnv()->GetCurrentTime(&curr_time);
4440 
4441       char buffer[1024];
4442 #ifdef MARIAROCKS_NOT_YET
4443       thd_security_context(tx->get_thd(), buffer, sizeof buffer, 0);
4444 #endif
4445       m_data += format_string(
4446           "---SNAPSHOT, ACTIVE %lld sec\n"
4447           "%s\n"
4448           "lock count %llu, write count %llu\n"
4449           "insert count %llu, update count %llu, delete count %llu\n",
4450           (longlong)(curr_time - snapshot_timestamp), buffer, tx->get_lock_count(),
4451           tx->get_write_count(), tx->get_insert_count(), tx->get_update_count(),
4452           tx->get_delete_count());
4453     }
4454   }
4455 
4456   void populate_deadlock_buffer() {
4457     auto dlock_buffer = rdb->GetDeadlockInfoBuffer();
4458     m_data += "----------LATEST DETECTED DEADLOCKS----------\n";
4459 
4460     for (const auto &path_entry : dlock_buffer) {
4461       std::string path_data;
4462       if (path_entry.limit_exceeded) {
4463         path_data += "\n-------DEADLOCK EXCEEDED MAX DEPTH-------\n";
4464       } else {
4465         path_data +=
4466             "\n*** DEADLOCK PATH\n"
4467             "=========================================\n";
4468         const auto dl_info = get_dl_path_trx_info(path_entry);
4469         const auto deadlock_time = dl_info.deadlock_time;
4470         for (auto it = dl_info.path.begin(); it != dl_info.path.end(); it++) {
4471           const auto &trx_info = *it;
4472           path_data += format_string(
4473               "TIMESTAMP: %" PRId64
4474               "\n"
4475               "TRANSACTION ID: %u\n"
4476               "COLUMN FAMILY NAME: %s\n"
4477               "WAITING KEY: %s\n"
4478               "LOCK TYPE: %s\n"
4479               "INDEX NAME: %s\n"
4480               "TABLE NAME: %s\n",
4481               deadlock_time, trx_info.trx_id, trx_info.cf_name.c_str(),
4482               trx_info.waiting_key.c_str(),
4483               trx_info.exclusive_lock ? "EXCLUSIVE" : "SHARED",
4484               trx_info.index_name.c_str(), trx_info.table_name.c_str());
4485           if (it != dl_info.path.end() - 1) {
4486             path_data += "---------------WAITING FOR---------------\n";
4487           }
4488         }
4489         path_data += format_string(
4490             "\n--------TRANSACTION ID: %u GOT DEADLOCK---------\n",
4491             dl_info.victim_trx_id);
4492       }
4493       m_data += path_data;
4494     }
4495   }
4496 
4497   std::vector<Rdb_deadlock_info> get_deadlock_info() {
4498     std::vector<Rdb_deadlock_info> deadlock_info;
4499     auto dlock_buffer = rdb->GetDeadlockInfoBuffer();
4500     for (const auto &path_entry : dlock_buffer) {
4501       if (!path_entry.limit_exceeded) {
4502         deadlock_info.push_back(get_dl_path_trx_info(path_entry));
4503       }
4504     }
4505     return deadlock_info;
4506   }
4507 };
4508 
4509 /**
4510  * @brief
4511  * walks through all non-replication transactions and copies
4512  * out relevant information for information_schema.rocksdb_trx
4513  */
4514 class Rdb_trx_info_aggregator : public Rdb_tx_list_walker {
4515  private:
4516   std::vector<Rdb_trx_info> *m_trx_info;
4517 
4518  public:
4519   explicit Rdb_trx_info_aggregator(std::vector<Rdb_trx_info> *const trx_info)
4520       : m_trx_info(trx_info) {}
4521 
4522   void process_tran(const Rdb_transaction *const tx) override {
4523     static const std::map<int, std::string> state_map = {
4524         {rocksdb::Transaction::STARTED, "STARTED"},
4525         {rocksdb::Transaction::AWAITING_PREPARE, "AWAITING_PREPARE"},
4526         {rocksdb::Transaction::PREPARED, "PREPARED"},
4527         {rocksdb::Transaction::AWAITING_COMMIT, "AWAITING_COMMIT"},
4528         {rocksdb::Transaction::COMMITED, "COMMITED"},
4529         {rocksdb::Transaction::AWAITING_ROLLBACK, "AWAITING_ROLLBACK"},
4530         {rocksdb::Transaction::ROLLEDBACK, "ROLLEDBACK"},
4531     };
4532 
4533     DBUG_ASSERT(tx != nullptr);
4534 
4535     THD *const thd = tx->get_thd();
4536     ulong thread_id = thd_get_thread_id(thd);
4537 
4538     if (tx->is_writebatch_trx()) {
4539       const auto wb_impl = static_cast<const Rdb_writebatch_impl *>(tx);
4540       DBUG_ASSERT(wb_impl);
4541       m_trx_info->push_back(
4542           {"",                            /* name */
4543            0,                             /* trx_id */
4544            wb_impl->get_write_count(), 0, /* lock_count */
4545            0,                             /* timeout_sec */
4546            "",                            /* state */
4547            "",                            /* waiting_key */
4548            0,                             /* waiting_cf_id */
4549            1,                             /*is_replication */
4550            1,                             /* skip_trx_api */
4551            wb_impl->is_tx_read_only(), 0, /* deadlock detection */
4552            wb_impl->num_ongoing_bulk_load(), thread_id, "" /* query string */});
4553     } else {
4554       const auto tx_impl = static_cast<const Rdb_transaction_impl *>(tx);
4555       DBUG_ASSERT(tx_impl);
4556       const rocksdb::Transaction *rdb_trx = tx_impl->get_rdb_trx();
4557 
4558       if (rdb_trx == nullptr) {
4559         return;
4560       }
4561 
4562       char query_buf[NAME_LEN+1];
4563       thd_query_safe(thd, query_buf, sizeof(query_buf));
4564       std::string query_str(query_buf);
4565 
4566       const auto state_it = state_map.find(rdb_trx->GetState());
4567       DBUG_ASSERT(state_it != state_map.end());
4568       const int is_replication = (thd->rgi_slave != nullptr);
4569       uint32_t waiting_cf_id;
4570       std::string waiting_key;
4571       rdb_trx->GetWaitingTxns(&waiting_cf_id, &waiting_key),
4572 
4573           m_trx_info->push_back(
4574               {rdb_trx->GetName(), rdb_trx->GetID(), tx_impl->get_write_count(),
4575                tx_impl->get_lock_count(), tx_impl->get_timeout_sec(),
4576                state_it->second, waiting_key, waiting_cf_id, is_replication,
4577                0, /* skip_trx_api */
4578                tx_impl->is_tx_read_only(), rdb_trx->IsDeadlockDetect(),
4579                tx_impl->num_ongoing_bulk_load(), thread_id, query_str});
4580     }
4581   }
4582 };
4583 
4584 /*
4585   returns a vector of info for all non-replication threads
4586   for use by information_schema.rocksdb_trx
4587 */
4588 std::vector<Rdb_trx_info> rdb_get_all_trx_info() {
4589   std::vector<Rdb_trx_info> trx_info;
4590   Rdb_trx_info_aggregator trx_info_agg(&trx_info);
4591   Rdb_transaction::walk_tx_list(&trx_info_agg);
4592   return trx_info;
4593 }
4594 
4595 
4596 /*
4597   returns a vector of info of recent deadlocks
4598   for use by information_schema.rocksdb_deadlock
4599 */
4600 std::vector<Rdb_deadlock_info> rdb_get_deadlock_info() {
4601   Rdb_snapshot_status showStatus;
4602   Rdb_transaction::walk_tx_list(&showStatus);
4603   return showStatus.get_deadlock_info();
4604 }
4605 
4606 #ifdef MARIAROCKS_NOT_YET
4607 /* Generate the snapshot status table */
4608 static bool rocksdb_show_snapshot_status(handlerton *const hton, THD *const thd,
4609                                          stat_print_fn *const stat_print) {
4610   Rdb_snapshot_status showStatus;
4611 
4612   Rdb_transaction::walk_tx_list(&showStatus);
4613   showStatus.populate_deadlock_buffer();
4614 
4615   /* Send the result data back to MySQL */
4616   return print_stats(thd, "rocksdb", "", showStatus.getResult(), stat_print);
4617 }
4618 #endif
4619 
4620 /*
4621   This is called for SHOW ENGINE ROCKSDB STATUS | LOGS | etc.
4622 
4623   For now, produce info about live files (which gives an imprecise idea about
4624   what column families are there).
4625 */
4626 static bool rocksdb_show_status(handlerton *const hton, THD *const thd,
4627                                 stat_print_fn *const stat_print,
4628                                 enum ha_stat_type stat_type) {
4629   DBUG_ASSERT(hton != nullptr);
4630   DBUG_ASSERT(thd != nullptr);
4631   DBUG_ASSERT(stat_print != nullptr);
4632 
4633   bool res = false;
4634   char buf[100] = {'\0'};
4635 
4636   if (stat_type == HA_ENGINE_STATUS) {
4637     DBUG_ASSERT(rdb != nullptr);
4638 
4639     std::string str;
4640 
4641     /* Global DB Statistics */
4642     if (rocksdb_stats) {
4643       str = rocksdb_stats->ToString();
4644 
4645       // Use the same format as internal RocksDB statistics entries to make
4646       // sure that output will look unified.
4647       DBUG_ASSERT(commit_latency_stats != nullptr);
4648 
4649       snprintf(buf, sizeof(buf),
4650                "rocksdb.commit_latency statistics "
4651                "Percentiles :=> 50 : %.2f 95 : %.2f "
4652                "99 : %.2f 100 : %.2f\n",
4653                commit_latency_stats->Percentile(50),
4654                commit_latency_stats->Percentile(95),
4655                commit_latency_stats->Percentile(99),
4656                commit_latency_stats->Percentile(100));
4657       str.append(buf);
4658 
4659       uint64_t v = 0;
4660 
4661       // Retrieve additional stalling related numbers from RocksDB and append
4662       // them to the buffer meant for displaying detailed statistics. The intent
4663       // here is to avoid adding another row to the query output because of
4664       // just two numbers.
4665       //
4666       // NB! We're replacing hyphens with underscores in output to better match
4667       // the existing naming convention.
4668       if (rdb->GetIntProperty("rocksdb.is-write-stopped", &v)) {
4669         snprintf(buf, sizeof(buf), "rocksdb.is_write_stopped COUNT : %llu\n", (ulonglong)v);
4670         str.append(buf);
4671       }
4672 
4673       if (rdb->GetIntProperty("rocksdb.actual-delayed-write-rate", &v)) {
4674         snprintf(buf, sizeof(buf),
4675                                    "COUNT : %llu\n",
4676                  (ulonglong)v);
4677         str.append(buf);
4678       }
4679 
4680       res |= print_stats(thd, "STATISTICS", "rocksdb", str, stat_print);
4681     }
4682 
4683     /* Per DB stats */
4684     if (rdb->GetProperty("rocksdb.dbstats", &str)) {
4685       res |= print_stats(thd, "DBSTATS", "rocksdb", str, stat_print);
4686     }
4687 
4688     /* Per column family stats */
4689     for (const auto &cf_name : cf_manager.get_cf_names()) {
4690       rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(cf_name);
4691       if (cfh == nullptr) {
4692         continue;
4693       }
4694 
4695       if (!rdb->GetProperty(cfh, "rocksdb.cfstats", &str)) {
4696         continue;
4697       }
4698 
4699       res |= print_stats(thd, "CF_COMPACTION", cf_name, str, stat_print);
4700     }
4701 
4702     /* Memory Statistics */
4703     std::vector<rocksdb::DB *> dbs;
4704     std::unordered_set<const rocksdb::Cache *> cache_set;
4705     size_t internal_cache_count = 0;
4706     size_t kDefaultInternalCacheSize = 8 * 1024 * 1024;
4707 
4708     dbs.push_back(rdb);
4709     cache_set.insert(rocksdb_tbl_options->block_cache.get());
4710 
4711     for (const auto &cf_handle : cf_manager.get_all_cf()) {
4712       rocksdb::ColumnFamilyDescriptor cf_desc;
4713       cf_handle->GetDescriptor(&cf_desc);
4714       auto *const table_factory = cf_desc.options.table_factory.get();
4715 
4716       if (table_factory != nullptr) {
4717         std::string tf_name = table_factory->Name();
4718 
4719         if (tf_name.find("BlockBasedTable") != std::string::npos) {
4720           const rocksdb::BlockBasedTableOptions *const bbt_opt =
4721               reinterpret_cast<rocksdb::BlockBasedTableOptions *>(
4722                   table_factory->GetOptions());
4723 
4724           if (bbt_opt != nullptr) {
4725             if (bbt_opt->block_cache.get() != nullptr) {
4726               cache_set.insert(bbt_opt->block_cache.get());
4727             } else {
4728               internal_cache_count++;
4729             }
4730             cache_set.insert(bbt_opt->block_cache_compressed.get());
4731           }
4732         }
4733       }
4734     }
4735 
4736     std::map<rocksdb::MemoryUtil::UsageType, uint64_t> temp_usage_by_type;
4737     str.clear();
4738     rocksdb::MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set,
4739                                                          &temp_usage_by_type);
4740     snprintf(buf, sizeof(buf), "\nMemTable Total: %llu",
4741              (ulonglong)temp_usage_by_type[rocksdb::MemoryUtil::kMemTableTotal]);
4742     str.append(buf);
4743     snprintf(buf, sizeof(buf), "\nMemTable Unflushed: %llu",
4744              (ulonglong)temp_usage_by_type[rocksdb::MemoryUtil::kMemTableUnFlushed]);
4745     str.append(buf);
4746     snprintf(buf, sizeof(buf), "\nTable Readers Total: %llu",
4747              (ulonglong)temp_usage_by_type[rocksdb::MemoryUtil::kTableReadersTotal]);
4748     str.append(buf);
4749     snprintf(buf, sizeof(buf), "\nCache Total: %llu",
4750              (ulonglong)temp_usage_by_type[rocksdb::MemoryUtil::kCacheTotal]);
4751     str.append(buf);
4752     snprintf(buf, sizeof(buf), "\nDefault Cache Capacity: %llu",
4753              (ulonglong)internal_cache_count * kDefaultInternalCacheSize);
4754     str.append(buf);
4755     res |= print_stats(thd, "MEMORY_STATS", "rocksdb", str, stat_print);
4756 
4757     /* Show the background thread status */
4758     std::vector<rocksdb::ThreadStatus> thread_list;
4759     rocksdb::Status s = rdb->GetEnv()->GetThreadList(&thread_list);
4760 
4761     if (!s.ok()) {
4762       // NO_LINT_DEBUG
4763       sql_print_error("RocksDB: Returned error (%s) from GetThreadList.\n",
4764                       s.ToString().c_str());
4765       res |= true;
4766     } else {
4767       /* For each background thread retrieved, print out its information */
4768       for (auto &it : thread_list) {
4769         /* Only look at background threads. Ignore user threads, if any. */
4770         if (it.thread_type > rocksdb::ThreadStatus::LOW_PRIORITY) {
4771           continue;
4772         }
4773 
4774         str = "\nthread_type: " + it.GetThreadTypeName(it.thread_type) +
4775               "\ncf_name: " + it.cf_name +
4776               "\noperation_type: " + it.GetOperationName(it.operation_type) +
4777               "\noperation_stage: " +
4778               it.GetOperationStageName(it.operation_stage) +
4779               "\nelapsed_time_ms: " + it.MicrosToString(it.op_elapsed_micros);
4780 
4781         for (auto &it_props : it.InterpretOperationProperties(
4782                  it.operation_type, it.op_properties)) {
4783           str += "\n" + it_props.first + ": " + std::to_string(it_props.second);
4784         }
4785 
4786         str += "\nstate_type: " + it.GetStateName(it.state_type);
4787 
4788         res |= print_stats(thd, "BG_THREADS", std::to_string(it.thread_id), str,
4789                            stat_print);
4790       }
4791     }
4792 
4793 #ifdef MARIAROCKS_NOT_YET
4794     /* Explicit snapshot information */
4795     str = Rdb_explicit_snapshot::dump_snapshots();
4796 #endif
4797 
4798     if (!str.empty()) {
4799       res |= print_stats(thd, "EXPLICIT_SNAPSHOTS", "rocksdb", str, stat_print);
4800     }
4801 #ifdef MARIAROCKS_NOT_YET
4802   } else if (stat_type == HA_ENGINE_TRX) {
4803     /* Handle the SHOW ENGINE ROCKSDB TRANSACTION STATUS command */
4804     res |= rocksdb_show_snapshot_status(hton, thd, stat_print);
4805 #endif
4806   }
4807   return res;
4808 }
4809 
4810 static inline void rocksdb_register_tx(handlerton *const hton, THD *const thd,
4811                                        Rdb_transaction *const tx) {
4812   DBUG_ASSERT(tx != nullptr);
4813 
4814   trans_register_ha(thd, FALSE, rocksdb_hton, 0);
4815   if (my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
4816     tx->start_stmt();
4817     trans_register_ha(thd, TRUE, rocksdb_hton, 0);
4818   }
4819 }
4820 
4821 static const char *ha_rocksdb_exts[] = {NullS};
4822 
4823 #ifdef MARIAROCKS_NOT_YET
4824 static bool rocksdb_explicit_snapshot(
4825     handlerton *const /* hton */, /*!< in: RocksDB handlerton */
4826     THD *const thd,               /*!< in: MySQL thread handle */
4827     snapshot_info_st *ss_info)    /*!< out: Snapshot information */
4828 {
4829   switch (ss_info->op) {
4830     case snapshot_operation::SNAPSHOT_CREATE: {
4831       if (mysql_bin_log_is_open()) {
4832         mysql_bin_log_lock_commits(ss_info);
4833       }
4834       auto s = Rdb_explicit_snapshot::create(ss_info, rdb, rdb->GetSnapshot());
4835       if (mysql_bin_log_is_open()) {
4836         mysql_bin_log_unlock_commits(ss_info);
4837       }
4838 
4839       thd->set_explicit_snapshot(s);
4840       return s == nullptr;
4841     }
4842     case snapshot_operation::SNAPSHOT_ATTACH: {
4843       auto s = Rdb_explicit_snapshot::get(ss_info->snapshot_id);
4844       if (!s) {
4845         return true;
4846       }
4847       *ss_info = s->ss_info;
4848       thd->set_explicit_snapshot(s);
4849       return false;
4850     }
4851     case snapshot_operation::SNAPSHOT_RELEASE: {
4852       if (!thd->get_explicit_snapshot()) {
4853         return true;
4854       }
4855       *ss_info = thd->get_explicit_snapshot()->ss_info;
4856       thd->set_explicit_snapshot(nullptr);
4857       return false;
4858     }
4859     default:
4860       DBUG_ASSERT(false);
4861       return true;
4862   }
4863   return true;
4864 }
4865 #endif
4866 
4867 /*
4868     Supporting START TRANSACTION WITH CONSISTENT [ROCKSDB] SNAPSHOT
4869 
4870     Features:
4871     1. Supporting START TRANSACTION WITH CONSISTENT SNAPSHOT
4872     2. Getting current binlog position in addition to #1.
4873 
4874     The second feature is done by START TRANSACTION WITH
4875     CONSISTENT ROCKSDB SNAPSHOT. This is Facebook's extension, and
4876     it works like existing START TRANSACTION WITH CONSISTENT INNODB SNAPSHOT.
4877 
4878     - When not setting engine, START TRANSACTION WITH CONSISTENT SNAPSHOT
4879     takes both InnoDB and RocksDB snapshots, and both InnoDB and RocksDB
4880     participate in transaction. When executing COMMIT, both InnoDB and
4881     RocksDB modifications are committed. Remember that XA is not supported yet,
4882     so mixing engines is not recommended anyway.
4883 
4884     - When setting engine, START TRANSACTION WITH CONSISTENT.. takes
4885     snapshot for the specified engine only. But it starts both
4886     InnoDB and RocksDB transactions.
4887 */
4888 static int rocksdb_start_tx_and_assign_read_view(
4889     handlerton *const hton,    /*!< in: RocksDB handlerton */
4890     THD *const thd             /*!< in: MySQL thread handle of the
4891                                user for whom the transaction should
4892                                be committed */
4893 )
4894 #ifdef MARIAROCKS_NOT_YET
4895     snapshot_info_st *ss_info) /*!< in/out: Snapshot info like binlog file, pos,
4896                                gtid executed and snapshot ID */
4897 #endif
4898 {
4899   ulong const tx_isolation = my_core::thd_tx_isolation(thd);
4900 
4901   if (tx_isolation != ISO_REPEATABLE_READ) {
4902     my_error(ER_ISOLATION_LEVEL_WITH_CONSISTENT_SNAPSHOT, MYF(0));
4903     return HA_EXIT_FAILURE;
4904   }
4905 
4906 #ifdef MARIADB_NOT_YET
4907   if (ss_info) {
4908     if (mysql_bin_log_is_open()) {
4909       mysql_bin_log_lock_commits(ss_info);
4910     } else {
4911       return HA_EXIT_FAILURE;
4912     }
4913 #endif
4914 
4915   /*
4916     MariaDB: there is no need to call mysql_bin_log_lock_commits and then
4917     unlock back.
4918     SQL layer calls start_consistent_snapshot() for all engines, including the
4919     binlog under LOCK_commit_ordered mutex.
4920 
4921     The mutex prevents binlog commits from happening (right?) while the storage
4922     engine(s) allocate read snapshots. That way, each storage engine is
4923     synchronized with current binlog position.
4924   */
4925   mysql_mutex_assert_owner(&LOCK_commit_ordered);
4926 
4927   Rdb_transaction *const tx = get_or_create_tx(thd);
4928   Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd));
4929 
4930   DBUG_ASSERT(!tx->has_snapshot());
4931   tx->set_tx_read_only(true);
4932   rocksdb_register_tx(hton, thd, tx);
4933   tx->acquire_snapshot(true);
4934 
4935 #ifdef MARIADB_NOT_YET
4936   if (ss_info) {
4937     mysql_bin_log_unlock_commits(ss_info);
4938   }
4939 #endif
4940   return HA_EXIT_SUCCESS;
4941 }
4942 
4943 #ifdef MARIADB_NOT_YET
4944 static int rocksdb_start_tx_with_shared_read_view(
4945     handlerton *const hton,    /*!< in: RocksDB handlerton */
4946     THD *const thd)            /*!< in: MySQL thread handle of the
4947                                user for whom the transaction should
4948                                be committed */
4949 #ifdef MARIADB_NOT_YET
4950     snapshot_info_st *ss_info) /*!< out: Snapshot info like binlog file, pos,
4951                                gtid executed and snapshot ID */
4952 #endif
4953 {
4954   DBUG_ASSERT(thd != nullptr);
4955 
4956   int error = HA_EXIT_SUCCESS;
4957 
4958   ulong const tx_isolation = my_core::thd_tx_isolation(thd);
4959   if (tx_isolation != ISO_REPEATABLE_READ) {
4960     my_error(ER_ISOLATION_LEVEL_WITH_CONSISTENT_SNAPSHOT, MYF(0));
4961     return HA_EXIT_FAILURE;
4962   }
4963 
4964   Rdb_transaction *tx = nullptr;
4965 #ifdef MARIADB_NOT_YET
4966   std::shared_ptr<Rdb_explicit_snapshot> explicit_snapshot;
4967   const auto op = ss_info->op;
4968 
4969   DBUG_ASSERT(op == snapshot_operation::SNAPSHOT_CREATE ||
4970               op == snapshot_operation::SNAPSHOT_ATTACH);
4971 
4972   // case: if binlogs are available get binlog file/pos and gtid info
4973   if (op == snapshot_operation::SNAPSHOT_CREATE && mysql_bin_log_is_open()) {
4974     mysql_bin_log_lock_commits(ss_info);
4975   }
4976 
4977   if (op == snapshot_operation::SNAPSHOT_ATTACH) {
4978     explicit_snapshot = Rdb_explicit_snapshot::get(ss_info->snapshot_id);
4979     if (!explicit_snapshot) {
4980       my_printf_error(ER_UNKNOWN_ERROR, "Snapshot %llu does not exist", MYF(0),
4981                       ss_info->snapshot_id);
4982       error = HA_EXIT_FAILURE;
4983     }
4984   }
4985 #endif
4986 
4987   // case: all good till now
4988   if (error == HA_EXIT_SUCCESS) {
4989     tx = get_or_create_tx(thd);
4990     Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd));
4991 
4992 #ifdef MARIADB_NOT_YET
4993     if (explicit_snapshot) {
4994       tx->m_explicit_snapshot = explicit_snapshot;
4995     }
4996 #endif
4997 
4998     DBUG_ASSERT(!tx->has_snapshot());
4999     tx->set_tx_read_only(true);
5000     rocksdb_register_tx(hton, thd, tx);
5001     tx->acquire_snapshot(true);
5002 
5003 #ifdef MARIADB_NOT_YET
5004     // case: an explicit snapshot was not assigned to this transaction
5005     if (!tx->m_explicit_snapshot) {
5006       tx->m_explicit_snapshot =
5007           Rdb_explicit_snapshot::create(ss_info, rdb, tx->m_read_opts.snapshot);
5008       if (!tx->m_explicit_snapshot) {
5009         my_printf_error(ER_UNKNOWN_ERROR, "Could not create snapshot", MYF(0));
5010         error = HA_EXIT_FAILURE;
5011       }
5012     }
5013 #endif
5014   }
5015 
5016 #ifdef MARIADB_NOT_YET
5017   // case: unlock the binlog
5018   if (op == snapshot_operation::SNAPSHOT_CREATE && mysql_bin_log_is_open()) {
5019     mysql_bin_log_unlock_commits(ss_info);
5020   }
5021 
5022   DBUG_ASSERT(error == HA_EXIT_FAILURE || tx->m_explicit_snapshot);
5023 
5024   // copy over the snapshot details to pass to the upper layers
5025   if (tx->m_explicit_snapshot) {
5026     *ss_info = tx->m_explicit_snapshot->ss_info;
5027     ss_info->op = op;
5028   }
5029 #endif
5030 
5031   return error;
5032 }
5033 #endif
5034 
5035 /* Dummy SAVEPOINT support. This is needed for long running transactions
5036  * like mysqldump (https://bugs.mysql.com/bug.php?id=71017).
5037  * Current SAVEPOINT does not correctly handle ROLLBACK and does not return
5038  * errors. This needs to be addressed in future versions (Issue#96).
5039  */
5040 static int rocksdb_savepoint(handlerton *const hton, THD *const thd,
5041                              void *const savepoint) {
5042   return HA_EXIT_SUCCESS;
5043 }
5044 
5045 static int rocksdb_rollback_to_savepoint(handlerton *const hton, THD *const thd,
5046                                          void *const savepoint) {
5047   Rdb_transaction *tx = get_tx_from_thd(thd);
5048   return tx->rollback_to_savepoint(savepoint);
5049 }
5050 
5051 static bool rocksdb_rollback_to_savepoint_can_release_mdl(
5052     handlerton *const /* hton */, THD *const /* thd */) {
5053   return true;
5054 }
5055 
5056 #ifdef MARIAROCKS_NOT_YET
5057 /*
5058   This is called for INFORMATION_SCHEMA
5059 */
5060 static void rocksdb_update_table_stats(
5061     /* per-table stats callback */
5062     void (*cb)(const char *db, const char *tbl, bool is_partition,
5063                my_io_perf_t *r, my_io_perf_t *w, my_io_perf_t *r_blob,
5064                my_io_perf_t *r_primary, my_io_perf_t *r_secondary,
5065                page_stats_t *page_stats, comp_stats_t *comp_stats,
5066                int n_lock_wait, int n_lock_wait_timeout, int n_lock_deadlock,
5067                const char *engine)) {
5068   my_io_perf_t io_perf_read;
5069   my_io_perf_t io_perf_write;
5070   my_io_perf_t io_perf;
5071   page_stats_t page_stats;
5072   comp_stats_t comp_stats;
5073   uint lock_wait_timeout_stats;
5074   uint deadlock_stats;
5075   uint lock_wait_stats;
5076   std::vector<std::string> tablenames;
5077 
5078   /*
5079     Most of these are for innodb, so setting them to 0.
5080     TODO: possibly separate out primary vs. secondary index reads
5081    */
5082   memset(&io_perf, 0, sizeof(io_perf));
5083   memset(&page_stats, 0, sizeof(page_stats));
5084   memset(&comp_stats, 0, sizeof(comp_stats));
5085   memset(&io_perf_write, 0, sizeof(io_perf_write));
5086 
5087   tablenames = rdb_open_tables.get_table_names();
5088 
5089   for (const auto &it : tablenames) {
5090     Rdb_table_handler *table_handler;
5091     std::string str, dbname, tablename, partname;
5092     char dbname_sys[NAME_LEN + 1];
5093     char tablename_sys[NAME_LEN + 1];
5094     bool is_partition;
5095 
5096     if (rdb_normalize_tablename(it, &str) != HA_EXIT_SUCCESS) {
5097       /* Function needs to return void because of the interface and we've
5098        * detected an error which shouldn't happen. There's no way to let
5099        * caller know that something failed.
5100        */
5101       SHIP_ASSERT(false);
5102       return;
5103     }
5104 
5105     if (rdb_split_normalized_tablename(str, &dbname, &tablename, &partname)) {
5106       continue;
5107     }
5108 
5109     is_partition = (partname.size() != 0);
5110 
5111     table_handler = rdb_open_tables.get_table_handler(it.c_str());
5112     if (table_handler == nullptr) {
5113       continue;
5114     }
5115 
5116     io_perf_read.bytes = table_handler->m_io_perf_read.bytes.load();
5117     io_perf_read.requests = table_handler->m_io_perf_read.requests.load();
5118     io_perf_write.bytes = table_handler->m_io_perf_write.bytes.load();
5119     io_perf_write.requests = table_handler->m_io_perf_write.requests.load();
5120     lock_wait_timeout_stats = table_handler->m_lock_wait_timeout_counter.load();
5121     deadlock_stats = table_handler->m_deadlock_counter.load();
5122     lock_wait_stats =
5123         table_handler->m_table_perf_context.m_value[PC_KEY_LOCK_WAIT_COUNT]
5124             .load();
5125 
5126     /*
5127       Convert from rocksdb timer to mysql timer. RocksDB values are
5128       in nanoseconds, but table statistics expect the value to be
5129       in my_timer format.
5130      */
5131     io_perf_read.svc_time = my_core::microseconds_to_my_timer(
5132         table_handler->m_io_perf_read.svc_time.load() / 1000);
5133     io_perf_read.svc_time_max = my_core::microseconds_to_my_timer(
5134         table_handler->m_io_perf_read.svc_time_max.load() / 1000);
5135     io_perf_read.wait_time = my_core::microseconds_to_my_timer(
5136         table_handler->m_io_perf_read.wait_time.load() / 1000);
5137     io_perf_read.wait_time_max = my_core::microseconds_to_my_timer(
5138         table_handler->m_io_perf_read.wait_time_max.load() / 1000);
5139     io_perf_read.slow_ios = table_handler->m_io_perf_read.slow_ios.load();
5140     rdb_open_tables.release_table_handler(table_handler);
5141 
5142     /*
5143       Table stats expects our database and table name to be in system encoding,
5144       not filename format. Convert before calling callback.
5145      */
5146     my_core::filename_to_tablename(dbname.c_str(), dbname_sys,
5147                                    sizeof(dbname_sys));
5148     my_core::filename_to_tablename(tablename.c_str(), tablename_sys,
5149                                    sizeof(tablename_sys));
5150     (*cb)(dbname_sys, tablename_sys, is_partition, &io_perf_read,
5151           &io_perf_write, &io_perf, &io_perf, &io_perf, &page_stats,
5152           &comp_stats, lock_wait_stats, lock_wait_timeout_stats, deadlock_stats,
5153           rocksdb_hton_name);
5154   }
5155 }
5156 #endif
5157 static rocksdb::Status check_rocksdb_options_compatibility(
5158     const char *const dbpath, const rocksdb::Options &main_opts,
5159     const std::vector<rocksdb::ColumnFamilyDescriptor> &cf_descr) {
5160   DBUG_ASSERT(rocksdb_datadir != nullptr);
5161 
5162   rocksdb::DBOptions loaded_db_opt;
5163   std::vector<rocksdb::ColumnFamilyDescriptor> loaded_cf_descs;
5164   rocksdb::Status status =
5165       LoadLatestOptions(dbpath, rocksdb::Env::Default(), &loaded_db_opt,
5166                         &loaded_cf_descs, rocksdb_ignore_unknown_options);
5167 
5168   // If we're starting from scratch and there are no options saved yet then this
5169   // is a valid case. Therefore we can't compare the current set of options to
5170   // anything.
5171   if (status.IsNotFound()) {
5172     return rocksdb::Status::OK();
5173   }
5174 
5175   if (!status.ok()) {
5176     return status;
5177   }
5178 
5179   if (loaded_cf_descs.size() != cf_descr.size()) {
5180     return rocksdb::Status::NotSupported(
5181         "Mismatched size of column family "
5182         "descriptors.");
5183   }
5184 
5185   // Please see RocksDB documentation for more context about why we need to set
5186   // user-defined functions and pointer-typed options manually.
5187   for (size_t i = 0; i < loaded_cf_descs.size(); i++) {
5188     loaded_cf_descs[i].options.compaction_filter =
5189         cf_descr[i].options.compaction_filter;
5190     loaded_cf_descs[i].options.compaction_filter_factory =
5191         cf_descr[i].options.compaction_filter_factory;
5192     loaded_cf_descs[i].options.comparator = cf_descr[i].options.comparator;
5193     loaded_cf_descs[i].options.memtable_factory =
5194         cf_descr[i].options.memtable_factory;
5195     loaded_cf_descs[i].options.merge_operator =
5196         cf_descr[i].options.merge_operator;
5197     loaded_cf_descs[i].options.prefix_extractor =
5198         cf_descr[i].options.prefix_extractor;
5199     loaded_cf_descs[i].options.table_factory =
5200         cf_descr[i].options.table_factory;
5201   }
5202 
5203   // This is the essence of the function - determine if it's safe to open the
5204   // database or not.
5205   status = CheckOptionsCompatibility(dbpath, rocksdb::Env::Default(), main_opts,
5206                                      loaded_cf_descs,
5207                                      rocksdb_ignore_unknown_options);
5208 
5209   return status;
5210 }
5211 
5212 bool prevent_myrocks_loading= false;
5213 
5214 
5215 /*
5216   Storage Engine initialization function, invoked when plugin is loaded.
5217 */
5218 
5219 static int rocksdb_init_func(void *const p) {
5220 
5221   DBUG_ENTER_FUNC();
5222 
5223   if (prevent_myrocks_loading)
5224   {
5225     my_error(ER_INTERNAL_ERROR, MYF(0),
5226              "Loading MyRocks plugin after it has been unloaded is not "
5227              "supported. Please restart mysqld");
5228     DBUG_RETURN(1);
5229   }
5230 
5231   if (rocksdb_ignore_datadic_errors)
5232   {
5233     sql_print_information(
5234         "CAUTION: Running with rocksdb_ignore_datadic_errors=1. "
5235         " This should only be used to perform repairs");
5236   }
5237 
5238   if (rdb_check_rocksdb_corruption()) {
5239     // NO_LINT_DEBUG
5240     sql_print_error(
5241         "RocksDB: There was a corruption detected in RockDB files. "
5242         "Check error log emitted earlier for more details.");
5243     if (rocksdb_allow_to_start_after_corruption) {
5244       // NO_LINT_DEBUG
5245       sql_print_information(
5246           "RocksDB: Remove rocksdb_allow_to_start_after_corruption to prevent "
5247           "server operating if RocksDB corruption is detected.");
5248     } else {
5249       // NO_LINT_DEBUG
5250       sql_print_error(
5251           "RocksDB: The server will exit normally and stop restart "
5252           "attempts. Remove %s file from data directory and "
5253           "start mysqld manually.",
5254           rdb_corruption_marker_file_name().c_str());
5255       exit(0);
5256     }
5257   }
5258 
5259   // Validate the assumption about the size of ROCKSDB_SIZEOF_HIDDEN_PK_COLUMN.
5260   static_assert(sizeof(longlong) == 8, "Assuming that longlong is 8 bytes.");
5261 
5262   init_rocksdb_psi_keys();
5263 
5264   rocksdb_hton = (handlerton *)p;
5265 
5266   rdb_open_tables.init();
5267   Ensure_cleanup rdb_open_tables_cleanup([]() { rdb_open_tables.free(); });
5268 
5269 #ifdef HAVE_PSI_INTERFACE
5270   rdb_bg_thread.init(rdb_signal_bg_psi_mutex_key, rdb_signal_bg_psi_cond_key);
5271   rdb_drop_idx_thread.init(rdb_signal_drop_idx_psi_mutex_key,
5272                            rdb_signal_drop_idx_psi_cond_key);
5273   rdb_mc_thread.init(rdb_signal_mc_psi_mutex_key, rdb_signal_mc_psi_cond_key);
5274 #else
5275   rdb_bg_thread.init();
5276   rdb_drop_idx_thread.init();
5277   rdb_mc_thread.init();
5278 #endif
5279   mysql_mutex_init(rdb_collation_data_mutex_key, &rdb_collation_data_mutex,
5280                    MY_MUTEX_INIT_FAST);
5281   mysql_mutex_init(rdb_mem_cmp_space_mutex_key, &rdb_mem_cmp_space_mutex,
5282                    MY_MUTEX_INIT_FAST);
5283 
5284   const char* initial_rocksdb_datadir_for_ignore_dirs= rocksdb_datadir;
5285   if (!strncmp(rocksdb_datadir, "./", 2))
5286     initial_rocksdb_datadir_for_ignore_dirs += 2;
5287   ignore_db_dirs_append(initial_rocksdb_datadir_for_ignore_dirs);
5288 
5289 #if defined(HAVE_PSI_INTERFACE)
5290   rdb_collation_exceptions =
5291       new Regex_list_handler(key_rwlock_collation_exception_list);
5292 #else
5293   rdb_collation_exceptions = new Regex_list_handler();
5294 #endif
5295 
5296   mysql_mutex_init(rdb_sysvars_psi_mutex_key, &rdb_sysvars_mutex,
5297                    MY_MUTEX_INIT_FAST);
5298   mysql_mutex_init(rdb_block_cache_resize_mutex_key,
5299                    &rdb_block_cache_resize_mutex, MY_MUTEX_INIT_FAST);
5300   Rdb_transaction::init_mutex();
5301 
5302   rocksdb_hton->create = rocksdb_create_handler;
5303   rocksdb_hton->close_connection = rocksdb_close_connection;
5304 
5305   rocksdb_hton->prepare = rocksdb_prepare;
5306   rocksdb_hton->prepare_ordered = NULL; // Do not need it
5307 
5308   rocksdb_hton->commit_by_xid = rocksdb_commit_by_xid;
5309   rocksdb_hton->rollback_by_xid = rocksdb_rollback_by_xid;
5310   rocksdb_hton->recover = rocksdb_recover;
5311 
5312   rocksdb_hton->commit_ordered= rocksdb_commit_ordered;
5313   rocksdb_hton->commit = rocksdb_commit;
5314 
5315   rocksdb_hton->commit_checkpoint_request= rocksdb_checkpoint_request;
5316 
5317   rocksdb_hton->rollback = rocksdb_rollback;
5318   rocksdb_hton->show_status = rocksdb_show_status;
5319 #ifdef MARIADB_NOT_YET
5320   rocksdb_hton->explicit_snapshot = rocksdb_explicit_snapshot;
5321 #endif
5322   rocksdb_hton->start_consistent_snapshot =
5323       rocksdb_start_tx_and_assign_read_view;
5324 #ifdef MARIADB_NOT_YET
5325   rocksdb_hton->start_shared_snapshot = rocksdb_start_tx_with_shared_read_view;
5326 #endif
5327   rocksdb_hton->savepoint_set = rocksdb_savepoint;
5328   rocksdb_hton->savepoint_rollback = rocksdb_rollback_to_savepoint;
5329   rocksdb_hton->savepoint_rollback_can_release_mdl =
5330       rocksdb_rollback_to_savepoint_can_release_mdl;
5331 #ifdef MARIAROCKS_NOT_YET
5332   rocksdb_hton->update_table_stats = rocksdb_update_table_stats;
5333 #endif // MARIAROCKS_NOT_YET
5334 
5335   /*
5336   Not needed in MariaDB:
5337   rocksdb_hton->flush_logs = rocksdb_flush_wal;
5338   rocksdb_hton->handle_single_table_select = rocksdb_handle_single_table_select;
5339 
5340   */
5341 
5342   rocksdb_hton->flags = HTON_TEMPORARY_NOT_SUPPORTED |
5343                         HTON_SUPPORTS_EXTENDED_KEYS | HTON_CAN_RECREATE;
5344 
5345   rocksdb_hton->tablefile_extensions= ha_rocksdb_exts;
5346   DBUG_ASSERT(!mysqld_embedded);
5347 
5348   if (rocksdb_db_options->max_open_files > (long)open_files_limit) {
5349     // NO_LINT_DEBUG
5350     sql_print_information(
5351         "RocksDB: rocksdb_max_open_files should not be "
5352         "greater than the open_files_limit, effective value "
5353         "of rocksdb_max_open_files is being set to "
5354         "open_files_limit / 2.");
5355     rocksdb_db_options->max_open_files = open_files_limit / 2;
5356   } else if (rocksdb_db_options->max_open_files == -2) {
5357     rocksdb_db_options->max_open_files = open_files_limit / 2;
5358   }
5359 
5360 #if 0 // MARIAROCKS_NOT_YET : read-free replication is not supported
5361   rdb_read_free_regex_handler.set_patterns(DEFAULT_READ_FREE_RPL_TABLES);
5362 #endif
5363 
5364   rocksdb_stats = rocksdb::CreateDBStatistics();
5365   rocksdb_stats->set_stats_level(
5366       static_cast<rocksdb::StatsLevel>(rocksdb_stats_level));
5367   rocksdb_stats_level = rocksdb_stats->get_stats_level();
5368   rocksdb_db_options->statistics = rocksdb_stats;
5369 
5370   if (rocksdb_rate_limiter_bytes_per_sec != 0) {
5371     rocksdb_rate_limiter.reset(
5372         rocksdb::NewGenericRateLimiter(rocksdb_rate_limiter_bytes_per_sec));
5373     rocksdb_db_options->rate_limiter = rocksdb_rate_limiter;
5374   }
5375 
5376   rocksdb_db_options->delayed_write_rate = rocksdb_delayed_write_rate;
5377 
5378   std::shared_ptr<Rdb_logger> myrocks_logger = std::make_shared<Rdb_logger>();
5379   rocksdb::Status s = rocksdb::CreateLoggerFromOptions(
5380       rocksdb_datadir, *rocksdb_db_options, &rocksdb_db_options->info_log);
5381   if (s.ok()) {
5382     myrocks_logger->SetRocksDBLogger(rocksdb_db_options->info_log);
5383   }
5384 
5385   rocksdb_db_options->info_log = myrocks_logger;
5386   myrocks_logger->SetInfoLogLevel(
5387       static_cast<rocksdb::InfoLogLevel>(rocksdb_info_log_level));
5388   rocksdb_db_options->wal_dir = rocksdb_wal_dir;
5389 
5390   rocksdb_db_options->wal_recovery_mode =
5391       static_cast<rocksdb::WALRecoveryMode>(rocksdb_wal_recovery_mode);
5392 
5393   rocksdb_db_options->access_hint_on_compaction_start =
5394       static_cast<rocksdb::Options::AccessHint>(
5395           rocksdb_access_hint_on_compaction_start);
5396 
5397   if (rocksdb_db_options->allow_mmap_reads &&
5398       rocksdb_db_options->use_direct_reads) {
5399     // allow_mmap_reads implies !use_direct_reads and RocksDB will not open if
5400     // mmap_reads and direct_reads are both on.   (NO_LINT_DEBUG)
5401     sql_print_error(
5402         "RocksDB: Can't enable both use_direct_reads "
5403         "and allow_mmap_reads\n");
5404     DBUG_RETURN(HA_EXIT_FAILURE);
5405   }
5406 
5407   // Check whether the filesystem backing rocksdb_datadir allows O_DIRECT
5408   if (rocksdb_db_options->use_direct_reads ||
5409       rocksdb_db_options->use_direct_io_for_flush_and_compaction) {
5410     rocksdb::EnvOptions soptions;
5411     rocksdb::Status check_status;
5412     rocksdb::Env *const env = rocksdb_db_options->env;
5413 
5414     std::string fname = format_string("%s/DIRECT_CHECK", rocksdb_datadir);
5415     if (env->FileExists(fname).ok()) {
5416       std::unique_ptr<rocksdb::SequentialFile> file;
5417       soptions.use_direct_reads = true;
5418       check_status = env->NewSequentialFile(fname, &file, soptions);
5419     } else {
5420       std::unique_ptr<rocksdb::WritableFile> file;
5421       soptions.use_direct_writes = true;
5422       check_status = env->ReopenWritableFile(fname, &file, soptions);
5423       if (file != nullptr) {
5424         file->Close();
5425       }
5426       env->DeleteFile(fname);
5427     }
5428 
5429     if (!check_status.ok()) {
5430       // NO_LINT_DEBUG
5431       sql_print_error(
5432           "RocksDB: Unable to use direct io in rocksdb-datadir:"
5433           "(%s)",
5434           check_status.getState());
5435       DBUG_RETURN(HA_EXIT_FAILURE);
5436     }
5437   }
5438 
5439   if (rocksdb_db_options->allow_mmap_writes &&
5440       rocksdb_db_options->use_direct_io_for_flush_and_compaction) {
5441     // See above comment for allow_mmap_reads. (NO_LINT_DEBUG)
5442     sql_print_error(
5443         "RocksDB: Can't enable both "
5444         "use_direct_io_for_flush_and_compaction and "
5445         "allow_mmap_writes\n");
5446     DBUG_RETURN(HA_EXIT_FAILURE);
5447   }
5448 
5449   if (rocksdb_db_options->allow_mmap_writes &&
5450       rocksdb_flush_log_at_trx_commit != FLUSH_LOG_NEVER) {
5451     // NO_LINT_DEBUG
5452     sql_print_error(
5453         "RocksDB: rocksdb_flush_log_at_trx_commit needs to be 0 "
5454         "to use allow_mmap_writes");
5455     DBUG_RETURN(HA_EXIT_FAILURE);
5456   }
5457 
5458   // sst_file_manager will move deleted rocksdb sst files to trash_dir
5459   // to be deleted in a background thread.
5460   std::string trash_dir = std::string(rocksdb_datadir) + "/trash";
5461   rocksdb_db_options->sst_file_manager.reset(NewSstFileManager(
5462       rocksdb_db_options->env, myrocks_logger, trash_dir,
5463       rocksdb_sst_mgr_rate_bytes_per_sec, true /* delete_existing_trash */));
5464 
5465   std::vector<std::string> cf_names;
5466   rocksdb::Status status;
5467   status = rocksdb::DB::ListColumnFamilies(*rocksdb_db_options, rocksdb_datadir,
5468                                            &cf_names);
5469   if (!status.ok()) {
5470     /*
5471       When we start on an empty datadir, ListColumnFamilies returns IOError,
5472       and RocksDB doesn't provide any way to check what kind of error it was.
5473       Checking system errno happens to work right now.
5474     */
5475     if (status.IsIOError()
5476 #ifndef _WIN32
5477       && errno == ENOENT
5478 #endif
5479       ) {
5480       sql_print_information("RocksDB: Got ENOENT when listing column families");
5481 
5482       // NO_LINT_DEBUG
5483       sql_print_information(
5484           "RocksDB:   assuming that we're creating a new database");
5485     } else {
5486       rdb_log_status_error(status, "Error listing column families");
5487       DBUG_RETURN(HA_EXIT_FAILURE);
5488     }
5489   } else {
5490     // NO_LINT_DEBUG
5491     sql_print_information("RocksDB: %ld column families found",
5492                           cf_names.size());
5493   }
5494 
5495   std::vector<rocksdb::ColumnFamilyDescriptor> cf_descr;
5496   std::vector<rocksdb::ColumnFamilyHandle *> cf_handles;
5497 
5498   rocksdb_tbl_options->index_type =
5499       (rocksdb::BlockBasedTableOptions::IndexType)rocksdb_index_type;
5500 
5501   if (!rocksdb_tbl_options->no_block_cache) {
5502     std::shared_ptr<rocksdb::MemoryAllocator> memory_allocator;
5503     if (!rocksdb_cache_dump) {
5504       size_t block_size = rocksdb_tbl_options->block_size;
5505       rocksdb::JemallocAllocatorOptions alloc_opt;
5506       // Limit jemalloc tcache memory usage. The range
5507       // [block_size/4, block_size] should be enough to cover most of
5508       // block cache allocation sizes.
5509       alloc_opt.limit_tcache_size = true;
5510       alloc_opt.tcache_size_lower_bound = block_size / 4;
5511       alloc_opt.tcache_size_upper_bound = block_size;
5512       rocksdb::Status new_alloc_status =
5513           rocksdb::NewJemallocNodumpAllocator(alloc_opt, &memory_allocator);
5514       if (!new_alloc_status.ok()) {
5515         // Fallback to use default malloc/free.
5516         rdb_log_status_error(new_alloc_status,
5517                              "Error excluding block cache from core dump");
5518         memory_allocator = nullptr;
5519         DBUG_RETURN(HA_EXIT_FAILURE);
5520       }
5521     }
5522     std::shared_ptr<rocksdb::Cache> block_cache =
5523         rocksdb_use_clock_cache
5524             ? rocksdb::NewClockCache(rocksdb_block_cache_size)
5525             : rocksdb::NewLRUCache(
5526                   rocksdb_block_cache_size, -1 /*num_shard_bits*/,
5527                   false /*strict_capcity_limit*/,
5528                   rocksdb_cache_high_pri_pool_ratio, memory_allocator);
5529     if (rocksdb_sim_cache_size > 0) {
5530       // Simulated cache enabled
5531       // Wrap block cache inside a simulated cache and pass it to RocksDB
5532       rocksdb_tbl_options->block_cache =
5533           rocksdb::NewSimCache(block_cache, rocksdb_sim_cache_size, 6);
5534     } else {
5535       // Pass block cache to RocksDB
5536       rocksdb_tbl_options->block_cache = block_cache;
5537     }
5538   }
5539   // Using newer BlockBasedTable format version for better compression
5540   // and better memory allocation.
5541   // See:
5542   // https://github.com/facebook/rocksdb/commit/9ab5adfc59a621d12357580c94451d9f7320c2dd
5543   rocksdb_tbl_options->format_version = 2;
5544 
5545   if (rocksdb_collect_sst_properties) {
5546     properties_collector_factory =
5547         std::make_shared<Rdb_tbl_prop_coll_factory>(&ddl_manager);
5548 
5549     rocksdb_set_compaction_options(nullptr, nullptr, nullptr, nullptr);
5550 
5551     RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
5552 
5553     DBUG_ASSERT(rocksdb_table_stats_sampling_pct <=
5554                 RDB_TBL_STATS_SAMPLE_PCT_MAX);
5555     properties_collector_factory->SetTableStatsSamplingPct(
5556         rocksdb_table_stats_sampling_pct);
5557 
5558     RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
5559   }
5560 
5561   if (rocksdb_persistent_cache_size_mb > 0) {
5562     std::shared_ptr<rocksdb::PersistentCache> pcache;
5563     uint64_t cache_size_bytes = rocksdb_persistent_cache_size_mb * 1024 * 1024;
5564     status = rocksdb::NewPersistentCache(
5565         rocksdb::Env::Default(), std::string(rocksdb_persistent_cache_path),
5566         cache_size_bytes, myrocks_logger, true, &pcache);
5567     if (!status.ok()) {
5568       // NO_LINT_DEBUG
5569       sql_print_error("RocksDB: Persistent cache returned error: (%s)",
5570                       status.getState());
5571       DBUG_RETURN(HA_EXIT_FAILURE);
5572     }
5573     rocksdb_tbl_options->persistent_cache = pcache;
5574   } else if (strlen(rocksdb_persistent_cache_path)) {
5575     // NO_LINT_DEBUG
5576     sql_print_error("RocksDB: Must specify rocksdb_persistent_cache_size_mb");
5577     DBUG_RETURN(HA_EXIT_FAILURE);
5578   }
5579 
5580   std::unique_ptr<Rdb_cf_options> cf_options_map(new Rdb_cf_options());
5581   if (!cf_options_map->init(*rocksdb_tbl_options, properties_collector_factory,
5582                             rocksdb_default_cf_options,
5583                             rocksdb_override_cf_options)) {
5584     // NO_LINT_DEBUG
5585     sql_print_error("RocksDB: Failed to initialize CF options map.");
5586     DBUG_RETURN(HA_EXIT_FAILURE);
5587   }
5588 
5589   /*
5590     If there are no column families, we're creating the new database.
5591     Create one column family named "default".
5592   */
5593   if (cf_names.size() == 0) cf_names.push_back(DEFAULT_CF_NAME);
5594 
5595   std::vector<int> compaction_enabled_cf_indices;
5596 
5597   // NO_LINT_DEBUG
5598   sql_print_information("RocksDB: Column Families at start:");
5599   for (size_t i = 0; i < cf_names.size(); ++i) {
5600     rocksdb::ColumnFamilyOptions opts;
5601     cf_options_map->get_cf_options(cf_names[i], &opts);
5602 
5603     // NO_LINT_DEBUG
5604     sql_print_information("  cf=%s", cf_names[i].c_str());
5605 
5606     // NO_LINT_DEBUG
5607     sql_print_information("    write_buffer_size=%ld", opts.write_buffer_size);
5608 
5609     // NO_LINT_DEBUG
5610     sql_print_information("    target_file_size_base=%" PRIu64,
5611                           opts.target_file_size_base);
5612 
5613     /*
5614       Temporarily disable compactions to prevent a race condition where
5615       compaction starts before compaction filter is ready.
5616     */
5617     if (!opts.disable_auto_compactions) {
5618       compaction_enabled_cf_indices.push_back(i);
5619       opts.disable_auto_compactions = true;
5620     }
5621     cf_descr.push_back(rocksdb::ColumnFamilyDescriptor(cf_names[i], opts));
5622   }
5623 
5624   rocksdb::Options main_opts(*rocksdb_db_options,
5625                              cf_options_map->get_defaults());
5626 
5627   rocksdb::TransactionDBOptions tx_db_options;
5628   tx_db_options.transaction_lock_timeout = 2000;  // 2 seconds
5629   tx_db_options.custom_mutex_factory = std::make_shared<Rdb_mutex_factory>();
5630   tx_db_options.write_policy =
5631       static_cast<rocksdb::TxnDBWritePolicy>(rocksdb_write_policy);
5632 
5633   status =
5634       check_rocksdb_options_compatibility(rocksdb_datadir, main_opts, cf_descr);
5635 
5636   // We won't start if we'll determine that there's a chance of data corruption
5637   // because of incompatible options.
5638   if (!status.ok()) {
5639     rdb_log_status_error(
5640         status, "Compatibility check against existing database options failed");
5641     DBUG_RETURN(HA_EXIT_FAILURE);
5642   }
5643 
5644   status = rocksdb::TransactionDB::Open(
5645       main_opts, tx_db_options, rocksdb_datadir, cf_descr, &cf_handles, &rdb);
5646 
5647   if (!status.ok()) {
5648     rdb_log_status_error(status, "Error opening instance");
5649     DBUG_RETURN(HA_EXIT_FAILURE);
5650   }
5651   cf_manager.init(std::move(cf_options_map), &cf_handles);
5652 
5653   if (dict_manager.init(rdb, &cf_manager)) {
5654     // NO_LINT_DEBUG
5655     sql_print_error("RocksDB: Failed to initialize data dictionary.");
5656     DBUG_RETURN(HA_EXIT_FAILURE);
5657   }
5658 
5659   if (binlog_manager.init(&dict_manager)) {
5660     // NO_LINT_DEBUG
5661     sql_print_error("RocksDB: Failed to initialize binlog manager.");
5662     DBUG_RETURN(HA_EXIT_FAILURE);
5663   }
5664 
5665   if (ddl_manager.init(&dict_manager, &cf_manager, rocksdb_validate_tables)) {
5666     // NO_LINT_DEBUG
5667     sql_print_error("RocksDB: Failed to initialize DDL manager.");
5668 
5669     if (rocksdb_ignore_datadic_errors)
5670     {
5671       sql_print_error("RocksDB: rocksdb_ignore_datadic_errors=1, "
5672                       "trying to continue");
5673     }
5674     else
5675       DBUG_RETURN(HA_EXIT_FAILURE);
5676   }
5677 
5678   Rdb_sst_info::init(rdb);
5679 
5680   /*
5681     Enable auto compaction, things needed for compaction filter are finished
5682     initializing
5683   */
5684   std::vector<rocksdb::ColumnFamilyHandle *> compaction_enabled_cf_handles;
5685   compaction_enabled_cf_handles.reserve(compaction_enabled_cf_indices.size());
5686   for (const auto &index : compaction_enabled_cf_indices) {
5687     compaction_enabled_cf_handles.push_back(cf_handles[index]);
5688   }
5689 
5690   status = rdb->EnableAutoCompaction(compaction_enabled_cf_handles);
5691 
5692   if (!status.ok()) {
5693     rdb_log_status_error(status, "Error enabling compaction");
5694     DBUG_RETURN(HA_EXIT_FAILURE);
5695   }
5696 
5697 #ifndef HAVE_PSI_INTERFACE
5698   auto err = rdb_bg_thread.create_thread(BG_THREAD_NAME);
5699 #else
5700   auto err = rdb_bg_thread.create_thread(BG_THREAD_NAME,
5701                                          rdb_background_psi_thread_key);
5702 #endif
5703   if (err != 0) {
5704     // NO_LINT_DEBUG
5705     sql_print_error("RocksDB: Couldn't start the background thread: (errno=%d)",
5706                     err);
5707     DBUG_RETURN(HA_EXIT_FAILURE);
5708   }
5709 
5710 #ifndef HAVE_PSI_INTERFACE
5711   err = rdb_drop_idx_thread.create_thread(INDEX_THREAD_NAME);
5712 #else
5713   err = rdb_drop_idx_thread.create_thread(INDEX_THREAD_NAME,
5714                                           rdb_drop_idx_psi_thread_key);
5715 #endif
5716   if (err != 0) {
5717     // NO_LINT_DEBUG
5718     sql_print_error("RocksDB: Couldn't start the drop index thread: (errno=%d)",
5719                     err);
5720     DBUG_RETURN(HA_EXIT_FAILURE);
5721   }
5722 
5723   err = rdb_mc_thread.create_thread(MANUAL_COMPACTION_THREAD_NAME
5724 #ifdef HAVE_PSI_INTERFACE
5725                                     ,
5726                                     rdb_mc_psi_thread_key
5727 #endif
5728   );
5729   if (err != 0) {
5730     // NO_LINT_DEBUG
5731     sql_print_error(
5732         "RocksDB: Couldn't start the manual compaction thread: (errno=%d)",
5733         err);
5734     DBUG_RETURN(HA_EXIT_FAILURE);
5735   }
5736 
5737   rdb_set_collation_exception_list(rocksdb_strict_collation_exceptions);
5738 
5739   if (rocksdb_pause_background_work) {
5740     rdb->PauseBackgroundWork();
5741   }
5742 
5743   // NO_LINT_DEBUG
5744   sql_print_information("RocksDB: global statistics using %s indexer",
5745                         STRINGIFY_ARG(RDB_INDEXER));
5746 #if defined(HAVE_SCHED_GETCPU)
5747   if (sched_getcpu() == -1) {
5748     // NO_LINT_DEBUG
5749     sql_print_information(
5750         "RocksDB: sched_getcpu() failed - "
5751         "global statistics will use thread_id_indexer_t instead");
5752   }
5753 #endif
5754 
5755   err = my_error_register(rdb_get_error_messages, HA_ERR_ROCKSDB_FIRST,
5756                           HA_ERR_ROCKSDB_LAST);
5757   if (err != 0) {
5758     // NO_LINT_DEBUG
5759     sql_print_error("RocksDB: Couldn't initialize error messages");
5760     DBUG_RETURN(HA_EXIT_FAILURE);
5761   }
5762 
5763 
5764 
5765   // Creating an instance of HistogramImpl should only happen after RocksDB
5766   // has been successfully initialized.
5767   commit_latency_stats = new rocksdb::HistogramImpl();
5768 
5769   // Construct a list of directories which will be monitored by I/O watchdog
5770   // to make sure that we won't lose write access to them.
5771   std::vector<std::string> directories;
5772 
5773   // 1. Data directory.
5774   directories.push_back(mysql_real_data_home);
5775 
5776   // 2. Transaction logs.
5777   if (myrocks::rocksdb_wal_dir && *myrocks::rocksdb_wal_dir) {
5778     directories.push_back(myrocks::rocksdb_wal_dir);
5779   }
5780 
5781 #if !defined(_WIN32) && !defined(__APPLE__)
5782   io_watchdog = new Rdb_io_watchdog(std::move(directories));
5783   io_watchdog->reset_timeout(rocksdb_io_write_timeout_secs);
5784 #endif
5785 
5786   // NO_LINT_DEBUG
5787   sql_print_information(
5788       "MyRocks storage engine plugin has been successfully "
5789       "initialized.");
5790 
5791   // Skip cleaning up rdb_open_tables as we've succeeded
5792   rdb_open_tables_cleanup.skip();
5793 
5794   DBUG_RETURN(HA_EXIT_SUCCESS);
5795 }
5796 
5797 /*
5798   Storage Engine deinitialization function, invoked when plugin is unloaded.
5799 */
5800 
5801 static int rocksdb_done_func(void *const p) {
5802   DBUG_ENTER_FUNC();
5803 
5804   int error = 0;
5805 
5806   // signal the drop index thread to stop
5807   rdb_drop_idx_thread.signal(true);
5808 
5809   // Flush all memtables for not losing data, even if WAL is disabled.
5810   rocksdb_flush_all_memtables();
5811 
5812   // Stop all rocksdb background work
5813   CancelAllBackgroundWork(rdb->GetBaseDB(), true);
5814 
5815   // Signal the background thread to stop and to persist all stats collected
5816   // from background flushes and compactions. This will add more keys to a new
5817   // memtable, but since the memtables were just flushed, it should not trigger
5818   // a flush that can stall due to background threads being stopped. As long
5819   // as these keys are stored in a WAL file, they can be retrieved on restart.
5820   rdb_bg_thread.signal(true);
5821 
5822   // Wait for the background thread to finish.
5823   auto err = rdb_bg_thread.join();
5824   if (err != 0) {
5825     // We'll log the message and continue because we're shutting down and
5826     // continuation is the optimal strategy.
5827     // NO_LINT_DEBUG
5828     sql_print_error("RocksDB: Couldn't stop the background thread: (errno=%d)",
5829                     err);
5830   }
5831 
5832   // Wait for the drop index thread to finish.
5833   err = rdb_drop_idx_thread.join();
5834   if (err != 0) {
5835     // NO_LINT_DEBUG
5836     sql_print_error("RocksDB: Couldn't stop the index thread: (errno=%d)", err);
5837   }
5838 
5839   // signal the manual compaction thread to stop
5840   rdb_mc_thread.signal(true);
5841   // Wait for the manual compaction thread to finish.
5842   err = rdb_mc_thread.join();
5843   if (err != 0) {
5844     // NO_LINT_DEBUG
5845     sql_print_error(
5846         "RocksDB: Couldn't stop the manual compaction thread: (errno=%d)", err);
5847   }
5848 
5849   if (rdb_open_tables.count()) {
5850     // Looks like we are getting unloaded and yet we have some open tables
5851     // left behind.
5852     error = 1;
5853   }
5854 
5855   rdb_open_tables.free();
5856   /*
5857     destructors for static objects can be called at _exit(),
5858     but we want to free the memory at dlclose()
5859   */
5860   // MARIADB_MERGE_2019:  rdb_open_tables.m_hash.~Rdb_table_set();
5861   mysql_mutex_destroy(&rdb_sysvars_mutex);
5862   mysql_mutex_destroy(&rdb_block_cache_resize_mutex);
5863 
5864 
5865   delete rdb_collation_exceptions;
5866 
5867   mysql_mutex_destroy(&rdb_collation_data_mutex);
5868   mysql_mutex_destroy(&rdb_mem_cmp_space_mutex);
5869 
5870   Rdb_transaction::term_mutex();
5871 
5872   for (auto &it : rdb_collation_data) {
5873     delete it;
5874     it = nullptr;
5875   }
5876 
5877   ddl_manager.cleanup();
5878   binlog_manager.cleanup();
5879   dict_manager.cleanup();
5880   cf_manager.cleanup();
5881 
5882   delete rdb;
5883   rdb = nullptr;
5884 
5885   delete commit_latency_stats;
5886   commit_latency_stats = nullptr;
5887 
5888 #if !defined(_WIN32) && !defined(__APPLE__)
5889   delete io_watchdog;
5890   io_watchdog = nullptr;
5891 #endif
5892 
5893 // Disown the cache data since we're shutting down.
5894 // This results in memory leaks but it improved the shutdown time.
5895 // Don't disown when running under valgrind
5896 #ifndef HAVE_valgrind
5897   if (rocksdb_tbl_options->block_cache) {
5898     rocksdb_tbl_options->block_cache->DisownData();
5899   }
5900 #endif /* HAVE_valgrind */
5901 
5902   /*
5903     MariaDB: don't clear rocksdb_db_options and rocksdb_tbl_options.
5904     MyRocks' plugin variables refer to them.
5905 
5906     The plugin cannot be loaded again (see prevent_myrocks_loading) but plugin
5907     variables are processed before myrocks::rocksdb_init_func is invoked, so
5908     they must point to valid memory.
5909   */
5910   //rocksdb_db_options = nullptr;
5911   rocksdb_db_options->statistics = nullptr;
5912   //rocksdb_tbl_options = nullptr;
5913   rocksdb_stats = nullptr;
5914 
5915   my_free(rocksdb_update_cf_options);
5916   rocksdb_update_cf_options = nullptr;
5917 
5918   my_error_unregister(HA_ERR_ROCKSDB_FIRST, HA_ERR_ROCKSDB_LAST);
5919 
5920   /*
5921     Prevent loading the plugin after it has been loaded and then unloaded. This
5922     doesn't work currently.
5923   */
5924   prevent_myrocks_loading= true;
5925 
5926   DBUG_RETURN(error);
5927 }
5928 
5929 static inline void rocksdb_smart_seek(bool seek_backward,
5930                                       rocksdb::Iterator *const iter,
5931                                       const rocksdb::Slice &key_slice) {
5932   if (seek_backward) {
5933     iter->SeekForPrev(key_slice);
5934   } else {
5935     iter->Seek(key_slice);
5936   }
5937 }
5938 
5939 static inline void rocksdb_smart_next(bool seek_backward,
5940                                       rocksdb::Iterator *const iter) {
5941   if (seek_backward) {
5942     iter->Prev();
5943   } else {
5944     iter->Next();
5945   }
5946 }
5947 
5948 #ifndef DBUG_OFF
5949 // simulate that RocksDB has reported corrupted data
5950 static void dbug_change_status_to_corrupted(rocksdb::Status *status) {
5951   *status = rocksdb::Status::Corruption();
5952 }
5953 #endif
5954 
5955 // If the iterator is not valid it might be because of EOF but might be due
5956 // to IOError or corruption. The good practice is always check it.
5957 // https://github.com/facebook/rocksdb/wiki/Iterator#error-handling
5958 static inline bool is_valid(rocksdb::Iterator *scan_it) {
5959   if (scan_it->Valid()) {
5960     return true;
5961   } else {
5962     rocksdb::Status s = scan_it->status();
5963     DBUG_EXECUTE_IF("rocksdb_return_status_corrupted",
5964                     dbug_change_status_to_corrupted(&s););
5965     if (s.IsIOError() || s.IsCorruption()) {
5966       if (s.IsCorruption()) {
5967         rdb_persist_corruption_marker();
5968       }
5969       rdb_handle_io_error(s, RDB_IO_ERROR_GENERAL);
5970     }
5971     return false;
5972   }
5973 }
5974 
5975 /**
5976   @brief
5977   Example of simple lock controls. The "table_handler" it creates is a
5978   structure we will pass to each ha_rocksdb handler. Do you have to have
5979   one of these? Well, you have pieces that are used for locking, and
5980   they are needed to function.
5981 */
5982 
5983 Rdb_table_handler *Rdb_open_tables_map::get_table_handler(
5984     const char *const table_name) {
5985   DBUG_ASSERT(table_name != nullptr);
5986 
5987   Rdb_table_handler *table_handler;
5988 
5989   std::string table_name_str(table_name);
5990 
5991   // First, look up the table in the hash map.
5992   RDB_MUTEX_LOCK_CHECK(m_mutex);
5993   const auto it = m_table_map.find(table_name_str);
5994   if (it != m_table_map.end()) {
5995     // Found it
5996     table_handler = it->second;
5997   } else {
5998     char *tmp_name;
5999 
6000     // Since we did not find it in the hash map, attempt to create and add it
6001     // to the hash map.
6002     if (!(table_handler = reinterpret_cast<Rdb_table_handler *>(my_multi_malloc(
6003               PSI_INSTRUMENT_ME,
6004               MYF(MY_WME | MY_ZEROFILL), &table_handler, sizeof(*table_handler),
6005               &tmp_name, table_name_str.length() + 1, NullS)))) {
6006       // Allocating a new Rdb_table_handler and a new table name failed.
6007       RDB_MUTEX_UNLOCK_CHECK(m_mutex);
6008       return nullptr;
6009     }
6010 
6011     table_handler->m_ref_count = 0;
6012     table_handler->m_table_name_length = table_name_str.length();
6013     table_handler->m_table_name = tmp_name;
6014     strmov(table_handler->m_table_name, table_name);
6015 
6016     m_table_map.emplace(table_name_str, table_handler);
6017 
6018     thr_lock_init(&table_handler->m_thr_lock);
6019 #ifdef MARIAROCKS_NOT_YET
6020     table_handler->m_io_perf_read.init();
6021     table_handler->m_io_perf_write.init();
6022 #endif
6023   }
6024   DBUG_ASSERT(table_handler->m_ref_count >= 0);
6025   table_handler->m_ref_count++;
6026 
6027   RDB_MUTEX_UNLOCK_CHECK(m_mutex);
6028 
6029   return table_handler;
6030 }
6031 
6032 std::vector<std::string> rdb_get_open_table_names(void) {
6033   return rdb_open_tables.get_table_names();
6034 }
6035 
6036 std::vector<std::string> Rdb_open_tables_map::get_table_names(void) const {
6037   const Rdb_table_handler *table_handler;
6038   std::vector<std::string> names;
6039 
6040   RDB_MUTEX_LOCK_CHECK(m_mutex);
6041   for (const auto &kv : m_table_map) {
6042     table_handler = kv.second;
6043     DBUG_ASSERT(table_handler != nullptr);
6044     names.push_back(table_handler->m_table_name);
6045   }
6046   RDB_MUTEX_UNLOCK_CHECK(m_mutex);
6047 
6048   return names;
6049 }
6050 
6051 /*
6052   Inspired by innobase_get_int_col_max_value from InnoDB. This returns the
6053   maximum value a type can take on.
6054 */
6055 static ulonglong rdb_get_int_col_max_value(const Field *field) {
6056   ulonglong max_value = 0;
6057   switch (field->key_type()) {
6058     case HA_KEYTYPE_BINARY:
6059       max_value = 0xFFULL;
6060       break;
6061     case HA_KEYTYPE_INT8:
6062       max_value = 0x7FULL;
6063       break;
6064     case HA_KEYTYPE_USHORT_INT:
6065       max_value = 0xFFFFULL;
6066       break;
6067     case HA_KEYTYPE_SHORT_INT:
6068       max_value = 0x7FFFULL;
6069       break;
6070     case HA_KEYTYPE_UINT24:
6071       max_value = 0xFFFFFFULL;
6072       break;
6073     case HA_KEYTYPE_INT24:
6074       max_value = 0x7FFFFFULL;
6075       break;
6076     case HA_KEYTYPE_ULONG_INT:
6077       max_value = 0xFFFFFFFFULL;
6078       break;
6079     case HA_KEYTYPE_LONG_INT:
6080       max_value = 0x7FFFFFFFULL;
6081       break;
6082     case HA_KEYTYPE_ULONGLONG:
6083       max_value = 0xFFFFFFFFFFFFFFFFULL;
6084       break;
6085     case HA_KEYTYPE_LONGLONG:
6086       max_value = 0x7FFFFFFFFFFFFFFFULL;
6087       break;
6088     case HA_KEYTYPE_FLOAT:
6089       max_value = 0x1000000ULL;
6090       break;
6091     case HA_KEYTYPE_DOUBLE:
6092       max_value = 0x20000000000000ULL;
6093       break;
6094     default:
6095       abort();
6096   }
6097 
6098   return max_value;
6099 }
6100 
6101 void ha_rocksdb::load_auto_incr_value() {
6102   ulonglong auto_incr = 0;
6103   bool validate_last = false, use_datadic = true;
6104 #ifndef DBUG_OFF
6105   DBUG_EXECUTE_IF("myrocks_autoinc_upgrade", use_datadic = false;);
6106   validate_last = true;
6107 #endif
6108 
6109   if (use_datadic && dict_manager.get_auto_incr_val(
6110                          m_tbl_def->get_autoincr_gl_index_id(), &auto_incr)) {
6111     update_auto_incr_val(auto_incr);
6112   }
6113 
6114   // If we find nothing in the data dictionary, or if we are in debug mode,
6115   // then call index_last to get the last value.
6116   //
6117   // This is needed when upgrading from a server that did not support
6118   // persistent auto_increment, of if the table is empty.
6119   //
6120   // For debug mode, we are just verifying that the data dictionary value is
6121   // greater than or equal to the maximum value in the table.
6122   if (auto_incr == 0 || validate_last) {
6123     auto_incr = load_auto_incr_value_from_index();
6124     update_auto_incr_val(auto_incr);
6125   }
6126 
6127   // If we failed to find anything from the data dictionary and index, then
6128   // initialize auto_increment to 1.
6129   if (m_tbl_def->m_auto_incr_val == 0) {
6130     update_auto_incr_val(1);
6131   }
6132 }
6133 
6134 ulonglong ha_rocksdb::load_auto_incr_value_from_index() {
6135   const int save_active_index = active_index;
6136   active_index = table->s->next_number_index;
6137   const uint8 save_table_status = table->status;
6138   ulonglong last_val = 0;
6139 
6140   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
6141   const bool is_new_snapshot = !tx->has_snapshot();
6142   if (is_new_snapshot) {
6143     tx->acquire_snapshot(true);
6144   }
6145 
6146   // Do a lookup. We only need index column, so it should be index-only.
6147   // (another reason to make it index-only is that table->read_set is not set
6148   // appropriately and non-index-only lookup will not read the value)
6149   const bool save_keyread_only = m_keyread_only;
6150   m_keyread_only = true;
6151   m_converter->set_is_key_requested(true);
6152 
6153   if (!index_last(table->record[0])) {
6154     Field *field =
6155         table->key_info[table->s->next_number_index].key_part[0].field;
6156     ulonglong max_val = rdb_get_int_col_max_value(field);
6157     MY_BITMAP *const old_map = dbug_tmp_use_all_columns(table, &table->read_set);
6158     last_val = field->val_int();
6159     if (last_val != max_val) {
6160       last_val++;
6161     }
6162 #ifndef DBUG_OFF
6163     ulonglong dd_val;
6164     if (last_val <= max_val) {
6165       const auto &gl_index_id = m_tbl_def->get_autoincr_gl_index_id();
6166       if (dict_manager.get_auto_incr_val(gl_index_id, &dd_val) &&
6167           tx->get_auto_incr(gl_index_id) == 0) {
6168         DBUG_ASSERT(dd_val >= last_val);
6169       }
6170     }
6171 #endif
6172     dbug_tmp_restore_column_map(&table->read_set, old_map);
6173   }
6174 
6175   m_keyread_only = save_keyread_only;
6176   if (is_new_snapshot) {
6177     tx->release_snapshot();
6178   }
6179 
6180   table->status = save_table_status;
6181   active_index = save_active_index;
6182 
6183   /*
6184     Do what ha_rocksdb::index_end() does.
6185     (Why don't we use index_init/index_end? class handler defines index_init
6186     as private, for some reason).
6187     */
6188   release_scan_iterator();
6189 
6190   return last_val;
6191 }
6192 
6193 void ha_rocksdb::update_auto_incr_val(ulonglong val) {
6194   ulonglong auto_incr_val = m_tbl_def->m_auto_incr_val;
6195   while (
6196       auto_incr_val < val &&
6197       !m_tbl_def->m_auto_incr_val.compare_exchange_weak(auto_incr_val, val)) {
6198     // Do nothing - just loop until auto_incr_val is >= val or we successfully
6199     // set it
6200   }
6201 }
6202 
6203 void ha_rocksdb::update_auto_incr_val_from_field() {
6204   Field *field;
6205   ulonglong new_val, max_val;
6206   field = table->key_info[table->s->next_number_index].key_part[0].field;
6207   max_val = rdb_get_int_col_max_value(field);
6208 
6209   MY_BITMAP *const old_map =
6210       dbug_tmp_use_all_columns(table, &table->read_set);
6211   new_val = field->val_int();
6212   // don't increment if we would wrap around
6213   if (new_val != max_val) {
6214     new_val++;
6215   }
6216 
6217   dbug_tmp_restore_column_map(&table->read_set, old_map);
6218 
6219   // Only update if positive value was set for auto_incr column.
6220   if (new_val <= max_val) {
6221     Rdb_transaction *const tx = get_or_create_tx(table->in_use);
6222     tx->set_auto_incr(m_tbl_def->get_autoincr_gl_index_id(), new_val);
6223 
6224     // Update the in memory auto_incr value in m_tbl_def.
6225     update_auto_incr_val(new_val);
6226   }
6227 }
6228 
6229 int ha_rocksdb::load_hidden_pk_value() {
6230   const int save_active_index = active_index;
6231   active_index = m_tbl_def->m_key_count - 1;
6232   const uint8 save_table_status = table->status;
6233 
6234   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
6235   const bool is_new_snapshot = !tx->has_snapshot();
6236 
6237   longlong hidden_pk_id = 1;
6238   // Do a lookup.
6239   if (!index_last(table->record[0])) {
6240     /*
6241       Decode PK field from the key
6242     */
6243     auto err = read_hidden_pk_id_from_rowkey(&hidden_pk_id);
6244     if (err) {
6245       if (is_new_snapshot) {
6246         tx->release_snapshot();
6247       }
6248       return err;
6249     }
6250 
6251     hidden_pk_id++;
6252   }
6253 
6254   longlong old = m_tbl_def->m_hidden_pk_val;
6255   while (old < hidden_pk_id &&
6256          !m_tbl_def->m_hidden_pk_val.compare_exchange_weak(old, hidden_pk_id)) {
6257   }
6258 
6259   if (is_new_snapshot) {
6260     tx->release_snapshot();
6261   }
6262 
6263   table->status = save_table_status;
6264   active_index = save_active_index;
6265 
6266   release_scan_iterator();
6267 
6268   return HA_EXIT_SUCCESS;
6269 }
6270 
6271 /* Get PK value from m_tbl_def->m_hidden_pk_info. */
6272 longlong ha_rocksdb::update_hidden_pk_val() {
6273   DBUG_ASSERT(has_hidden_pk(table));
6274   const longlong new_val = m_tbl_def->m_hidden_pk_val++;
6275   return new_val;
6276 }
6277 
6278 /* Get the id of the hidden pk id from m_last_rowkey */
6279 int ha_rocksdb::read_hidden_pk_id_from_rowkey(longlong *const hidden_pk_id) {
6280   DBUG_ASSERT(table != nullptr);
6281   DBUG_ASSERT(has_hidden_pk(table));
6282 
6283   rocksdb::Slice rowkey_slice(m_last_rowkey.ptr(), m_last_rowkey.length());
6284 
6285   // Get hidden primary key from old key slice
6286   Rdb_string_reader reader(&rowkey_slice);
6287   if ((!reader.read(Rdb_key_def::INDEX_NUMBER_SIZE))) {
6288     return HA_ERR_ROCKSDB_CORRUPT_DATA;
6289   }
6290 
6291   const int length= 8; /* was Field_longlong::PACK_LENGTH in FB MySQL tree */
6292   const uchar *from = reinterpret_cast<const uchar *>(reader.read(length));
6293   if (from == nullptr) {
6294     /* Mem-comparable image doesn't have enough bytes */
6295     return HA_ERR_ROCKSDB_CORRUPT_DATA;
6296   }
6297 
6298   *hidden_pk_id = rdb_netbuf_read_uint64(&from);
6299   return HA_EXIT_SUCCESS;
6300 }
6301 
6302 /**
6303   @brief
6304   Free lock controls. We call this whenever we close a table. If the table had
6305   the last reference to the table_handler, then we free the memory associated
6306   with it.
6307 */
6308 
6309 void Rdb_open_tables_map::release_table_handler(
6310     Rdb_table_handler *const table_handler) {
6311   RDB_MUTEX_LOCK_CHECK(m_mutex);
6312 
6313   DBUG_ASSERT(table_handler != nullptr);
6314   DBUG_ASSERT(table_handler->m_ref_count > 0);
6315   if (!--table_handler->m_ref_count) {
6316     // Last reference was released. Tear down the hash entry.
6317     const auto ret MY_ATTRIBUTE((__unused__)) =
6318         m_table_map.erase(std::string(table_handler->m_table_name));
6319     DBUG_ASSERT(ret == 1);  // the hash entry must actually be found and deleted
6320     my_core::thr_lock_delete(&table_handler->m_thr_lock);
6321     my_free(table_handler);
6322   }
6323 
6324   RDB_MUTEX_UNLOCK_CHECK(m_mutex);
6325 }
6326 
6327 static handler *rocksdb_create_handler(my_core::handlerton *const hton,
6328                                        my_core::TABLE_SHARE *const table_arg,
6329                                        my_core::MEM_ROOT *const mem_root) {
6330   return new (mem_root) ha_rocksdb(hton, table_arg);
6331 }
6332 
6333 ha_rocksdb::ha_rocksdb(my_core::handlerton *const hton,
6334                        my_core::TABLE_SHARE *const table_arg)
6335     : handler(hton, table_arg),
6336       m_table_handler(nullptr),
6337       m_scan_it(nullptr),
6338       m_scan_it_skips_bloom(false),
6339       m_scan_it_snapshot(nullptr),
6340       m_scan_it_lower_bound(nullptr),
6341       m_scan_it_upper_bound(nullptr),
6342       m_tbl_def(nullptr),
6343       m_pk_descr(nullptr),
6344       m_key_descr_arr(nullptr),
6345       m_pk_can_be_decoded(false),
6346       m_pk_tuple(nullptr),
6347       m_pk_packed_tuple(nullptr),
6348       m_sk_packed_tuple(nullptr),
6349       m_end_key_packed_tuple(nullptr),
6350       m_sk_match_prefix(nullptr),
6351       m_sk_match_prefix_buf(nullptr),
6352       m_sk_packed_tuple_old(nullptr),
6353       m_dup_sk_packed_tuple(nullptr),
6354       m_dup_sk_packed_tuple_old(nullptr),
6355       m_pack_buffer(nullptr),
6356       m_record_buffer(nullptr),
6357       m_lock_rows(RDB_LOCK_NONE),
6358       m_keyread_only(false),
6359       m_insert_with_update(false),
6360       m_dup_pk_found(false),
6361       m_in_rpl_delete_rows(false),
6362       m_in_rpl_update_rows(false),
6363       m_force_skip_unique_check(false) {}
6364 
6365 
6366 const std::string &ha_rocksdb::get_table_basename() const {
6367   return m_tbl_def->base_tablename();
6368 }
6369 
6370 /**
6371   @return
6372     false  OK
6373     other  Error inpacking the data
6374 */
6375 bool ha_rocksdb::init_with_fields() {
6376   DBUG_ENTER_FUNC();
6377 
6378   const uint pk = table_share->primary_key;
6379   if (pk != MAX_KEY) {
6380     const uint key_parts = table_share->key_info[pk].user_defined_key_parts;
6381     check_keyread_allowed(pk /*PK*/, key_parts - 1, true);
6382   } else {
6383     m_pk_can_be_decoded = false;
6384   }
6385   cached_table_flags = table_flags();
6386 
6387   DBUG_RETURN(false); /* Ok */
6388 }
6389 
6390 /*
6391   If the key is a TTL key, we may need to filter it out.
6392 
6393   The purpose of read filtering for tables with TTL is to ensure that
6394   during a transaction a key which has expired already but not removed by
6395   compaction yet is not returned to the user.
6396 
6397   Without this the user might be hit with problems such as disappearing
6398   rows within a transaction, etc, because the compaction filter ignores
6399   snapshots when filtering keys.
6400 */
6401 bool ha_rocksdb::should_hide_ttl_rec(const Rdb_key_def &kd,
6402                                      const rocksdb::Slice &ttl_rec_val,
6403                                      const int64_t curr_ts) {
6404   DBUG_ASSERT(kd.has_ttl());
6405   DBUG_ASSERT(kd.m_ttl_rec_offset != UINT_MAX);
6406 
6407   /*
6408     Curr_ts can only be 0 if there are no snapshots open.
6409     should_hide_ttl_rec can only be called when there is >=1 snapshots, unless
6410     we are filtering on the write path (single INSERT/UPDATE) in which case
6411     we are passed in the current time as curr_ts.
6412 
6413     In the event curr_ts is 0, we always decide not to filter the record. We
6414     also log a warning and increment a diagnostic counter.
6415   */
6416   if (curr_ts == 0) {
6417     update_row_stats(ROWS_HIDDEN_NO_SNAPSHOT);
6418     return false;
6419   }
6420 
6421   if (!rdb_is_ttl_read_filtering_enabled() || !rdb_is_ttl_enabled()) {
6422     return false;
6423   }
6424 
6425   Rdb_string_reader reader(&ttl_rec_val);
6426 
6427   /*
6428     Find where the 8-byte ttl is for each record in this index.
6429   */
6430   uint64 ts;
6431   if (!reader.read(kd.m_ttl_rec_offset) || reader.read_uint64(&ts)) {
6432     /*
6433       This condition should never be reached since all TTL records have an
6434       8 byte ttl field in front. Don't filter the record out, and log an error.
6435     */
6436     std::string buf;
6437     buf = rdb_hexdump(ttl_rec_val.data(), ttl_rec_val.size(),
6438                       RDB_MAX_HEXDUMP_LEN);
6439     const GL_INDEX_ID gl_index_id = kd.get_gl_index_id();
6440     // NO_LINT_DEBUG
6441     sql_print_error(
6442         "Decoding ttl from PK value failed, "
6443         "for index (%u,%u), val: %s",
6444         gl_index_id.cf_id, gl_index_id.index_id, buf.c_str());
6445     DBUG_ASSERT(0);
6446     return false;
6447   }
6448 
6449   /* Hide record if it has expired before the current snapshot time. */
6450   uint64 read_filter_ts = 0;
6451 #ifndef DBUG_OFF
6452   read_filter_ts += rdb_dbug_set_ttl_read_filter_ts();
6453 #endif
6454   bool is_hide_ttl =
6455       ts + kd.m_ttl_duration + read_filter_ts <= static_cast<uint64>(curr_ts);
6456   if (is_hide_ttl) {
6457     update_row_stats(ROWS_FILTERED);
6458 
6459     /* increment examined row count when rows are skipped */
6460     THD *thd = ha_thd();
6461     thd->inc_examined_row_count(1);
6462     DEBUG_SYNC(thd, "rocksdb.ttl_rows_examined");
6463   }
6464   return is_hide_ttl;
6465 }
6466 
6467 int ha_rocksdb::rocksdb_skip_expired_records(const Rdb_key_def &kd,
6468                                              rocksdb::Iterator *const iter,
6469                                              bool seek_backward) {
6470   if (kd.has_ttl()) {
6471     THD *thd = ha_thd();
6472     while (iter->Valid() &&
6473            should_hide_ttl_rec(
6474                kd, iter->value(),
6475                get_or_create_tx(table->in_use)->m_snapshot_timestamp)) {
6476       DEBUG_SYNC(thd, "rocksdb.check_flags_ser");
6477       if (thd && thd->killed) {
6478         return HA_ERR_QUERY_INTERRUPTED;
6479       }
6480       rocksdb_smart_next(seek_backward, iter);
6481     }
6482   }
6483   return HA_EXIT_SUCCESS;
6484 }
6485 
6486 #ifndef DBUG_OFF
6487 void dbug_append_garbage_at_end(rocksdb::PinnableSlice *on_disk_rec) {
6488   std::string str(on_disk_rec->data(), on_disk_rec->size());
6489   on_disk_rec->Reset();
6490   str.append("abc");
6491   on_disk_rec->PinSelf(rocksdb::Slice(str));
6492 }
6493 
6494 void dbug_truncate_record(rocksdb::PinnableSlice *on_disk_rec) {
6495   on_disk_rec->remove_suffix(on_disk_rec->size());
6496 }
6497 
6498 void dbug_modify_rec_varchar12(rocksdb::PinnableSlice *on_disk_rec) {
6499   std::string res;
6500   // The record is NULL-byte followed by VARCHAR(10).
6501   // Put the NULL-byte
6502   res.append("\0", 1);
6503   // Then, add a valid VARCHAR(12) value.
6504   res.append("\xC", 1);
6505   res.append("123456789ab", 12);
6506 
6507   on_disk_rec->Reset();
6508   on_disk_rec->PinSelf(rocksdb::Slice(res));
6509 }
6510 
6511 void dbug_create_err_inplace_alter() {
6512   my_printf_error(ER_UNKNOWN_ERROR,
6513                   "Intentional failure in inplace alter occurred.", MYF(0));
6514 }
6515 #endif
6516 
6517 int ha_rocksdb::convert_record_from_storage_format(
6518     const rocksdb::Slice *const key, uchar *const buf) {
6519   DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read1",
6520                   dbug_append_garbage_at_end(&m_retrieved_record););
6521   DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read2",
6522                   dbug_truncate_record(&m_retrieved_record););
6523   DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read3",
6524                   dbug_modify_rec_varchar12(&m_retrieved_record););
6525 
6526   return convert_record_from_storage_format(key, &m_retrieved_record, buf);
6527 }
6528 
6529 /*
6530   @brief
6531   Unpack the record in this->m_retrieved_record and this->m_last_rowkey from
6532   storage format into buf (which can be table->record[0] or table->record[1]).
6533 
6534   @param  key   Table record's key in mem-comparable form.
6535   @param  buf   Store record in table->record[0] format here
6536 
6537   @detail
6538     If the table has blobs, the unpacked data in buf may keep pointers to the
6539     data in this->m_retrieved_record.
6540 
6541     The key is only needed to check its checksum value (the checksum is in
6542     m_retrieved_record).
6543 
6544   @seealso
6545     rdb_converter::setup_read_decoders()  Sets up data structures which tell
6546   which columns to decode.
6547 
6548   @return
6549     0      OK
6550     other  Error inpacking the data
6551 */
6552 
6553 int ha_rocksdb::convert_record_from_storage_format(
6554     const rocksdb::Slice *const key, const rocksdb::Slice *const value,
6555     uchar *const buf) {
6556   return m_converter->decode(m_pk_descr, buf, key, value);
6557 }
6558 
6559 int ha_rocksdb::alloc_key_buffers(const TABLE *const table_arg,
6560                                   const Rdb_tbl_def *const tbl_def_arg,
6561                                   bool alloc_alter_buffers) {
6562   DBUG_ENTER_FUNC();
6563 
6564   DBUG_ASSERT(m_pk_tuple == nullptr);
6565 
6566   std::shared_ptr<Rdb_key_def> *const kd_arr = tbl_def_arg->m_key_descr_arr;
6567 
6568   uint key_len = 0;
6569   uint max_packed_sk_len = 0;
6570   uint pack_key_len = 0;
6571   uint record_len = table->s->reclength + table->s->null_bytes;
6572 
6573   m_pk_descr = kd_arr[pk_index(table_arg, tbl_def_arg)];
6574   if (has_hidden_pk(table_arg)) {
6575     m_pk_key_parts = 1;
6576   } else {
6577     m_pk_key_parts =
6578         table->key_info[table->s->primary_key].user_defined_key_parts;
6579     key_len = table->key_info[table->s->primary_key].key_length;
6580   }
6581 
6582   // move this into get_table_handler() ??
6583   m_pk_descr->setup(table_arg, tbl_def_arg);
6584 
6585   m_pk_tuple = reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, key_len, MYF(0)));
6586 
6587   pack_key_len = m_pk_descr->max_storage_fmt_length();
6588   m_pk_packed_tuple =
6589       reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, pack_key_len, MYF(0)));
6590 
6591   /* Sometimes, we may use m_sk_packed_tuple for storing packed PK */
6592   max_packed_sk_len = pack_key_len;
6593   for (uint i = 0; i < table_arg->s->keys; i++) {
6594     /* Primary key was processed above */
6595     if (i == table_arg->s->primary_key) continue;
6596 
6597     // TODO: move this into get_table_handler() ??
6598     kd_arr[i]->setup(table_arg, tbl_def_arg);
6599 
6600     const uint packed_len = kd_arr[i]->max_storage_fmt_length();
6601     if (packed_len > max_packed_sk_len) {
6602       max_packed_sk_len = packed_len;
6603     }
6604   }
6605 
6606   m_sk_packed_tuple =
6607       reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, max_packed_sk_len, MYF(0)));
6608   m_sk_match_prefix_buf =
6609       reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, max_packed_sk_len, MYF(0)));
6610   m_sk_packed_tuple_old =
6611       reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, max_packed_sk_len, MYF(0)));
6612   m_end_key_packed_tuple =
6613       reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, max_packed_sk_len, MYF(0)));
6614   m_pack_buffer =
6615       reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, max_packed_sk_len, MYF(0)));
6616   m_record_buffer =
6617       reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, record_len, MYF(0)));
6618 
6619   m_scan_it_lower_bound =
6620       reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, max_packed_sk_len, MYF(0)));
6621   m_scan_it_upper_bound =
6622       reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, max_packed_sk_len, MYF(0)));
6623 
6624   /*
6625     If inplace alter is happening, allocate special buffers for unique
6626     secondary index duplicate checking.
6627   */
6628   if (alloc_alter_buffers) {
6629     m_dup_sk_packed_tuple =
6630         reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, max_packed_sk_len, MYF(0)));
6631     m_dup_sk_packed_tuple_old =
6632         reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, max_packed_sk_len, MYF(0)));
6633   }
6634 
6635   if (m_pk_tuple == nullptr || m_pk_packed_tuple == nullptr ||
6636       m_sk_packed_tuple == nullptr || m_sk_packed_tuple_old == nullptr ||
6637       m_end_key_packed_tuple == nullptr || m_pack_buffer == nullptr ||
6638       m_scan_it_upper_bound == nullptr || m_scan_it_lower_bound == nullptr ||
6639       m_record_buffer == nullptr ||
6640       (alloc_alter_buffers && (m_dup_sk_packed_tuple == nullptr ||
6641                                m_dup_sk_packed_tuple_old == nullptr))) {
6642     // One or more of the above allocations failed.  Clean up and exit
6643     free_key_buffers();
6644 
6645     DBUG_RETURN(HA_ERR_OUT_OF_MEM);
6646   }
6647 
6648   DBUG_RETURN(HA_EXIT_SUCCESS);
6649 }
6650 
6651 void ha_rocksdb::free_key_buffers() {
6652   my_free(m_pk_tuple);
6653   m_pk_tuple = nullptr;
6654 
6655   my_free(m_pk_packed_tuple);
6656   m_pk_packed_tuple = nullptr;
6657 
6658   my_free(m_sk_packed_tuple);
6659   m_sk_packed_tuple = nullptr;
6660 
6661   my_free(m_sk_match_prefix_buf);
6662   m_sk_match_prefix_buf = nullptr;
6663 
6664   my_free(m_sk_packed_tuple_old);
6665   m_sk_packed_tuple_old = nullptr;
6666 
6667   my_free(m_end_key_packed_tuple);
6668   m_end_key_packed_tuple = nullptr;
6669 
6670   my_free(m_pack_buffer);
6671   m_pack_buffer = nullptr;
6672 
6673   my_free(m_record_buffer);
6674   m_record_buffer = nullptr;
6675 
6676   my_free(m_dup_sk_packed_tuple);
6677   m_dup_sk_packed_tuple = nullptr;
6678 
6679   my_free(m_dup_sk_packed_tuple_old);
6680   m_dup_sk_packed_tuple_old = nullptr;
6681 
6682   my_free(m_scan_it_lower_bound);
6683   m_scan_it_lower_bound = nullptr;
6684 
6685   my_free(m_scan_it_upper_bound);
6686   m_scan_it_upper_bound = nullptr;
6687 }
6688 
6689 #ifdef MARIAROCKS_NOT_YET
6690 #endif
6691 void ha_rocksdb::set_skip_unique_check_tables(const char *const whitelist) {
6692   const char *const wl =
6693       whitelist ? whitelist : DEFAULT_SKIP_UNIQUE_CHECK_TABLES;
6694 
6695 #if defined(HAVE_PSI_INTERFACE)
6696   Regex_list_handler regex_handler(key_rwlock_skip_unique_check_tables);
6697 #else
6698   Regex_list_handler regex_handler;
6699 #endif
6700 
6701   if (!regex_handler.set_patterns(wl)) {
6702     warn_about_bad_patterns(&regex_handler, "skip_unique_check_tables");
6703   }
6704 
6705   m_skip_unique_check = regex_handler.matches(m_tbl_def->base_tablename());
6706 }
6707 
6708 /**
6709   @return
6710     HA_EXIT_SUCCESS  OK
6711     other            HA_ERR error code (can be SE-specific)
6712 */
6713 int ha_rocksdb::open(const char *const name, int mode, uint test_if_locked) {
6714   DBUG_ENTER_FUNC();
6715 
6716   int err = close();
6717   if (err) {
6718     DBUG_RETURN(err);
6719   }
6720 
6721   m_table_handler = rdb_open_tables.get_table_handler(name);
6722 
6723   if (m_table_handler == nullptr) {
6724     DBUG_RETURN(HA_ERR_OUT_OF_MEM);
6725   }
6726 
6727   my_core::thr_lock_data_init(&m_table_handler->m_thr_lock, &m_db_lock,
6728                               nullptr);
6729   m_io_perf.init(&m_table_handler->m_table_perf_context,
6730                  &m_table_handler->m_io_perf_read,
6731                  &m_table_handler->m_io_perf_write, &stats);
6732   Rdb_perf_context_guard guard(&m_io_perf,
6733                                rocksdb_perf_context_level(ha_thd()));
6734 
6735   std::string fullname;
6736   err = rdb_normalize_tablename(name, &fullname);
6737   if (err != HA_EXIT_SUCCESS) {
6738     DBUG_RETURN(err);
6739   }
6740 
6741   m_tbl_def = ddl_manager.find(fullname);
6742   if (m_tbl_def == nullptr) {
6743     my_error(ER_INTERNAL_ERROR, MYF(0),
6744              "Attempt to open a table that is not present in RocksDB-SE data "
6745              "dictionary");
6746     DBUG_RETURN(HA_ERR_ROCKSDB_INVALID_TABLE);
6747   }
6748   if (m_tbl_def->m_key_count != table->s->keys + has_hidden_pk(table)? 1:0)
6749   {
6750     sql_print_error("MyRocks: DDL mismatch: .frm file has %u indexes, "
6751                     "MyRocks has %u (%s hidden pk)",
6752                     table->s->keys, m_tbl_def->m_key_count,
6753                     has_hidden_pk(table)? "1" : "no");
6754 
6755     if (rocksdb_ignore_datadic_errors)
6756     {
6757       sql_print_error("MyRocks: rocksdb_ignore_datadic_errors=1, "
6758                       "trying to continue");
6759     }
6760     else
6761     {
6762       my_error(ER_INTERNAL_ERROR, MYF(0),
6763                "MyRocks: DDL mismatch. Check the error log for details");
6764       DBUG_RETURN(HA_ERR_ROCKSDB_INVALID_TABLE);
6765     }
6766   }
6767 
6768 
6769   m_lock_rows = RDB_LOCK_NONE;
6770   m_key_descr_arr = m_tbl_def->m_key_descr_arr;
6771 
6772   /*
6773     Full table scan actually uses primary key
6774     (UPDATE needs to know this, otherwise it will go into infinite loop on
6775     queries like "UPDATE tbl SET pk=pk+100")
6776   */
6777   key_used_on_scan = table->s->primary_key;
6778 
6779   // close() above has already called free_key_buffers(). No need to do it here.
6780   err = alloc_key_buffers(table, m_tbl_def);
6781 
6782   if (err) {
6783     DBUG_RETURN(err);
6784   }
6785 
6786   /*
6787     init_with_fields() is used to initialize table flags based on the field
6788     definitions in table->field[].
6789     It is called by open_binary_frm(), but that function calls the method for
6790     a temporary ha_rocksdb object which is later destroyed.
6791 
6792     If we are here in ::open(), then init_with_fields() has not been called
6793     for this object. Call it ourselves, we want all member variables to be
6794     properly initialized.
6795   */
6796   init_with_fields();
6797 
6798   /* Initialize decoder */
6799   m_converter = std::make_shared<Rdb_converter>(ha_thd(), m_tbl_def, table);
6800 
6801   /*
6802      Update m_ttl_bytes address to same as Rdb_converter's m_ttl_bytes.
6803      Remove this code after moving convert_record_to_storage_format() into
6804      Rdb_converter class.
6805   */
6806   m_ttl_bytes = m_converter->get_ttl_bytes_buffer();
6807 
6808   /*
6809     MariaDB: adjust field->part_of_key for PK columns. We can only do it here
6810     because SE API is just relying on the HA_PRIMARY_KEY_IN_READ_INDEX which
6811     does not allow to distinguish between unpack'able and non-unpack'able
6812     columns.
6813     Upstream uses handler->init_with_fields() but we don't have that call.
6814   */
6815   {
6816     if (!has_hidden_pk(table)) {
6817       KEY *const pk_info = &table->key_info[table->s->primary_key];
6818       for (uint kp = 0; kp < pk_info->user_defined_key_parts; kp++) {
6819         if (!m_pk_descr->can_unpack(kp)) {
6820           //
6821           uint field_index= pk_info->key_part[kp].field->field_index;
6822           table->field[field_index]->part_of_key.clear_all();
6823           table->field[field_index]->part_of_key.set_bit(table->s->primary_key);
6824         }
6825       }
6826     }
6827 
6828     for (uint key= 0; key < table->s->keys; key++) {
6829       KEY *const key_info = &table->key_info[key];
6830       if (key ==  table->s->primary_key)
6831         continue;
6832       for (uint kp = 0; kp < key_info->usable_key_parts; kp++) {
6833         uint field_index= key_info->key_part[kp].field->field_index;
6834         if (m_key_descr_arr[key]->can_unpack(kp)) {
6835           table->field[field_index]->part_of_key.set_bit(key);
6836         } else {
6837           table->field[field_index]->part_of_key.clear_bit(key);
6838         }
6839       }
6840     }
6841   }
6842 
6843   info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
6844 
6845   /*
6846     The following load_XXX code calls row decode functions, and they do
6847     that without having done ::external_lock() or index_init()/rnd_init().
6848     (Note: this also means we're doing a read when there was no
6849     rdb_converter::setup_field_encoders() call)
6850 
6851     Initialize the necessary variables for them:
6852   */
6853 
6854   /* Load auto_increment value only once on first use. */
6855   if (table->found_next_number_field && m_tbl_def->m_auto_incr_val == 0) {
6856     load_auto_incr_value();
6857   }
6858 
6859   /* Load hidden pk only once on first use. */
6860   if (has_hidden_pk(table) && m_tbl_def->m_hidden_pk_val == 0 &&
6861       (err = load_hidden_pk_value()) != HA_EXIT_SUCCESS) {
6862     free_key_buffers();
6863     DBUG_RETURN(err);
6864   }
6865 
6866   /* Index block size in MyRocks: used by MySQL in query optimization */
6867   stats.block_size = rocksdb_tbl_options->block_size;
6868 
6869 #ifdef MARIAROCKS_NOT_YET // MDEV-10976
6870 #endif
6871   /* Determine at open whether we should skip unique checks for this table */
6872   set_skip_unique_check_tables(THDVAR(ha_thd(), skip_unique_check_tables));
6873 
6874   DBUG_RETURN(HA_EXIT_SUCCESS);
6875 }
6876 
6877 int ha_rocksdb::close(void) {
6878   DBUG_ENTER_FUNC();
6879 
6880   m_pk_descr = nullptr;
6881   m_key_descr_arr = nullptr;
6882   m_converter = nullptr;
6883   free_key_buffers();
6884 
6885   if (m_table_handler != nullptr) {
6886     rdb_open_tables.release_table_handler(m_table_handler);
6887     m_table_handler = nullptr;
6888   }
6889 
6890   // These are needed to suppress valgrind errors in rocksdb.partition
6891   m_last_rowkey.free();
6892   m_sk_tails.free();
6893   m_sk_tails_old.free();
6894   m_pk_unpack_info.free();
6895 
6896   DBUG_RETURN(HA_EXIT_SUCCESS);
6897 }
6898 
6899 static const char *rdb_error_messages[] = {
6900     "Table must have a PRIMARY KEY.",
6901     "Specifying DATA DIRECTORY for an individual table is not supported.",
6902     "Specifying INDEX DIRECTORY for an individual table is not supported.",
6903     "RocksDB commit failed.",
6904     "Failure during bulk load operation.",
6905     "Found data corruption.",
6906     "CRC checksum mismatch.",
6907     "Invalid table.",
6908     "Could not access RocksDB properties.",
6909     "File I/O error during merge/sort operation.",
6910     "RocksDB status: not found.",
6911     "RocksDB status: corruption.",
6912     "RocksDB status: invalid argument.",
6913     "RocksDB status: io error.",
6914     "RocksDB status: no space.",
6915     "RocksDB status: merge in progress.",
6916     "RocksDB status: incomplete.",
6917     "RocksDB status: shutdown in progress.",
6918     "RocksDB status: timed out.",
6919     "RocksDB status: aborted.",
6920     "RocksDB status: lock limit reached.",
6921     "RocksDB status: busy.",
6922     "RocksDB status: deadlock.",
6923     "RocksDB status: expired.",
6924     "RocksDB status: try again.",
6925 };
6926 
6927 static_assert((sizeof(rdb_error_messages) / sizeof(rdb_error_messages[0])) ==
6928                   ((HA_ERR_ROCKSDB_LAST - HA_ERR_ROCKSDB_FIRST) + 1),
6929               "Number of error messages doesn't match number of error codes");
6930 
6931 //psergey-merge: do we need this in MariaDB: we have get_error_messages
6932 //below...
6933 #if 0
6934 static const char *rdb_get_error_message(int nr) {
6935   return rdb_error_messages[nr - HA_ERR_ROCKSDB_FIRST];
6936 }
6937 #endif
6938 
6939 static const char **rdb_get_error_messages(int nr) { return rdb_error_messages; }
6940 
6941 bool ha_rocksdb::get_error_message(const int error, String *const buf) {
6942   DBUG_ENTER_FUNC();
6943 
6944   static_assert(HA_ERR_ROCKSDB_LAST > HA_ERR_FIRST,
6945                 "HA_ERR_ROCKSDB_LAST > HA_ERR_FIRST");
6946   static_assert(HA_ERR_ROCKSDB_LAST > HA_ERR_LAST,
6947                 "HA_ERR_ROCKSDB_LAST > HA_ERR_LAST");
6948 
6949   if (error == HA_ERR_LOCK_WAIT_TIMEOUT || error == HA_ERR_LOCK_DEADLOCK ||
6950       error == HA_ERR_ROCKSDB_STATUS_BUSY) {
6951     Rdb_transaction *const tx = get_tx_from_thd(ha_thd());
6952     DBUG_ASSERT(tx != nullptr);
6953     buf->append(tx->m_detailed_error);
6954     DBUG_RETURN(true);
6955   }
6956 
6957   if (error >= HA_ERR_ROCKSDB_FIRST && error <= HA_ERR_ROCKSDB_LAST) {
6958     buf->append(rdb_error_messages[error - HA_ERR_ROCKSDB_FIRST]);
6959   }
6960 
6961   // We can be called with the values which are < HA_ERR_FIRST because most
6962   // MySQL internal functions will just return HA_EXIT_FAILURE in case of
6963   // an error.
6964 
6965   DBUG_RETURN(false);
6966 }
6967 
6968 /*
6969   Generalized way to convert RocksDB status errors into MySQL error code, and
6970   print error message.
6971 
6972   Each error code below maps to a RocksDB status code found in:
6973   rocksdb/include/rocksdb/status.h
6974 */
6975 int ha_rocksdb::rdb_error_to_mysql(const rocksdb::Status &s,
6976                                    const char *opt_msg) {
6977   DBUG_ASSERT(!s.ok());
6978 
6979   int err;
6980   switch (s.code()) {
6981     case rocksdb::Status::Code::kOk:
6982       err = HA_EXIT_SUCCESS;
6983       break;
6984     case rocksdb::Status::Code::kNotFound:
6985       err = HA_ERR_ROCKSDB_STATUS_NOT_FOUND;
6986       break;
6987     case rocksdb::Status::Code::kCorruption:
6988       err = HA_ERR_ROCKSDB_STATUS_CORRUPTION;
6989       break;
6990     case rocksdb::Status::Code::kNotSupported:
6991       err = HA_ERR_ROCKSDB_STATUS_NOT_SUPPORTED;
6992       break;
6993     case rocksdb::Status::Code::kInvalidArgument:
6994       err = HA_ERR_ROCKSDB_STATUS_INVALID_ARGUMENT;
6995       break;
6996     case rocksdb::Status::Code::kIOError:
6997       err = (s.IsNoSpace()) ? HA_ERR_ROCKSDB_STATUS_NO_SPACE
6998                             : HA_ERR_ROCKSDB_STATUS_IO_ERROR;
6999       break;
7000     case rocksdb::Status::Code::kMergeInProgress:
7001       err = HA_ERR_ROCKSDB_STATUS_MERGE_IN_PROGRESS;
7002       break;
7003     case rocksdb::Status::Code::kIncomplete:
7004       err = HA_ERR_ROCKSDB_STATUS_INCOMPLETE;
7005       break;
7006     case rocksdb::Status::Code::kShutdownInProgress:
7007       err = HA_ERR_ROCKSDB_STATUS_SHUTDOWN_IN_PROGRESS;
7008       break;
7009     case rocksdb::Status::Code::kTimedOut:
7010       err = HA_ERR_ROCKSDB_STATUS_TIMED_OUT;
7011       break;
7012     case rocksdb::Status::Code::kAborted:
7013       err = (s.IsLockLimit()) ? HA_ERR_ROCKSDB_STATUS_LOCK_LIMIT
7014                               : HA_ERR_ROCKSDB_STATUS_ABORTED;
7015       break;
7016     case rocksdb::Status::Code::kBusy:
7017       err = (s.IsDeadlock()) ? HA_ERR_ROCKSDB_STATUS_DEADLOCK
7018                              : HA_ERR_ROCKSDB_STATUS_BUSY;
7019       break;
7020     case rocksdb::Status::Code::kExpired:
7021       err = HA_ERR_ROCKSDB_STATUS_EXPIRED;
7022       break;
7023     case rocksdb::Status::Code::kTryAgain:
7024       err = HA_ERR_ROCKSDB_STATUS_TRY_AGAIN;
7025       break;
7026     default:
7027       DBUG_ASSERT(0);
7028       return -1;
7029   }
7030 
7031   std::string errMsg;
7032   if (s.IsLockLimit()) {
7033     errMsg =
7034         "Operation aborted: Failed to acquire lock due to "
7035         "rocksdb_max_row_locks limit";
7036   } else {
7037     errMsg = s.ToString();
7038   }
7039 
7040   if (opt_msg) {
7041     std::string concatenated_error = errMsg + " (" + std::string(opt_msg) + ")";
7042     my_error(ER_GET_ERRMSG, MYF(0), s.code(), concatenated_error.c_str(),
7043              rocksdb_hton_name);
7044   } else {
7045     my_error(ER_GET_ERRMSG, MYF(0), s.code(), errMsg.c_str(),
7046              rocksdb_hton_name);
7047   }
7048 
7049   return err;
7050 }
7051 
7052 /* MyRocks supports only the following collations for indexed columns */
7053 static const std::set<uint> RDB_INDEX_COLLATIONS = {
7054     COLLATION_BINARY, COLLATION_UTF8_BIN, COLLATION_LATIN1_BIN};
7055 
7056 static bool rdb_is_index_collation_supported(
7057     const my_core::Field *const field) {
7058   const my_core::enum_field_types type = field->real_type();
7059   /* Handle [VAR](CHAR|BINARY) or TEXT|BLOB */
7060   if (type == MYSQL_TYPE_VARCHAR || type == MYSQL_TYPE_STRING ||
7061       type == MYSQL_TYPE_BLOB)  {
7062 
7063     return (RDB_INDEX_COLLATIONS.find(field->charset()->number) !=
7064             RDB_INDEX_COLLATIONS.end()) ||
7065             rdb_is_collation_supported(field->charset());
7066   }
7067   return true;
7068 }
7069 
7070 
7071 static bool
7072 rdb_field_uses_nopad_collation(const my_core::Field *const field) {
7073   const my_core::enum_field_types type = field->real_type();
7074   /* Handle [VAR](CHAR|BINARY) or TEXT|BLOB */
7075   if (type == MYSQL_TYPE_VARCHAR || type == MYSQL_TYPE_STRING ||
7076       type == MYSQL_TYPE_BLOB) {
7077 
7078     /*
7079       This is technically a NOPAD collation but it's a binary collation
7080       that we can handle.
7081     */
7082     if (RDB_INDEX_COLLATIONS.find(field->charset()->number) !=
7083            RDB_INDEX_COLLATIONS.end())
7084       return false;
7085 
7086     return (field->charset()->state & MY_CS_NOPAD);
7087   }
7088   return false;
7089 }
7090 
7091 
7092 /*
7093   Create structures needed for storing data in rocksdb. This is called when the
7094   table is created. The structures will be shared by all TABLE* objects.
7095 
7096   @param
7097     table_arg        Table with definition
7098     db_table         "dbname.tablename"
7099     len              strlen of the above
7100     tbl_def_arg      tbl_def whose key_descr is being created/populated
7101     old_tbl_def_arg  tbl_def from which keys are being copied over from
7102                      (for use during inplace alter)
7103 
7104   @return
7105     0      - Ok
7106     other  - error, either given table ddl is not supported by rocksdb or OOM.
7107 */
7108 int ha_rocksdb::create_key_defs(
7109     const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
7110     const TABLE *const old_table_arg /* = nullptr */,
7111     const Rdb_tbl_def *const old_tbl_def_arg
7112     /* = nullptr */) const {
7113   DBUG_ENTER_FUNC();
7114 
7115   DBUG_ASSERT(table_arg->s != nullptr);
7116 
7117   /*
7118     These need to be one greater than MAX_INDEXES since the user can create
7119     MAX_INDEXES secondary keys and no primary key which would cause us
7120     to generate a hidden one.
7121   */
7122   std::array<key_def_cf_info, MAX_INDEXES + 1> cfs;
7123 
7124   /*
7125     NOTE: All new column families must be created before new index numbers are
7126     allocated to each key definition. See below for more details.
7127     http://github.com/MySQLOnRocksDB/mysql-5.6/issues/86#issuecomment-138515501
7128   */
7129   if (create_cfs(table_arg, tbl_def_arg, &cfs)) {
7130     DBUG_RETURN(HA_EXIT_FAILURE);
7131   }
7132 
7133   uint64 ttl_duration = 0;
7134   std::string ttl_column;
7135   uint ttl_field_offset;
7136 
7137   uint err;
7138   if ((err = Rdb_key_def::extract_ttl_duration(table_arg, tbl_def_arg,
7139                                                &ttl_duration))) {
7140     DBUG_RETURN(err);
7141   }
7142 
7143   if ((err = Rdb_key_def::extract_ttl_col(table_arg, tbl_def_arg, &ttl_column,
7144                                           &ttl_field_offset))) {
7145     DBUG_RETURN(err);
7146   }
7147 
7148   /* We don't currently support TTL on tables with hidden primary keys. */
7149   if (ttl_duration > 0 && has_hidden_pk(table_arg)) {
7150     my_error(ER_RDB_TTL_UNSUPPORTED, MYF(0));
7151     DBUG_RETURN(HA_EXIT_FAILURE);
7152   }
7153 
7154   /*
7155     If TTL duration is not specified but TTL column was specified, throw an
7156     error because TTL column requires duration.
7157   */
7158   if (ttl_duration == 0 && !ttl_column.empty()) {
7159     my_error(ER_RDB_TTL_COL_FORMAT, MYF(0), ttl_column.c_str());
7160     DBUG_RETURN(HA_EXIT_FAILURE);
7161   }
7162 
7163   if (!old_tbl_def_arg) {
7164     /*
7165       old_tbl_def doesn't exist. this means we are in the process of creating
7166       a new table.
7167 
7168       Get the index numbers (this will update the next_index_number)
7169       and create Rdb_key_def structures.
7170     */
7171     for (uint i = 0; i < tbl_def_arg->m_key_count; i++) {
7172       if (create_key_def(table_arg, i, tbl_def_arg, &m_key_descr_arr[i], cfs[i],
7173                          ttl_duration, ttl_column)) {
7174         DBUG_RETURN(HA_EXIT_FAILURE);
7175       }
7176     }
7177   } else {
7178     /*
7179       old_tbl_def exists.  This means we are creating a new tbl_def as part of
7180       in-place alter table.  Copy over existing keys from the old_tbl_def and
7181       generate the necessary new key definitions if any.
7182     */
7183     if (create_inplace_key_defs(table_arg, tbl_def_arg, old_table_arg,
7184                                 old_tbl_def_arg, cfs, ttl_duration,
7185                                 ttl_column)) {
7186       DBUG_RETURN(HA_EXIT_FAILURE);
7187     }
7188   }
7189 
7190   DBUG_RETURN(HA_EXIT_SUCCESS);
7191 }
7192 
7193 /*
7194   Checks index parameters and creates column families needed for storing data
7195   in rocksdb if necessary.
7196 
7197   @param in
7198     table_arg     Table with definition
7199     db_table      Table name
7200     tbl_def_arg   Table def structure being populated
7201 
7202   @param out
7203     cfs           CF info for each key definition in 'key_info' order
7204 
7205   @return
7206     0      - Ok
7207     other  - error
7208 */
7209 int ha_rocksdb::create_cfs(
7210     const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
7211     std::array<struct key_def_cf_info, MAX_INDEXES + 1> *const cfs) const {
7212   DBUG_ENTER_FUNC();
7213 
7214   DBUG_ASSERT(table_arg->s != nullptr);
7215 
7216   char tablename_sys[NAME_LEN + 1];
7217   bool tsys_set= false;
7218 
7219   /*
7220     The first loop checks the index parameters and creates
7221     column families if necessary.
7222   */
7223   for (uint i = 0; i < tbl_def_arg->m_key_count; i++) {
7224     rocksdb::ColumnFamilyHandle *cf_handle;
7225 
7226     if (!is_hidden_pk(i, table_arg, tbl_def_arg) &&
7227         tbl_def_arg->base_tablename().find(tmp_file_prefix) != 0) {
7228       if (!tsys_set)
7229       {
7230         tsys_set= true;
7231         my_core::filename_to_tablename(tbl_def_arg->base_tablename().c_str(),
7232                                    tablename_sys, sizeof(tablename_sys));
7233       }
7234 
7235       for (uint part = 0; part < table_arg->key_info[i].ext_key_parts;
7236            part++)
7237       {
7238         /* MariaDB: disallow NOPAD collations */
7239         if (rdb_field_uses_nopad_collation(
7240               table_arg->key_info[i].key_part[part].field))
7241         {
7242           my_error(ER_MYROCKS_CANT_NOPAD_COLLATION, MYF(0));
7243           DBUG_RETURN(HA_EXIT_FAILURE);
7244         }
7245 
7246         if (rocksdb_strict_collation_check &&
7247             !rdb_is_index_collation_supported(
7248                 table_arg->key_info[i].key_part[part].field) &&
7249             !rdb_collation_exceptions->matches(tablename_sys)) {
7250 
7251           char buf[1024];
7252           my_snprintf(buf, sizeof(buf),
7253                       "Indexed column %s.%s uses a collation that does not "
7254                       "allow index-only access in secondary key and has "
7255                       "reduced disk space efficiency in primary key.",
7256                        tbl_def_arg->full_tablename().c_str(),
7257                        table_arg->key_info[i].key_part[part].field->field_name.str);
7258 
7259           my_error(ER_INTERNAL_ERROR, MYF(ME_WARNING), buf);
7260         }
7261       }
7262     }
7263 
7264     // Internal consistency check to make sure that data in TABLE and
7265     // Rdb_tbl_def structures matches. Either both are missing or both are
7266     // specified. Yes, this is critical enough to make it into SHIP_ASSERT.
7267     SHIP_ASSERT(IF_PARTITIONING(!table_arg->part_info,true) == tbl_def_arg->base_partition().empty());
7268 
7269     // Generate the name for the column family to use.
7270     bool per_part_match_found = false;
7271     std::string cf_name =
7272         generate_cf_name(i, table_arg, tbl_def_arg, &per_part_match_found);
7273 
7274     // Prevent create from using the system column family.
7275     if (cf_name == DEFAULT_SYSTEM_CF_NAME) {
7276       my_error(ER_WRONG_ARGUMENTS, MYF(0),
7277                "column family not valid for storing index data.");
7278       DBUG_RETURN(HA_EXIT_FAILURE);
7279     }
7280 
7281     // Here's how `get_or_create_cf` will use the input parameters:
7282     //
7283     // `cf_name` - will be used as a CF name.
7284     cf_handle = cf_manager.get_or_create_cf(rdb, cf_name);
7285 
7286     if (!cf_handle) {
7287       DBUG_RETURN(HA_EXIT_FAILURE);
7288     }
7289 
7290     auto &cf = (*cfs)[i];
7291 
7292     cf.cf_handle = cf_handle;
7293     cf.is_reverse_cf = Rdb_cf_manager::is_cf_name_reverse(cf_name.c_str());
7294     cf.is_per_partition_cf = per_part_match_found;
7295   }
7296 
7297   DBUG_RETURN(HA_EXIT_SUCCESS);
7298 }
7299 
7300 /*
7301   Create key definition needed for storing data in rocksdb during ADD index
7302   inplace operations.
7303 
7304   @param in
7305     table_arg         Table with definition
7306     tbl_def_arg       New table def structure being populated
7307     old_tbl_def_arg   Old(current) table def structure
7308     cfs               Struct array which contains column family information
7309 
7310   @return
7311     0      - Ok
7312     other  - error, either given table ddl is not supported by rocksdb or OOM.
7313 */
7314 int ha_rocksdb::create_inplace_key_defs(
7315     const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
7316     const TABLE *const old_table_arg, const Rdb_tbl_def *const old_tbl_def_arg,
7317     const std::array<key_def_cf_info, MAX_INDEXES + 1> &cfs,
7318     uint64 ttl_duration, const std::string &ttl_column) const {
7319   DBUG_ENTER_FUNC();
7320 
7321   std::shared_ptr<Rdb_key_def> *const old_key_descr =
7322       old_tbl_def_arg->m_key_descr_arr;
7323   std::shared_ptr<Rdb_key_def> *const new_key_descr =
7324       tbl_def_arg->m_key_descr_arr;
7325   const std::unordered_map<std::string, uint> old_key_pos =
7326       get_old_key_positions(table_arg, tbl_def_arg, old_table_arg,
7327                             old_tbl_def_arg);
7328 
7329   uint i;
7330   for (i = 0; i < tbl_def_arg->m_key_count; i++) {
7331     const auto &it = old_key_pos.find(get_key_name(i, table_arg, tbl_def_arg));
7332 
7333     if (it != old_key_pos.end()) {
7334       /*
7335         Found matching index in old table definition, so copy it over to the
7336         new one created.
7337       */
7338       const Rdb_key_def &okd = *old_key_descr[it->second];
7339 
7340       const GL_INDEX_ID gl_index_id = okd.get_gl_index_id();
7341       struct Rdb_index_info index_info;
7342       if (!dict_manager.get_index_info(gl_index_id, &index_info)) {
7343         // NO_LINT_DEBUG
7344         sql_print_error(
7345             "RocksDB: Could not get index information "
7346             "for Index Number (%u,%u), table %s",
7347             gl_index_id.cf_id, gl_index_id.index_id,
7348             old_tbl_def_arg->full_tablename().c_str());
7349         DBUG_RETURN(HA_EXIT_FAILURE);
7350       }
7351 
7352       uint32 ttl_rec_offset =
7353           Rdb_key_def::has_index_flag(index_info.m_index_flags,
7354                                       Rdb_key_def::TTL_FLAG)
7355               ? Rdb_key_def::calculate_index_flag_offset(
7356                     index_info.m_index_flags, Rdb_key_def::TTL_FLAG)
7357               : UINT_MAX;
7358 
7359       /*
7360         We can't use the copy constructor because we need to update the
7361         keynr within the pack_info for each field and the keyno of the keydef
7362         itself.
7363       */
7364       new_key_descr[i] = std::make_shared<Rdb_key_def>(
7365           okd.get_index_number(), i, okd.get_cf(),
7366           index_info.m_index_dict_version, index_info.m_index_type,
7367           index_info.m_kv_version, okd.m_is_reverse_cf,
7368           okd.m_is_per_partition_cf, okd.m_name.c_str(),
7369           dict_manager.get_stats(gl_index_id), index_info.m_index_flags,
7370           ttl_rec_offset, index_info.m_ttl_duration);
7371     } else if (create_key_def(table_arg, i, tbl_def_arg, &new_key_descr[i],
7372                               cfs[i], ttl_duration, ttl_column)) {
7373       DBUG_RETURN(HA_EXIT_FAILURE);
7374     }
7375 
7376     DBUG_ASSERT(new_key_descr[i] != nullptr);
7377     new_key_descr[i]->setup(table_arg, tbl_def_arg);
7378   }
7379 
7380   DBUG_RETURN(HA_EXIT_SUCCESS);
7381 }
7382 
7383 std::unordered_map<std::string, uint> ha_rocksdb::get_old_key_positions(
7384     const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg,
7385     const TABLE *const old_table_arg,
7386     const Rdb_tbl_def *const old_tbl_def_arg) const {
7387   DBUG_ENTER_FUNC();
7388 
7389   std::shared_ptr<Rdb_key_def> *const old_key_descr =
7390       old_tbl_def_arg->m_key_descr_arr;
7391   std::unordered_map<std::string, uint> old_key_pos;
7392   std::unordered_map<std::string, uint> new_key_pos;
7393   uint i;
7394 
7395   for (i = 0; i < tbl_def_arg->m_key_count; i++) {
7396     new_key_pos[get_key_name(i, table_arg, tbl_def_arg)] = i;
7397   }
7398 
7399   for (i = 0; i < old_tbl_def_arg->m_key_count; i++) {
7400     if (is_hidden_pk(i, old_table_arg, old_tbl_def_arg)) {
7401       old_key_pos[old_key_descr[i]->m_name] = i;
7402       continue;
7403     }
7404 
7405     /*
7406       In case of matching key name, need to check key parts of keys as well,
7407       in case a simultaneous drop + add is performed, where the key name is the
7408       same but the key parts are different.
7409 
7410       Example:
7411       CREATE TABLE t1 (a INT, b INT, KEY ka(a)) ENGINE=RocksDB;
7412       ALTER TABLE t1 DROP INDEX ka, ADD INDEX ka(b), ALGORITHM=INPLACE;
7413     */
7414     const KEY *const old_key = &old_table_arg->key_info[i];
7415     const auto &it = new_key_pos.find(old_key->name.str);
7416     if (it == new_key_pos.end()) {
7417       continue;
7418     }
7419 
7420     KEY *const new_key = &table_arg->key_info[it->second];
7421 
7422     /*
7423       Check that the key is identical between old and new tables.
7424       If not, we still need to create a new index.
7425 
7426       The exception is if there is an index changed from unique to non-unique,
7427       in these cases we don't need to rebuild as they are stored the same way in
7428       RocksDB.
7429     */
7430     bool unique_to_non_unique =
7431         ((old_key->flags ^ new_key->flags) == HA_NOSAME) &&
7432         (old_key->flags & HA_NOSAME);
7433 
7434     if (compare_keys(old_key, new_key) && !unique_to_non_unique) {
7435       continue;
7436     }
7437 
7438     /* Check to make sure key parts match. */
7439     if (compare_key_parts(old_key, new_key)) {
7440       continue;
7441     }
7442 
7443     old_key_pos[old_key->name.str] = i;
7444   }
7445 
7446   DBUG_RETURN(old_key_pos);
7447 }
7448 
7449 /* Check to see if two keys are identical. */
7450 int ha_rocksdb::compare_keys(const KEY *const old_key,
7451                              const KEY *const new_key) const {
7452   DBUG_ENTER_FUNC();
7453 
7454   /* Check index name. */
7455   if (strcmp(old_key->name.str, new_key->name.str) != 0) {
7456     DBUG_RETURN(HA_EXIT_FAILURE);
7457   }
7458 
7459   /* If index algorithms are different then keys are different. */
7460   if (old_key->algorithm != new_key->algorithm) {
7461     DBUG_RETURN(HA_EXIT_FAILURE);
7462   }
7463 
7464   /* Check that the key is identical between old and new tables.  */
7465   if ((old_key->flags ^ new_key->flags) & HA_KEYFLAG_MASK) {
7466     DBUG_RETURN(HA_EXIT_FAILURE);
7467   }
7468 
7469   /* Check index comment. (for column family changes) */
7470   std::string old_comment(old_key->comment.str, old_key->comment.length);
7471   std::string new_comment(new_key->comment.str, new_key->comment.length);
7472   if (old_comment.compare(new_comment) != 0) {
7473     DBUG_RETURN(HA_EXIT_FAILURE);
7474   }
7475 
7476   DBUG_RETURN(HA_EXIT_SUCCESS);
7477 }
7478 
7479 /* Check two keys to ensure that key parts within keys match */
7480 int ha_rocksdb::compare_key_parts(const KEY *const old_key,
7481                                   const KEY *const new_key) const {
7482   DBUG_ENTER_FUNC();
7483 
7484   /* Skip if key parts do not match, as it is a different key */
7485   if (new_key->user_defined_key_parts != old_key->user_defined_key_parts) {
7486     DBUG_RETURN(HA_EXIT_FAILURE);
7487   }
7488 
7489   /* Check to see that key parts themselves match */
7490   for (uint i = 0; i < old_key->user_defined_key_parts; i++) {
7491     if (strcmp(old_key->key_part[i].field->field_name.str,
7492                new_key->key_part[i].field->field_name.str) != 0) {
7493       DBUG_RETURN(HA_EXIT_FAILURE);
7494     }
7495 
7496     /* Check if prefix index key part length has changed */
7497     if (old_key->key_part[i].length != new_key->key_part[i].length) {
7498       DBUG_RETURN(HA_EXIT_FAILURE);
7499     }
7500   }
7501 
7502   DBUG_RETURN(HA_EXIT_SUCCESS);
7503 }
7504 
7505 /*
7506   Create key definition needed for storing data in rocksdb.
7507   This can be called either during CREATE table or doing ADD index operations.
7508 
7509   @param in
7510     table_arg     Table with definition
7511     i             Position of index being created inside table_arg->key_info
7512     tbl_def_arg   Table def structure being populated
7513     cf_info       Struct which contains column family information
7514 
7515   @param out
7516     new_key_def  Newly created index definition.
7517 
7518   @return
7519     0      - Ok
7520     other  - error, either given table ddl is not supported by rocksdb or OOM.
7521 */
7522 int ha_rocksdb::create_key_def(const TABLE *const table_arg, const uint i,
7523                                const Rdb_tbl_def *const tbl_def_arg,
7524                                std::shared_ptr<Rdb_key_def> *const new_key_def,
7525                                const struct key_def_cf_info &cf_info,
7526                                uint64 ttl_duration,
7527                                const std::string &ttl_column) const {
7528   DBUG_ENTER_FUNC();
7529 
7530   DBUG_ASSERT(*new_key_def == nullptr);
7531 
7532   const uint index_id = ddl_manager.get_and_update_next_number(&dict_manager);
7533   const uint16_t index_dict_version = Rdb_key_def::INDEX_INFO_VERSION_LATEST;
7534   uchar index_type;
7535   uint16_t kv_version;
7536 
7537   if (is_hidden_pk(i, table_arg, tbl_def_arg)) {
7538     index_type = Rdb_key_def::INDEX_TYPE_HIDDEN_PRIMARY;
7539     kv_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_LATEST;
7540   } else if (i == table_arg->s->primary_key) {
7541     index_type = Rdb_key_def::INDEX_TYPE_PRIMARY;
7542     uint16 pk_latest_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_LATEST;
7543     kv_version = pk_latest_version;
7544   } else {
7545     index_type = Rdb_key_def::INDEX_TYPE_SECONDARY;
7546     uint16 sk_latest_version = Rdb_key_def::SECONDARY_FORMAT_VERSION_LATEST;
7547     kv_version = sk_latest_version;
7548   }
7549 
7550   // Use PRIMARY_FORMAT_VERSION_UPDATE1 here since it is the same value as
7551   // SECONDARY_FORMAT_VERSION_UPDATE1 so it doesn't matter if this is a
7552   // primary key or secondary key.
7553   DBUG_EXECUTE_IF("MYROCKS_LEGACY_VARBINARY_FORMAT", {
7554     kv_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_UPDATE1;
7555   });
7556 
7557   DBUG_EXECUTE_IF("MYROCKS_NO_COVERED_BITMAP_FORMAT", {
7558     if (index_type == Rdb_key_def::INDEX_TYPE_SECONDARY) {
7559       kv_version = Rdb_key_def::SECONDARY_FORMAT_VERSION_UPDATE2;
7560     }
7561   });
7562 
7563   uint32 index_flags = (ttl_duration > 0 ? Rdb_key_def::TTL_FLAG : 0);
7564 
7565   uint32 ttl_rec_offset =
7566       Rdb_key_def::has_index_flag(index_flags, Rdb_key_def::TTL_FLAG)
7567           ? Rdb_key_def::calculate_index_flag_offset(index_flags,
7568                                                      Rdb_key_def::TTL_FLAG)
7569           : UINT_MAX;
7570 
7571   const char *const key_name = get_key_name(i, table_arg, m_tbl_def);
7572   *new_key_def = std::make_shared<Rdb_key_def>(
7573       index_id, i, cf_info.cf_handle, index_dict_version, index_type,
7574       kv_version, cf_info.is_reverse_cf, cf_info.is_per_partition_cf, key_name,
7575       Rdb_index_stats(), index_flags, ttl_rec_offset, ttl_duration);
7576 
7577   if (!ttl_column.empty()) {
7578     (*new_key_def)->m_ttl_column = ttl_column;
7579   }
7580   // initialize key_def
7581   (*new_key_def)->setup(table_arg, tbl_def_arg);
7582   DBUG_RETURN(HA_EXIT_SUCCESS);
7583 }
7584 
7585 int rdb_normalize_tablename(const std::string &tablename,
7586                             std::string *const strbuf) {
7587   if (tablename.size() < 2 || tablename[0] != '.' ||
7588       (tablename[1] != FN_LIBCHAR && tablename[1] != FN_LIBCHAR2)) {
7589     DBUG_ASSERT(0);  // We were not passed table name?
7590     return HA_ERR_ROCKSDB_INVALID_TABLE;
7591   }
7592 
7593   size_t pos = tablename.find_first_of(FN_LIBCHAR, 2);
7594   if (pos == std::string::npos) {
7595     pos = tablename.find_first_of(FN_LIBCHAR2, 2);
7596   }
7597 
7598   if (pos == std::string::npos) {
7599     DBUG_ASSERT(0);  // We were not passed table name?
7600     return HA_ERR_ROCKSDB_INVALID_TABLE;
7601   }
7602 
7603   *strbuf = tablename.substr(2, pos - 2) + "." + tablename.substr(pos + 1);
7604 
7605   return HA_EXIT_SUCCESS;
7606 }
7607 
7608 /*
7609   Check to see if the user's original statement includes foreign key
7610   references
7611 */
7612 bool ha_rocksdb::contains_foreign_key(THD *const thd) {
7613   bool success;
7614   const char *str = thd_query_string(thd)->str;
7615 
7616   DBUG_ASSERT(str != nullptr);
7617 
7618   while (*str != '\0') {
7619     // Scan from our current pos looking for 'FOREIGN'
7620     str = rdb_find_in_string(str, "FOREIGN", &success);
7621     if (!success) {
7622       return false;
7623     }
7624 
7625     // Skip past the found "FOREIGN'
7626     str = rdb_check_next_token(&my_charset_bin, str, "FOREIGN", &success);
7627     DBUG_ASSERT(success);
7628 
7629     if (!my_isspace(&my_charset_bin, *str)) {
7630       return false;
7631     }
7632 
7633     // See if the next token is 'KEY'
7634     str = rdb_check_next_token(&my_charset_bin, str, "KEY", &success);
7635     if (!success) {
7636       continue;
7637     }
7638 
7639     // See if the next token is '('
7640     str = rdb_check_next_token(&my_charset_bin, str, "(", &success);
7641     if (!success) {
7642       // There is an optional index id after 'FOREIGN KEY', skip it
7643       str = rdb_skip_id(&my_charset_bin, str);
7644 
7645       // Now check for '(' again
7646       str = rdb_check_next_token(&my_charset_bin, str, "(", &success);
7647     }
7648 
7649     // If we have found 'FOREIGN KEY [<word>] (' we can be confident we have
7650     // a foreign key clause.
7651     return success;
7652   }
7653 
7654   // We never found a valid foreign key clause
7655   return false;
7656 }
7657 
7658 /**
7659   @brief
7660   splits the normalized table name of <dbname>.<tablename>#P#<part_no> into
7661   the <dbname>, <tablename> and <part_no> components.
7662 
7663   @param dbbuf returns database name/table_schema
7664   @param tablebuf returns tablename
7665   @param partitionbuf returns partition suffix if there is one
7666   @return HA_EXIT_SUCCESS on success, non-zero on failure to split
7667 */
7668 int rdb_split_normalized_tablename(const std::string &fullname,
7669                                    std::string *const db,
7670                                    std::string *const table,
7671                                    std::string *const partition) {
7672   DBUG_ASSERT(!fullname.empty());
7673 
7674 #define RDB_PARTITION_STR "#P#"
7675 
7676   /* Normalize returns dbname.tablename. */
7677   size_t dotpos = fullname.find('.');
7678 
7679   /* Invalid table name? */
7680   if (dotpos == std::string::npos) {
7681     return HA_ERR_ROCKSDB_INVALID_TABLE;
7682   }
7683 
7684   // Table must have a database name associated with it.
7685   DBUG_ASSERT(dotpos > 0);
7686 
7687   if (db != nullptr) {
7688     *db = fullname.substr(0, dotpos);
7689   }
7690 
7691   dotpos++;
7692 
7693   const size_t partpos =
7694       fullname.find(RDB_PARTITION_STR, dotpos, strlen(RDB_PARTITION_STR));
7695 
7696   if (partpos != std::string::npos) {
7697     DBUG_ASSERT(partpos >= dotpos);
7698 
7699     if (table != nullptr) {
7700       *table = fullname.substr(dotpos, partpos - dotpos);
7701     }
7702 
7703     if (partition != nullptr) {
7704       *partition = fullname.substr(partpos + strlen(RDB_PARTITION_STR));
7705     }
7706   } else if (table != nullptr) {
7707     *table = fullname.substr(dotpos);
7708   }
7709 
7710   return HA_EXIT_SUCCESS;
7711 }
7712 
7713 /*
7714  Create a table's Rdb_tbl_def and its Rdb_key_defs and store table information
7715  into MyRocks Data Dictionary
7716  The method is called during create table/partition, truncate table/partition
7717 
7718  @param table_name            IN      table's name formated as
7719  'dbname.tablename'
7720  @param table_arg             IN      sql table
7721  @param auto_increment_value  IN      specified table's auto increment value
7722 
7723   @return
7724     HA_EXIT_SUCCESS  OK
7725     other            HA_ERR error code (can be SE-specific)
7726 */
7727 int ha_rocksdb::create_table(const std::string &table_name,
7728                              const TABLE *table_arg,
7729                              ulonglong auto_increment_value) {
7730   DBUG_ENTER_FUNC();
7731 
7732 
7733 
7734   int err;
7735 
7736   const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
7737   rocksdb::WriteBatch *const batch = wb.get();
7738 
7739   /* Create table/key descriptions and put them into the data dictionary */
7740   m_tbl_def = new Rdb_tbl_def(table_name);
7741 
7742   uint n_keys = table_arg->s->keys;
7743 
7744   /*
7745     If no primary key found, create a hidden PK and place it inside table
7746     definition
7747   */
7748   if (has_hidden_pk(table_arg)) {
7749     n_keys += 1;
7750     // reset hidden pk id
7751     // the starting valid value for hidden pk is 1
7752     m_tbl_def->m_hidden_pk_val = 1;
7753   }
7754 
7755   m_key_descr_arr = new std::shared_ptr<Rdb_key_def>[n_keys];
7756   m_tbl_def->m_key_count = n_keys;
7757   m_tbl_def->m_key_descr_arr = m_key_descr_arr;
7758 
7759   err = create_key_defs(table_arg, m_tbl_def);
7760   if (err != HA_EXIT_SUCCESS) {
7761     goto error;
7762   }
7763 
7764   m_pk_descr = m_key_descr_arr[pk_index(table_arg, m_tbl_def)];
7765 
7766   if (auto_increment_value) {
7767     bool autoinc_upgrade_test = false;
7768     m_tbl_def->m_auto_incr_val = auto_increment_value;
7769     DBUG_EXECUTE_IF("myrocks_autoinc_upgrade", autoinc_upgrade_test = true;);
7770     if (!autoinc_upgrade_test) {
7771       auto s = dict_manager.put_auto_incr_val(
7772           batch, m_tbl_def->get_autoincr_gl_index_id(),
7773           m_tbl_def->m_auto_incr_val);
7774       if (!s.ok()) {
7775         goto error;
7776       }
7777     }
7778   }
7779 
7780   dict_manager.lock();
7781   err = ddl_manager.put_and_write(m_tbl_def, batch);
7782   if (err != HA_EXIT_SUCCESS) {
7783     dict_manager.unlock();
7784     goto error;
7785   }
7786 
7787   err = dict_manager.commit(batch);
7788   if (err != HA_EXIT_SUCCESS) {
7789     dict_manager.unlock();
7790     goto error;
7791   }
7792 
7793   dict_manager.unlock();
7794 
7795   DBUG_RETURN(HA_EXIT_SUCCESS);
7796 
7797 error:
7798   /* Delete what we have allocated so far */
7799   delete m_tbl_def;
7800   m_tbl_def = nullptr;
7801   m_key_descr_arr = nullptr;
7802 
7803   DBUG_RETURN(err);
7804 }
7805 
7806 /**
7807   @brief
7808   create() is called to create a table. The variable name will have the name
7809   of the table.
7810 
7811   @details
7812   When create() is called you do not need to worry about
7813   opening the table. Also, the .frm file will have already been
7814   created so adjusting create_info is not necessary. You can overwrite
7815   the .frm file at this point if you wish to change the table
7816   definition, but there are no methods currently provided for doing
7817   so.
7818 
7819   Called from handle.cc by ha_create_table().
7820 
7821   @return
7822     HA_EXIT_SUCCESS  OK
7823     other            HA_ERR error code (can be SE-specific)
7824 
7825   @see
7826   ha_create_table() in handle.cc
7827 */
7828 
7829 int ha_rocksdb::create(const char *const name, TABLE *const table_arg,
7830                        HA_CREATE_INFO *const create_info) {
7831   DBUG_ENTER_FUNC();
7832 
7833   DBUG_ASSERT(table_arg != nullptr);
7834   DBUG_ASSERT(create_info != nullptr);
7835 
7836   if (create_info->data_file_name) {
7837     // DATA DIRECTORY is used to create tables under a specific location
7838     // outside the MySQL data directory. We don't support this for MyRocks.
7839     // The `rocksdb_datadir` setting should be used to configure RocksDB data
7840     // directory.
7841     DBUG_RETURN(HA_ERR_ROCKSDB_TABLE_DATA_DIRECTORY_NOT_SUPPORTED);
7842   }
7843 
7844   if (create_info->index_file_name) {
7845     // Similar check for INDEX DIRECTORY as well.
7846     DBUG_RETURN(HA_ERR_ROCKSDB_TABLE_INDEX_DIRECTORY_NOT_SUPPORTED);
7847   }
7848 
7849   int err;
7850   /*
7851     Construct dbname.tablename ourselves, because parititioning
7852     passes strings like "./test/t14#P#p0" for individual partitions,
7853     while table_arg->s->table_name has none of that.
7854   */
7855   std::string str;
7856   err = rdb_normalize_tablename(name, &str);
7857   if (err != HA_EXIT_SUCCESS) {
7858     DBUG_RETURN(err);
7859   }
7860 
7861   // FOREIGN KEY isn't supported yet
7862   THD *const thd = my_core::thd_get_current_thd();
7863   if (contains_foreign_key(thd)) {
7864     my_error(ER_NOT_SUPPORTED_YET, MYF(0),
7865              "FOREIGN KEY for the RocksDB storage engine");
7866     DBUG_RETURN(HA_ERR_UNSUPPORTED);
7867   }
7868 
7869   // Check whether Data Dictionary contain information
7870   Rdb_tbl_def *tbl = ddl_manager.find(str);
7871   if (tbl != nullptr) {
7872     if (thd->lex->sql_command == SQLCOM_TRUNCATE) {
7873       err = delete_table(tbl);
7874       if (err != HA_EXIT_SUCCESS) {
7875         DBUG_RETURN(err);
7876       }
7877     } else {
7878       my_error(ER_METADATA_INCONSISTENCY, MYF(0), str.c_str(), name);
7879       DBUG_RETURN(HA_ERR_ROCKSDB_CORRUPT_DATA);
7880     }
7881   }
7882 
7883   // The below adds/clears hooks in RocksDB sync points. There's no reason for
7884   // this code to be in ::create() but it needs to be somewhere where it is
7885   // away from any tight loops and where one can invoke it from mtr:
7886   DBUG_EXECUTE_IF("rocksdb_enable_delay_commits",
7887     {
7888       auto syncpoint= rocksdb::SyncPoint::GetInstance();
7889       syncpoint->SetCallBack("DBImpl::WriteImpl:BeforeLeaderEnters",
7890                              [&](void* /*arg*/) {my_sleep(500);} );
7891       syncpoint->EnableProcessing();
7892       push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_WRONG_ARGUMENTS,
7893                         "enable_delay_commits_mode ON");
7894 
7895     });
7896   DBUG_EXECUTE_IF("rocksdb_disable_delay_commits",
7897     {
7898       auto syncpoint= rocksdb::SyncPoint::GetInstance();
7899       syncpoint->ClearCallBack("DBImpl::WriteImpl:BeforeLeaderEnters");
7900       syncpoint->DisableProcessing();
7901       push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_WRONG_ARGUMENTS,
7902                         "enable_delay_commits_mode OFF");
7903     });
7904 
7905   DBUG_RETURN(create_table(str, table_arg, create_info->auto_increment_value));
7906 }
7907 
7908 /**
7909   @note
7910   This function is used only when the table has not yet been opened, and
7911   keyread_allowed bitmap doesn't have the correct values yet.
7912 
7913   See comment in ha_rocksdb::index_flags() for details.
7914 */
7915 
7916 bool ha_rocksdb::check_keyread_allowed(uint inx, uint part,
7917                                        bool all_parts) const {
7918   bool res = true;
7919   KEY *const key_info = &table_share->key_info[inx];
7920 
7921   Rdb_field_packing dummy1;
7922   res = dummy1.setup(nullptr, key_info->key_part[part].field, inx, part,
7923                      key_info->key_part[part].length);
7924 
7925   if (res && all_parts) {
7926     for (uint i = 0; i < part; i++) {
7927       Field *field;
7928       if ((field = key_info->key_part[i].field)) {
7929         Rdb_field_packing dummy;
7930         if (!dummy.setup(nullptr, field, inx, i,
7931                          key_info->key_part[i].length)) {
7932           /* Cannot do index-only reads for this column */
7933           res = false;
7934           break;
7935         }
7936       }
7937     }
7938   }
7939 
7940   const uint pk = table_share->primary_key;
7941   if (inx == pk && all_parts &&
7942       part + 1 == table_share->key_info[pk].user_defined_key_parts) {
7943     m_pk_can_be_decoded = res;
7944   }
7945 
7946   return res;
7947 }
7948 
7949 int ha_rocksdb::read_key_exact(const Rdb_key_def &kd,
7950                                rocksdb::Iterator *const iter,
7951                                const bool /* unused */,
7952                                const rocksdb::Slice &key_slice,
7953                                const int64_t ttl_filter_ts) {
7954   THD *thd = ha_thd();
7955   /*
7956     We are looking for the first record such that
7957       index_tuple= lookup_tuple.
7958     lookup_tuple may be a prefix of the index.
7959   */
7960   rocksdb_smart_seek(kd.m_is_reverse_cf, iter, key_slice);
7961 
7962   while (iter->Valid() && kd.value_matches_prefix(iter->key(), key_slice)) {
7963     if (thd && thd->killed) {
7964       return HA_ERR_QUERY_INTERRUPTED;
7965     }
7966     /*
7967       If TTL is enabled we need to check if the given key has already expired
7968       from the POV of the current transaction.  If it has, try going to the next
7969       key.
7970     */
7971     if (kd.has_ttl() && should_hide_ttl_rec(kd, iter->value(), ttl_filter_ts)) {
7972       rocksdb_smart_next(kd.m_is_reverse_cf, iter);
7973       continue;
7974     }
7975 
7976     return HA_EXIT_SUCCESS;
7977   }
7978 
7979   /*
7980     Got a record that is not equal to the lookup value, or even a record
7981     from another table.index.
7982   */
7983   return HA_ERR_KEY_NOT_FOUND;
7984 }
7985 
7986 int ha_rocksdb::read_before_key(const Rdb_key_def &kd,
7987                                 const bool full_key_match,
7988                                 const rocksdb::Slice &key_slice,
7989                                 const int64_t ttl_filter_ts) {
7990   THD *thd = ha_thd();
7991   /*
7992     We are looking for record with the biggest t.key such that
7993     t.key < lookup_tuple.
7994   */
7995   rocksdb_smart_seek(!kd.m_is_reverse_cf, m_scan_it, key_slice);
7996 
7997   while (is_valid(m_scan_it)) {
7998     if (thd && thd->killed) {
7999       return HA_ERR_QUERY_INTERRUPTED;
8000     }
8001     /*
8002       We are using full key and we've hit an exact match, or...
8003 
8004       If TTL is enabled we need to check if the given key has already expired
8005       from the POV of the current transaction.  If it has, try going to the next
8006       key.
8007     */
8008     if ((full_key_match &&
8009          kd.value_matches_prefix(m_scan_it->key(), key_slice)) ||
8010         (kd.has_ttl() &&
8011          should_hide_ttl_rec(kd, m_scan_it->value(), ttl_filter_ts))) {
8012       rocksdb_smart_next(!kd.m_is_reverse_cf, m_scan_it);
8013       continue;
8014     }
8015 
8016     return HA_EXIT_SUCCESS;
8017   }
8018 
8019   return HA_ERR_KEY_NOT_FOUND;
8020 }
8021 
8022 int ha_rocksdb::read_after_key(const Rdb_key_def &kd,
8023                                const rocksdb::Slice &key_slice,
8024                                const int64_t ttl_filter_ts) {
8025   THD *thd = ha_thd();
8026   /*
8027     We are looking for the first record such that
8028 
8029       index_tuple $GT lookup_tuple
8030 
8031     with HA_READ_AFTER_KEY, $GT = '>',
8032     with HA_READ_KEY_OR_NEXT, $GT = '>='
8033   */
8034   rocksdb_smart_seek(kd.m_is_reverse_cf, m_scan_it, key_slice);
8035 
8036   /*
8037     If TTL is enabled we need to check if the given key has already expired
8038     from the POV of the current transaction.  If it has, try going to the next
8039     key.
8040   */
8041   while (is_valid(m_scan_it) && kd.has_ttl() &&
8042          should_hide_ttl_rec(kd, m_scan_it->value(), ttl_filter_ts)) {
8043     if (thd && thd->killed) {
8044       return HA_ERR_QUERY_INTERRUPTED;
8045     }
8046     rocksdb_smart_next(kd.m_is_reverse_cf, m_scan_it);
8047   }
8048 
8049   return is_valid(m_scan_it) ? HA_EXIT_SUCCESS : HA_ERR_KEY_NOT_FOUND;
8050 }
8051 
8052 int ha_rocksdb::position_to_correct_key(
8053     const Rdb_key_def &kd, const enum ha_rkey_function &find_flag,
8054     const bool full_key_match, const uchar *const key,
8055     const key_part_map &keypart_map, const rocksdb::Slice &key_slice,
8056     bool *const move_forward, const int64_t ttl_filter_ts) {
8057   int rc = 0;
8058 
8059   *move_forward = true;
8060 
8061   switch (find_flag) {
8062     case HA_READ_KEY_EXACT:
8063       rc = read_key_exact(kd, m_scan_it, full_key_match, key_slice,
8064                           ttl_filter_ts);
8065       break;
8066     case HA_READ_BEFORE_KEY:
8067       *move_forward = false;
8068       rc = read_before_key(kd, full_key_match, key_slice, ttl_filter_ts);
8069       if (rc == 0 && !kd.covers_key(m_scan_it->key())) {
8070         /* The record we've got is not from this index */
8071         rc = HA_ERR_KEY_NOT_FOUND;
8072       }
8073       break;
8074     case HA_READ_AFTER_KEY:
8075     case HA_READ_KEY_OR_NEXT:
8076       rc = read_after_key(kd, key_slice, ttl_filter_ts);
8077       if (rc == 0 && !kd.covers_key(m_scan_it->key())) {
8078         /* The record we've got is not from this index */
8079         rc = HA_ERR_KEY_NOT_FOUND;
8080       }
8081       break;
8082     case HA_READ_KEY_OR_PREV:
8083     case HA_READ_PREFIX:
8084       /* This flag is not used by the SQL layer, so we don't support it yet. */
8085       rc = HA_ERR_UNSUPPORTED;
8086       break;
8087     case HA_READ_PREFIX_LAST:
8088     case HA_READ_PREFIX_LAST_OR_PREV:
8089       *move_forward = false;
8090       /*
8091         Find the last record with the specified index prefix lookup.
8092         - HA_READ_PREFIX_LAST requires that the record has the
8093           prefix=lookup (if there are no such records,
8094           HA_ERR_KEY_NOT_FOUND should be returned).
8095         - HA_READ_PREFIX_LAST_OR_PREV has no such requirement. If there are no
8096           records with prefix=lookup, we should return the last record
8097           before that.
8098       */
8099       rc = read_before_key(kd, full_key_match, key_slice, ttl_filter_ts);
8100       if (rc == 0) {
8101         const rocksdb::Slice &rkey = m_scan_it->key();
8102         if (!kd.covers_key(rkey)) {
8103           /* The record we've got is not from this index */
8104           rc = HA_ERR_KEY_NOT_FOUND;
8105         } else if (find_flag == HA_READ_PREFIX_LAST) {
8106           uint size = kd.pack_index_tuple(table, m_pack_buffer,
8107                                           m_sk_packed_tuple, m_record_buffer,
8108                                           key, keypart_map);
8109           rocksdb::Slice lookup_tuple(
8110               reinterpret_cast<char *>(m_sk_packed_tuple), size);
8111 
8112           // We need to compare the key we've got with the original search
8113           // prefix.
8114           if (!kd.value_matches_prefix(rkey, lookup_tuple)) {
8115             rc = HA_ERR_KEY_NOT_FOUND;
8116           }
8117         }
8118       }
8119       break;
8120     default:
8121       DBUG_ASSERT(0);
8122       break;
8123   }
8124 
8125   return rc;
8126 }
8127 
8128 int ha_rocksdb::calc_eq_cond_len(const Rdb_key_def &kd,
8129                                  const enum ha_rkey_function &find_flag,
8130                                  const rocksdb::Slice &slice,
8131                                  const int bytes_changed_by_succ,
8132                                  const key_range *const end_key,
8133                                  uint *const end_key_packed_size) {
8134   if (find_flag == HA_READ_KEY_EXACT) return slice.size();
8135 
8136   if (find_flag == HA_READ_PREFIX_LAST) {
8137     /*
8138       We have made the kd.successor(m_sk_packed_tuple) call above.
8139 
8140       The slice is at least Rdb_key_def::INDEX_NUMBER_SIZE bytes long.
8141     */
8142     return slice.size() - bytes_changed_by_succ;
8143   }
8144 
8145   if (end_key) {
8146     *end_key_packed_size =
8147         kd.pack_index_tuple(table, m_pack_buffer, m_end_key_packed_tuple,
8148                             m_record_buffer, end_key->key, end_key->keypart_map);
8149 
8150     /*
8151       Calculating length of the equal conditions here. 4 byte index id is
8152       included.
8153       Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3)
8154        WHERE id1=1 AND id2=1 AND id3>=2 => eq_cond_len= 4+8+4= 16
8155        WHERE id1=1 AND id2>=1 AND id3>=2 => eq_cond_len= 4+8= 12
8156       Example2: id1 VARCHAR(30), id2 INT, PRIMARY KEY (id1, id2)
8157        WHERE id1 = 'AAA' and id2 < 3; => eq_cond_len=13 (varchar used 9 bytes)
8158     */
8159     rocksdb::Slice end_slice(reinterpret_cast<char *>(m_end_key_packed_tuple),
8160                              *end_key_packed_size);
8161     return slice.difference_offset(end_slice);
8162   }
8163 
8164   /*
8165     On range scan without any end key condition, there is no
8166     eq cond, and eq cond length is the same as index_id size (4 bytes).
8167     Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3)
8168      WHERE id1>=1 AND id2 >= 2 and id2 <= 5 => eq_cond_len= 4
8169   */
8170   return Rdb_key_def::INDEX_NUMBER_SIZE;
8171 }
8172 
8173 int ha_rocksdb::read_row_from_primary_key(uchar *const buf) {
8174   int rc;
8175   const rocksdb::Slice &rkey = m_scan_it->key();
8176   const uint pk_size = rkey.size();
8177   const char *pk_data = rkey.data();
8178 
8179   memcpy(m_pk_packed_tuple, pk_data, pk_size);
8180   m_last_rowkey.copy(pk_data, pk_size, &my_charset_bin);
8181 
8182   if (m_lock_rows != RDB_LOCK_NONE) {
8183     /* We need to put a lock and re-read */
8184     rc = get_row_by_rowid(buf, m_pk_packed_tuple, pk_size);
8185   } else {
8186     /* Unpack from the row we've read */
8187     const rocksdb::Slice &value = m_scan_it->value();
8188     rc = convert_record_from_storage_format(&rkey, &value, buf);
8189   }
8190 
8191   return rc;
8192 }
8193 
8194 int ha_rocksdb::read_row_from_secondary_key(uchar *const buf,
8195                                             const Rdb_key_def &kd,
8196                                             bool move_forward) {
8197   int rc = 0;
8198   uint pk_size;
8199 
8200   /* Get the key columns and primary key value */
8201   const rocksdb::Slice &rkey = m_scan_it->key();
8202   const rocksdb::Slice &value = m_scan_it->value();
8203 
8204 #ifndef DBUG_OFF
8205   bool save_keyread_only = m_keyread_only;
8206 #endif
8207   DBUG_EXECUTE_IF("dbug.rocksdb.HA_EXTRA_KEYREAD", { m_keyread_only = true; });
8208 
8209   bool covered_lookup = (m_keyread_only && kd.can_cover_lookup()) ||
8210                         kd.covers_lookup(&value, &m_lookup_bitmap);
8211 
8212 #ifndef DBUG_OFF
8213   m_keyread_only = save_keyread_only;
8214 #endif
8215 
8216   if (covered_lookup && m_lock_rows == RDB_LOCK_NONE) {
8217     pk_size =
8218         kd.get_primary_key_tuple(table, *m_pk_descr, &rkey, m_pk_packed_tuple);
8219     if (pk_size == RDB_INVALID_KEY_LEN) {
8220       rc = HA_ERR_ROCKSDB_CORRUPT_DATA;
8221     } else {
8222       rc = kd.unpack_record(table, buf, &rkey, &value,
8223                             m_converter->get_verify_row_debug_checksums());
8224       global_stats.covered_secondary_key_lookups.inc();
8225     }
8226   } else {
8227     if (kd.m_is_reverse_cf) move_forward = !move_forward;
8228 
8229     rc = find_icp_matching_index_rec(move_forward, buf);
8230     if (!rc) {
8231       const rocksdb::Slice &rkey = m_scan_it->key();
8232       pk_size = kd.get_primary_key_tuple(table, *m_pk_descr, &rkey,
8233                                          m_pk_packed_tuple);
8234       if (pk_size == RDB_INVALID_KEY_LEN) {
8235         rc = HA_ERR_ROCKSDB_CORRUPT_DATA;
8236       } else {
8237         rc = get_row_by_rowid(buf, m_pk_packed_tuple, pk_size);
8238       }
8239     }
8240   }
8241 
8242   if (!rc) {
8243     m_last_rowkey.copy((const char *)m_pk_packed_tuple, pk_size,
8244                        &my_charset_bin);
8245   }
8246 
8247   return rc;
8248 }
8249 
8250 /**
8251   @note
8252     The problem with this function is that SQL layer calls it, when
8253      - the table has not been yet opened (no ::open() call done)
8254      - this->table_share already exists, but it is in the process of being
8255        filled, so some of fields are still NULL.
8256      - In particular, table_share->key_info[inx].key_part[] is filled only up
8257        to part #part. Subsequent key parts are not yet filled.
8258 
8259     To complicate things further, SQL layer will call index_flags() with
8260     all_parts=TRUE. Essentially, we're asked to provide flags for reading
8261     keyparts whose datatype is not yet known.
8262 
8263     We walk around this problem by using check_keyread_allowed(), which uses
8264     table_share object and is careful not to step on unitialized data.
8265 
8266     When we get a call with all_parts=TRUE, we try to analyze all parts but
8267     ignore those that have key_part->field==nullptr (these are not initialized
8268     yet).
8269 */
8270 
8271 ulong ha_rocksdb::index_flags(uint inx, uint part, bool all_parts) const {
8272   DBUG_ENTER_FUNC();
8273 
8274   ulong base_flags = HA_READ_NEXT |  // doesn't seem to be used
8275                      HA_READ_ORDER | HA_READ_RANGE | HA_READ_PREV;
8276 
8277   if (check_keyread_allowed(inx, part, all_parts)) {
8278     base_flags |= HA_KEYREAD_ONLY;
8279   }
8280 
8281   if (inx == table_share->primary_key) {
8282     /*
8283       Index-only reads on primary key are the same as table scan for us. Still,
8284       we need to explicitly "allow" them, otherwise SQL layer will miss some
8285       plans.
8286     */
8287     base_flags |= HA_KEYREAD_ONLY | HA_CLUSTERED_INDEX;
8288   } else {
8289     /*
8290       We can Index Condition Pushdown any key except the primary. With primary
8291       key, we get (pk, record) pair immediately, there is no place to put the
8292       ICP check.
8293     */
8294     base_flags |= HA_DO_INDEX_COND_PUSHDOWN;
8295   }
8296 
8297   DBUG_RETURN(base_flags);
8298 }
8299 
8300 /**
8301   @brief
8302   Read next index tuple through the secondary index.
8303 
8304   @details
8305   m_scan_it points at the index key-value pair that we should read the (pk,row)
8306   pair for.
8307 */
8308 int ha_rocksdb::secondary_index_read(const int keyno, uchar *const buf) {
8309   DBUG_ASSERT(table != nullptr);
8310 #ifdef MARIAROCKS_NOT_YET
8311   stats.rows_requested++;
8312 #endif
8313   /* Use STATUS_NOT_FOUND when record not found or some error occurred */
8314   table->status = STATUS_NOT_FOUND;
8315 
8316   if (is_valid(m_scan_it)) {
8317     rocksdb::Slice key = m_scan_it->key();
8318 
8319     /* Check if we've ran out of records of this index */
8320     if (m_key_descr_arr[keyno]->covers_key(key)) {
8321       int rc = 0;
8322 
8323       // TODO: We could here check if we have ran out of range we're scanning
8324       const uint size = m_key_descr_arr[keyno]->get_primary_key_tuple(
8325           table, *m_pk_descr, &key, m_pk_packed_tuple);
8326       if (size == RDB_INVALID_KEY_LEN) {
8327         return HA_ERR_ROCKSDB_CORRUPT_DATA;
8328       }
8329 
8330       m_last_rowkey.copy((const char *)m_pk_packed_tuple, size,
8331                          &my_charset_bin);
8332 
8333       rocksdb::Slice value = m_scan_it->value();
8334       bool covered_lookup =
8335           (m_keyread_only && m_key_descr_arr[keyno]->can_cover_lookup()) ||
8336           m_key_descr_arr[keyno]->covers_lookup(&value, &m_lookup_bitmap);
8337       if (covered_lookup && m_lock_rows == RDB_LOCK_NONE) {
8338         rc = m_key_descr_arr[keyno]->unpack_record(
8339             table, buf, &key, &value,
8340             m_converter->get_verify_row_debug_checksums());
8341         global_stats.covered_secondary_key_lookups.inc();
8342       } else {
8343         DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete_sk");
8344         rc = get_row_by_rowid(buf, m_pk_packed_tuple, size);
8345       }
8346 
8347       if (!rc) {
8348         table->status = 0;
8349 #ifdef MARIAROCKS_NOT_YET
8350         stats.rows_read++;
8351         stats.rows_index_next++;
8352 #endif
8353         update_row_stats(ROWS_READ);
8354       }
8355       return rc;
8356     }
8357   }
8358   return HA_ERR_END_OF_FILE;
8359 }
8360 
8361 bool ha_rocksdb::is_using_full_key(key_part_map keypart_map,
8362                                    uint actual_key_parts)
8363 {
8364   return (keypart_map == HA_WHOLE_KEY) ||
8365          (keypart_map == ((key_part_map(1) << actual_key_parts)
8366                         - 1));
8367 }
8368 
8369 /*
8370   ha_rocksdb::read_range_first overrides handler::read_range_first.
8371   The only difference from handler::read_range_first is that
8372   ha_rocksdb::read_range_first passes end_key to
8373   ha_rocksdb::index_read_map_impl function.
8374 
8375   @return
8376     HA_EXIT_SUCCESS  OK
8377     other            HA_ERR error code (can be SE-specific)
8378 */
8379 int ha_rocksdb::read_range_first(const key_range *const start_key,
8380                                  const key_range *const end_key,
8381                                  bool eq_range_arg, bool sorted) {
8382   DBUG_ENTER_FUNC();
8383 
8384   int result;
8385 
8386   eq_range = eq_range_arg;
8387 
8388   /* MariaDB: Pass RANGE_SCAN_ASC when we support ICP on backwards scans */
8389   set_end_range(end_key);
8390 
8391   range_key_part = table->key_info[active_index].key_part;
8392 
8393   if (!start_key) {
8394     // Read first record
8395     result = ha_index_first(table->record[0]);
8396   } else {
8397 #ifdef MARIAROCKS_NOT_YET
8398     if (is_using_prohibited_gap_locks(
8399             is_using_full_unique_key(active_index, start_key->keypart_map,
8400                                      start_key->flag))) {
8401       DBUG_RETURN(HA_ERR_LOCK_DEADLOCK);
8402     }
8403 #endif
8404     increment_statistics(&SSV::ha_read_key_count);
8405 
8406     result =
8407         index_read_map_impl(table->record[0], start_key->key,
8408                             start_key->keypart_map, start_key->flag, end_key);
8409   }
8410   if (result) {
8411     DBUG_RETURN((result == HA_ERR_KEY_NOT_FOUND) ? HA_ERR_END_OF_FILE : result);
8412   }
8413 
8414   if (compare_key(end_range) <= 0) {
8415     DBUG_RETURN(HA_EXIT_SUCCESS);
8416   } else {
8417     /*
8418       The last read row does not fall in the range. So request
8419       storage engine to release row lock if possible.
8420     */
8421     unlock_row();
8422     DBUG_RETURN(HA_ERR_END_OF_FILE);
8423   }
8424 }
8425 
8426 int ha_rocksdb::prepare_index_scan()
8427 {
8428   range_key_part= table->key_info[active_index].key_part;
8429   set_end_range(NULL);
8430   return 0;
8431 }
8432 
8433 
8434 int ha_rocksdb::prepare_range_scan(const key_range *start_key,
8435                                     const key_range *end_key)
8436 {
8437   range_key_part= table->key_info[active_index].key_part;
8438 
8439   if (start_key)
8440   {
8441     m_save_start_range= *start_key;
8442     m_start_range= &m_save_start_range;
8443   }
8444   else
8445     m_start_range= NULL;
8446 
8447   set_end_range(end_key);
8448   return 0;
8449 }
8450 
8451 
8452 /**
8453    @return
8454     HA_EXIT_SUCCESS  OK
8455     other            HA_ERR error code (can be SE-specific)
8456 */
8457 
8458 int ha_rocksdb::index_read_map(uchar *const buf, const uchar *const key,
8459                                key_part_map keypart_map,
8460                                enum ha_rkey_function find_flag) {
8461   DBUG_ENTER_FUNC();
8462 
8463   DBUG_RETURN(index_read_map_impl(buf, key, keypart_map, find_flag, nullptr));
8464 }
8465 
8466 /*
8467    See storage/rocksdb/rocksdb-range-access.txt for description of how MySQL
8468    index navigation commands are converted into RocksDB lookup commands.
8469 
8470    This function takes end_key as an argument, and it is set on range scan.
8471    MyRocks needs to decide whether prefix bloom filter can be used or not.
8472    To decide to use prefix bloom filter or not, calculating equal condition
8473    length
8474    is needed. On equal lookups (find_flag == HA_READ_KEY_EXACT), equal
8475    condition length is the same as rocksdb::Slice.size() of the start key.
8476    On range scan, equal condition length is MIN(start_key, end_key) of the
8477    rocksdb::Slice expression.
8478 
8479    @return
8480     HA_EXIT_SUCCESS  OK
8481     other            HA_ERR error code (can be SE-specific)
8482 */
8483 int ha_rocksdb::index_read_map_impl(uchar *const buf, const uchar *const key,
8484                                     key_part_map keypart_map,
8485                                     enum ha_rkey_function find_flag,
8486                                     const key_range *end_key) {
8487   DBUG_ENTER_FUNC();
8488 
8489   DBUG_EXECUTE_IF("myrocks_busy_loop_on_row_read", int debug_i = 0;
8490                   while (1) { debug_i++; });
8491 
8492   int rc = 0;
8493 
8494   THD *thd = ha_thd();
8495   DEBUG_SYNC(thd, "rocksdb.check_flags_rmi");
8496   if (thd && thd->killed) {
8497     rc = HA_ERR_QUERY_INTERRUPTED;
8498     DBUG_RETURN(rc);
8499   }
8500 
8501   const Rdb_key_def &kd = *m_key_descr_arr[active_index];
8502   const uint actual_key_parts = kd.get_key_parts();
8503   bool using_full_key = is_using_full_key(keypart_map, actual_key_parts);
8504 
8505   if (!end_key) end_key = end_range;
8506 
8507   /* By default, we don't need the retrieved records to match the prefix */
8508   m_sk_match_prefix = nullptr;
8509 #ifdef MARIAROCKS_NOT_YET
8510   stats.rows_requested++;
8511 #endif
8512   if (active_index == table->s->primary_key && find_flag == HA_READ_KEY_EXACT &&
8513       using_full_key) {
8514     /*
8515       Equality lookup over primary key, using full tuple.
8516       This is a special case, use DB::Get.
8517     */
8518     const uint size = kd.pack_index_tuple(table, m_pack_buffer,
8519                                           m_pk_packed_tuple, m_record_buffer,
8520                                           key, keypart_map);
8521     bool skip_lookup = is_blind_delete_enabled();
8522 
8523     rc = get_row_by_rowid(buf, m_pk_packed_tuple, size, skip_lookup, false);
8524 
8525     if (!rc && !skip_lookup) {
8526 #ifdef MARIAROCKS_NOT_YET
8527       stats.rows_read++;
8528       stats.rows_index_first++;
8529 #endif
8530       update_row_stats(ROWS_READ);
8531     }
8532     DBUG_RETURN(rc);
8533   }
8534 
8535   /*
8536     Unique secondary index performs lookups without the extended key fields
8537   */
8538   uint packed_size;
8539   if (active_index != table->s->primary_key &&
8540       table->key_info[active_index].flags & HA_NOSAME &&
8541       find_flag == HA_READ_KEY_EXACT && using_full_key) {
8542     key_part_map tmp_map = (key_part_map(1) << table->key_info[active_index]
8543                                                    .user_defined_key_parts) -
8544                            1;
8545     packed_size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
8546                                       m_record_buffer, key, tmp_map);
8547     if (table->key_info[active_index].user_defined_key_parts !=
8548         kd.get_key_parts()) {
8549       using_full_key = false;
8550     }
8551   } else {
8552     packed_size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
8553                                       m_record_buffer, key, keypart_map);
8554   }
8555 
8556   if ((pushed_idx_cond && pushed_idx_cond_keyno == active_index) &&
8557       (find_flag == HA_READ_KEY_EXACT || find_flag == HA_READ_PREFIX_LAST)) {
8558     /*
8559       We are doing a point index lookup, and ICP is enabled. It is possible
8560       that this call will be followed by ha_rocksdb->index_next_same() call.
8561 
8562       Do what InnoDB does: save the lookup tuple now. We will need it in
8563       index_next_same/find_icp_matching_index_rec in order to stop scanning
8564       as soon as index record doesn't match the lookup tuple.
8565 
8566       When not using ICP, handler::index_next_same() will make sure that rows
8567       that don't match the lookup prefix are not returned.
8568       row matches the lookup prefix.
8569     */
8570     m_sk_match_prefix = m_sk_match_prefix_buf;
8571     m_sk_match_length = packed_size;
8572     memcpy(m_sk_match_prefix, m_sk_packed_tuple, packed_size);
8573   }
8574 
8575   int bytes_changed_by_succ = 0;
8576   if (find_flag == HA_READ_PREFIX_LAST_OR_PREV ||
8577       find_flag == HA_READ_PREFIX_LAST || find_flag == HA_READ_AFTER_KEY) {
8578     /* See below */
8579     bytes_changed_by_succ = kd.successor(m_sk_packed_tuple, packed_size);
8580   }
8581 
8582   rocksdb::Slice slice(reinterpret_cast<const char *>(m_sk_packed_tuple),
8583                        packed_size);
8584 
8585   uint end_key_packed_size = 0;
8586   /*
8587     In MariaDB, the end_key is always the bigger end of the range.
8588     If we are doing a reverse-ordered scan (that is, walking from the bigger
8589     key values to smaller), we should use the smaller end of range as end_key.
8590   */
8591   const key_range *cur_end_key= end_key;
8592   if (find_flag == HA_READ_PREFIX_LAST_OR_PREV ||
8593       find_flag == HA_READ_BEFORE_KEY)
8594   {
8595     cur_end_key= m_start_range;
8596   }
8597 
8598   const uint eq_cond_len =
8599       calc_eq_cond_len(kd, find_flag, slice, bytes_changed_by_succ, cur_end_key,
8600                        &end_key_packed_size);
8601 
8602   bool use_all_keys = false;
8603   if (find_flag == HA_READ_KEY_EXACT &&
8604       my_count_bits(keypart_map) == kd.get_key_parts()) {
8605     use_all_keys = true;
8606   }
8607 
8608   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
8609   const bool is_new_snapshot = !tx->has_snapshot();
8610   // Loop as long as we get a deadlock error AND we end up creating the
8611   // snapshot here (i.e. it did not exist prior to this)
8612   for (;;) {
8613     DEBUG_SYNC(thd, "rocksdb.check_flags_rmi_scan");
8614     if (thd && thd->killed) {
8615       rc = HA_ERR_QUERY_INTERRUPTED;
8616       break;
8617     }
8618     /*
8619       This will open the iterator and position it at a record that's equal or
8620       greater than the lookup tuple.
8621     */
8622     setup_scan_iterator(kd, &slice, use_all_keys, eq_cond_len);
8623 
8624     /*
8625       Once we are positioned on from above, move to the position we really
8626       want: See storage/rocksdb/rocksdb-range-access.txt
8627     */
8628     bool move_forward;
8629     rc =
8630         position_to_correct_key(kd, find_flag, using_full_key, key, keypart_map,
8631                                 slice, &move_forward, tx->m_snapshot_timestamp);
8632 
8633     if (rc) {
8634       break;
8635     }
8636 
8637     m_skip_scan_it_next_call = false;
8638 
8639     /*
8640       Now get the data for the row into 'buf'.  If we were using a primary key
8641       then we have all the rows we need.  For a secondary key we now need to
8642       lookup the primary key.
8643     */
8644     if (active_index == table->s->primary_key) {
8645       rc = read_row_from_primary_key(buf);
8646     } else {
8647       rc = read_row_from_secondary_key(buf, kd, move_forward);
8648     }
8649 
8650     if (!should_recreate_snapshot(rc, is_new_snapshot)) {
8651       break; /* Exit the loop */
8652     }
8653 
8654     // release the snapshot and iterator so they will be regenerated
8655     tx->release_snapshot();
8656     release_scan_iterator();
8657   }
8658 
8659   if (rc) {
8660     /*
8661       This status is returned on any error
8662       the only possible error condition is record-not-found
8663     */
8664     table->status = STATUS_NOT_FOUND;
8665   } else {
8666     table->status = 0;
8667 #ifdef MARIAROCKS_NOT_YET
8668     stats.rows_read++;
8669     stats.rows_index_first++;
8670 #endif
8671     update_row_stats(ROWS_READ);
8672   }
8673 
8674   DBUG_RETURN(rc);
8675 }
8676 
8677 /*
8678   @brief
8679   Scan the secondary index until we find an index record that satisfies ICP
8680 
8681   @param move_forward   TRUE  <=> move m_scan_it forward
8682                         FALSE <=> move m_scan_it backward
8683   @param buf            Record buffer (must be the same buffer that
8684                         pushed index condition points to, in practice
8685                         it is table->record[0])
8686 
8687   @detail
8688   Move the current iterator m_scan_it until we get an index tuple that
8689   satisfies the pushed Index Condition.
8690   (if there is no pushed index condition, return right away)
8691 
8692   @return
8693     0     - Index tuple satisfies ICP, can do index read.
8694     other - error code
8695 */
8696 
8697 int ha_rocksdb::find_icp_matching_index_rec(const bool move_forward,
8698                                             uchar *const buf) {
8699   if (pushed_idx_cond && pushed_idx_cond_keyno == active_index) {
8700     const Rdb_key_def &kd = *m_key_descr_arr[active_index];
8701     THD *thd = ha_thd();
8702 
8703     while (1) {
8704       int rc = rocksdb_skip_expired_records(kd, m_scan_it, !move_forward);
8705       if (rc != HA_EXIT_SUCCESS) {
8706         return rc;
8707       }
8708 
8709       if (thd && thd->killed) {
8710         return HA_ERR_QUERY_INTERRUPTED;
8711       }
8712 
8713       if (!is_valid(m_scan_it)) {
8714         table->status = STATUS_NOT_FOUND;
8715         return HA_ERR_END_OF_FILE;
8716       }
8717       const rocksdb::Slice rkey = m_scan_it->key();
8718 
8719       if (!kd.covers_key(rkey)) {
8720         table->status = STATUS_NOT_FOUND;
8721         return HA_ERR_END_OF_FILE;
8722       }
8723 
8724       if (m_sk_match_prefix) {
8725         const rocksdb::Slice prefix((const char *)m_sk_match_prefix,
8726                                     m_sk_match_length);
8727         if (!kd.value_matches_prefix(rkey, prefix)) {
8728           table->status = STATUS_NOT_FOUND;
8729           return HA_ERR_END_OF_FILE;
8730         }
8731       }
8732 
8733       const rocksdb::Slice value = m_scan_it->value();
8734       int err = kd.unpack_record(table, buf, &rkey, &value,
8735                                  m_converter->get_verify_row_debug_checksums());
8736       if (err != HA_EXIT_SUCCESS) {
8737         return err;
8738       }
8739 
8740       const check_result_t icp_status= handler_index_cond_check(this);
8741       if (icp_status == CHECK_NEG) {
8742         rocksdb_smart_next(!move_forward, m_scan_it);
8743         continue; /* Get the next (or prev) index tuple */
8744       }
8745       else if (icp_status == CHECK_OUT_OF_RANGE ||
8746                icp_status == CHECK_ABORTED_BY_USER) {
8747         /* We have walked out of range we are scanning */
8748         table->status = STATUS_NOT_FOUND;
8749         return HA_ERR_END_OF_FILE;
8750       }
8751       else /* icp_status == CHECK_POS */
8752       {
8753         /* Index Condition is satisfied. We have rc==0, proceed to fetch the
8754          * row. */
8755         break;
8756       }
8757     }
8758   }
8759   return HA_EXIT_SUCCESS;
8760 }
8761 
8762 /**
8763    @return
8764     HA_EXIT_SUCCESS  OK
8765     other            HA_ERR error code (can be SE-specific)
8766 */
8767 
8768 /**
8769    @return
8770     HA_ADMIN_OK      OK
8771     other            HA_ADMIN error code
8772 */
8773 int ha_rocksdb::check(THD *const thd, HA_CHECK_OPT *const check_opt) {
8774   DBUG_ENTER_FUNC();
8775 
8776   DBUG_ASSERT(thd != nullptr);
8777   DBUG_ASSERT(check_opt != nullptr);
8778 
8779   const uint pk = pk_index(table, m_tbl_def);
8780   String rowkey_copy;
8781   String sec_key_copy;
8782   const char *const table_name = table->s->table_name.str;
8783 
8784   // Only when debugging: don't use snapshot when reading
8785   // Rdb_transaction *tx= get_or_create_tx(table->in_use);
8786   // tx->snapshot= nullptr;
8787 
8788   bool save_verify_row_debug_checksums =
8789       m_converter->get_verify_row_debug_checksums();
8790   m_converter->set_verify_row_debug_checksums(true);
8791   /* For each secondary index, check that we can get a PK value from it */
8792   // NO_LINT_DEBUG
8793   sql_print_verbose_info("CHECKTABLE %s: Checking table %s", table_name,
8794                          table_name);
8795   ha_rows UNINIT_VAR(row_checksums_at_start);  // set/used iff first_index==true
8796   ha_rows row_checksums = ha_rows(-1);
8797   bool first_index = true;
8798 
8799   for (uint keyno = 0; keyno < table->s->keys; keyno++) {
8800     if (keyno != pk) {
8801       extra(HA_EXTRA_KEYREAD);
8802       ha_index_init(keyno, true);
8803       ha_rows rows = 0;
8804       ha_rows checksums = 0;
8805       if (first_index) {
8806         row_checksums_at_start = m_converter->get_row_checksums_checked();
8807       }
8808       int res;
8809       // NO_LINT_DEBUG
8810       sql_print_verbose_info("CHECKTABLE %s:   Checking index %s", table_name,
8811                              table->key_info[keyno].name.str);
8812       while (1) {
8813         if (!rows) {
8814           res = index_first(table->record[0]);
8815         } else {
8816           res = index_next(table->record[0]);
8817         }
8818 
8819         if (res == HA_ERR_END_OF_FILE) break;
8820         if (res) {
8821           // error
8822           // NO_LINT_DEBUG
8823           sql_print_error("CHECKTABLE %s:   .. row %lld: index scan error %d",
8824                           table_name, rows, res);
8825           goto error;
8826         }
8827         rocksdb::Slice key = m_scan_it->key();
8828         sec_key_copy.copy(key.data(), key.size(), &my_charset_bin);
8829         rowkey_copy.copy(m_last_rowkey.ptr(), m_last_rowkey.length(),
8830                          &my_charset_bin);
8831 
8832         if (m_key_descr_arr[keyno]->unpack_info_has_checksum(
8833                 m_scan_it->value())) {
8834           checksums++;
8835         }
8836 
8837         if ((res = get_row_by_rowid(table->record[0], rowkey_copy.ptr(),
8838                                     rowkey_copy.length()))) {
8839           // NO_LINT_DEBUG
8840           sql_print_error(
8841               "CHECKTABLE %s:   .. row %lld: "
8842               "failed to fetch row by rowid",
8843               table_name, rows);
8844           goto error;
8845         }
8846 
8847         longlong hidden_pk_id = 0;
8848         if (has_hidden_pk(table) &&
8849             read_hidden_pk_id_from_rowkey(&hidden_pk_id)) {
8850           goto error;
8851         }
8852 
8853         /* Check if we get the same PK value */
8854         uint packed_size = m_pk_descr->pack_record(
8855             table, m_pack_buffer, table->record[0], m_pk_packed_tuple, nullptr,
8856             false, hidden_pk_id);
8857         if (packed_size != rowkey_copy.length() ||
8858             memcmp(m_pk_packed_tuple, rowkey_copy.ptr(), packed_size)) {
8859           // NO_LINT_DEBUG
8860           sql_print_error("CHECKTABLE %s:   .. row %lld: PK value mismatch",
8861                           table_name, rows);
8862           goto print_and_error;
8863         }
8864 
8865         /* Check if we get the same secondary key value */
8866         packed_size = m_key_descr_arr[keyno]->pack_record(
8867             table, m_pack_buffer, table->record[0], m_sk_packed_tuple,
8868             &m_sk_tails, false, hidden_pk_id);
8869         if (packed_size != sec_key_copy.length() ||
8870             memcmp(m_sk_packed_tuple, sec_key_copy.ptr(), packed_size)) {
8871           // NO_LINT_DEBUG
8872           sql_print_error(
8873               "CHECKTABLE %s:   .. row %lld: "
8874               "secondary index value mismatch",
8875               table_name, rows);
8876           goto print_and_error;
8877         }
8878         rows++;
8879         continue;
8880 
8881       print_and_error : {
8882         std::string buf;
8883         buf = rdb_hexdump(rowkey_copy.ptr(), rowkey_copy.length(),
8884                           RDB_MAX_HEXDUMP_LEN);
8885         // NO_LINT_DEBUG
8886         sql_print_error("CHECKTABLE %s:   rowkey: %s", table_name, buf.c_str());
8887 
8888         buf = rdb_hexdump(m_retrieved_record.data(), m_retrieved_record.size(),
8889                           RDB_MAX_HEXDUMP_LEN);
8890         // NO_LINT_DEBUG
8891         sql_print_error("CHECKTABLE %s:   record: %s", table_name, buf.c_str());
8892 
8893         buf = rdb_hexdump(sec_key_copy.ptr(), sec_key_copy.length(),
8894                           RDB_MAX_HEXDUMP_LEN);
8895         // NO_LINT_DEBUG
8896         sql_print_error("CHECKTABLE %s:   index: %s", table_name, buf.c_str());
8897 
8898         goto error;
8899       }
8900       }
8901       // NO_LINT_DEBUG
8902       sql_print_verbose_info("CHECKTABLE %s:   ... %lld index entries checked "
8903                              "(%lld had checksums)",
8904                              table_name, rows, checksums);
8905 
8906       if (first_index) {
8907         row_checksums =
8908             m_converter->get_row_checksums_checked() - row_checksums_at_start;
8909         first_index = false;
8910       }
8911       ha_index_end();
8912     }
8913   }
8914   if (row_checksums != ha_rows(-1)) {
8915     // NO_LINT_DEBUG
8916     sql_print_verbose_info("CHECKTABLE %s:   %lld table records had checksums",
8917                            table_name, row_checksums);
8918   }
8919   extra(HA_EXTRA_NO_KEYREAD);
8920 
8921   m_converter->set_verify_row_debug_checksums(save_verify_row_debug_checksums);
8922   /*
8923     TODO(sergiy): we should check also for PK records that are missing in
8924     the secondary indexes.
8925     For that, need to walk through the PK and check that every PK record has a
8926     proper counterpart in each secondary index.
8927   */
8928   DBUG_RETURN(HA_ADMIN_OK);
8929 error:
8930   m_converter->set_verify_row_debug_checksums(save_verify_row_debug_checksums);
8931   ha_index_or_rnd_end();
8932   extra(HA_EXTRA_NO_KEYREAD);
8933 
8934   DBUG_RETURN(HA_ADMIN_CORRUPT);
8935 }
8936 
8937 static void dbug_dump_str(FILE *const out, const char *const str, int len) {
8938   fprintf(out, "\"");
8939   for (int i = 0; i < len; i++) {
8940     if (str[i] > 32) {
8941       fprintf(out, "%c", str[i]);
8942     } else {
8943       fprintf(out, "\\%d", str[i]);
8944     }
8945   }
8946   fprintf(out, "\"");
8947 }
8948 
8949 /*
8950   Debugging help: dump the whole database into a human-readable file.
8951   Usage:
8952     dbug_dump_database(rdb);
8953 */
8954 
8955 void dbug_dump_database(rocksdb::DB *const db) {
8956   FILE *const out = fopen("/tmp/rocksdb.dump", "wt");
8957   if (!out) return;
8958 
8959   rocksdb::Iterator *it = db->NewIterator(rocksdb::ReadOptions());
8960   for (it->SeekToFirst(); it->Valid(); it->Next()) {
8961     rocksdb::Slice key = it->key();
8962     rocksdb::Slice val = it->value();
8963     dbug_dump_str(out, key.data(), key.size());
8964     fprintf(out, " -> ");
8965     dbug_dump_str(out, val.data(), val.size());
8966     fprintf(out, "\n");
8967   }
8968 
8969   delete it;
8970   fclose(out);
8971 }
8972 
8973 rocksdb::Status ha_rocksdb::get_for_update(
8974     Rdb_transaction *const tx, rocksdb::ColumnFamilyHandle *const column_family,
8975     const rocksdb::Slice &key, rocksdb::PinnableSlice *const value) const {
8976   DBUG_ASSERT(m_lock_rows != RDB_LOCK_NONE);
8977   const bool exclusive = m_lock_rows != RDB_LOCK_READ;
8978 
8979   const bool do_validate =
8980       my_core::thd_tx_isolation(ha_thd()) > ISO_READ_COMMITTED;
8981   rocksdb::Status s =
8982       tx->get_for_update(column_family, key, value, exclusive, do_validate);
8983 
8984 #ifndef DBUG_OFF
8985   ++rocksdb_num_get_for_update_calls;
8986 #endif
8987   return s;
8988 }
8989 
8990 bool ha_rocksdb::is_blind_delete_enabled() {
8991   THD *thd = ha_thd();
8992   /*
8993     Note: in MariaDB, thd->lex->table_count is only set for multi-table DELETE,
8994     not for single-table DELETE.  So we check thd->lex->query_tables instead.
8995   */
8996   return (THDVAR(thd, blind_delete_primary_key) &&
8997           thd->lex->sql_command == SQLCOM_DELETE &&
8998           thd->lex->query_tables && !thd->lex->query_tables->next_global &&
8999           table->s->keys == 1 &&
9000           !has_hidden_pk(table) && !thd->rgi_slave);
9001 }
9002 
9003 /*
9004   Given a rowid (i.e. packed PK) as a parameter, get the record.
9005 
9006   @return
9007     HA_EXIT_SUCCESS  OK
9008     other            HA_ERR error code (can be SE-specific)
9009 */
9010 
9011 int ha_rocksdb::get_row_by_rowid(uchar *const buf, const char *const rowid,
9012                                  const uint rowid_size, const bool skip_lookup,
9013                                  const bool skip_ttl_check) {
9014   DBUG_ENTER_FUNC();
9015 
9016   DBUG_ASSERT(table != nullptr);
9017 
9018   int rc;
9019 
9020   rocksdb::Slice key_slice(rowid, rowid_size);
9021 
9022   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
9023   DBUG_ASSERT(tx != nullptr);
9024 
9025 #ifdef ENABLED_DEBUG_SYNC
9026   DEBUG_SYNC(ha_thd(), "rocksdb.get_row_by_rowid");
9027   DBUG_EXECUTE_IF("dbug.rocksdb.get_row_by_rowid", {
9028     THD *thd = ha_thd();
9029     const char act[] =
9030         "now signal Reached "
9031         "wait_for signal.rocksdb.get_row_by_rowid_let_running";
9032     DBUG_ASSERT(opt_debug_sync_timeout > 0);
9033     DBUG_ASSERT(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
9034   };);
9035 #endif /* ENABLED_DEBUG_SYNC */
9036 
9037   bool found;
9038   rocksdb::Status s;
9039 
9040   /* Pretend row found without looking up */
9041   if (skip_lookup) {
9042 #ifdef MARIAROCKS_NOT_YET
9043     stats.rows_deleted_blind++;
9044 #endif
9045     update_row_stats(ROWS_DELETED_BLIND);
9046     m_last_rowkey.copy((const char *)rowid, rowid_size, &my_charset_bin);
9047     table->status = 0;
9048     DBUG_RETURN(0);
9049   }
9050 
9051   if (m_lock_rows == RDB_LOCK_NONE) {
9052     tx->acquire_snapshot(true);
9053     s = tx->get(m_pk_descr->get_cf(), key_slice, &m_retrieved_record);
9054   } else if (m_insert_with_update && m_dup_pk_found) {
9055     DBUG_ASSERT(m_pk_descr->get_keyno() == m_dupp_errkey);
9056     DBUG_ASSERT(m_dup_pk_retrieved_record.length() ==
9057                 m_retrieved_record.size());
9058     DBUG_ASSERT(memcmp(m_dup_pk_retrieved_record.ptr(),
9059                        m_retrieved_record.data(),
9060                        m_retrieved_record.size()) == 0);
9061 
9062     // do nothing - we already have the result in m_retrieved_record and
9063     // already taken the lock
9064     s = rocksdb::Status::OK();
9065   } else {
9066     s = get_for_update(tx, m_pk_descr->get_cf(), key_slice,
9067                        &m_retrieved_record);
9068   }
9069 
9070   DBUG_EXECUTE_IF("rocksdb_return_status_corrupted",
9071                   dbug_change_status_to_corrupted(&s););
9072 
9073   if (!s.IsNotFound() && !s.ok()) {
9074     DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def,
9075                                      m_table_handler));
9076   }
9077   found = !s.IsNotFound();
9078 
9079   table->status = STATUS_NOT_FOUND;
9080   if (found) {
9081     /* If we found the record, but it's expired, pretend we didn't find it.  */
9082     if (!skip_ttl_check && m_pk_descr->has_ttl() &&
9083         should_hide_ttl_rec(*m_pk_descr, m_retrieved_record,
9084                             tx->m_snapshot_timestamp)) {
9085       DBUG_RETURN(HA_ERR_KEY_NOT_FOUND);
9086     }
9087 
9088     m_last_rowkey.copy((const char *)rowid, rowid_size, &my_charset_bin);
9089     rc = convert_record_from_storage_format(&key_slice, buf);
9090 
9091     if (!rc) {
9092       table->status = 0;
9093     }
9094   } else {
9095     /*
9096       Note: we don't need to unlock the row. It is intentional that we keep
9097       locks on rows that don't exist.
9098     */
9099     rc = HA_ERR_KEY_NOT_FOUND;
9100   }
9101 
9102   DBUG_RETURN(rc);
9103 }
9104 
9105 /**
9106   @return
9107     HA_EXIT_SUCCESS  OK
9108     other            HA_ERR error code (can be SE-specific)
9109 */
9110 int ha_rocksdb::index_next(uchar *const buf) {
9111   DBUG_ENTER_FUNC();
9112 
9113   bool moves_forward = true;
9114   if (m_key_descr_arr[active_index]->m_is_reverse_cf) {
9115     moves_forward = false;
9116   }
9117 
9118   int rc = index_next_with_direction(buf, moves_forward);
9119   if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
9120 
9121   DBUG_RETURN(rc);
9122 }
9123 
9124 /**
9125   @return
9126     HA_EXIT_SUCCESS  OK
9127     other            HA_ERR error code (can be SE-specific)
9128 */
9129 int ha_rocksdb::index_prev(uchar *const buf) {
9130   DBUG_ENTER_FUNC();
9131 
9132   bool moves_forward = false;
9133   if (m_key_descr_arr[active_index]->m_is_reverse_cf) {
9134     moves_forward = true;
9135   }
9136 
9137   int rc = index_next_with_direction(buf, moves_forward);
9138   if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
9139 
9140   DBUG_RETURN(rc);
9141 }
9142 
9143 int ha_rocksdb::index_next_with_direction(uchar *const buf, bool move_forward) {
9144   DBUG_ENTER_FUNC();
9145 
9146   int rc;
9147 
9148   if (active_index == pk_index(table, m_tbl_def)) {
9149     rc = rnd_next_with_direction(buf, move_forward);
9150   } else {
9151     THD *thd = ha_thd();
9152     for (;;) {
9153       DEBUG_SYNC(thd, "rocksdb.check_flags_inwd");
9154       if (thd && thd->killed) {
9155         rc = HA_ERR_QUERY_INTERRUPTED;
9156         break;
9157       }
9158       if (m_skip_scan_it_next_call) {
9159         m_skip_scan_it_next_call = false;
9160       } else {
9161         if (move_forward) {
9162           m_scan_it->Next(); /* this call cannot fail */
9163         } else {
9164           m_scan_it->Prev();
9165         }
9166       }
9167       rc = rocksdb_skip_expired_records(*m_key_descr_arr[active_index],
9168                                         m_scan_it, !move_forward);
9169       if (rc != HA_EXIT_SUCCESS) {
9170         break;
9171       }
9172       rc = find_icp_matching_index_rec(move_forward, buf);
9173       if (!rc) rc = secondary_index_read(active_index, buf);
9174       if (!should_skip_invalidated_record(rc)) {
9175         break;
9176       }
9177     }
9178   }
9179 
9180   DBUG_RETURN(rc);
9181 }
9182 
9183 /**
9184   @return
9185     HA_EXIT_SUCCESS  OK
9186     other            HA_ERR error code (can be SE-specific)
9187 */
9188 int ha_rocksdb::index_first(uchar *const buf) {
9189   DBUG_ENTER_FUNC();
9190 
9191   m_sk_match_prefix = nullptr;
9192   int rc = m_key_descr_arr[active_index]->m_is_reverse_cf
9193                ? index_last_intern(buf)
9194                : index_first_intern(buf);
9195   if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
9196 
9197   DBUG_RETURN(rc);
9198 }
9199 
9200 /**
9201   @return
9202     HA_EXIT_SUCCESS  OK
9203     other            HA_ERR error code (can be SE-specific)
9204 */
9205 int ha_rocksdb::index_last(uchar *const buf) {
9206   DBUG_ENTER_FUNC();
9207 
9208   m_sk_match_prefix = nullptr;
9209   int rc = m_key_descr_arr[active_index]->m_is_reverse_cf
9210                ? index_first_intern(buf)
9211                : index_last_intern(buf);
9212   if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
9213 
9214   DBUG_RETURN(rc);
9215 }
9216 
9217 /*
9218   Start scanning from the "first" value.
9219 
9220   The 'first' here means "the first from start of the key space".
9221   For reverse-ordered key spaces, the first value will be the biggest, 'zzzz'.
9222 
9223   An picture of a forward-ordered keyspace (remember, the keys have form
9224   'indexnr-keyval'. Suppose the index we are at has number n)
9225 
9226       (n-1) - ...
9227       ( n )          <--- 1. (n) doesn't exist in the db but it would be here.
9228       ( n ) - aaa       <--- 2. Seek("n") will put us here on the first index
9229       ( n ) - bbb               record.
9230       ( n ) - cc
9231 
9232   So, need to do: Seek(n);
9233 
9234   A backward-ordered keyspace:
9235 
9236       (n+1) - bbb
9237       (n+1) - aaa
9238       (n+1)        <--- (n+1) doesn't exist in the db but would be here.
9239       ( n ) - ccc       <--- 1. We need to be here.
9240       ( n ) - bbb
9241       ( n ) - aaa
9242       ( n )
9243 
9244   So, need to: Seek(n+1);
9245 
9246 */
9247 
9248 int ha_rocksdb::index_first_intern(uchar *const buf) {
9249   DBUG_ENTER_FUNC();
9250 
9251   uchar *key;
9252   uint key_size;
9253   int rc;
9254 
9255   if (is_pk(active_index, table, m_tbl_def)) {
9256     key = m_pk_packed_tuple;
9257   } else {
9258     key = m_sk_packed_tuple;
9259   }
9260 
9261   DBUG_ASSERT(key != nullptr);
9262 
9263   const Rdb_key_def &kd = *m_key_descr_arr[active_index];
9264   int key_start_matching_bytes = kd.get_first_key(key, &key_size);
9265 
9266   rocksdb::Slice index_key((const char *)key, key_size);
9267 
9268   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
9269   DBUG_ASSERT(tx != nullptr);
9270 
9271   const bool is_new_snapshot = !tx->has_snapshot();
9272   // Loop as long as we get a deadlock error AND we end up creating the
9273   // snapshot here (i.e. it did not exist prior to this)
9274   for (;;) {
9275     setup_scan_iterator(kd, &index_key, false, key_start_matching_bytes);
9276     m_scan_it->Seek(index_key);
9277     m_skip_scan_it_next_call = true;
9278 
9279     rc = index_next_with_direction(buf, true);
9280     if (!should_recreate_snapshot(rc, is_new_snapshot)) {
9281       break; /* exit the loop */
9282     }
9283 
9284     // release the snapshot and iterator so they will be regenerated
9285     tx->release_snapshot();
9286     release_scan_iterator();
9287   }
9288 
9289   if (!rc) {
9290     /*
9291       index_next is always incremented on success, so decrement if it is
9292       index_first instead
9293      */
9294 #ifdef MARIAROCKS_NOT_YET
9295     stats.rows_index_first++;
9296     stats.rows_index_next--;
9297 #endif
9298   }
9299 
9300   DBUG_RETURN(rc);
9301 }
9302 
9303 /**
9304   @details
9305   Start scanning from the "last" value
9306 
9307   The 'last' here means "the last from start of the key space".
9308   For reverse-ordered key spaces, we will actually read the smallest value.
9309 
9310   An picture of a forward-ordered keyspace (remember, the keys have form
9311   'indexnr-keyval'. Suppose the we are at a key that has number n)
9312 
9313      (n-1)-something
9314      ( n )-aaa
9315      ( n )-bbb
9316      ( n )-ccc            <----------- Need to seek to here.
9317      (n+1)      <---- Doesn't exist, but would be here.
9318      (n+1)-smth, or no value at all
9319 
9320    RocksDB's Iterator::SeekForPrev($val) seeks to "at $val or last value that's
9321    smaller". We can't seek to "(n)-ccc" directly, because we don't know what
9322    is the value of 'ccc' (the biggest record with prefix (n)). Instead, we seek
9323    to "(n+1)", which is the least possible value that's greater than any value
9324    in index #n.
9325 
9326    So, need to:  it->SeekForPrev(n+1)
9327 
9328    A backward-ordered keyspace:
9329 
9330       (n+1)-something
9331       ( n ) - ccc
9332       ( n ) - bbb
9333       ( n ) - aaa       <---------------- (*) Need to seek here.
9334       ( n ) <--- Doesn't exist, but would be here.
9335       (n-1)-smth, or no value at all
9336 
9337    So, need to:  it->SeekForPrev(n)
9338 */
9339 
9340 int ha_rocksdb::index_last_intern(uchar *const buf) {
9341   DBUG_ENTER_FUNC();
9342 
9343   uchar *key;
9344   uint key_size;
9345   int rc;
9346 
9347   if (is_pk(active_index, table, m_tbl_def)) {
9348     key = m_pk_packed_tuple;
9349   } else {
9350     key = m_sk_packed_tuple;
9351   }
9352 
9353   DBUG_ASSERT(key != nullptr);
9354 
9355   const Rdb_key_def &kd = *m_key_descr_arr[active_index];
9356   int key_end_matching_bytes = kd.get_last_key(key, &key_size);
9357 
9358   rocksdb::Slice index_key((const char *)key, key_size);
9359 
9360   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
9361   DBUG_ASSERT(tx != nullptr);
9362 
9363   bool is_new_snapshot = !tx->has_snapshot();
9364   // Loop as long as we get a deadlock error AND we end up creating the
9365   // snapshot here (i.e. it did not exist prior to this)
9366   for (;;) {
9367     setup_scan_iterator(kd, &index_key, false, key_end_matching_bytes);
9368     m_scan_it->SeekForPrev(index_key);
9369     m_skip_scan_it_next_call = false;
9370 
9371     if (is_pk(active_index, table, m_tbl_def)) {
9372       m_skip_scan_it_next_call = true;
9373       rc = rnd_next_with_direction(buf, false);
9374     } else {
9375       rc = find_icp_matching_index_rec(false /*move_forward*/, buf);
9376       if (!rc) rc = secondary_index_read(active_index, buf);
9377     }
9378 
9379     if (!should_recreate_snapshot(rc, is_new_snapshot)) {
9380       break; /* exit the loop */
9381     }
9382 
9383     // release the snapshot and iterator so they will be regenerated
9384     tx->release_snapshot();
9385     release_scan_iterator();
9386   }
9387 
9388   if (!rc) {
9389     /*
9390       index_next is always incremented on success, so decrement if it is
9391       index_first instead
9392      */
9393 #ifdef MARIAROCKS_NOT_YET
9394     stats.rows_index_first++;
9395     stats.rows_index_next--;
9396 #endif
9397   }
9398 
9399   DBUG_RETURN(rc);
9400 }
9401 
9402 void ha_rocksdb::unlock_row() {
9403   DBUG_ENTER_FUNC();
9404 
9405   if (m_lock_rows != RDB_LOCK_NONE) {
9406     Rdb_transaction *const tx = get_or_create_tx(table->in_use);
9407     tx->release_lock(m_pk_descr->get_cf(),
9408                      std::string(m_last_rowkey.ptr(), m_last_rowkey.length()));
9409   }
9410 
9411   DBUG_VOID_RETURN;
9412 }
9413 
9414 /*
9415   Returning true if SingleDelete can be used.
9416   - Secondary Indexes can always use SingleDelete.
9417   - If the index is PRIMARY KEY, and if all of the columns of the table
9418     are covered by the PRIMARY KEY, SingleDelete can be used.
9419 */
9420 bool ha_rocksdb::can_use_single_delete(const uint index) const {
9421   return (index != pk_index(table, m_tbl_def) ||
9422           (!has_hidden_pk(table) &&
9423            table->key_info[index].ext_key_parts == table->s->fields));
9424 }
9425 
9426 bool ha_rocksdb::skip_unique_check() const {
9427   /*
9428     We want to skip unique checks if:
9429       1) bulk_load is on
9430       2) this table is in the whitelist of tables to skip and the replication
9431          lag has reached a large enough value (see unique_check_lag_threshold
9432          and unique_check_lage_reset_threshold)
9433       3) the user set unique_checks option to 0, and the table does not have
9434          any indexes. If the table has secondary keys, then those might becomes
9435          inconsisted/corrupted
9436       4) We're using read-free replication
9437   */
9438   return THDVAR(table->in_use, bulk_load) ||
9439          (m_force_skip_unique_check && m_skip_unique_check) ||
9440          (my_core::thd_test_options(table->in_use,
9441                                     OPTION_RELAXED_UNIQUE_CHECKS) &&
9442           m_tbl_def->m_key_count == 1) ||
9443 #ifdef MARIAROCKS_NOT_YET
9444          use_read_free_rpl();
9445 #else
9446          FALSE;
9447 #endif
9448 }
9449 
9450 #ifdef MARIAROCKS_NOT_YET // MDEV-10975
9451 void ha_rocksdb::set_force_skip_unique_check(bool skip) {
9452   DBUG_ENTER_FUNC();
9453 
9454   m_force_skip_unique_check = skip;
9455 
9456   DBUG_VOID_RETURN;
9457 }
9458 #endif
9459 
9460 bool ha_rocksdb::commit_in_the_middle() {
9461   return THDVAR(table->in_use, bulk_load) ||
9462          THDVAR(table->in_use, commit_in_the_middle);
9463 }
9464 
9465 /*
9466   Executing bulk commit if it should.
9467   @retval true if bulk commit failed
9468   @retval false if bulk commit was skipped or succeeded
9469 */
9470 bool ha_rocksdb::do_bulk_commit(Rdb_transaction *const tx) {
9471   return commit_in_the_middle() &&
9472          tx->get_write_count() >= THDVAR(table->in_use, bulk_load_size) &&
9473          tx->flush_batch();
9474 }
9475 
9476 /*
9477   If table was created without primary key, SQL layer represents the primary
9478   key number as MAX_INDEXES.  Hence, this function returns true if the table
9479   does not contain a primary key. (In which case we generate a hidden
9480   'auto-incremented' pk.)
9481 */
9482 bool ha_rocksdb::has_hidden_pk(const TABLE *const table) const {
9483   return Rdb_key_def::table_has_hidden_pk(table);
9484 }
9485 
9486 /*
9487   Returns true if given index number is a hidden_pk.
9488   - This is used when a table is created with no primary key.
9489 */
9490 bool ha_rocksdb::is_hidden_pk(const uint index, const TABLE *const table_arg,
9491                               const Rdb_tbl_def *const tbl_def_arg) {
9492   DBUG_ASSERT(table_arg->s != nullptr);
9493 
9494   return (table_arg->s->primary_key == MAX_INDEXES &&
9495           index == tbl_def_arg->m_key_count - 1);
9496 }
9497 
9498 /* Returns index of primary key */
9499 uint ha_rocksdb::pk_index(const TABLE *const table_arg,
9500                           const Rdb_tbl_def *const tbl_def_arg) {
9501   DBUG_ASSERT(table_arg->s != nullptr);
9502 
9503   return table_arg->s->primary_key == MAX_INDEXES ? tbl_def_arg->m_key_count - 1
9504                                                   : table_arg->s->primary_key;
9505 }
9506 
9507 /* Returns true if given index number is a primary key */
9508 bool ha_rocksdb::is_pk(const uint index, const TABLE *const table_arg,
9509                        const Rdb_tbl_def *const tbl_def_arg) {
9510   DBUG_ASSERT(table_arg->s != nullptr);
9511 
9512   return index == table_arg->s->primary_key ||
9513          is_hidden_pk(index, table_arg, tbl_def_arg);
9514 }
9515 
9516 uint ha_rocksdb::max_supported_key_part_length() const {
9517   DBUG_ENTER_FUNC();
9518   DBUG_RETURN(rocksdb_large_prefix ? MAX_INDEX_COL_LEN_LARGE
9519                                    : MAX_INDEX_COL_LEN_SMALL);
9520 }
9521 
9522 const char *ha_rocksdb::get_key_name(const uint index,
9523                                      const TABLE *const table_arg,
9524                                      const Rdb_tbl_def *const tbl_def_arg) {
9525   if (is_hidden_pk(index, table_arg, tbl_def_arg)) {
9526     return HIDDEN_PK_NAME;
9527   }
9528 
9529   DBUG_ASSERT(table_arg->key_info != nullptr);
9530   DBUG_ASSERT(table_arg->key_info[index].name.str != nullptr);
9531 
9532   return table_arg->key_info[index].name.str;
9533 }
9534 
9535 const char *ha_rocksdb::get_key_comment(const uint index,
9536                                         const TABLE *const table_arg,
9537                                         const Rdb_tbl_def *const tbl_def_arg) {
9538   if (is_hidden_pk(index, table_arg, tbl_def_arg)) {
9539     return nullptr;
9540   }
9541 
9542   DBUG_ASSERT(table_arg->key_info != nullptr);
9543 
9544   return table_arg->key_info[index].comment.str;
9545 }
9546 
9547 const std::string ha_rocksdb::generate_cf_name(
9548     const uint index, const TABLE *const table_arg,
9549     const Rdb_tbl_def *const tbl_def_arg, bool *per_part_match_found) {
9550   DBUG_ASSERT(table_arg != nullptr);
9551   DBUG_ASSERT(tbl_def_arg != nullptr);
9552   DBUG_ASSERT(per_part_match_found != nullptr);
9553 
9554   // When creating CF-s the caller needs to know if there was a custom CF name
9555   // specified for a given paritition.
9556   *per_part_match_found = false;
9557 
9558   // Index comment is used to define the column family name specification(s).
9559   // If there was no comment, we get an emptry string, and it means "use the
9560   // default column family".
9561   const char *const comment = get_key_comment(index, table_arg, tbl_def_arg);
9562 
9563   // `get_key_comment` can return `nullptr`, that's why this.
9564   std::string key_comment = comment ? comment : "";
9565 
9566   std::string cf_name = Rdb_key_def::parse_comment_for_qualifier(
9567       key_comment, table_arg, tbl_def_arg, per_part_match_found,
9568       RDB_CF_NAME_QUALIFIER);
9569 
9570   if (IF_PARTITIONING(table_arg->part_info,nullptr) != nullptr && !*per_part_match_found) {
9571     // At this point we tried to search for a custom CF name for a partition,
9572     // but none was specified. Therefore default one will be used.
9573     return "";
9574   }
9575 
9576   // If we didn't find any partitioned/non-partitioned qualifiers, return the
9577   // comment itself.  NOTE: this currently handles returning the cf name
9578   // specified in the index comment in the case of no partitions, which doesn't
9579   // use any qualifiers at the moment. (aka its a special case)
9580   if (cf_name.empty() && !key_comment.empty()) {
9581     return key_comment;
9582   }
9583 
9584   return cf_name;
9585 }
9586 
9587 const std::string ha_rocksdb::get_table_comment(const TABLE *const table_arg) {
9588   DBUG_ASSERT(table_arg->s != nullptr);
9589 
9590   return table_arg->s->comment.str;
9591 }
9592 
9593 /**
9594   Write a new row
9595 
9596   @param[in] buf                new row data to write
9597   @return
9598     HA_EXIT_SUCCESS  OK
9599     other            HA_ERR error code (can be SE-specific)
9600 */
9601 int ha_rocksdb::write_row(const uchar *const buf) {
9602   DBUG_ENTER_FUNC();
9603 
9604   DBUG_ASSERT(buf != nullptr);
9605   DBUG_ASSERT(buf == table->record[0]);
9606   DBUG_ASSERT(m_lock_rows == RDB_LOCK_WRITE);
9607 
9608   /*
9609     Note: "buf == table->record[0]" is copied from innodb. I am not aware of
9610     any use cases where this condition is not true.
9611   */
9612   if (table->next_number_field && buf == table->record[0]) {
9613     int err;
9614     if ((err = update_auto_increment())) {
9615       DBUG_RETURN(err);
9616     }
9617   }
9618 
9619   // clear cache at beginning of write for INSERT ON DUPLICATE
9620   // we may get multiple write->fail->read->update if there are multiple
9621   // values from INSERT
9622   m_dup_pk_found = false;
9623 
9624   const int rv = update_write_row(nullptr, buf, skip_unique_check());
9625 
9626   if (rv == 0) {
9627 #ifdef MARIAROCKS_NOT_YET
9628     stats.rows_inserted++;
9629 #endif
9630     update_row_stats(ROWS_INSERTED);
9631   }
9632 
9633   DBUG_RETURN(rv);
9634 }
9635 
9636 /**
9637   Constructing m_last_rowkey (MyRocks key expression) from
9638   before_update|delete image (MySQL row expression).
9639   m_last_rowkey is normally set during lookup phase, such as
9640   rnd_next_with_direction() and rnd_pos(). With Read Free Replication,
9641   these read functions are skipped and update_rows(), delete_rows() are
9642   called without setting m_last_rowkey. This function sets m_last_rowkey
9643   for Read Free Replication.
9644 */
9645 void ha_rocksdb::set_last_rowkey(const uchar *const old_data) {
9646 #ifdef MARIAROCKS_NOT_YET
9647   if (old_data && use_read_free_rpl()) {
9648     const int old_pk_size = m_pk_descr->pack_record(
9649         table, m_pack_buffer, old_data, m_pk_packed_tuple, nullptr, false);
9650     m_last_rowkey.copy((const char *)m_pk_packed_tuple, old_pk_size,
9651                        &my_charset_bin);
9652   }
9653 #endif
9654 }
9655 
9656 /**
9657   Collect update data for primary key
9658 
9659   @param[in, out] row_info            hold all data for update row, such as
9660                                       new row data/old row data
9661   @return
9662     HA_EXIT_SUCCESS  OK
9663     other            HA_ERR error code (can be SE-specific)
9664 */
9665 int ha_rocksdb::get_pk_for_update(struct update_row_info *const row_info) {
9666   int size;
9667 
9668   /*
9669     Get new row key for any insert, and any update where the pk is not hidden.
9670     Row key for updates with hidden pk is handled below.
9671   */
9672   if (!has_hidden_pk(table)) {
9673     row_info->hidden_pk_id = 0;
9674 
9675     row_info->new_pk_unpack_info = &m_pk_unpack_info;
9676 
9677     size = m_pk_descr->pack_record(
9678         table, m_pack_buffer, row_info->new_data, m_pk_packed_tuple,
9679         row_info->new_pk_unpack_info, false, 0, 0, nullptr);
9680   } else if (row_info->old_data == nullptr) {
9681     row_info->hidden_pk_id = update_hidden_pk_val();
9682     size =
9683         m_pk_descr->pack_hidden_pk(row_info->hidden_pk_id, m_pk_packed_tuple);
9684   } else {
9685     /*
9686       If hidden primary key, rowkey for new record will always be the same as
9687       before
9688     */
9689     size = row_info->old_pk_slice.size();
9690     memcpy(m_pk_packed_tuple, row_info->old_pk_slice.data(), size);
9691     int err = read_hidden_pk_id_from_rowkey(&row_info->hidden_pk_id);
9692     if (err) {
9693       return err;
9694     }
9695   }
9696 
9697   row_info->new_pk_slice =
9698       rocksdb::Slice((const char *)m_pk_packed_tuple, size);
9699 
9700   return HA_EXIT_SUCCESS;
9701 }
9702 
9703 /**
9704    Check the specified primary key value is unique and also lock the row
9705 
9706   @param[in] key_id           key index
9707   @param[in] row_info         hold all data for update row, such as old row
9708                               data and new row data
9709   @param[out] found           whether the primary key exists before.
9710   @param[out] pk_changed      whether primary key is changed
9711   @return
9712     HA_EXIT_SUCCESS  OK
9713     other            HA_ERR error code (can be SE-specific)
9714 */
9715 int ha_rocksdb::check_and_lock_unique_pk(const uint key_id,
9716                                          const struct update_row_info &row_info,
9717                                          bool *const found) {
9718   DBUG_ASSERT(found != nullptr);
9719 
9720   DBUG_ASSERT(row_info.old_pk_slice.size() == 0 ||
9721               row_info.new_pk_slice.compare(row_info.old_pk_slice) != 0);
9722 
9723   /* Ignore PK violations if this is a optimized 'replace into' */
9724 #ifdef MARIAROCKS_NOT_YET
9725   const bool ignore_pk_unique_check = ha_thd()->lex->blind_replace_into;
9726 #else
9727   const bool ignore_pk_unique_check= false;
9728 #endif
9729 
9730   /*
9731     Perform a read to determine if a duplicate entry exists. For primary
9732     keys, a point lookup will be sufficient.
9733 
9734     note: we intentionally don't set options.snapshot here. We want to read
9735     the latest committed data.
9736   */
9737 
9738   /*
9739     To prevent race conditions like below, it is necessary to
9740     take a lock for a target row. get_for_update() holds a gap lock if
9741     target key does not exist, so below conditions should never
9742     happen.
9743 
9744     1) T1 Get(empty) -> T2 Get(empty) -> T1 Put(insert) -> T1 commit
9745        -> T2 Put(overwrite) -> T2 commit
9746     2) T1 Get(empty) -> T1 Put(insert, not committed yet) -> T2 Get(empty)
9747        -> T2 Put(insert, blocked) -> T1 commit -> T2 commit(overwrite)
9748   */
9749   const rocksdb::Status s =
9750       get_for_update(row_info.tx, m_pk_descr->get_cf(), row_info.new_pk_slice,
9751                      ignore_pk_unique_check ? nullptr : &m_retrieved_record);
9752   if (!s.ok() && !s.IsNotFound()) {
9753     return row_info.tx->set_status_error(
9754         table->in_use, s, *m_key_descr_arr[key_id], m_tbl_def, m_table_handler);
9755   }
9756 
9757   bool key_found = ignore_pk_unique_check ? false : !s.IsNotFound();
9758 
9759   /*
9760     If the pk key has ttl, we may need to pretend the row wasn't
9761     found if it is already expired.
9762   */
9763   if (key_found && m_pk_descr->has_ttl() &&
9764       should_hide_ttl_rec(*m_pk_descr, m_retrieved_record,
9765                           (row_info.tx->m_snapshot_timestamp
9766                                ? row_info.tx->m_snapshot_timestamp
9767                                : static_cast<int64_t>(std::time(nullptr))))) {
9768     key_found = false;
9769   }
9770 
9771   if (key_found && row_info.old_data == nullptr && m_insert_with_update) {
9772     // In INSERT ON DUPLICATE KEY UPDATE ... case, if the insert failed
9773     // due to a duplicate key, remember the last key and skip the check
9774     // next time
9775     m_dup_pk_found = true;
9776 
9777 #ifndef DBUG_OFF
9778     // save it for sanity checking later
9779     m_dup_pk_retrieved_record.copy(m_retrieved_record.data(),
9780                                    m_retrieved_record.size(), &my_charset_bin);
9781 #endif
9782   }
9783 
9784   *found = key_found;
9785 
9786   return HA_EXIT_SUCCESS;
9787 }
9788 
9789 /**
9790    Check the specified secondary key value is unique and also lock the row
9791 
9792   @param[in] key_id           key index
9793   @param[in] row_info         hold all data for update row, such as old row
9794                               data and new row data
9795   @param[out] found           whether specified key value exists before.
9796   @return
9797     HA_EXIT_SUCCESS  OK
9798     other            HA_ERR error code (can be SE-specific)
9799 */
9800 int ha_rocksdb::check_and_lock_sk(const uint key_id,
9801                                   const struct update_row_info &row_info,
9802                                   bool *const found) {
9803   DBUG_ASSERT(found != nullptr);
9804   *found = false;
9805 
9806   /*
9807     Can skip checking this key if none of the key fields have changed.
9808   */
9809   if (row_info.old_data != nullptr && !m_update_scope.is_set(key_id)) {
9810     return HA_EXIT_SUCCESS;
9811   }
9812 
9813   KEY *key_info = nullptr;
9814   uint n_null_fields = 0;
9815   uint user_defined_key_parts = 1;
9816 
9817   key_info = &table->key_info[key_id];
9818   user_defined_key_parts = key_info->user_defined_key_parts;
9819   /*
9820     If there are no uniqueness requirements, there's no need to obtain a
9821     lock for this key.
9822   */
9823   if (!(key_info->flags & HA_NOSAME)) {
9824     return HA_EXIT_SUCCESS;
9825   }
9826 
9827   const Rdb_key_def &kd = *m_key_descr_arr[key_id];
9828 
9829   /*
9830     Calculate the new key for obtaining the lock
9831 
9832     For unique secondary indexes, the key used for locking does not
9833     include the extended fields.
9834   */
9835   int size =
9836       kd.pack_record(table, m_pack_buffer, row_info.new_data, m_sk_packed_tuple,
9837                      nullptr, false, 0, user_defined_key_parts, &n_null_fields);
9838   if (n_null_fields > 0) {
9839     /*
9840       If any fields are marked as NULL this will never match another row as
9841       to NULL never matches anything else including another NULL.
9842      */
9843     return HA_EXIT_SUCCESS;
9844   }
9845 
9846   const rocksdb::Slice new_slice =
9847       rocksdb::Slice((const char *)m_sk_packed_tuple, size);
9848 
9849   /*
9850      Acquire lock on the old key in case of UPDATE
9851   */
9852   if (row_info.old_data != nullptr) {
9853     size = kd.pack_record(table, m_pack_buffer, row_info.old_data,
9854                           m_sk_packed_tuple_old, nullptr, false, 0,
9855                           user_defined_key_parts);
9856     const rocksdb::Slice old_slice =
9857         rocksdb::Slice((const char *)m_sk_packed_tuple_old, size);
9858 
9859     const rocksdb::Status s =
9860         get_for_update(row_info.tx, kd.get_cf(), old_slice, nullptr);
9861     if (!s.ok()) {
9862       return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def,
9863                                            m_table_handler);
9864     }
9865 
9866     /*
9867       If the old and new keys are the same we're done since we've already taken
9868       the lock on the old key
9869     */
9870     if (!new_slice.compare(old_slice)) {
9871       return HA_EXIT_SUCCESS;
9872     }
9873   }
9874 
9875   /*
9876     Perform a read to determine if a duplicate entry exists - since this is
9877     a secondary indexes a range scan is needed.
9878 
9879     note: we intentionally don't set options.snapshot here. We want to read
9880     the latest committed data.
9881   */
9882 
9883   const bool all_parts_used = (user_defined_key_parts == kd.get_key_parts());
9884 
9885   /*
9886     This iterator seems expensive since we need to allocate and free
9887     memory for each unique index.
9888 
9889     If this needs to be optimized, for keys without NULL fields, the
9890     extended primary key fields can be migrated to the value portion of the
9891     key. This enables using Get() instead of Seek() as in the primary key
9892     case.
9893 
9894     The bloom filter may need to be disabled for this lookup.
9895   */
9896   uchar lower_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE];
9897   uchar upper_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE];
9898   rocksdb::Slice lower_bound_slice;
9899   rocksdb::Slice upper_bound_slice;
9900 
9901   const bool total_order_seek = !check_bloom_and_set_bounds(
9902       ha_thd(), kd, new_slice, all_parts_used, Rdb_key_def::INDEX_NUMBER_SIZE,
9903       lower_bound_buf, upper_bound_buf, &lower_bound_slice, &upper_bound_slice);
9904   const bool fill_cache = !THDVAR(ha_thd(), skip_fill_cache);
9905 
9906   const rocksdb::Status s =
9907       get_for_update(row_info.tx, kd.get_cf(), new_slice, nullptr);
9908   if (!s.ok() && !s.IsNotFound()) {
9909     return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def,
9910                                          m_table_handler);
9911   }
9912 
9913   rocksdb::Iterator *const iter = row_info.tx->get_iterator(
9914       kd.get_cf(), total_order_seek, fill_cache, lower_bound_slice,
9915       upper_bound_slice, true /* read current data */,
9916       false /* acquire snapshot */);
9917   /*
9918     Need to scan the transaction to see if there is a duplicate key.
9919     Also need to scan RocksDB and verify the key has not been deleted
9920     in the transaction.
9921   */
9922   iter->Seek(new_slice);
9923   *found = !read_key_exact(kd, iter, all_parts_used, new_slice,
9924                            row_info.tx->m_snapshot_timestamp);
9925   delete iter;
9926 
9927   return HA_EXIT_SUCCESS;
9928 }
9929 
9930 /**
9931    Enumerate all keys to check their uniquess and also lock it
9932 
9933   @param[in] row_info         hold all data for update row, such as old row
9934                               data and new row data
9935   @param[out] pk_changed      whether primary key is changed
9936   @return
9937     HA_EXIT_SUCCESS  OK
9938     other            HA_ERR error code (can be SE-specific)
9939 */
9940 int ha_rocksdb::check_uniqueness_and_lock(
9941     const struct update_row_info &row_info, bool pk_changed) {
9942   /*
9943     Go through each index and determine if the index has uniqueness
9944     requirements. If it does, then try to obtain a row lock on the new values.
9945     Once all locks have been obtained, then perform the changes needed to
9946     update/insert the row.
9947   */
9948   for (uint key_id = 0; key_id < m_tbl_def->m_key_count; key_id++) {
9949     bool found;
9950     int rc;
9951 
9952     if (is_pk(key_id, table, m_tbl_def)) {
9953       if (row_info.old_pk_slice.size() > 0 && !pk_changed) {
9954         found = false;
9955         rc = HA_EXIT_SUCCESS;
9956       } else {
9957         rc = check_and_lock_unique_pk(key_id, row_info, &found);
9958       }
9959     } else {
9960       rc = check_and_lock_sk(key_id, row_info, &found);
9961     }
9962 
9963     if (rc != HA_EXIT_SUCCESS) {
9964       return rc;
9965     }
9966 
9967     if (found) {
9968       /* There is a row with this key already, so error out. */
9969       errkey = key_id;
9970       m_dupp_errkey = errkey;
9971 
9972       return HA_ERR_FOUND_DUPP_KEY;
9973     }
9974   }
9975 
9976   return HA_EXIT_SUCCESS;
9977 }
9978 
9979 /**
9980   Check whether secondary key value is duplicate or not
9981 
9982   @param[in] table_arg         the table currently working on
9983   @param[in  key_def           the key_def is being checked
9984   @param[in] key               secondary key storage data
9985   @param[out] sk_info          hold secondary key memcmp datas(new/old)
9986   @return
9987     HA_EXIT_SUCCESS  OK
9988     other            HA_ERR error code (can be SE-specific)
9989 */
9990 
9991 int ha_rocksdb::check_duplicate_sk(const TABLE *table_arg,
9992                                    const Rdb_key_def &key_def,
9993                                    const rocksdb::Slice *key,
9994                                    struct unique_sk_buf_info *sk_info) {
9995   uint n_null_fields = 0;
9996   const rocksdb::Comparator *index_comp = key_def.get_cf()->GetComparator();
9997 
9998   /* Get proper SK buffer. */
9999   uchar *sk_buf = sk_info->swap_and_get_sk_buf();
10000 
10001   /* Get memcmp form of sk without extended pk tail */
10002   uint sk_memcmp_size =
10003       key_def.get_memcmp_sk_parts(table_arg, *key, sk_buf, &n_null_fields);
10004 
10005   sk_info->sk_memcmp_key =
10006       rocksdb::Slice(reinterpret_cast<char *>(sk_buf), sk_memcmp_size);
10007 
10008   if (sk_info->sk_memcmp_key_old.size() > 0 && n_null_fields == 0 &&
10009       index_comp->Compare(sk_info->sk_memcmp_key, sk_info->sk_memcmp_key_old) ==
10010           0) {
10011     return 1;
10012   }
10013 
10014   sk_info->sk_memcmp_key_old = sk_info->sk_memcmp_key;
10015   return 0;
10016 }
10017 
10018 int ha_rocksdb::bulk_load_key(Rdb_transaction *const tx, const Rdb_key_def &kd,
10019                               const rocksdb::Slice &key,
10020                               const rocksdb::Slice &value, bool sort) {
10021   DBUG_ENTER_FUNC();
10022   int res;
10023   THD *thd = ha_thd();
10024   if (thd && thd->killed) {
10025     DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED);
10026   }
10027 
10028   rocksdb::ColumnFamilyHandle *cf = kd.get_cf();
10029 
10030   // In the case of unsorted inserts, m_sst_info allocated here is not
10031   // used to store the keys. It is still used to indicate when tables
10032   // are switched.
10033   if (m_sst_info == nullptr || m_sst_info->is_done()) {
10034     m_sst_info.reset(new Rdb_sst_info(rdb, m_table_handler->m_table_name,
10035                                       kd.get_name(), cf, *rocksdb_db_options,
10036                                       THDVAR(ha_thd(), trace_sst_api)));
10037     res = tx->start_bulk_load(this, m_sst_info);
10038     if (res != HA_EXIT_SUCCESS) {
10039       DBUG_RETURN(res);
10040     }
10041   }
10042   DBUG_ASSERT(m_sst_info);
10043 
10044   if (sort) {
10045     Rdb_index_merge *key_merge;
10046     DBUG_ASSERT(cf != nullptr);
10047 
10048     res = tx->get_key_merge(kd.get_gl_index_id(), cf, &key_merge);
10049     if (res == HA_EXIT_SUCCESS) {
10050       res = key_merge->add(key, value);
10051     }
10052   } else {
10053     res = m_sst_info->put(key, value);
10054   }
10055 
10056   DBUG_RETURN(res);
10057 }
10058 
10059 int ha_rocksdb::finalize_bulk_load(bool print_client_error) {
10060   DBUG_ENTER_FUNC();
10061 
10062   int res = HA_EXIT_SUCCESS;
10063 
10064   /* Skip if there are no possible ongoing bulk loads */
10065   if (m_sst_info) {
10066     if (m_sst_info->is_done()) {
10067       m_sst_info.reset();
10068       DBUG_RETURN(res);
10069     }
10070 
10071     Rdb_sst_info::Rdb_sst_commit_info commit_info;
10072 
10073     // Wrap up the current work in m_sst_info and get ready to commit
10074     // This transfer the responsibility of commit over to commit_info
10075     res = m_sst_info->finish(&commit_info, print_client_error);
10076     if (res == 0) {
10077       // Make sure we have work to do - under race condition we could lose
10078       // to another thread and end up with no work
10079       if (commit_info.has_work()) {
10080         rocksdb::IngestExternalFileOptions opts;
10081         opts.move_files = true;
10082         opts.snapshot_consistency = false;
10083         opts.allow_global_seqno = false;
10084         opts.allow_blocking_flush = false;
10085 
10086         const rocksdb::Status s = rdb->IngestExternalFile(
10087             commit_info.get_cf(), commit_info.get_committed_files(), opts);
10088         if (!s.ok()) {
10089           if (print_client_error) {
10090             Rdb_sst_info::report_error_msg(s, nullptr);
10091           }
10092           res = HA_ERR_ROCKSDB_BULK_LOAD;
10093         } else {
10094           // Mark the list of SST files as committed, otherwise they'll get
10095           // cleaned up when commit_info destructs
10096           commit_info.commit();
10097         }
10098       }
10099     }
10100     m_sst_info.reset();
10101   }
10102   DBUG_RETURN(res);
10103 }
10104 
10105 /**
10106   Update an existing primary key record or write a new primary key record
10107 
10108   @param[in] kd                the primary key is being update/write
10109   @param[in] update_row_info   hold all row data, such as old row data and
10110                                new row data
10111   @param[in] pk_changed        whether primary key is changed
10112   @return
10113     HA_EXIT_SUCCESS OK
10114     Other           HA_ERR error code (can be SE-specific)
10115  */
10116 int ha_rocksdb::update_write_pk(const Rdb_key_def &kd,
10117                                 const struct update_row_info &row_info,
10118                                 bool pk_changed) {
10119   uint key_id = kd.get_keyno();
10120   bool hidden_pk = is_hidden_pk(key_id, table, m_tbl_def);
10121   ulonglong bytes_written = 0;
10122 
10123   /*
10124     If the PK has changed, or if this PK uses single deletes and this is an
10125     update, the old key needs to be deleted. In the single delete case, it
10126     might be possible to have this sequence of keys: PUT(X), PUT(X), SD(X),
10127     resulting in the first PUT(X) showing up.
10128   */
10129   if (!hidden_pk && (pk_changed || ((row_info.old_pk_slice.size() > 0) &&
10130                                     can_use_single_delete(key_id)))) {
10131     const rocksdb::Status s = delete_or_singledelete(
10132         key_id, row_info.tx, kd.get_cf(), row_info.old_pk_slice);
10133     if (!s.ok()) {
10134       return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def,
10135                                            m_table_handler);
10136     } else {
10137       bytes_written = row_info.old_pk_slice.size();
10138     }
10139   }
10140 
10141   if (table->found_next_number_field) {
10142     update_auto_incr_val_from_field();
10143   }
10144 
10145   int rc = HA_EXIT_SUCCESS;
10146   rocksdb::Slice value_slice;
10147   /* Prepare the new record to be written into RocksDB */
10148   if ((rc = m_converter->encode_value_slice(
10149            m_pk_descr, row_info.new_pk_slice, row_info.new_pk_unpack_info,
10150            !row_info.old_pk_slice.empty(), should_store_row_debug_checksums(),
10151            m_ttl_bytes, &m_ttl_bytes_updated, &value_slice))) {
10152     return rc;
10153   }
10154 
10155   const auto cf = m_pk_descr->get_cf();
10156   if (rocksdb_enable_bulk_load_api && THDVAR(table->in_use, bulk_load) &&
10157       !hidden_pk) {
10158     /*
10159       Write the primary key directly to an SST file using an SstFileWriter
10160      */
10161     rc = bulk_load_key(row_info.tx, kd, row_info.new_pk_slice, value_slice,
10162                        THDVAR(table->in_use, bulk_load_allow_unsorted));
10163   } else if (row_info.skip_unique_check || row_info.tx->m_ddl_transaction) {
10164     /*
10165       It is responsibility of the user to make sure that the data being
10166       inserted doesn't violate any unique keys.
10167     */
10168     row_info.tx->get_indexed_write_batch()->Put(cf, row_info.new_pk_slice,
10169                                                 value_slice);
10170   } else {
10171     const bool assume_tracked = can_assume_tracked(ha_thd());
10172     const auto s = row_info.tx->put(cf, row_info.new_pk_slice, value_slice,
10173                                     assume_tracked);
10174     if (!s.ok()) {
10175       if (s.IsBusy()) {
10176         errkey = table->s->primary_key;
10177         m_dupp_errkey = errkey;
10178         rc = HA_ERR_FOUND_DUPP_KEY;
10179       } else {
10180         rc = row_info.tx->set_status_error(table->in_use, s, *m_pk_descr,
10181                                            m_tbl_def, m_table_handler);
10182       }
10183     }
10184   }
10185 
10186   if (rc == HA_EXIT_SUCCESS) {
10187     row_info.tx->update_bytes_written(
10188         bytes_written + row_info.new_pk_slice.size() + value_slice.size());
10189   }
10190   return rc;
10191 }
10192 
10193 /**
10194   update an existing secondary key record or write a new secondary key record
10195 
10196   @param[in] table_arg    Table we're working on
10197   @param[in] kd           The secondary key being update/write
10198   @param[in] row_info     data structure contains old row data and new row data
10199   @param[in] bulk_load_sk whether support bulk load. Currently it is only
10200                           support for write
10201   @return
10202     HA_EXIT_SUCCESS OK
10203     Other           HA_ERR error code (can be SE-specific)
10204  */
10205 int ha_rocksdb::update_write_sk(const TABLE *const table_arg,
10206                                 const Rdb_key_def &kd,
10207                                 const struct update_row_info &row_info,
10208                                 const bool bulk_load_sk) {
10209   int new_packed_size;
10210   int old_packed_size;
10211   int rc = HA_EXIT_SUCCESS;
10212 
10213   rocksdb::Slice new_key_slice;
10214   rocksdb::Slice new_value_slice;
10215   rocksdb::Slice old_key_slice;
10216 
10217   const uint key_id = kd.get_keyno();
10218 
10219   ulonglong bytes_written = 0;
10220 
10221   /*
10222     Can skip updating this key if none of the key fields have changed and, if
10223     this table has TTL, the TTL timestamp has not changed.
10224   */
10225   if (row_info.old_data != nullptr && !m_update_scope.is_set(key_id) &&
10226       (!kd.has_ttl() || !m_ttl_bytes_updated)) {
10227     return HA_EXIT_SUCCESS;
10228   }
10229 
10230   bool store_row_debug_checksums = should_store_row_debug_checksums();
10231   new_packed_size =
10232       kd.pack_record(table_arg, m_pack_buffer, row_info.new_data,
10233                      m_sk_packed_tuple, &m_sk_tails, store_row_debug_checksums,
10234                      row_info.hidden_pk_id, 0, nullptr, m_ttl_bytes);
10235 
10236   if (row_info.old_data != nullptr) {
10237     // The old value
10238     old_packed_size = kd.pack_record(
10239         table_arg, m_pack_buffer, row_info.old_data, m_sk_packed_tuple_old,
10240         &m_sk_tails_old, store_row_debug_checksums, row_info.hidden_pk_id, 0,
10241         nullptr, m_ttl_bytes);
10242 
10243     /*
10244       Check if we are going to write the same value. This can happen when
10245       one does
10246         UPDATE tbl SET col='foo'
10247       and we are looking at the row that already has col='foo'.
10248 
10249       We also need to compare the unpack info. Suppose, the collation is
10250       case-insensitive, and unpack info contains information about whether
10251       the letters were uppercase and lowercase.  Then, both 'foo' and 'FOO'
10252       will have the same key value, but different data in unpack_info.
10253 
10254       (note: anyone changing bytewise_compare should take this code into
10255       account)
10256     */
10257     if (old_packed_size == new_packed_size &&
10258         m_sk_tails_old.get_current_pos() == m_sk_tails.get_current_pos() &&
10259         !(kd.has_ttl() && m_ttl_bytes_updated) &&
10260         memcmp(m_sk_packed_tuple_old, m_sk_packed_tuple, old_packed_size) ==
10261             0 &&
10262         memcmp(m_sk_tails_old.ptr(), m_sk_tails.ptr(),
10263                m_sk_tails.get_current_pos()) == 0) {
10264       return HA_EXIT_SUCCESS;
10265     }
10266 
10267     /*
10268       Deleting entries from secondary index should skip locking, but
10269       be visible to the transaction.
10270       (also note that DDL statements do not delete rows, so this is not a DDL
10271        statement)
10272     */
10273     old_key_slice = rocksdb::Slice(
10274         reinterpret_cast<const char *>(m_sk_packed_tuple_old), old_packed_size);
10275 
10276     row_info.tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(),
10277                                                          old_key_slice);
10278 
10279     bytes_written = old_key_slice.size();
10280   }
10281 
10282   new_key_slice = rocksdb::Slice(
10283       reinterpret_cast<const char *>(m_sk_packed_tuple), new_packed_size);
10284   new_value_slice =
10285       rocksdb::Slice(reinterpret_cast<const char *>(m_sk_tails.ptr()),
10286                      m_sk_tails.get_current_pos());
10287 
10288   if (bulk_load_sk && row_info.old_data == nullptr) {
10289     rc = bulk_load_key(row_info.tx, kd, new_key_slice, new_value_slice, true);
10290   } else {
10291     row_info.tx->get_indexed_write_batch()->Put(kd.get_cf(), new_key_slice,
10292                                                 new_value_slice);
10293   }
10294 
10295   row_info.tx->update_bytes_written(bytes_written + new_key_slice.size() +
10296                                     new_value_slice.size());
10297 
10298   return rc;
10299 }
10300 
10301 /**
10302    Update existing indexes(PK/SKs) or write new indexes(PK/SKs)
10303 
10304    @param[in] row_info    hold all row data, such as old key/new key
10305    @param[in] pk_changed  whether primary key is changed
10306    @return
10307      HA_EXIT_SUCCESS OK
10308      Other           HA_ERR error code (can be SE-specific)
10309  */
10310 int ha_rocksdb::update_write_indexes(const struct update_row_info &row_info,
10311                                      const bool pk_changed) {
10312   int rc;
10313   bool bulk_load_sk;
10314 
10315   // The PK must be updated first to pull out the TTL value.
10316   rc = update_write_pk(*m_pk_descr, row_info, pk_changed);
10317   if (rc != HA_EXIT_SUCCESS) {
10318     return rc;
10319   }
10320 
10321   // Update the remaining indexes. Allow bulk loading only if
10322   // allow_sk is enabled
10323   bulk_load_sk = rocksdb_enable_bulk_load_api &&
10324                  THDVAR(table->in_use, bulk_load) &&
10325                  THDVAR(table->in_use, bulk_load_allow_sk);
10326   for (uint key_id = 0; key_id < m_tbl_def->m_key_count; key_id++) {
10327     if (is_pk(key_id, table, m_tbl_def)) {
10328       continue;
10329     }
10330 
10331     rc = update_write_sk(table, *m_key_descr_arr[key_id], row_info,
10332                          bulk_load_sk);
10333     if (rc != HA_EXIT_SUCCESS) {
10334       return rc;
10335     }
10336   }
10337 
10338   return HA_EXIT_SUCCESS;
10339 }
10340 
10341 /**
10342   Update an existing row or write a new row
10343 
10344   @param[in] old_data           nullptr for write, non-null for update
10345   @param[in] new_data           non-null for write/update
10346   @param[in] skip_unique_check  whether to check uniqueness
10347   @return
10348     HA_EXIT_SUCCESS OK
10349     Other           HA_ERR error code (can be SE-specific)
10350  */
10351 int ha_rocksdb::update_write_row(const uchar *const old_data,
10352                                  const uchar *const new_data,
10353                                  const bool skip_unique_check) {
10354   DBUG_ENTER_FUNC();
10355 
10356   THD *thd = ha_thd();
10357   if (thd && thd->killed) {
10358     DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED);
10359   }
10360 
10361   bool pk_changed = false;
10362   struct update_row_info row_info;
10363 
10364   row_info.old_data = old_data;
10365   row_info.new_data = new_data;
10366   row_info.skip_unique_check = skip_unique_check;
10367   row_info.new_pk_unpack_info = nullptr;
10368   set_last_rowkey(old_data);
10369 
10370   row_info.tx = get_or_create_tx(table->in_use);
10371 
10372   if (old_data != nullptr) {
10373     row_info.old_pk_slice =
10374         rocksdb::Slice(m_last_rowkey.ptr(), m_last_rowkey.length());
10375 
10376     /* Determine which indexes need updating. */
10377     calc_updated_indexes();
10378   }
10379 
10380   /*
10381     Get the new row key into row_info.new_pk_slice
10382    */
10383   int rc = get_pk_for_update(&row_info);
10384   if (rc != HA_EXIT_SUCCESS) {
10385     DBUG_RETURN(rc);
10386   }
10387 
10388   /*
10389     For UPDATEs, if the key has changed, we need to obtain a lock. INSERTs
10390     always require locking.
10391   */
10392   if (row_info.old_pk_slice.size() > 0) {
10393     pk_changed = row_info.new_pk_slice.compare(row_info.old_pk_slice) != 0;
10394   }
10395 
10396   if (!skip_unique_check) {
10397     /*
10398       Check to see if we are going to have failures because of unique
10399       keys.  Also lock the appropriate key values.
10400     */
10401     rc = check_uniqueness_and_lock(row_info, pk_changed);
10402     if (rc != HA_EXIT_SUCCESS) {
10403       DBUG_RETURN(rc);
10404     }
10405   }
10406 
10407   DEBUG_SYNC(ha_thd(), "rocksdb.update_write_row_after_unique_check");
10408 
10409   /*
10410     At this point, all locks have been obtained, and all checks for duplicate
10411     keys have been performed. No further errors can be allowed to occur from
10412     here because updates to the transaction will be made and those updates
10413     cannot be easily removed without rolling back the entire transaction.
10414   */
10415   rc = update_write_indexes(row_info, pk_changed);
10416   if (rc != HA_EXIT_SUCCESS) {
10417     DBUG_RETURN(rc);
10418   }
10419 
10420   if (old_data != nullptr) {
10421     row_info.tx->incr_update_count();
10422   } else {
10423     row_info.tx->incr_insert_count();
10424   }
10425 
10426   row_info.tx->log_table_write_op(m_tbl_def);
10427 
10428   if (do_bulk_commit(row_info.tx)) {
10429     DBUG_RETURN(HA_ERR_ROCKSDB_BULK_LOAD);
10430   }
10431 
10432   DBUG_RETURN(HA_EXIT_SUCCESS);
10433 }
10434 
10435 /*
10436  Setting iterator upper/lower bounds for Seek/SeekForPrev.
10437  This makes RocksDB to avoid scanning tombstones outside of
10438  the given key ranges, when prefix_same_as_start=true was not passed
10439  (when prefix bloom filter can not be used).
10440  Inversing upper/lower bound is necessary on reverse order CF.
10441  This covers HA_READ_PREFIX_LAST* case as well. For example,
10442  if given query eq condition was 12 bytes and condition was
10443  0x0000b3eb003f65c5e78858b8, and if doing HA_READ_PREFIX_LAST,
10444  eq_cond_len was 11 (see calc_eq_cond_len() for details).
10445  If the index was reverse order, upper bound would be
10446  0x0000b3eb003f65c5e78857, and lower bound would be
10447  0x0000b3eb003f65c5e78859. These cover given eq condition range.
10448 
10449   @param lower_bound_buf  IN Buffer for lower bound
10450   @param upper_bound_buf  IN Buffer for upper bound
10451 
10452   @param outer_u
10453 */
10454 void ha_rocksdb::setup_iterator_bounds(
10455     const Rdb_key_def &kd, const rocksdb::Slice &eq_cond, size_t bound_len,
10456     uchar *const lower_bound, uchar *const upper_bound,
10457     rocksdb::Slice *lower_bound_slice, rocksdb::Slice *upper_bound_slice) {
10458   // If eq_cond is shorter than Rdb_key_def::INDEX_NUMBER_SIZE, we should be
10459   // able to get better bounds just by using index id directly.
10460   if (eq_cond.size() <= Rdb_key_def::INDEX_NUMBER_SIZE) {
10461     DBUG_ASSERT(bound_len == Rdb_key_def::INDEX_NUMBER_SIZE);
10462     uint size;
10463     kd.get_infimum_key(lower_bound, &size);
10464     DBUG_ASSERT(size == Rdb_key_def::INDEX_NUMBER_SIZE);
10465     kd.get_supremum_key(upper_bound, &size);
10466     DBUG_ASSERT(size == Rdb_key_def::INDEX_NUMBER_SIZE);
10467   } else {
10468     DBUG_ASSERT(bound_len <= eq_cond.size());
10469     memcpy(upper_bound, eq_cond.data(), bound_len);
10470     kd.successor(upper_bound, bound_len);
10471     memcpy(lower_bound, eq_cond.data(), bound_len);
10472     kd.predecessor(lower_bound, bound_len);
10473   }
10474 
10475   if (kd.m_is_reverse_cf) {
10476     *upper_bound_slice = rocksdb::Slice((const char *)lower_bound, bound_len);
10477     *lower_bound_slice = rocksdb::Slice((const char *)upper_bound, bound_len);
10478   } else {
10479     *upper_bound_slice = rocksdb::Slice((const char *)upper_bound, bound_len);
10480     *lower_bound_slice = rocksdb::Slice((const char *)lower_bound, bound_len);
10481   }
10482 }
10483 
10484 /*
10485   Open a cursor
10486 */
10487 
10488 void ha_rocksdb::setup_scan_iterator(const Rdb_key_def &kd,
10489                                      rocksdb::Slice *const slice,
10490                                      const bool use_all_keys,
10491                                      const uint eq_cond_len) {
10492   DBUG_ASSERT(slice->size() >= eq_cond_len);
10493 
10494   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10495 
10496   bool skip_bloom = true;
10497 
10498   const rocksdb::Slice eq_cond(slice->data(), eq_cond_len);
10499   // The size of m_scan_it_lower_bound (and upper) is technically
10500   // max_packed_sk_len as calculated in ha_rocksdb::alloc_key_buffers.  Rather
10501   // than recalculating that number, we pass in the max of eq_cond_len and
10502   // Rdb_key_def::INDEX_NUMBER_SIZE which is guaranteed to be smaller than
10503   // max_packed_sk_len, hence ensuring no buffer overrun.
10504   //
10505   // See ha_rocksdb::setup_iterator_bounds on how the bound_len parameter is
10506   // used.
10507   if (check_bloom_and_set_bounds(
10508           ha_thd(), kd, eq_cond, use_all_keys,
10509           std::max(eq_cond_len, (uint)Rdb_key_def::INDEX_NUMBER_SIZE),
10510           m_scan_it_lower_bound, m_scan_it_upper_bound,
10511           &m_scan_it_lower_bound_slice, &m_scan_it_upper_bound_slice)) {
10512     skip_bloom = false;
10513   }
10514 
10515   /*
10516     In some cases, setup_scan_iterator() is called multiple times from
10517     the same query but bloom filter can not always be used.
10518     Suppose the following query example. id2 is VARCHAR(30) and PRIMARY KEY
10519     (id1, id2).
10520      select count(*) from t2 WHERE id1=100 and id2 IN ('00000000000000000000',
10521     '100');
10522     In this case, setup_scan_iterator() is called twice, the first time is for
10523     (id1, id2)=(100, '00000000000000000000') and the second time is for (100,
10524     '100').
10525     If prefix bloom filter length is 24 bytes, prefix bloom filter can be used
10526     for the
10527     first condition but not for the second condition.
10528     If bloom filter condition is changed, currently it is necessary to destroy
10529     and
10530     re-create Iterator.
10531   */
10532   if (m_scan_it_skips_bloom != skip_bloom) {
10533     release_scan_iterator();
10534   }
10535 
10536   /*
10537     SQL layer can call rnd_init() multiple times in a row.
10538     In that case, re-use the iterator, but re-position it at the table start.
10539   */
10540   if (!m_scan_it) {
10541     const bool fill_cache = !THDVAR(ha_thd(), skip_fill_cache);
10542     if (commit_in_the_middle()) {
10543       DBUG_ASSERT(m_scan_it_snapshot == nullptr);
10544       m_scan_it_snapshot = rdb->GetSnapshot();
10545 
10546       auto read_opts = rocksdb::ReadOptions();
10547       // TODO(mung): set based on WHERE conditions
10548       read_opts.total_order_seek = true;
10549       read_opts.snapshot = m_scan_it_snapshot;
10550       m_scan_it = rdb->NewIterator(read_opts, kd.get_cf());
10551     } else {
10552       m_scan_it = tx->get_iterator(kd.get_cf(), skip_bloom, fill_cache,
10553                                    m_scan_it_lower_bound_slice,
10554                                    m_scan_it_upper_bound_slice);
10555     }
10556     m_scan_it_skips_bloom = skip_bloom;
10557   }
10558 }
10559 
10560 void ha_rocksdb::release_scan_iterator() {
10561   delete m_scan_it;
10562   m_scan_it = nullptr;
10563 
10564   if (m_scan_it_snapshot) {
10565     rdb->ReleaseSnapshot(m_scan_it_snapshot);
10566     m_scan_it_snapshot = nullptr;
10567   }
10568 }
10569 
10570 void ha_rocksdb::setup_iterator_for_rnd_scan() {
10571   uint key_size;
10572 
10573   int key_start_matching_bytes =
10574       m_pk_descr->get_first_key(m_pk_packed_tuple, &key_size);
10575 
10576   rocksdb::Slice table_key((const char *)m_pk_packed_tuple, key_size);
10577 
10578   setup_scan_iterator(*m_pk_descr, &table_key, false, key_start_matching_bytes);
10579   m_scan_it->Seek(table_key);
10580   m_skip_scan_it_next_call = true;
10581 }
10582 
10583 /**
10584   @return
10585     HA_EXIT_SUCCESS  OK
10586     other            HA_ERR error code (can be SE-specific)
10587 */
10588 int ha_rocksdb::rnd_init(bool scan) {
10589   DBUG_ENTER_FUNC();
10590 
10591   THD *thd = ha_thd();
10592   if (thd && thd->killed) {
10593     DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED);
10594   }
10595 
10596   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10597 
10598   // when this table is being updated, decode all fields
10599   m_converter->setup_field_decoders(table->read_set,
10600                                     m_lock_rows == RDB_LOCK_WRITE);
10601 
10602   if (scan) {
10603     m_rnd_scan_is_new_snapshot = !tx->has_snapshot();
10604     setup_iterator_for_rnd_scan();
10605   } else {
10606     /* We don't need any preparations for rnd_pos() calls. */
10607   }
10608 
10609   // If m_lock_rows is on then we will be doing a get_for_update when accessing
10610   // the index, so don't acquire the snapshot right away.  Otherwise acquire
10611   // the snapshot immediately.
10612   tx->acquire_snapshot(m_lock_rows == RDB_LOCK_NONE);
10613 
10614   DBUG_RETURN(HA_EXIT_SUCCESS);
10615 }
10616 
10617 /**
10618   @return
10619     HA_EXIT_SUCCESS  OK
10620     other            HA_ERR error code (can be SE-specific)
10621 */
10622 int ha_rocksdb::rnd_next(uchar *const buf) {
10623   DBUG_ENTER_FUNC();
10624 
10625   int rc;
10626   for (;;) {
10627     rc = rnd_next_with_direction(buf, true);
10628     if (!should_recreate_snapshot(rc, m_rnd_scan_is_new_snapshot)) {
10629       break; /* exit the loop */
10630     }
10631     // release the snapshot and iterator and then regenerate them
10632     Rdb_transaction *tx = get_or_create_tx(table->in_use);
10633     tx->release_snapshot();
10634     release_scan_iterator();
10635     setup_iterator_for_rnd_scan();
10636   }
10637 
10638   m_rnd_scan_is_new_snapshot = false;
10639 
10640   if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
10641 
10642   DBUG_RETURN(rc);
10643 }
10644 
10645 /*
10646   See also secondary_index_read().
10647 */
10648 int ha_rocksdb::rnd_next_with_direction(uchar *const buf, bool move_forward) {
10649   DBUG_ENTER_FUNC();
10650 
10651   int rc;
10652   THD *thd = ha_thd();
10653 
10654   table->status = STATUS_NOT_FOUND;
10655 #ifdef MARIAROCKS_NOT_YET
10656   stats.rows_requested++;
10657 #endif
10658   if (!m_scan_it || !is_valid(m_scan_it)) {
10659     /*
10660       We can get here when SQL layer has called
10661 
10662         h->index_init(PRIMARY);
10663         h->index_read_map(full index tuple, HA_READ_KEY_EXACT);
10664 
10665       In this case, we should return EOF.
10666     */
10667     DBUG_RETURN(HA_ERR_END_OF_FILE);
10668   }
10669 
10670   for (;;) {
10671     DEBUG_SYNC(thd, "rocksdb.check_flags_rnwd");
10672     if (thd && thd->killed) {
10673       rc = HA_ERR_QUERY_INTERRUPTED;
10674       break;
10675     }
10676 
10677     if (m_skip_scan_it_next_call) {
10678       m_skip_scan_it_next_call = false;
10679     } else {
10680       if (move_forward) {
10681         m_scan_it->Next(); /* this call cannot fail */
10682       } else {
10683         m_scan_it->Prev(); /* this call cannot fail */
10684       }
10685     }
10686 
10687     if (!is_valid(m_scan_it)) {
10688       rc = HA_ERR_END_OF_FILE;
10689       break;
10690     }
10691 
10692     /* check if we're out of this table */
10693     const rocksdb::Slice key = m_scan_it->key();
10694     if (!m_pk_descr->covers_key(key)) {
10695       rc = HA_ERR_END_OF_FILE;
10696       break;
10697     }
10698 
10699     if (m_lock_rows != RDB_LOCK_NONE) {
10700       /*
10701         Lock the row we've just read.
10702 
10703         Now we call get_for_update which will 1) Take a lock and 2) Will fail
10704         if the row was deleted since the snapshot was taken.
10705       */
10706       Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10707       DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete");
10708 
10709       if (m_pk_descr->has_ttl() &&
10710           should_hide_ttl_rec(*m_pk_descr, m_scan_it->value(),
10711                               tx->m_snapshot_timestamp)) {
10712         continue;
10713       }
10714 
10715       const rocksdb::Status s =
10716           get_for_update(tx, m_pk_descr->get_cf(), key, &m_retrieved_record);
10717       if (s.IsNotFound() &&
10718           should_skip_invalidated_record(HA_ERR_KEY_NOT_FOUND)) {
10719         continue;
10720       }
10721 
10722       if (!s.ok()) {
10723         DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr,
10724                                          m_tbl_def, m_table_handler));
10725       }
10726 
10727       // If we called get_for_update() use the value from that call not from
10728       // the iterator as it may be stale since we don't have a snapshot
10729       // when m_lock_rows is not RDB_LOCK_NONE.
10730       m_last_rowkey.copy(key.data(), key.size(), &my_charset_bin);
10731       rc = convert_record_from_storage_format(&key, buf);
10732     } else {
10733       // Use the value from the iterator
10734       rocksdb::Slice value = m_scan_it->value();
10735 
10736       if (m_pk_descr->has_ttl() &&
10737           should_hide_ttl_rec(
10738               *m_pk_descr, value,
10739               get_or_create_tx(table->in_use)->m_snapshot_timestamp)) {
10740         continue;
10741       }
10742 
10743       m_last_rowkey.copy(key.data(), key.size(), &my_charset_bin);
10744       rc = convert_record_from_storage_format(&key, &value, buf);
10745     }
10746 
10747     table->status = 0;
10748     break;
10749   }
10750 
10751   if (!rc) {
10752 #ifdef MARIAROCKS_NOT_YET
10753     stats.rows_read++;
10754     stats.rows_index_next++;
10755 #endif
10756     update_row_stats(ROWS_READ);
10757   }
10758 
10759   DBUG_RETURN(rc);
10760 }
10761 
10762 int ha_rocksdb::rnd_end() {
10763   DBUG_ENTER_FUNC();
10764 
10765   release_scan_iterator();
10766 
10767   DBUG_RETURN(HA_EXIT_SUCCESS);
10768 }
10769 
10770 /**
10771   @return
10772     HA_EXIT_SUCCESS  OK
10773     other            HA_ERR error code (can be SE-specific)
10774 */
10775 int ha_rocksdb::index_init(uint idx, bool sorted) {
10776   DBUG_ENTER_FUNC();
10777 
10778   THD *thd = ha_thd();
10779   if (thd && thd->killed) {
10780     DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED);
10781   }
10782 
10783   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10784   DBUG_ASSERT(tx != nullptr);
10785 
10786    m_start_range= NULL;
10787   // when this table is being updated, decode all fields
10788   m_converter->setup_field_decoders(table->read_set,
10789                                     m_lock_rows == RDB_LOCK_WRITE);
10790 
10791   if (!m_keyread_only) {
10792     m_key_descr_arr[idx]->get_lookup_bitmap(table, &m_lookup_bitmap);
10793   }
10794 
10795   // If m_lock_rows is not RDB_LOCK_NONE then we will be doing a get_for_update
10796   // when accessing the index, so don't acquire the snapshot right away.
10797   // Otherwise acquire the snapshot immediately.
10798   tx->acquire_snapshot(m_lock_rows == RDB_LOCK_NONE);
10799 
10800   active_index = idx;
10801 
10802   DBUG_RETURN(HA_EXIT_SUCCESS);
10803 }
10804 
10805 /**
10806   @return
10807     HA_EXIT_SUCCESS      OK
10808 */
10809 int ha_rocksdb::index_end() {
10810   DBUG_ENTER_FUNC();
10811 
10812   release_scan_iterator();
10813 
10814   bitmap_free(&m_lookup_bitmap);
10815 
10816   active_index = MAX_KEY;
10817   in_range_check_pushed_down = FALSE;
10818   m_start_range= NULL;
10819 
10820   DBUG_RETURN(HA_EXIT_SUCCESS);
10821 }
10822 
10823 /**
10824   @return
10825     HA_EXIT_SUCCESS  OK
10826     other            HA_ERR error code (can be SE-specific)
10827 */
10828 int ha_rocksdb::truncate() {
10829   DBUG_ENTER_FUNC();
10830 
10831   DBUG_ASSERT(m_tbl_def != nullptr);
10832 
10833   // Save table name to use later
10834   std::string table_name = m_tbl_def->full_tablename();
10835 
10836   // Delete current table
10837   int err = delete_table(m_tbl_def);
10838   if (err != HA_EXIT_SUCCESS) {
10839     DBUG_RETURN(err);
10840   }
10841 
10842   // Reset auto_increment_value to 1 if auto-increment feature is enabled
10843   // By default, the starting valid value for auto_increment_value is 1
10844   DBUG_RETURN(create_table(
10845       table_name, table,
10846       table->found_next_number_field ? 1 : 0 /* auto_increment_value */));
10847 }
10848 
10849 /*
10850   Delete the row we've last read. The row is also passed as parameter.
10851 
10852   @detail
10853     The caller guarantees table buf points to the row that was just read.
10854     The row is either table->record[0] or table->record[1].
10855     (Check out InnoDB: row_update_for_mysql() has "UT_NOT_USED(mysql_rec)"
10856 
10857   @return
10858     HA_EXIT_SUCCESS  OK
10859     other            HA_ERR error code (can be SE-specific)
10860 */
10861 int ha_rocksdb::delete_row(const uchar *const buf) {
10862   DBUG_ENTER_FUNC();
10863 
10864   DBUG_ASSERT(buf != nullptr);
10865 
10866   set_last_rowkey(buf);
10867 
10868   rocksdb::Slice key_slice(m_last_rowkey.ptr(), m_last_rowkey.length());
10869   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10870   ulonglong bytes_written = 0;
10871 
10872   const uint index = pk_index(table, m_tbl_def);
10873   rocksdb::Status s =
10874       delete_or_singledelete(index, tx, m_pk_descr->get_cf(), key_slice);
10875   if (!s.ok()) {
10876     DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def,
10877                                      m_table_handler));
10878   } else {
10879     bytes_written = key_slice.size();
10880   }
10881 
10882   longlong hidden_pk_id = 0;
10883   if (m_tbl_def->m_key_count > 1 && has_hidden_pk(table)) {
10884     int err = read_hidden_pk_id_from_rowkey(&hidden_pk_id);
10885     if (err) {
10886       DBUG_RETURN(err);
10887     }
10888   }
10889 
10890   // Delete the record for every secondary index
10891   for (uint i = 0; i < m_tbl_def->m_key_count; i++) {
10892     if (!is_pk(i, table, m_tbl_def)) {
10893       int packed_size;
10894       const Rdb_key_def &kd = *m_key_descr_arr[i];
10895       packed_size = kd.pack_record(table, m_pack_buffer, buf, m_sk_packed_tuple,
10896                                    nullptr, false, hidden_pk_id);
10897       rocksdb::Slice secondary_key_slice(
10898           reinterpret_cast<const char *>(m_sk_packed_tuple), packed_size);
10899       /* Deleting on secondary key doesn't need any locks: */
10900       tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(),
10901                                                   secondary_key_slice);
10902       bytes_written += secondary_key_slice.size();
10903     }
10904   }
10905 
10906   tx->incr_delete_count();
10907   tx->log_table_write_op(m_tbl_def);
10908 
10909   if (do_bulk_commit(tx)) {
10910     DBUG_RETURN(HA_ERR_ROCKSDB_BULK_LOAD);
10911   }
10912 #ifdef MARIAROCKS_NOT_YET
10913   stats.rows_deleted++;
10914 #endif
10915   update_row_stats(ROWS_DELETED);
10916   tx->update_bytes_written(bytes_written);
10917 
10918   DBUG_RETURN(HA_EXIT_SUCCESS);
10919 }
10920 
10921 rocksdb::Status ha_rocksdb::delete_or_singledelete(
10922     uint index, Rdb_transaction *const tx,
10923     rocksdb::ColumnFamilyHandle *const column_family,
10924     const rocksdb::Slice &key) {
10925   const bool assume_tracked = can_assume_tracked(ha_thd());
10926   if (can_use_single_delete(index)) {
10927     return tx->single_delete(column_family, key, assume_tracked);
10928   }
10929   return tx->delete_key(column_family, key, assume_tracked);
10930 }
10931 
10932 void ha_rocksdb::update_stats(void) {
10933   DBUG_ENTER_FUNC();
10934 
10935   stats.records = 0;
10936   stats.index_file_length = 0ul;
10937   stats.data_file_length = 0ul;
10938   stats.mean_rec_length = 0;
10939 
10940   for (uint i = 0; i < m_tbl_def->m_key_count; i++) {
10941     if (is_pk(i, table, m_tbl_def)) {
10942       stats.data_file_length = m_pk_descr->m_stats.m_actual_disk_size;
10943       stats.records = m_pk_descr->m_stats.m_rows;
10944     } else {
10945       stats.index_file_length += m_key_descr_arr[i]->m_stats.m_actual_disk_size;
10946     }
10947   }
10948 
10949   DBUG_VOID_RETURN;
10950 }
10951 
10952 /**
10953   @return
10954     HA_EXIT_SUCCESS  OK
10955     HA_EXIT_FAILURE  Error
10956 */
10957 int ha_rocksdb::info(uint flag) {
10958   DBUG_ENTER_FUNC();
10959 
10960   if (!table) {
10961     DBUG_RETURN(HA_EXIT_FAILURE);
10962   }
10963 
10964   if (flag & HA_STATUS_VARIABLE) {
10965     /*
10966       Test only to simulate corrupted stats
10967     */
10968     DBUG_EXECUTE_IF("myrocks_simulate_negative_stats",
10969                     m_pk_descr->m_stats.m_actual_disk_size =
10970                         -m_pk_descr->m_stats.m_actual_disk_size;);
10971 
10972     update_stats();
10973 
10974     /*
10975       If any stats are negative due to bad cached stats, re-run analyze table
10976       and re-retrieve the stats.
10977     */
10978     if (static_cast<longlong>(stats.data_file_length) < 0 ||
10979         static_cast<longlong>(stats.index_file_length) < 0 ||
10980         static_cast<longlong>(stats.records) < 0) {
10981       if (calculate_stats_for_table()) {
10982         DBUG_RETURN(HA_EXIT_FAILURE);
10983       }
10984 
10985       update_stats();
10986     }
10987 
10988     // if number of records is hardcoded, we do not want to force computation
10989     // of memtable cardinalities
10990     if (stats.records == 0 || (rocksdb_force_compute_memtable_stats &&
10991                                rocksdb_debug_optimizer_n_rows == 0)) {
10992       // First, compute SST files stats
10993       uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
10994       auto r = get_range(pk_index(table, m_tbl_def), buf);
10995       uint64_t sz = 0;
10996       uint8_t include_flags = rocksdb::DB::INCLUDE_FILES;
10997       // recompute SST files stats only if records count is 0
10998       if (stats.records == 0) {
10999         rdb->GetApproximateSizes(m_pk_descr->get_cf(), &r, 1, &sz,
11000                                  include_flags);
11001         stats.records += sz / ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE;
11002         stats.data_file_length += sz;
11003       }
11004       // Second, compute memtable stats. This call is expensive, so cache
11005       // values computed for some time.
11006       uint64_t cachetime = rocksdb_force_compute_memtable_stats_cachetime;
11007       uint64_t time = (cachetime == 0) ? 0 : my_interval_timer() / 1000;
11008       if (cachetime == 0 ||
11009           time > m_table_handler->m_mtcache_last_update + cachetime) {
11010         uint64_t memtableCount;
11011         uint64_t memtableSize;
11012 
11013         // the stats below are calculated from skiplist wich is a probablistic
11014         // data structure, so the results vary between test runs
11015         // it also can return 0 for quite a large tables which means that
11016         // cardinality for memtable only indxes will be reported as 0
11017         rdb->GetApproximateMemTableStats(m_pk_descr->get_cf(), r,
11018                                          &memtableCount, &memtableSize);
11019 
11020         // Atomically update all of these fields at the same time
11021         if (cachetime > 0) {
11022           if (m_table_handler->m_mtcache_lock.fetch_add(
11023                   1, std::memory_order_acquire) == 0) {
11024             m_table_handler->m_mtcache_count = memtableCount;
11025             m_table_handler->m_mtcache_size = memtableSize;
11026             m_table_handler->m_mtcache_last_update = time;
11027           }
11028           m_table_handler->m_mtcache_lock.fetch_sub(1,
11029                                                     std::memory_order_release);
11030         }
11031 
11032         stats.records += memtableCount;
11033         stats.data_file_length += memtableSize;
11034       } else {
11035         // Cached data is still valid, so use it instead
11036         stats.records += m_table_handler->m_mtcache_count;
11037         stats.data_file_length += m_table_handler->m_mtcache_size;
11038       }
11039 
11040       // Do like InnoDB does. stats.records=0 confuses the optimizer
11041       if (stats.records == 0 && !(flag & (HA_STATUS_TIME | HA_STATUS_OPEN))) {
11042         stats.records++;
11043       }
11044     }
11045 
11046     if (rocksdb_debug_optimizer_n_rows > 0)
11047       stats.records = rocksdb_debug_optimizer_n_rows;
11048 
11049     if (stats.records != 0) {
11050       stats.mean_rec_length = stats.data_file_length / stats.records;
11051     }
11052   }
11053 
11054   if (flag & HA_STATUS_CONST) {
11055     ref_length = m_pk_descr->max_storage_fmt_length();
11056 
11057     for (uint i = 0; i < m_tbl_def->m_key_count; i++) {
11058       if (is_hidden_pk(i, table, m_tbl_def)) {
11059         continue;
11060       }
11061       KEY *const k = &table->key_info[i];
11062       for (uint j = 0; j < k->ext_key_parts; j++) {
11063         const Rdb_index_stats &k_stats = m_key_descr_arr[i]->m_stats;
11064         uint x;
11065 
11066         if (k_stats.m_distinct_keys_per_prefix.size() > j &&
11067             k_stats.m_distinct_keys_per_prefix[j] > 0) {
11068           x = k_stats.m_rows / k_stats.m_distinct_keys_per_prefix[j];
11069           /*
11070             If the number of rows is less than the number of prefixes (due to
11071             sampling), the average number of rows with the same prefix is 1.
11072            */
11073           if (x == 0) {
11074             x = 1;
11075           }
11076         } else {
11077           x = 0;
11078         }
11079         if (x > stats.records) x = stats.records;
11080         if ((x == 0 && rocksdb_debug_optimizer_no_zero_cardinality) ||
11081             rocksdb_debug_optimizer_n_rows > 0) {
11082           // Fake cardinality implementation. For example, (idx1, idx2, idx3)
11083           // index
11084           /*
11085             Make MariaRocks behave the same way as MyRocks does:
11086             1. SQL layer thinks that unique secondary indexes are not extended
11087                with PK columns (both in MySQL and MariaDB)
11088             2. MariaDB also thinks that indexes with partially-covered columns
11089                are not extended with PK columns. Use the same number of
11090                keyparts that MyRocks would use.
11091           */
11092           uint ext_key_parts2;
11093           if (k->flags & HA_NOSAME)
11094             ext_key_parts2= k->ext_key_parts;  // This is #1
11095           else
11096             ext_key_parts2= m_key_descr_arr[i]->get_key_parts(); // This is #2.
11097 
11098           // will have rec_per_key for (idx1)=4, (idx1,2)=2, and (idx1,2,3)=1.
11099           // rec_per_key for the whole index is 1, and multiplied by 2^n if
11100           // n suffix columns of the index are not used.
11101           x = 1 << (ext_key_parts2 - j - 1);
11102         }
11103         k->rec_per_key[j] = x;
11104       }
11105     }
11106 
11107     stats.create_time = m_tbl_def->get_create_time();
11108   }
11109 
11110   if (flag & HA_STATUS_TIME) {
11111     stats.update_time = m_tbl_def->m_update_time;
11112   }
11113 
11114   if (flag & HA_STATUS_ERRKEY) {
11115     /*
11116       Currently we support only primary keys so we know which key had a
11117       uniqueness violation.
11118     */
11119     errkey = m_dupp_errkey;
11120     dup_ref = m_pk_tuple;  // TODO(?): this should store packed PK.
11121   }
11122 
11123   if (flag & HA_STATUS_AUTO) {
11124     stats.auto_increment_value = m_tbl_def->m_auto_incr_val;
11125   }
11126 
11127   DBUG_RETURN(HA_EXIT_SUCCESS);
11128 }
11129 
11130 void ha_rocksdb::position(const uchar *const record) {
11131   DBUG_ENTER_FUNC();
11132 
11133   longlong hidden_pk_id = 0;
11134   if (has_hidden_pk(table) && read_hidden_pk_id_from_rowkey(&hidden_pk_id)) {
11135     DBUG_ASSERT(false);  // should never reach here
11136   }
11137 
11138   /*
11139     Get packed primary key value from the record.
11140 
11141     (Note: m_last_rowkey has the packed PK of last-read-row, which allows to
11142     handle most cases, but there is an exception to this: when slave applies
11143     RBR events, it fills the record and then calls position(); rnd_pos())
11144 
11145     Also note that we *can* rely on PK column values being available. This is
11146     because table_flags() includes HA_PRIMARY_KEY_REQUIRED_FOR_POSITION bit.
11147     When that is true, table->prepare_for_position() adds PK columns into the
11148     read set (this may potentially disable index-only access when PK column
11149     cannot be restored from its mem-comparable form in the secondary indexes).
11150   */
11151   const uint packed_size = m_pk_descr->pack_record(
11152       table, m_pack_buffer, record, ref, nullptr, false, hidden_pk_id);
11153 
11154   /*
11155     It could be that mem-comparable form of PK occupies less than ref_length
11156     bytes. Fill the remainder with zeros.
11157   */
11158   if (ref_length > packed_size) {
11159     memset(ref + packed_size, 0, ref_length - packed_size);
11160   }
11161 
11162   DBUG_VOID_RETURN;
11163 }
11164 
11165 /**
11166   @return
11167     HA_EXIT_SUCCESS  OK
11168     other            HA_ERR error code (can be SE-specific)
11169 */
11170 int ha_rocksdb::rnd_pos(uchar *const buf, uchar *const pos) {
11171   DBUG_ENTER_FUNC();
11172 
11173   int rc;
11174   size_t len;
11175 
11176 #ifdef MARIAROCKS_NOT_YET
11177   stats.rows_requested++;
11178 #endif
11179   len = m_pk_descr->key_length(table,
11180                                rocksdb::Slice((const char *)pos, ref_length));
11181   if (len == size_t(-1)) {
11182     DBUG_RETURN(HA_ERR_ROCKSDB_CORRUPT_DATA); /* Data corruption? */
11183   }
11184 
11185   rc = get_row_by_rowid(buf, pos, len);
11186 
11187   if (!rc) {
11188 #ifdef MARIAROCKS_NOT_YET
11189     stats.rows_read++;
11190 #endif
11191     update_row_stats(ROWS_READ);
11192   }
11193 
11194   DBUG_RETURN(rc);
11195 }
11196 
11197 /*
11198   @brief
11199     Calculate (if needed) the bitmap of indexes that are modified by the
11200     current query
11201 
11202   @detail
11203     The calculation done by checking index definitions against the
11204   table->write_set
11205     column bitmap.
11206 */
11207 
11208 void ha_rocksdb::calc_updated_indexes() {
11209   if (!m_update_scope_is_valid) {
11210     m_update_scope_is_valid = true;
11211     m_update_scope.clear_all();
11212 
11213     for (uint keynr = 0; keynr < table->s->keys; keynr++) {
11214       const Rdb_key_def &kd = *m_key_descr_arr[keynr];
11215       /* Walk over all key parts, including the "extended key" suffix */
11216       const uint key_parts = kd.get_key_parts();
11217       for (uint kp = 0; kp < key_parts; kp++) {
11218         if (has_hidden_pk(table) && kp + 1 == key_parts) break;
11219 
11220         Field *const field = kd.get_table_field_for_part_no(table, kp);
11221         if (bitmap_is_set(table->write_set, field->field_index)) {
11222           m_update_scope.set_bit(keynr);
11223           break;
11224         }
11225       }
11226     }
11227   }
11228 }
11229 
11230 /**
11231   Update an existing row
11232   @param[in] old_data           nullptr for write, non-null for update
11233   @param[in] new_data           non-null for write/update
11234   @return
11235     HA_EXIT_SUCCESS  OK
11236     other            HA_ERR error code (can be SE-specific)
11237 */
11238 int ha_rocksdb::update_row(const uchar *const old_data,
11239                            const uchar *const new_data) {
11240   DBUG_ENTER_FUNC();
11241 
11242   DBUG_ASSERT(old_data != nullptr);
11243   DBUG_ASSERT(new_data != nullptr);
11244   DBUG_ASSERT(m_lock_rows == RDB_LOCK_WRITE);
11245   /*
11246     old_data points to record we're updating. It is the same as the record
11247     we've just read (for multi-table UPDATE, too, because SQL layer will make
11248     an rnd_pos() call to re-read the record before calling update_row())
11249   */
11250   DBUG_ASSERT(new_data == table->record[0]);
11251 
11252   const int rv = update_write_row(old_data, new_data, skip_unique_check());
11253 
11254   if (rv == 0) {
11255 #ifdef MARIAROCKS_NOT_YET
11256     stats.rows_updated++;
11257 #endif
11258     update_row_stats(ROWS_UPDATED);
11259   }
11260 
11261   DBUG_RETURN(rv);
11262 }
11263 
11264 /*
11265   MariaDB's temporary: MyRocks has this function in sql/handler.cc:
11266 */
11267 
11268 bool can_hold_read_locks_on_select(THD *thd, thr_lock_type lock_type)
11269 {
11270   return (lock_type == TL_READ_WITH_SHARED_LOCKS
11271           || lock_type == TL_READ_NO_INSERT
11272           || (lock_type != TL_IGNORE
11273             && thd->lex->sql_command != SQLCOM_SELECT));
11274 }
11275 
11276 
11277 /* The following function was copied from ha_blackhole::store_lock: */
11278 THR_LOCK_DATA **ha_rocksdb::store_lock(THD *const thd, THR_LOCK_DATA **to,
11279                                        enum thr_lock_type lock_type) {
11280   DBUG_ENTER_FUNC();
11281 
11282   DBUG_ASSERT(thd != nullptr);
11283   DBUG_ASSERT(to != nullptr);
11284 
11285   bool in_lock_tables = my_core::thd_in_lock_tables(thd);
11286 
11287   /* First, make a decision about MyRocks's internal locking */
11288   if (lock_type >= TL_WRITE_ALLOW_WRITE) {
11289     m_lock_rows = RDB_LOCK_WRITE;
11290   } else if (lock_type == TL_READ_WITH_SHARED_LOCKS) {
11291     m_lock_rows = RDB_LOCK_READ;
11292   } else if (lock_type != TL_IGNORE) {
11293     m_lock_rows = RDB_LOCK_NONE;
11294     if (THDVAR(thd, lock_scanned_rows)) {
11295       /*
11296         The following logic was copied directly from
11297         ha_innobase::store_lock_with_x_type() in
11298         storage/innobase/handler/ha_innodb.cc and causes MyRocks to leave
11299         locks in place on rows that are in a table that is not being updated.
11300       */
11301       const uint sql_command = my_core::thd_sql_command(thd);
11302       if ((lock_type == TL_READ && in_lock_tables) ||
11303           (lock_type == TL_READ_HIGH_PRIORITY && in_lock_tables) ||
11304           can_hold_read_locks_on_select(thd, lock_type)) {
11305         ulong tx_isolation = my_core::thd_tx_isolation(thd);
11306         if (sql_command != SQLCOM_CHECKSUM &&
11307             ((my_core::thd_test_options(thd, OPTION_BIN_LOG) &&
11308               tx_isolation > ISO_READ_COMMITTED) ||
11309              tx_isolation == ISO_SERIALIZABLE ||
11310              (lock_type != TL_READ && lock_type != TL_READ_NO_INSERT) ||
11311              (sql_command != SQLCOM_INSERT_SELECT &&
11312               sql_command != SQLCOM_REPLACE_SELECT &&
11313               sql_command != SQLCOM_UPDATE && sql_command != SQLCOM_DELETE &&
11314               sql_command != SQLCOM_CREATE_TABLE))) {
11315           m_lock_rows = RDB_LOCK_READ;
11316         }
11317       }
11318     }
11319   }
11320 
11321   /* Then, tell the SQL layer what kind of locking it should use: */
11322   if (lock_type != TL_IGNORE && m_db_lock.type == TL_UNLOCK) {
11323     /*
11324       Here is where we get into the guts of a row level lock.
11325       If TL_UNLOCK is set
11326       If we are not doing a LOCK TABLE or DISCARD/IMPORT
11327       TABLESPACE, then allow multiple writers
11328     */
11329 
11330     if ((lock_type >= TL_WRITE_CONCURRENT_INSERT && lock_type <= TL_WRITE) &&
11331         !in_lock_tables && !my_core::thd_tablespace_op(thd)) {
11332       lock_type = TL_WRITE_ALLOW_WRITE;
11333     }
11334 
11335     /*
11336       In queries of type INSERT INTO t1 SELECT ... FROM t2 ...
11337       MySQL would use the lock TL_READ_NO_INSERT on t2, and that
11338       would conflict with TL_WRITE_ALLOW_WRITE, blocking all inserts
11339       to t2. Convert the lock to a normal read lock to allow
11340       concurrent inserts to t2.
11341     */
11342 
11343     if (lock_type == TL_READ_NO_INSERT && !in_lock_tables) {
11344       lock_type = TL_READ;
11345     }
11346 
11347     m_db_lock.type = lock_type;
11348   }
11349 
11350   *to++ = &m_db_lock;
11351 
11352   DBUG_RETURN(to);
11353 }
11354 
11355 void ha_rocksdb::read_thd_vars(THD *const thd) {
11356   m_store_row_debug_checksums = THDVAR(thd, store_row_debug_checksums);
11357   m_converter->set_verify_row_debug_checksums(
11358       THDVAR(thd, verify_row_debug_checksums));
11359   m_checksums_pct = THDVAR(thd, checksums_pct);
11360 }
11361 
11362 ulonglong ha_rocksdb::table_flags() const
11363 {
11364   DBUG_ENTER_FUNC();
11365 
11366   /*
11367     HA_BINLOG_STMT_CAPABLE
11368     Upstream:  MyRocks advertises itself as it supports SBR, but has additional
11369       checks in ha_rocksdb::external_lock()/ start_stmt() which will return an
11370       error if one tries to run the statement.
11371       Exceptions: @@rocksdb_unsafe_for_binlog or we are an SQL slave thread.
11372 
11373     MariaDB: Inform the upper layer we don't support SBR, so it switches to RBR
11374       if possible. The exceptions are the same as with the upstream.
11375 
11376     HA_REC_NOT_IN_SEQ
11377       If we don't set it, filesort crashes, because it assumes rowids are
11378       1..8 byte numbers
11379     HA_PRIMARY_KEY_IN_READ_INDEX
11380       This flag is always set, even for tables that:
11381       - have no PK
11382       - have some (or all) of PK that can't be decoded from the secondary
11383         index.
11384   */
11385   THD *thd= ha_thd();
11386   DBUG_RETURN(HA_BINLOG_ROW_CAPABLE |
11387               ((thd && (THDVAR(thd, unsafe_for_binlog) ||thd->rgi_slave))?
11388                 HA_BINLOG_STMT_CAPABLE : 0) |
11389               HA_REC_NOT_IN_SEQ | HA_CAN_INDEX_BLOBS |
11390               HA_PRIMARY_KEY_IN_READ_INDEX |
11391               HA_PRIMARY_KEY_REQUIRED_FOR_POSITION | HA_NULL_IN_KEY |
11392               HA_PARTIAL_COLUMN_READ |
11393               HA_TABLE_SCAN_ON_INDEX);
11394 }
11395 
11396 
11397 /**
11398   @return
11399     HA_EXIT_SUCCESS  OK
11400     other            HA_ERR error code (cannot be SE-specific)
11401 */
11402 int ha_rocksdb::external_lock(THD *const thd, int lock_type) {
11403   DBUG_ENTER_FUNC();
11404 
11405   DBUG_ASSERT(thd != nullptr);
11406 
11407   int res = HA_EXIT_SUCCESS;
11408 #if 0
11409   // MariaDB uses a different way to implement this, see ha_rocksdb::table_flags
11410 
11411   int binlog_format = my_core::thd_binlog_format(thd);
11412   bool unsafe_for_binlog = THDVAR(ha_thd(), unsafe_for_binlog);
11413 
11414   /*
11415     A note about (*) below: In fb/mysql, LOCK TABLE myrocks_table WRITE
11416     - produces an error when @@binlog_format='ROW'.
11417     - does not produce an error with @@binlog_format='STATEMENT'
11418 
11419     We cannot easily provide this behavior in MariaDB, because LOCK ROWS
11420     changes @@binlog_format for the duration of the statement.
11421 
11422     Instead we will make the check in ha_rocksdb::start_stmt and will produce
11423     an error in the first DML statement done under LOCK TABLE
11424 
11425     (**) - The same happens for other "DDL-type" commands, so allow them here
11426     as well.
11427   */
11428   if (lock_type == F_WRLCK && !thd->rgi_slave && !unsafe_for_binlog &&
11429       binlog_format != BINLOG_FORMAT_ROW &&
11430       binlog_format != BINLOG_FORMAT_UNSPEC &&
11431       thd->lex->sql_command != SQLCOM_LOCK_TABLES &&  // (*)
11432       thd->lex->sql_command != SQLCOM_ANALYZE &&   // (**)
11433       thd->lex->sql_command != SQLCOM_OPTIMIZE &&  // (**)
11434       thd->lex->sql_command != SQLCOM_FLUSH &&  // (**)
11435       my_core::thd_binlog_filter_ok(thd)) {
11436     my_error(ER_REQUIRE_ROW_BINLOG_FORMAT, MYF(0));
11437     DBUG_RETURN(HA_ERR_UNSUPPORTED);
11438   }
11439 #endif
11440 
11441   if (lock_type == F_UNLCK) {
11442     Rdb_transaction *const tx = get_tx_from_thd(thd);
11443 
11444     if (tx) {
11445       tx->io_perf_end_and_record(&m_io_perf);
11446       tx->m_n_mysql_tables_in_use--;
11447       if (tx->m_n_mysql_tables_in_use == 0 &&
11448           !my_core::thd_test_options(thd,
11449                                      OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
11450         /*
11451           Do like InnoDB: when we get here, it's time to commit a
11452           single-statement transaction.
11453 
11454           If the statement involved multiple tables, this code will be executed
11455           for each of them, but that's ok because non-first tx->commit() calls
11456           will be no-ops.
11457         */
11458         if (tx->commit_or_rollback()) {
11459           res = HA_ERR_INTERNAL_ERROR;
11460         }
11461       }
11462     }
11463   } else {
11464     if (my_core::thd_tx_isolation(thd) < ISO_READ_COMMITTED ||
11465         my_core::thd_tx_isolation(thd) > ISO_REPEATABLE_READ) {
11466       my_error(ER_ISOLATION_MODE_NOT_SUPPORTED, MYF(0),
11467                tx_isolation_names[my_core::thd_tx_isolation(thd)]);
11468       DBUG_RETURN(HA_ERR_UNSUPPORTED);
11469     }
11470     /*
11471       It's nice to do the following on start of every statement. The problem
11472       is, handler->start_stmt() is not called for INSERTs.
11473       So, we put this code here.
11474     */
11475     Rdb_transaction *const tx = get_or_create_tx(thd);
11476     read_thd_vars(thd);
11477 
11478     m_update_scope_is_valid = false;
11479 
11480     if (skip_unique_check()) {
11481       if ((thd->lex->sql_command == SQLCOM_INSERT ||
11482            thd->lex->sql_command == SQLCOM_LOAD ||
11483            thd->lex->sql_command == SQLCOM_REPLACE) &&
11484           (thd->lex->duplicates == DUP_REPLACE ||
11485            thd->lex->duplicates == DUP_UPDATE)) {
11486         my_error(ER_ON_DUPLICATE_DISABLED, MYF(0), thd->query());
11487         DBUG_RETURN(HA_ERR_UNSUPPORTED);
11488       }
11489     }
11490 
11491     if (lock_type == F_WRLCK) {
11492       if (tx->is_tx_read_only()) {
11493         my_error(ER_UPDATES_WITH_CONSISTENT_SNAPSHOT, MYF(0));
11494         DBUG_RETURN(HA_ERR_UNSUPPORTED);
11495       }
11496 
11497 #ifdef MARIADB_NOT_YET
11498       if (thd->get_explicit_snapshot()) {
11499         my_error(ER_UPDATES_WITH_EXPLICIT_SNAPSHOT, MYF(0));
11500         DBUG_RETURN(HA_ERR_UNSUPPORTED);
11501       }
11502 #endif
11503 
11504       /*
11505         SQL layer signals us to take a write lock. It does so when starting DML
11506         statement. We should put locks on the rows we're reading.
11507 
11508         Note: sometimes, external_lock() can be called without a prior
11509         ::store_lock call.  That's why we need to set lock_* members here, too.
11510       */
11511       m_lock_rows = RDB_LOCK_WRITE;
11512 
11513       if (thd->lex->sql_command == SQLCOM_CREATE_INDEX ||
11514           thd->lex->sql_command == SQLCOM_DROP_INDEX ||
11515           thd->lex->sql_command == SQLCOM_ALTER_TABLE) {
11516         tx->m_ddl_transaction = true;
11517       }
11518     }
11519     tx->m_n_mysql_tables_in_use++;
11520     rocksdb_register_tx(rocksdb_hton, thd, tx);
11521     tx->io_perf_start(&m_io_perf);
11522   }
11523 
11524   DBUG_RETURN(res);
11525 }
11526 
11527 /**
11528   @note
11529   A quote from ha_innobase::start_stmt():
11530   <quote>
11531   MySQL calls this function at the start of each SQL statement inside LOCK
11532   TABLES. Inside LOCK TABLES the ::external_lock method does not work to
11533   mark SQL statement borders.
11534   </quote>
11535 
11536   @return
11537     HA_EXIT_SUCCESS  OK
11538 */
11539 
11540 int ha_rocksdb::start_stmt(THD *const thd, thr_lock_type lock_type) {
11541   DBUG_ENTER_FUNC();
11542 
11543   DBUG_ASSERT(thd != nullptr);
11544 
11545   Rdb_transaction *const tx = get_or_create_tx(thd);
11546   read_thd_vars(thd);
11547   rocksdb_register_tx(ht, thd, tx);
11548   tx->io_perf_start(&m_io_perf);
11549 
11550   DBUG_RETURN(HA_EXIT_SUCCESS);
11551 }
11552 
11553 rocksdb::Range get_range(uint32_t i,
11554                          uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2],
11555                          int offset1, int offset2) {
11556   uchar *buf_begin = buf;
11557   uchar *buf_end = buf + Rdb_key_def::INDEX_NUMBER_SIZE;
11558   rdb_netbuf_store_index(buf_begin, i + offset1);
11559   rdb_netbuf_store_index(buf_end, i + offset2);
11560 
11561   return rocksdb::Range(
11562       rocksdb::Slice((const char *)buf_begin, Rdb_key_def::INDEX_NUMBER_SIZE),
11563       rocksdb::Slice((const char *)buf_end, Rdb_key_def::INDEX_NUMBER_SIZE));
11564 }
11565 
11566 static rocksdb::Range get_range(const Rdb_key_def &kd,
11567                                 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2],
11568                                 int offset1, int offset2) {
11569   return get_range(kd.get_index_number(), buf, offset1, offset2);
11570 }
11571 
11572 rocksdb::Range get_range(const Rdb_key_def &kd,
11573                          uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]) {
11574   if (kd.m_is_reverse_cf) {
11575     return myrocks::get_range(kd, buf, 1, 0);
11576   } else {
11577     return myrocks::get_range(kd, buf, 0, 1);
11578   }
11579 }
11580 
11581 rocksdb::Range ha_rocksdb::get_range(
11582     const int i, uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]) const {
11583   return myrocks::get_range(*m_key_descr_arr[i], buf);
11584 }
11585 
11586 /*
11587  This function is called with total_order_seek=true, but
11588  upper/lower bound setting is not necessary.
11589  Boundary set is useful when there is no matching key,
11590  but in drop_index_thread's case, it means index is marked as removed,
11591  so no further seek will happen for the index id.
11592 */
11593 static bool is_myrocks_index_empty(rocksdb::ColumnFamilyHandle *cfh,
11594                                    const bool is_reverse_cf,
11595                                    const rocksdb::ReadOptions &read_opts,
11596                                    const uint index_id) {
11597   bool index_removed = false;
11598   uchar key_buf[Rdb_key_def::INDEX_NUMBER_SIZE] = {0};
11599   rdb_netbuf_store_uint32(key_buf, index_id);
11600   const rocksdb::Slice key =
11601       rocksdb::Slice(reinterpret_cast<char *>(key_buf), sizeof(key_buf));
11602   std::unique_ptr<rocksdb::Iterator> it(rdb->NewIterator(read_opts, cfh));
11603   rocksdb_smart_seek(is_reverse_cf, it.get(), key);
11604   if (!it->Valid()) {
11605     index_removed = true;
11606   } else {
11607     if (memcmp(it->key().data(), key_buf, Rdb_key_def::INDEX_NUMBER_SIZE)) {
11608       // Key does not have same prefix
11609       index_removed = true;
11610     }
11611   }
11612   return index_removed;
11613 }
11614 
11615 /*
11616   Drop index thread's main logic
11617 */
11618 
11619 void Rdb_drop_index_thread::run() {
11620   RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
11621 
11622   for (;;) {
11623     // The stop flag might be set by shutdown command
11624     // after drop_index_thread releases signal_mutex
11625     // (i.e. while executing expensive Seek()). To prevent drop_index_thread
11626     // from entering long cond_timedwait, checking if stop flag
11627     // is true or not is needed, with drop_index_interrupt_mutex held.
11628     if (m_stop) {
11629       break;
11630     }
11631 
11632     timespec ts;
11633     int sec= dict_manager.is_drop_index_empty()
11634                      ? 24 * 60 * 60  // no filtering
11635                      : 60;           // filtering
11636     set_timespec(ts,sec);
11637 
11638     const auto ret MY_ATTRIBUTE((__unused__)) =
11639         mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts);
11640     if (m_stop) {
11641       break;
11642     }
11643     // make sure, no program error is returned
11644     DBUG_ASSERT(ret == 0 || ret == ETIMEDOUT);
11645     RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
11646 
11647     std::unordered_set<GL_INDEX_ID> indices;
11648     dict_manager.get_ongoing_drop_indexes(&indices);
11649     if (!indices.empty()) {
11650       std::unordered_set<GL_INDEX_ID> finished;
11651       rocksdb::ReadOptions read_opts;
11652       read_opts.total_order_seek = true;  // disable bloom filter
11653 
11654       for (const auto d : indices) {
11655         uint32 cf_flags = 0;
11656         if (!dict_manager.get_cf_flags(d.cf_id, &cf_flags)) {
11657           // NO_LINT_DEBUG
11658           sql_print_error(
11659               "RocksDB: Failed to get column family flags "
11660               "from cf id %u. MyRocks data dictionary may "
11661               "get corrupted.",
11662               d.cf_id);
11663           if (rocksdb_ignore_datadic_errors)
11664           {
11665             sql_print_error("RocksDB: rocksdb_ignore_datadic_errors=1, "
11666                             "trying to continue");
11667             continue;
11668           }
11669           abort();
11670         }
11671         rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(d.cf_id);
11672         DBUG_ASSERT(cfh);
11673         const bool is_reverse_cf = cf_flags & Rdb_key_def::REVERSE_CF_FLAG;
11674 
11675         uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
11676         rocksdb::Range range = get_range(d.index_id, buf, is_reverse_cf ? 1 : 0,
11677                                          is_reverse_cf ? 0 : 1);
11678         rocksdb::Status status = DeleteFilesInRange(rdb->GetBaseDB(), cfh,
11679                                                     &range.start, &range.limit);
11680         if (!status.ok()) {
11681           if (status.IsShutdownInProgress()) {
11682             break;
11683           }
11684           rdb_handle_io_error(status, RDB_IO_ERROR_BG_THREAD);
11685         }
11686         status = rdb->CompactRange(getCompactRangeOptions(), cfh, &range.start,
11687                                    &range.limit);
11688         if (!status.ok()) {
11689           if (status.IsShutdownInProgress()) {
11690             break;
11691           }
11692           rdb_handle_io_error(status, RDB_IO_ERROR_BG_THREAD);
11693         }
11694         if (is_myrocks_index_empty(cfh, is_reverse_cf, read_opts, d.index_id)) {
11695           finished.insert(d);
11696         }
11697       }
11698 
11699       if (!finished.empty()) {
11700         dict_manager.finish_drop_indexes(finished);
11701       }
11702     }
11703     RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
11704   }
11705 
11706   RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
11707 }
11708 
11709 Rdb_tbl_def *ha_rocksdb::get_table_if_exists(const char *const tablename) {
11710   std::string str;
11711   if (rdb_normalize_tablename(tablename, &str) != HA_EXIT_SUCCESS) {
11712     // We were not passed table name?
11713     DBUG_ASSERT(0);
11714     return nullptr;
11715   }
11716 
11717   return ddl_manager.find(str);
11718 }
11719 
11720 /*
11721   Overload func for delete table ---it deletes table meta data in data
11722   dictionary immediately and delete real data in background thread(async)
11723 
11724   @param tbl       IN      MyRocks table definition
11725 
11726   @return
11727     HA_EXIT_SUCCESS  OK
11728     other            HA_ERR error code (can be SE-specific)
11729 */
11730 int ha_rocksdb::delete_table(Rdb_tbl_def *const tbl) {
11731   DBUG_ENTER_FUNC();
11732 
11733   DBUG_ASSERT(tbl != nullptr);
11734   DBUG_ASSERT(m_tbl_def == nullptr || m_tbl_def == tbl);
11735 
11736   const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
11737   rocksdb::WriteBatch *const batch = wb.get();
11738 
11739   dict_manager.add_drop_table(tbl->m_key_descr_arr, tbl->m_key_count, batch);
11740 
11741   /*
11742     Remove the table entry in data dictionary (this will also remove it from
11743     the persistent data dictionary).
11744   */
11745   ddl_manager.remove(tbl, batch, true);
11746 
11747   int err = dict_manager.commit(batch);
11748   if (err) {
11749     DBUG_RETURN(err);
11750   }
11751 
11752   rdb_drop_idx_thread.signal();
11753   // avoid dangling pointer
11754   m_tbl_def = nullptr;
11755   DBUG_RETURN(HA_EXIT_SUCCESS);
11756 }
11757 
11758 /*
11759   Note: the following function is called when the table is not open. That is,
11760   this->table==nullptr, pk_key_descr==nullptr, etc.
11761 
11762   tablename points to line in form "./dbname/tablename".
11763 
11764   @return
11765     HA_EXIT_SUCCESS  OK
11766     other            HA_ERR error code (can be SE-specific)
11767 */
11768 
11769 int ha_rocksdb::delete_table(const char *const tablename) {
11770   DBUG_ENTER_FUNC();
11771 
11772   DBUG_ASSERT(tablename != nullptr);
11773 
11774   /* Find the table in the hash */
11775   Rdb_tbl_def *const tbl = get_table_if_exists(tablename);
11776   if (!tbl) {
11777     DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
11778   }
11779 
11780   DBUG_RETURN(delete_table(tbl));
11781 }
11782 
11783 int ha_rocksdb::remove_rows(Rdb_tbl_def *const tbl) {
11784   const rocksdb::WriteOptions wo =
11785       rdb_get_rocksdb_write_options(handler::ha_thd());
11786 
11787   rocksdb::ReadOptions opts;
11788   opts.total_order_seek = true;
11789   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
11790 
11791   char key_buf[MAX_KEY_LENGTH];
11792   uint key_len;
11793   ulonglong bytes_written = 0;
11794 
11795   uchar lower_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE];
11796   uchar upper_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE];
11797   rocksdb::Slice lower_bound_slice;
11798   rocksdb::Slice upper_bound_slice;
11799 
11800   /*
11801     Remove all records in each index.
11802     (This is is not crash-safe, but it doesn't matter, because bulk row
11803     deletion will be handled on rocksdb side)
11804   */
11805   for (uint i = 0; i < tbl->m_key_count; i++) {
11806     const Rdb_key_def &kd = *tbl->m_key_descr_arr[i];
11807     kd.get_infimum_key(reinterpret_cast<uchar *>(key_buf), &key_len);
11808     rocksdb::ColumnFamilyHandle *cf = kd.get_cf();
11809     const rocksdb::Slice table_key(key_buf, key_len);
11810     setup_iterator_bounds(kd, table_key, Rdb_key_def::INDEX_NUMBER_SIZE,
11811                           lower_bound_buf, upper_bound_buf, &lower_bound_slice,
11812                           &upper_bound_slice);
11813     DBUG_ASSERT(key_len == Rdb_key_def::INDEX_NUMBER_SIZE);
11814     opts.iterate_lower_bound = &lower_bound_slice;
11815     opts.iterate_upper_bound = &upper_bound_slice;
11816     std::unique_ptr<rocksdb::Iterator> it(rdb->NewIterator(opts, cf));
11817 
11818     it->Seek(table_key);
11819     while (it->Valid()) {
11820       const rocksdb::Slice key = it->key();
11821       if (!kd.covers_key(key)) {
11822         break;
11823       }
11824 
11825       rocksdb::Status s;
11826       if (can_use_single_delete(i)) {
11827         s = rdb->SingleDelete(wo, cf, key);
11828       } else {
11829         s = rdb->Delete(wo, cf, key);
11830       }
11831 
11832       if (!s.ok()) {
11833         return tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def,
11834                                     m_table_handler);
11835       }
11836       bytes_written += key.size();
11837       it->Next();
11838     }
11839   }
11840 
11841   tx->update_bytes_written(bytes_written);
11842 
11843   return HA_EXIT_SUCCESS;
11844 }
11845 
11846 /**
11847   @return
11848     HA_EXIT_SUCCESS  OK
11849     other            HA_ERR error code (cannot be SE-specific)
11850 */
11851 int ha_rocksdb::rename_table(const char *const from, const char *const to) {
11852   DBUG_ENTER_FUNC();
11853 
11854   std::string from_str;
11855   std::string to_str;
11856   std::string from_db;
11857   std::string to_db;
11858 
11859   int rc = rdb_normalize_tablename(from, &from_str);
11860   if (rc != HA_EXIT_SUCCESS) {
11861     DBUG_RETURN(rc);
11862   }
11863 
11864   rc = rdb_split_normalized_tablename(from_str, &from_db);
11865   if (rc != HA_EXIT_SUCCESS) {
11866     DBUG_RETURN(rc);
11867   }
11868 
11869   rc = rdb_normalize_tablename(to, &to_str);
11870   if (rc != HA_EXIT_SUCCESS) {
11871     DBUG_RETURN(rc);
11872   }
11873 
11874   rc = rdb_split_normalized_tablename(to_str, &to_db);
11875   if (rc != HA_EXIT_SUCCESS) {
11876     DBUG_RETURN(rc);
11877   }
11878 
11879   // If the user changed the database part of the name then validate that the
11880   // 'to' database exists.
11881   if (from_db != to_db && !rdb_database_exists(to_db)) {
11882     // If we return a RocksDB specific error code here we get
11883     // "error: 206 - Unknown error 206".  InnoDB gets
11884     // "error -1 - Unknown error -1" so let's match them.
11885     DBUG_RETURN(-1);
11886   }
11887 
11888   DBUG_EXECUTE_IF("gen_sql_table_name", to_str = to_str + "#sql-test";);
11889 
11890   const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
11891   rocksdb::WriteBatch *const batch = wb.get();
11892   dict_manager.lock();
11893 
11894   if (ddl_manager.rename(from_str, to_str, batch)) {
11895     rc = HA_ERR_NO_SUCH_TABLE;
11896   } else {
11897     rc = dict_manager.commit(batch);
11898   }
11899   dict_manager.unlock();
11900 
11901   DBUG_RETURN(rc);
11902 }
11903 
11904 /**
11905   check_if_incompatible_data() called if ALTER TABLE can't detect otherwise
11906   if new and old definition are compatible
11907 
11908   @details If there are no other explicit signs like changed number of
11909   fields this function will be called by compare_tables()
11910   (sql/sql_tables.cc) to decide should we rewrite whole table or only .frm
11911   file.
11912 
11913 */
11914 
11915 bool ha_rocksdb::check_if_incompatible_data(HA_CREATE_INFO *const info,
11916                                             uint table_changes) {
11917   DBUG_ENTER_FUNC();
11918 
11919   DBUG_ASSERT(info != nullptr);
11920 
11921   // this function is needed only for online alter-table
11922   DBUG_RETURN(COMPATIBLE_DATA_NO);
11923 }
11924 
11925 /**
11926   @return
11927     HA_EXIT_SUCCESS  OK
11928 */
11929 int ha_rocksdb::extra(enum ha_extra_function operation) {
11930   DBUG_ENTER_FUNC();
11931 
11932   switch (operation) {
11933     case HA_EXTRA_KEYREAD:
11934       m_keyread_only = true;
11935       break;
11936     case HA_EXTRA_NO_KEYREAD:
11937       m_keyread_only = false;
11938       break;
11939     case HA_EXTRA_FLUSH:
11940       /*
11941         If the table has blobs, then they are part of m_retrieved_record.
11942         This call invalidates them.
11943       */
11944       m_retrieved_record.Reset();
11945       break;
11946     case HA_EXTRA_INSERT_WITH_UPDATE:
11947       // INSERT ON DUPLICATE KEY UPDATE
11948       if (rocksdb_enable_insert_with_update_caching) {
11949         m_insert_with_update = true;
11950       }
11951       break;
11952     case HA_EXTRA_NO_IGNORE_DUP_KEY:
11953       // PAIRED with HA_EXTRA_INSERT_WITH_UPDATE or HA_EXTRA_WRITE_CAN_REPLACE
11954       // that indicates the end of REPLACE / INSERT ON DUPLICATE KEY
11955       m_insert_with_update = false;
11956       break;
11957 
11958     default:
11959       break;
11960   }
11961 
11962   DBUG_RETURN(HA_EXIT_SUCCESS);
11963 }
11964 
11965 /*
11966   Given a starting key and an ending key, estimate the number of rows that
11967   will exist between the two keys.
11968 */
11969 ha_rows ha_rocksdb::records_in_range(uint inx, const key_range *const min_key,
11970                                      const key_range *const max_key,
11971                                      page_range *pages) {
11972   DBUG_ENTER_FUNC();
11973 
11974   ha_rows ret = THDVAR(ha_thd(), records_in_range);
11975   if (ret) {
11976     DBUG_RETURN(ret);
11977   }
11978   if (table->force_index) {
11979     const ha_rows force_rows = THDVAR(ha_thd(), force_index_records_in_range);
11980     if (force_rows) {
11981       DBUG_RETURN(force_rows);
11982     }
11983   }
11984 
11985   const Rdb_key_def &kd = *m_key_descr_arr[inx];
11986 
11987   uint size1 = 0;
11988   if (min_key) {
11989     size1 = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
11990                                 m_record_buffer,
11991                                 min_key->key, min_key->keypart_map);
11992     if (min_key->flag == HA_READ_PREFIX_LAST_OR_PREV ||
11993         min_key->flag == HA_READ_PREFIX_LAST ||
11994         min_key->flag == HA_READ_AFTER_KEY) {
11995       kd.successor(m_sk_packed_tuple, size1);
11996     }
11997   } else {
11998     kd.get_infimum_key(m_sk_packed_tuple, &size1);
11999   }
12000 
12001   uint size2 = 0;
12002   if (max_key) {
12003     size2 = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple_old,
12004                                 m_record_buffer,
12005                                 max_key->key, max_key->keypart_map);
12006     if (max_key->flag == HA_READ_PREFIX_LAST_OR_PREV ||
12007         max_key->flag == HA_READ_PREFIX_LAST ||
12008         max_key->flag == HA_READ_AFTER_KEY) {
12009       kd.successor(m_sk_packed_tuple_old, size2);
12010     }
12011     // pad the upper key with FFFFs to make sure it is more than the lower
12012     if (size1 > size2) {
12013       memset(m_sk_packed_tuple_old + size2, 0xff, size1 - size2);
12014       size2 = size1;
12015     }
12016   } else {
12017     kd.get_supremum_key(m_sk_packed_tuple_old, &size2);
12018   }
12019 
12020   const rocksdb::Slice slice1((const char *)m_sk_packed_tuple, size1);
12021   const rocksdb::Slice slice2((const char *)m_sk_packed_tuple_old, size2);
12022 
12023   // slice1 >= slice2 means no row will match
12024   if (slice1.compare(slice2) >= 0) {
12025     DBUG_RETURN(HA_EXIT_SUCCESS);
12026   }
12027 
12028   rocksdb::Range r(kd.m_is_reverse_cf ? slice2 : slice1,
12029                    kd.m_is_reverse_cf ? slice1 : slice2);
12030 
12031   uint64_t sz = 0;
12032   auto disk_size = kd.m_stats.m_actual_disk_size;
12033   if (disk_size == 0) disk_size = kd.m_stats.m_data_size;
12034   auto rows = kd.m_stats.m_rows;
12035   if (rows == 0 || disk_size == 0) {
12036     rows = 1;
12037     disk_size = ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE;
12038   }
12039 
12040   // Getting statistics, including from Memtables
12041   uint8_t include_flags = rocksdb::DB::INCLUDE_FILES;
12042   rdb->GetApproximateSizes(kd.get_cf(), &r, 1, &sz, include_flags);
12043   ret = rows * sz / disk_size;
12044   uint64_t memTableCount;
12045   rdb->GetApproximateMemTableStats(kd.get_cf(), r, &memTableCount, &sz);
12046   ret += memTableCount;
12047 
12048   /*
12049     GetApproximateSizes() gives estimates so ret might exceed stats.records.
12050     MySQL then decides to use full index scan rather than range scan, which
12051     is not efficient for most cases.
12052     To prevent this, changing estimated records slightly smaller than
12053     stats.records.
12054   */
12055   if (ret >= stats.records) {
12056     ret = stats.records * 0.99;
12057   }
12058 
12059   if (rocksdb_debug_optimizer_n_rows > 0) {
12060     ret = rocksdb_debug_optimizer_n_rows;
12061   } else if (ret == 0) {
12062     ret = 1;
12063   }
12064 
12065   DBUG_RETURN(ret);
12066 }
12067 
12068 void ha_rocksdb::update_create_info(HA_CREATE_INFO *const create_info) {
12069   DBUG_ENTER_FUNC();
12070 
12071   DBUG_ASSERT(create_info != nullptr);
12072 
12073   if (!(create_info->used_fields & HA_CREATE_USED_AUTO)) {
12074     create_info->auto_increment_value = m_tbl_def->m_auto_incr_val;
12075   }
12076 
12077   DBUG_VOID_RETURN;
12078 }
12079 
12080 /**
12081   @brief
12082   Doing manual compaction on OPTIMIZE TABLE in RocksDB.
12083   Compaction itself is executed by background thread in RocksDB, but
12084   CompactRange() waits until compaction completes so this function
12085   may take a long time.
12086   Since RocksDB dataset is allocated per index id, OPTIMIZE TABLE
12087   triggers manual compaction for all indexes of the table.
12088   @details
12089   Compaction range is from the beginning of the index id to
12090   the first row of the next index id. When using reverse order
12091   column family, the first row of the next index id should be
12092   the last row of the previous index id.
12093 
12094   @return
12095     HA_ADMIN_OK      OK
12096     other            HA_ADMIN error code
12097 */
12098 int ha_rocksdb::optimize(THD *const thd, HA_CHECK_OPT *const check_opt) {
12099   DBUG_ENTER_FUNC();
12100 
12101   DBUG_ASSERT(thd != nullptr);
12102   DBUG_ASSERT(check_opt != nullptr);
12103 
12104   for (uint i = 0; i < table->s->keys; i++) {
12105     uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
12106     auto range = get_range(i, buf);
12107     const rocksdb::Status s = rdb->CompactRange(getCompactRangeOptions(),
12108                                                 m_key_descr_arr[i]->get_cf(),
12109                                                 &range.start, &range.limit);
12110     if (!s.ok()) {
12111       DBUG_RETURN(rdb_error_to_mysql(s));
12112     }
12113   }
12114 
12115   DBUG_RETURN(HA_EXIT_SUCCESS);
12116 }
12117 
12118 static int calculate_stats(
12119     const std::unordered_map<GL_INDEX_ID, std::shared_ptr<const Rdb_key_def>>
12120         &to_recalc,
12121     bool include_memtables) {
12122   DBUG_ENTER_FUNC();
12123 
12124   // find per column family key ranges which need to be queried
12125   std::unordered_map<rocksdb::ColumnFamilyHandle *, std::vector<rocksdb::Range>>
12126       ranges;
12127   std::unordered_map<GL_INDEX_ID, Rdb_index_stats> stats;
12128   std::vector<uchar> buf(to_recalc.size() * 2 * Rdb_key_def::INDEX_NUMBER_SIZE);
12129 
12130   uchar *bufp = buf.data();
12131   for (const auto &it : to_recalc) {
12132     const GL_INDEX_ID index_id = it.first;
12133     auto &kd = it.second;
12134     ranges[kd->get_cf()].push_back(myrocks::get_range(*kd, bufp));
12135     bufp += 2 * Rdb_key_def::INDEX_NUMBER_SIZE;
12136 
12137     stats[index_id] = Rdb_index_stats(index_id);
12138     DBUG_ASSERT(kd->get_key_parts() > 0);
12139     stats[index_id].m_distinct_keys_per_prefix.resize(kd->get_key_parts());
12140   }
12141 
12142   // get RocksDB table properties for these ranges
12143   rocksdb::TablePropertiesCollection props;
12144   for (const auto &it : ranges) {
12145     const auto old_size MY_ATTRIBUTE((__unused__)) = props.size();
12146     const auto status = rdb->GetPropertiesOfTablesInRange(
12147         it.first, &it.second[0], it.second.size(), &props);
12148     DBUG_ASSERT(props.size() >= old_size);
12149     if (!status.ok()) {
12150       DBUG_RETURN(ha_rocksdb::rdb_error_to_mysql(
12151           status, "Could not access RocksDB properties"));
12152     }
12153   }
12154 
12155   int num_sst = 0;
12156   for (const auto &it : props) {
12157     std::vector<Rdb_index_stats> sst_stats;
12158     Rdb_tbl_prop_coll::read_stats_from_tbl_props(it.second, &sst_stats);
12159     /*
12160       sst_stats is a list of index statistics for indexes that have entries
12161       in the current SST file.
12162     */
12163     for (const auto &it1 : sst_stats) {
12164       /*
12165         Only update statistics for indexes that belong to this SQL table.
12166 
12167         The reason is: We are walking through all SST files that have
12168         entries from this table (and so can compute good statistics). For
12169         other SQL tables, it can be that we're only seeing a small fraction
12170         of table's entries (and so we can't update statistics based on that).
12171       */
12172       if (stats.find(it1.m_gl_index_id) == stats.end()) {
12173         continue;
12174       }
12175 
12176       auto it_index = to_recalc.find(it1.m_gl_index_id);
12177       DBUG_ASSERT(it_index != to_recalc.end());
12178       if (it_index == to_recalc.end()) {
12179         continue;
12180       }
12181       stats[it1.m_gl_index_id].merge(
12182           it1, true, it_index->second->max_storage_fmt_length());
12183     }
12184     num_sst++;
12185   }
12186 
12187   if (include_memtables) {
12188     // calculate memtable cardinality
12189     Rdb_tbl_card_coll cardinality_collector(rocksdb_table_stats_sampling_pct);
12190     auto read_opts = rocksdb::ReadOptions();
12191     read_opts.read_tier = rocksdb::ReadTier::kMemtableTier;
12192     for (const auto &it_kd : to_recalc) {
12193       const std::shared_ptr<const Rdb_key_def> &kd = it_kd.second;
12194       Rdb_index_stats &stat = stats[kd->get_gl_index_id()];
12195 
12196       uchar r_buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
12197       auto r = myrocks::get_range(*kd, r_buf);
12198       uint64_t memtableCount;
12199       uint64_t memtableSize;
12200       rdb->GetApproximateMemTableStats(kd->get_cf(), r, &memtableCount,
12201                                        &memtableSize);
12202       if (memtableCount < (uint64_t)stat.m_rows / 10) {
12203         // skip tables that already have enough stats from SST files to reduce
12204         // overhead and avoid degradation of big tables stats by sampling from
12205         // relatively tiny (less than 10% of full data set) memtable dataset
12206         continue;
12207       }
12208 
12209       std::unique_ptr<rocksdb::Iterator> it =
12210           std::unique_ptr<rocksdb::Iterator>(
12211               rdb->NewIterator(read_opts, kd->get_cf()));
12212 
12213       rocksdb::Slice first_index_key((const char *)r_buf,
12214                                      Rdb_key_def::INDEX_NUMBER_SIZE);
12215 
12216       cardinality_collector.Reset();
12217       for (it->Seek(first_index_key); is_valid(it.get()); it->Next()) {
12218         const rocksdb::Slice key = it->key();
12219         if (!kd->covers_key(key)) {
12220           break;  // end of this index
12221         }
12222         stat.m_rows++;
12223 
12224         cardinality_collector.ProcessKey(key, kd.get(), &stat);
12225       }
12226       cardinality_collector.AdjustStats(&stat);
12227     }
12228   }
12229 
12230   // set and persist new stats
12231   ddl_manager.set_stats(stats);
12232   ddl_manager.persist_stats(true);
12233 
12234   DBUG_RETURN(HA_EXIT_SUCCESS);
12235 }
12236 
12237 int ha_rocksdb::calculate_stats_for_table() {
12238   DBUG_ENTER_FUNC();
12239 
12240   std::unordered_map<GL_INDEX_ID, std::shared_ptr<const Rdb_key_def>>
12241       ids_to_check;
12242   for (uint i = 0; i < table->s->keys; i++) {
12243     ids_to_check.insert(std::make_pair(m_key_descr_arr[i]->get_gl_index_id(),
12244                                        m_key_descr_arr[i]));
12245   }
12246 
12247   DBUG_RETURN(calculate_stats(ids_to_check, true));
12248 }
12249 
12250 /*
12251   @return
12252     HA_ADMIN_OK      OK
12253     other            HA_ADMIN error code
12254 */
12255 int ha_rocksdb::analyze(THD *const thd, HA_CHECK_OPT *const check_opt) {
12256   DBUG_ENTER_FUNC();
12257 
12258   if (table) {
12259     if (calculate_stats_for_table() != HA_EXIT_SUCCESS) {
12260       DBUG_RETURN(HA_ADMIN_FAILED);
12261     }
12262   }
12263 
12264   // A call to ::info is needed to repopulate some SQL level structs. This is
12265   // necessary for online analyze because we cannot rely on another ::open
12266   // call to call info for us.
12267   if (info(HA_STATUS_CONST | HA_STATUS_VARIABLE) != HA_EXIT_SUCCESS) {
12268     DBUG_RETURN(HA_ADMIN_FAILED);
12269   }
12270 
12271   DBUG_RETURN(HA_ADMIN_OK);
12272 }
12273 
12274 void ha_rocksdb::get_auto_increment(ulonglong off, ulonglong inc,
12275                                     ulonglong nb_desired_values,
12276                                     ulonglong *const first_value,
12277                                     ulonglong *const nb_reserved_values) {
12278   /*
12279     MySQL has a somewhat complicated way of handling the auto-increment value.
12280     The first time get_auto_increment is called for a statement,
12281     nb_desired_values is the estimate for how many values will be needed.  The
12282     engine can then reserve some values, and those will be automatically used
12283     by MySQL, until a hard-coded value shows up in the insert statement, after
12284     which MySQL again calls this function to reset its starting value.
12285    *
12286     For simplicity we will just ignore nb_desired_values - we aren't going to
12287     reserve any extra values for a multi-insert statement.  Each row will
12288     simply acquire the next value as needed and we will always tell MySQL that
12289     we only reserved 1 value.  Since we are using an atomic value for
12290     m_auto_incr_val this should be safe - if we had to grab a mutex, doing
12291     an actual reserve of some values might be a better solution.
12292    */
12293   DEBUG_SYNC(ha_thd(), "rocksdb.autoinc_vars");
12294   DEBUG_SYNC(ha_thd(), "rocksdb.autoinc_vars2");
12295 
12296   if (off > inc) {
12297     off = 1;
12298   }
12299 
12300   Field *field;
12301   ulonglong new_val, max_val;
12302   field = table->key_info[table->s->next_number_index].key_part[0].field;
12303   max_val = rdb_get_int_col_max_value(field);
12304 
12305   // Local variable reference to simplify code below
12306   auto &auto_incr = m_tbl_def->m_auto_incr_val;
12307 
12308   if (inc == 1) {
12309     DBUG_ASSERT(off == 1);
12310     // Optimization for the standard case where we are always simply
12311     // incrementing from the last position
12312 
12313     // Use CAS operation in a loop to make sure automically get the next auto
12314     // increment value while ensuring that we don't wrap around to a negative
12315     // number.
12316     //
12317     // We set auto_incr to the min of max_val and new_val + 1. This means that
12318     // if we're at the maximum, we should be returning the same value for
12319     // multiple rows, resulting in duplicate key errors (as expected).
12320     //
12321     // If we return values greater than the max, the SQL layer will "truncate"
12322     // the value anyway, but it means that we store invalid values into
12323     // auto_incr that will be visible in SHOW CREATE TABLE.
12324     new_val = auto_incr;
12325     while (new_val != std::numeric_limits<ulonglong>::max()) {
12326       if (auto_incr.compare_exchange_weak(new_val,
12327                                           std::min(new_val + 1, max_val))) {
12328         break;
12329       }
12330     }
12331   } else {
12332     // The next value can be more complicated if either 'inc' or 'off' is not 1
12333     ulonglong last_val = auto_incr;
12334 
12335     if (last_val > max_val) {
12336       new_val = std::numeric_limits<ulonglong>::max();
12337     } else {
12338       // Loop until we can correctly update the atomic value
12339       do {
12340         DBUG_ASSERT(last_val > 0);
12341         // Calculate the next value in the auto increment series: offset
12342         // + N * increment where N is 0, 1, 2, ...
12343         //
12344         // For further information please visit:
12345         // http://dev.mysql.com/doc/refman/5.7/en/replication-options-master.html
12346         //
12347         // The following is confusing so here is an explanation:
12348         // To get the next number in the sequence above you subtract out the
12349         // offset, calculate the next sequence (N * increment) and then add the
12350         // offset back in.
12351         //
12352         // The additions are rearranged to avoid overflow.  The following is
12353         // equivalent to (last_val - 1 + inc - off) / inc. This uses the fact
12354         // that (a+b)/c = a/c + b/c + (a%c + b%c)/c. To show why:
12355         //
12356         // (a+b)/c
12357         // = (a - a%c + a%c + b - b%c + b%c) / c
12358         // = (a - a%c) / c + (b - b%c) / c + (a%c + b%c) / c
12359         // = a/c + b/c + (a%c + b%c) / c
12360         //
12361         // Now, substitute a = last_val - 1, b = inc - off, c = inc to get the
12362         // following statement.
12363         ulonglong n =
12364             (last_val - 1) / inc + ((last_val - 1) % inc + inc - off) / inc;
12365 
12366         // Check if n * inc + off will overflow. This can only happen if we have
12367         // an UNSIGNED BIGINT field.
12368         if (n > (std::numeric_limits<ulonglong>::max() - off) / inc) {
12369           DBUG_ASSERT(max_val == std::numeric_limits<ulonglong>::max());
12370           // The 'last_val' value is already equal to or larger than the largest
12371           // value in the sequence.  Continuing would wrap around (technically
12372           // the behavior would be undefined).  What should we do?
12373           // We could:
12374           //   1) set the new value to the last possible number in our sequence
12375           //      as described above.  The problem with this is that this
12376           //      number could be smaller than a value in an existing row.
12377           //   2) set the new value to the largest possible number.  This number
12378           //      may not be in our sequence, but it is guaranteed to be equal
12379           //      to or larger than any other value already inserted.
12380           //
12381           //  For now I'm going to take option 2.
12382           //
12383           //  Returning ULLONG_MAX from get_auto_increment will cause the SQL
12384           //  layer to fail with ER_AUTOINC_READ_FAILED. This means that due to
12385           //  the SE API for get_auto_increment, inserts will fail with
12386           //  ER_AUTOINC_READ_FAILED if the column is UNSIGNED BIGINT, but
12387           //  inserts will fail with ER_DUP_ENTRY for other types (or no failure
12388           //  if the column is in a non-unique SK).
12389           new_val = std::numeric_limits<ulonglong>::max();
12390           auto_incr = new_val;  // Store the largest value into auto_incr
12391           break;
12392         }
12393 
12394         new_val = n * inc + off;
12395 
12396         // Attempt to store the new value (plus 1 since m_auto_incr_val contains
12397         // the next available value) into the atomic value.  If the current
12398         // value no longer matches what we have in 'last_val' this will fail and
12399         // we will repeat the loop (`last_val` will automatically get updated
12400         // with the current value).
12401         //
12402         // See above explanation for inc == 1 for why we use std::min.
12403       } while (!auto_incr.compare_exchange_weak(
12404           last_val, std::min(new_val + 1, max_val)));
12405     }
12406   }
12407 
12408   *first_value = new_val;
12409   *nb_reserved_values = 1;
12410 }
12411 
12412 #ifndef DBUG_OFF
12413 
12414 /* Debugger help function */
12415 static char dbug_item_print_buf[512];
12416 
12417 const char *dbug_print_item(Item *const item) {
12418   char *const buf = dbug_item_print_buf;
12419   String str(buf, sizeof(dbug_item_print_buf), &my_charset_bin);
12420   str.length(0);
12421   if (!item) return "(Item*)nullptr";
12422   item->print(&str, QT_ORDINARY);
12423   if (str.c_ptr() == buf) {
12424     return buf;
12425   } else {
12426     return "Couldn't fit into buffer";
12427   }
12428 }
12429 
12430 #endif /*DBUG_OFF*/
12431 
12432 /**
12433   SQL layer calls this function to push an index condition.
12434 
12435   @details
12436     The condition is for index keyno (only one condition can be pushed at a
12437     time).
12438     The caller guarantees that condition refers only to index fields; besides
12439     that, fields must have
12440 
12441       $field->part_of_key.set_bit(keyno)
12442 
12443     which means that
12444 
12445        (handler->index_flags(keyno, $keypart, 0) & HA_KEYREAD_ONLY) == 1
12446 
12447     which means that field value can be restored from the index tuple.
12448 
12449   @return
12450     Part of condition we couldn't check (always nullptr).
12451 */
12452 
12453 class Item *ha_rocksdb::idx_cond_push(uint keyno, class Item *const idx_cond) {
12454   DBUG_ENTER_FUNC();
12455 
12456   DBUG_ASSERT(keyno != MAX_KEY);
12457   DBUG_ASSERT(idx_cond != nullptr);
12458 
12459   pushed_idx_cond = idx_cond;
12460   pushed_idx_cond_keyno = keyno;
12461   in_range_check_pushed_down = TRUE;
12462 
12463   /* We will check the whole condition */
12464   DBUG_RETURN(nullptr);
12465 }
12466 
12467 /*
12468   Checks if inplace alter is supported for a given operation.
12469 */
12470 
12471 my_core::enum_alter_inplace_result ha_rocksdb::check_if_supported_inplace_alter(
12472     TABLE *altered_table, my_core::Alter_inplace_info *const ha_alter_info) {
12473   DBUG_ENTER_FUNC();
12474 
12475   DBUG_ASSERT(ha_alter_info != nullptr);
12476 
12477   if (ha_alter_info->handler_flags &
12478       ~(ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX |
12479         ALTER_DROP_UNIQUE_INDEX |
12480         ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX |
12481         ALTER_PARTITIONED |
12482         ALTER_ADD_UNIQUE_INDEX |
12483         ALTER_INDEX_ORDER |
12484         ALTER_CHANGE_CREATE_OPTION)) {
12485     DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED);
12486   }
12487 
12488   /* We don't support unique keys on table w/ no primary keys */
12489   if ((ha_alter_info->handler_flags &
12490        ALTER_ADD_UNIQUE_INDEX) &&
12491       has_hidden_pk(altered_table)) {
12492     DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED);
12493   }
12494 
12495   /* We only support changing auto_increment for table options. */
12496   if ((ha_alter_info->handler_flags & ALTER_CHANGE_CREATE_OPTION) &&
12497       !(ha_alter_info->create_info->used_fields & HA_CREATE_USED_AUTO)) {
12498     DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED);
12499   }
12500 
12501   /* FIXME: MDEV-16099 Use alter algorithm=nocopy or algorithm=instant
12502 		for non-InnoDB engine */
12503   DBUG_RETURN(my_core::HA_ALTER_INPLACE_COPY_LOCK);
12504 }
12505 
12506 /**
12507   Allows the storage engine to update internal structures with concurrent
12508   writes blocked. If check_if_supported_inplace_alter() returns
12509   HA_ALTER_INPLACE_COPY_NO_LOCK or HA_ALTER_INPLACE_COPY_LOCK.
12510   this function is called with exclusive lock otherwise the same level of
12511   locking as for inplace_alter_table() will be used.
12512 
12513   @note Storage engines are responsible for reporting any errors by
12514   calling my_error()/print_error()
12515 
12516   @note If this function reports error, commit_inplace_alter_table()
12517   will be called with commit= false.
12518 
12519   @note For partitioning, failing to prepare one partition, means that
12520   commit_inplace_alter_table() will be called to roll back changes for
12521   all partitions. This means that commit_inplace_alter_table() might be
12522   called without prepare_inplace_alter_table() having been called first
12523   for a given partition.
12524 
12525   @param    altered_table     TABLE object for new version of table.
12526   @param    ha_alter_info     Structure describing changes to be done
12527                               by ALTER TABLE and holding data used
12528                               during in-place alter.
12529 
12530   @retval   true              Error
12531   @retval   false             Success
12532 */
12533 bool ha_rocksdb::prepare_inplace_alter_table(
12534     TABLE *const altered_table,
12535     my_core::Alter_inplace_info *const ha_alter_info) {
12536   DBUG_ENTER_FUNC();
12537 
12538   DBUG_ASSERT(altered_table != nullptr);
12539   DBUG_ASSERT(ha_alter_info != nullptr);
12540 
12541   Rdb_tbl_def *new_tdef = nullptr;
12542   std::shared_ptr<Rdb_key_def> *old_key_descr = nullptr;
12543   std::shared_ptr<Rdb_key_def> *new_key_descr = nullptr;
12544   uint old_n_keys = m_tbl_def->m_key_count;
12545   uint new_n_keys = altered_table->s->keys;
12546   std::unordered_set<std::shared_ptr<Rdb_key_def>> added_indexes;
12547   std::unordered_set<GL_INDEX_ID> dropped_index_ids;
12548   uint n_dropped_keys = 0;
12549   uint n_added_keys = 0;
12550   ulonglong max_auto_incr = 0;
12551 
12552   if (ha_alter_info->handler_flags &
12553       (ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX |
12554        ALTER_DROP_UNIQUE_INDEX |
12555        ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX |
12556        ALTER_ADD_UNIQUE_INDEX)) {
12557     if (has_hidden_pk(altered_table)) {
12558       new_n_keys += 1;
12559     }
12560 
12561     const TABLE *const old_table = table;
12562     old_key_descr = m_tbl_def->m_key_descr_arr;
12563     new_key_descr = new std::shared_ptr<Rdb_key_def>[new_n_keys];
12564 
12565     new_tdef = new Rdb_tbl_def(m_tbl_def->full_tablename());
12566     new_tdef->m_key_descr_arr = new_key_descr;
12567     new_tdef->m_key_count = new_n_keys;
12568     new_tdef->m_auto_incr_val =
12569         m_tbl_def->m_auto_incr_val.load(std::memory_order_relaxed);
12570     new_tdef->m_hidden_pk_val =
12571         m_tbl_def->m_hidden_pk_val.load(std::memory_order_relaxed);
12572 
12573     if (create_key_defs(altered_table, new_tdef, table, m_tbl_def)) {
12574       /* Delete the new key descriptors */
12575       delete[] new_key_descr;
12576 
12577       /*
12578         Explicitly mark as nullptr so we don't accidentally remove entries
12579         from data dictionary on cleanup (or cause double delete[]).
12580         */
12581       new_tdef->m_key_descr_arr = nullptr;
12582       delete new_tdef;
12583 
12584       my_error(ER_KEY_CREATE_DURING_ALTER, MYF(0));
12585       DBUG_RETURN(HA_EXIT_FAILURE);
12586     }
12587 
12588     uint i;
12589     uint j;
12590 
12591     /* Determine which(if any) key definition(s) need to be dropped */
12592     for (i = 0; i < ha_alter_info->index_drop_count; i++) {
12593       const KEY *const dropped_key = ha_alter_info->index_drop_buffer[i];
12594       for (j = 0; j < old_n_keys; j++) {
12595         const KEY *const old_key =
12596             &old_table->key_info[old_key_descr[j]->get_keyno()];
12597 
12598         if (!compare_keys(old_key, dropped_key)) {
12599           dropped_index_ids.insert(old_key_descr[j]->get_gl_index_id());
12600           break;
12601         }
12602       }
12603     }
12604 
12605     /* Determine which(if any) key definitions(s) need to be added */
12606     int identical_indexes_found = 0;
12607     for (i = 0; i < ha_alter_info->index_add_count; i++) {
12608       const KEY *const added_key =
12609           &ha_alter_info->key_info_buffer[ha_alter_info->index_add_buffer[i]];
12610       for (j = 0; j < new_n_keys; j++) {
12611         const KEY *const new_key =
12612             &altered_table->key_info[new_key_descr[j]->get_keyno()];
12613         if (!compare_keys(new_key, added_key)) {
12614           /*
12615             Check for cases where an 'identical' index is being dropped and
12616             re-added in a single ALTER statement.  Turn this into a no-op as the
12617             index has not changed.
12618 
12619             E.G. Unique index -> non-unique index requires no change
12620 
12621             Note that cases where the index name remains the same but the
12622             key-parts are changed is already handled in create_inplace_key_defs.
12623             In these cases the index needs to be rebuilt.
12624             */
12625           if (dropped_index_ids.count(new_key_descr[j]->get_gl_index_id())) {
12626             dropped_index_ids.erase(new_key_descr[j]->get_gl_index_id());
12627             identical_indexes_found++;
12628           } else {
12629             added_indexes.insert(new_key_descr[j]);
12630           }
12631 
12632           break;
12633         }
12634       }
12635     }
12636 
12637     n_dropped_keys = ha_alter_info->index_drop_count - identical_indexes_found;
12638     n_added_keys = ha_alter_info->index_add_count - identical_indexes_found;
12639     DBUG_ASSERT(dropped_index_ids.size() == n_dropped_keys);
12640     DBUG_ASSERT(added_indexes.size() == n_added_keys);
12641     DBUG_ASSERT(new_n_keys == (old_n_keys - n_dropped_keys + n_added_keys));
12642   }
12643   if (ha_alter_info->handler_flags & ALTER_CHANGE_CREATE_OPTION) {
12644     if (!new_tdef) {
12645       new_tdef = m_tbl_def;
12646     }
12647     if (table->found_next_number_field) {
12648       max_auto_incr = load_auto_incr_value_from_index();
12649     }
12650   }
12651 
12652   ha_alter_info->handler_ctx = new Rdb_inplace_alter_ctx(
12653       new_tdef, old_key_descr, new_key_descr, old_n_keys, new_n_keys,
12654       added_indexes, dropped_index_ids, n_added_keys, n_dropped_keys,
12655       max_auto_incr);
12656   DBUG_RETURN(HA_EXIT_SUCCESS);
12657 }
12658 
12659 /**
12660   Alter the table structure in-place with operations specified using
12661   HA_ALTER_FLAGS and Alter_inplace_info. The level of concurrency allowed
12662   during this operation depends on the return value from
12663   check_if_supported_inplace_alter().
12664 
12665   @note Storage engines are responsible for reporting any errors by
12666   calling my_error()/print_error()
12667 
12668   @note If this function reports error, commit_inplace_alter_table()
12669   will be called with commit= false.
12670 
12671   @param    altered_table     TABLE object for new version of table.
12672   @param    ha_alter_info     Structure describing changes to be done
12673                               by ALTER TABLE and holding data used
12674                               during in-place alter.
12675 
12676   @retval   true              Error
12677   @retval   false             Success
12678 */
12679 bool ha_rocksdb::inplace_alter_table(
12680     TABLE *const altered_table,
12681     my_core::Alter_inplace_info *const ha_alter_info) {
12682   DBUG_ENTER_FUNC();
12683 
12684   DBUG_ASSERT(altered_table != nullptr);
12685   DBUG_ASSERT(ha_alter_info != nullptr);
12686   DBUG_ASSERT(ha_alter_info->handler_ctx != nullptr);
12687 
12688   Rdb_inplace_alter_ctx *const ctx =
12689       static_cast<Rdb_inplace_alter_ctx *>(ha_alter_info->handler_ctx);
12690 
12691   if (ha_alter_info->handler_flags &
12692       (ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX |
12693        ALTER_ADD_UNIQUE_INDEX)) {
12694     /*
12695       Buffers need to be set up again to account for new, possibly longer
12696       secondary keys.
12697     */
12698     free_key_buffers();
12699 
12700     DBUG_ASSERT(ctx != nullptr);
12701 
12702     /*
12703       If adding unique index, allocate special buffers for duplicate checking.
12704     */
12705     int err;
12706     if ((err = alloc_key_buffers(
12707              altered_table, ctx->m_new_tdef,
12708              ha_alter_info->handler_flags &
12709                  ALTER_ADD_UNIQUE_INDEX))) {
12710       my_error(ER_OUT_OF_RESOURCES, MYF(0));
12711       DBUG_RETURN(err);
12712     }
12713 
12714     /* Populate all new secondary keys by scanning the primary key. */
12715     if ((err = inplace_populate_sk(altered_table, ctx->m_added_indexes))) {
12716       my_error(ER_SK_POPULATE_DURING_ALTER, MYF(0));
12717       DBUG_RETURN(HA_EXIT_FAILURE);
12718     }
12719   }
12720 
12721   DBUG_EXECUTE_IF("myrocks_simulate_index_create_rollback", {
12722     dbug_create_err_inplace_alter();
12723     DBUG_RETURN(HA_EXIT_FAILURE);
12724   };);
12725 
12726   DBUG_RETURN(HA_EXIT_SUCCESS);
12727 }
12728 
12729 /**
12730  Scan the Primary Key index entries and populate the new secondary keys.
12731 */
12732 int ha_rocksdb::inplace_populate_sk(
12733     TABLE *const new_table_arg,
12734     const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes) {
12735   DBUG_ENTER_FUNC();
12736   int res = HA_EXIT_SUCCESS;
12737   const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
12738   rocksdb::WriteBatch *const batch = wb.get();
12739 
12740   /* Update the data dictionary */
12741   std::unordered_set<GL_INDEX_ID> create_index_ids;
12742   for (const auto &index : indexes) {
12743     create_index_ids.insert(index->get_gl_index_id());
12744   }
12745   dict_manager.add_create_index(create_index_ids, batch);
12746   res = dict_manager.commit(batch);
12747   if (res != HA_EXIT_SUCCESS) {
12748     return res;
12749   }
12750 
12751   /*
12752     Add uncommitted key definitons to ddl_manager.  We need to do this
12753     so that the property collector can find this keydef when it needs to
12754     update stats.  The property collector looks for the keydef in the
12755     data dictionary, but it won't be there yet since this key definition
12756     is still in the creation process.
12757   */
12758   ddl_manager.add_uncommitted_keydefs(indexes);
12759 
12760   const bool hidden_pk_exists = has_hidden_pk(table);
12761 
12762   Rdb_transaction *tx = get_or_create_tx(table->in_use);
12763 
12764   /*
12765     There is one specific scenario where m_sst_info may not be nullptr. This
12766     happens if the handler we're using happens to be the handler where the PK
12767     bulk load was done on. The sequence of events that lead to this is as
12768     follows (T1 is PK bulk load, T2 is SK alter table):
12769 
12770     T1: Execute last INSERT statement
12771     T1: Return TABLE and handler object back to Table_cache_manager
12772     T1: Close connection
12773     T2: Execute ALTER statement
12774     T2: Take same TABLE/handler from Table_cache_manager
12775     T2: Call closefrm which will call finalize_bulk_load on every other open
12776         table/handler *except* the one it's on.
12777     T2: Acquire stale snapshot of PK
12778     T1: Call finalize_bulk_load
12779 
12780     This is rare because usually, closefrm will call the destructor (and thus
12781     finalize_bulk_load) on the handler where PK bulk load is done. However, if
12782     the thread ids of the bulk load thread and the alter thread differ by a
12783     multiple of table_cache_instances (8 by default), then they hash to the
12784     same bucket in Table_cache_manager and the alter thread will not not call
12785     the destructor on the handler it is holding. Thus, its m_sst_info will not
12786     be nullptr.
12787 
12788     At this point, it is safe to refresh the snapshot because we know all other
12789     open handlers have been closed at this point, and the one we're on is the
12790     only one left.
12791   */
12792   if (m_sst_info) {
12793     if ((res = finalize_bulk_load())) {
12794       DBUG_RETURN(res);
12795     }
12796     tx->commit();
12797   }
12798 
12799   const ulonglong rdb_merge_buf_size = THDVAR(ha_thd(), merge_buf_size);
12800   const ulonglong rdb_merge_combine_read_size =
12801       THDVAR(ha_thd(), merge_combine_read_size);
12802   const ulonglong rdb_merge_tmp_file_removal_delay =
12803       THDVAR(ha_thd(), merge_tmp_file_removal_delay_ms);
12804 
12805   for (const auto &index : indexes) {
12806     bool is_unique_index =
12807         new_table_arg->key_info[index->get_keyno()].flags & HA_NOSAME;
12808 
12809     Rdb_index_merge rdb_merge(tx->get_rocksdb_tmpdir(), rdb_merge_buf_size,
12810                               rdb_merge_combine_read_size,
12811                               rdb_merge_tmp_file_removal_delay,
12812                               index->get_cf());
12813 
12814     if ((res = rdb_merge.init())) {
12815       DBUG_RETURN(res);
12816     }
12817 
12818     /*
12819       Note: We pass in the currently existing table + tbl_def object here,
12820       as the pk index position may have changed in the case of hidden primary
12821       keys.
12822     */
12823     const uint pk = pk_index(table, m_tbl_def);
12824     ha_index_init(pk, true);
12825 
12826     /* Scan each record in the primary key in order */
12827     for (res = index_first(table->record[0]); res == 0;
12828          res = index_next(table->record[0])) {
12829       longlong hidden_pk_id = 0;
12830       if (hidden_pk_exists &&
12831           (res = read_hidden_pk_id_from_rowkey(&hidden_pk_id))) {
12832         // NO_LINT_DEBUG
12833         sql_print_error("Error retrieving hidden pk id.");
12834         ha_index_end();
12835         DBUG_RETURN(res);
12836       }
12837 
12838       /* Create new secondary index entry */
12839       const int new_packed_size = index->pack_record(
12840           new_table_arg, m_pack_buffer, table->record[0], m_sk_packed_tuple,
12841           &m_sk_tails, should_store_row_debug_checksums(), hidden_pk_id, 0,
12842           nullptr, m_ttl_bytes);
12843 
12844       const rocksdb::Slice key = rocksdb::Slice(
12845           reinterpret_cast<const char *>(m_sk_packed_tuple), new_packed_size);
12846       const rocksdb::Slice val =
12847           rocksdb::Slice(reinterpret_cast<const char *>(m_sk_tails.ptr()),
12848                          m_sk_tails.get_current_pos());
12849 
12850       /*
12851         Add record to offset tree in preparation for writing out to
12852         disk in sorted chunks.
12853       */
12854       if ((res = rdb_merge.add(key, val))) {
12855         ha_index_end();
12856         DBUG_RETURN(res);
12857       }
12858     }
12859 
12860     if (res != HA_ERR_END_OF_FILE) {
12861       // NO_LINT_DEBUG
12862       sql_print_error("Error retrieving index entry from primary key.");
12863       ha_index_end();
12864       DBUG_RETURN(res);
12865     }
12866 
12867     ha_index_end();
12868 
12869     /*
12870       Perform an n-way merge of n sorted buffers on disk, then writes all
12871       results to RocksDB via SSTFileWriter API.
12872     */
12873     rocksdb::Slice merge_key;
12874     rocksdb::Slice merge_val;
12875 
12876     struct unique_sk_buf_info sk_info;
12877     sk_info.dup_sk_buf = m_dup_sk_packed_tuple;
12878     sk_info.dup_sk_buf_old = m_dup_sk_packed_tuple_old;
12879 
12880     while ((res = rdb_merge.next(&merge_key, &merge_val)) == 0) {
12881       /* Perform uniqueness check if needed */
12882       if (is_unique_index) {
12883         if (check_duplicate_sk(new_table_arg, *index, &merge_key, &sk_info)) {
12884           /*
12885             Duplicate entry found when trying to create unique secondary key.
12886             We need to unpack the record into new_table_arg->record[0] as it
12887             is used inside print_keydup_error so that the error message shows
12888             the duplicate record.
12889           */
12890           if (index->unpack_record(
12891                   new_table_arg, new_table_arg->record[0], &merge_key,
12892                   &merge_val, m_converter->get_verify_row_debug_checksums())) {
12893             /* Should never reach here */
12894             DBUG_ASSERT(0);
12895           }
12896 
12897           print_keydup_error(new_table_arg,
12898                              &new_table_arg->key_info[index->get_keyno()],
12899                              MYF(0));
12900           DBUG_RETURN(ER_DUP_ENTRY);
12901         }
12902       }
12903 
12904       /*
12905         Insert key and slice to SST via SSTFileWriter API.
12906       */
12907       if ((res = bulk_load_key(tx, *index, merge_key, merge_val, false))) {
12908         break;
12909       }
12910     }
12911 
12912     /*
12913       Here, res == -1 means that we are finished, while > 0 means an error
12914       occurred.
12915     */
12916     if (res > 0) {
12917       // NO_LINT_DEBUG
12918       sql_print_error("Error while bulk loading keys in external merge sort.");
12919       DBUG_RETURN(res);
12920     }
12921 
12922     bool is_critical_error;
12923     res = tx->finish_bulk_load(&is_critical_error);
12924     if (res && is_critical_error) {
12925       // NO_LINT_DEBUG
12926       sql_print_error("Error finishing bulk load.");
12927       DBUG_RETURN(res);
12928     }
12929   }
12930 
12931   /*
12932     Explicitly tell jemalloc to clean up any unused dirty pages at this point.
12933     See https://reviews.facebook.net/D63723 for more details.
12934   */
12935   purge_all_jemalloc_arenas();
12936 
12937   DBUG_EXECUTE_IF("crash_during_online_index_creation", DBUG_SUICIDE(););
12938   DBUG_RETURN(res);
12939 }
12940 
12941 /**
12942   Commit or rollback the changes made during prepare_inplace_alter_table()
12943   and inplace_alter_table() inside the storage engine.
12944   Note that in case of rollback the allowed level of concurrency during
12945   this operation will be the same as for inplace_alter_table() and thus
12946   might be higher than during prepare_inplace_alter_table(). (For example,
12947   concurrent writes were blocked during prepare, but might not be during
12948   rollback).
12949 
12950   @note Storage engines are responsible for reporting any errors by
12951   calling my_error()/print_error()
12952 
12953   @note If this function with commit= true reports error, it will be called
12954   again with commit= false.
12955 
12956   @note In case of partitioning, this function might be called for rollback
12957   without prepare_inplace_alter_table() having been called first.
12958   Also partitioned tables sets ha_alter_info->group_commit_ctx to a NULL
12959   terminated array of the partitions handlers and if all of them are
12960   committed as one, then group_commit_ctx should be set to NULL to indicate
12961   to the partitioning handler that all partitions handlers are committed.
12962   @see prepare_inplace_alter_table().
12963 
12964   @param    altered_table     TABLE object for new version of table.
12965   @param    ha_alter_info     Structure describing changes to be done
12966                               by ALTER TABLE and holding data used
12967                               during in-place alter.
12968   @param    commit            True => Commit, False => Rollback.
12969 
12970   @retval   true              Error
12971   @retval   false             Success
12972 */
12973 bool ha_rocksdb::commit_inplace_alter_table(
12974     my_core::TABLE *const altered_table,
12975     my_core::Alter_inplace_info *const ha_alter_info, bool commit) {
12976   DBUG_ENTER_FUNC();
12977 
12978   DBUG_ASSERT(altered_table != nullptr);
12979   DBUG_ASSERT(ha_alter_info != nullptr);
12980 
12981   Rdb_inplace_alter_ctx *const ctx0 =
12982       static_cast<Rdb_inplace_alter_ctx *>(ha_alter_info->handler_ctx);
12983 
12984   DEBUG_SYNC(ha_thd(), "rocksdb.commit_in_place_alter_table");
12985 
12986   /*
12987     IMPORTANT: When rollback is requested, mysql will abort with
12988     an assertion failure. That means every failed commit during inplace alter
12989     table will result in a fatal error on the server. Indexes ongoing creation
12990     will be detected when the server restarts, and dropped.
12991 
12992     For partitioned tables, a rollback call to this function (commit == false)
12993     is done for each partition.  A successful commit call only executes once
12994     for all partitions.
12995   */
12996   if (!commit) {
12997     /* If ctx has not been created yet, nothing to do here */
12998     if (!ctx0) {
12999       DBUG_RETURN(HA_EXIT_SUCCESS);
13000     }
13001 
13002     /*
13003       Cannot call destructor for Rdb_tbl_def directly because we don't want to
13004       erase the mappings inside the ddl_manager, as the old_key_descr is still
13005       using them.
13006     */
13007     if (ctx0->m_new_key_descr) {
13008       /* Delete the new key descriptors */
13009       for (uint i = 0; i < ctx0->m_new_tdef->m_key_count; i++) {
13010         ctx0->m_new_key_descr[i] = nullptr;
13011       }
13012 
13013       delete[] ctx0->m_new_key_descr;
13014       ctx0->m_new_key_descr = nullptr;
13015       ctx0->m_new_tdef->m_key_descr_arr = nullptr;
13016 
13017       delete ctx0->m_new_tdef;
13018     }
13019 
13020     /* Remove uncommitted key definitons from ddl_manager */
13021     ddl_manager.remove_uncommitted_keydefs(ctx0->m_added_indexes);
13022 
13023     /* Rollback any partially created indexes */
13024     dict_manager.rollback_ongoing_index_creation();
13025 
13026     DBUG_RETURN(HA_EXIT_SUCCESS);
13027   }
13028 
13029   DBUG_ASSERT(ctx0);
13030 
13031   /*
13032     For partitioned tables, we need to commit all changes to all tables at
13033     once, unlike in the other inplace alter API methods.
13034   */
13035   inplace_alter_handler_ctx **ctx_array;
13036   inplace_alter_handler_ctx *ctx_single[2];
13037 
13038   if (ha_alter_info->group_commit_ctx) {
13039     DBUG_EXECUTE_IF("crash_during_index_creation_partition", DBUG_SUICIDE(););
13040     ctx_array = ha_alter_info->group_commit_ctx;
13041   } else {
13042     ctx_single[0] = ctx0;
13043     ctx_single[1] = nullptr;
13044     ctx_array = ctx_single;
13045   }
13046 
13047   DBUG_ASSERT(ctx0 == ctx_array[0]);
13048   ha_alter_info->group_commit_ctx = nullptr;
13049 
13050   if (ha_alter_info->handler_flags &
13051       (ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX |
13052        ALTER_DROP_UNIQUE_INDEX |
13053        ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX |
13054        ALTER_ADD_UNIQUE_INDEX)) {
13055     const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
13056     rocksdb::WriteBatch *const batch = wb.get();
13057     std::unordered_set<GL_INDEX_ID> create_index_ids;
13058 
13059     m_tbl_def = ctx0->m_new_tdef;
13060     m_key_descr_arr = m_tbl_def->m_key_descr_arr;
13061     m_pk_descr = m_key_descr_arr[pk_index(altered_table, m_tbl_def)];
13062 
13063     dict_manager.lock();
13064     for (inplace_alter_handler_ctx **pctx = ctx_array; *pctx; pctx++) {
13065       Rdb_inplace_alter_ctx *const ctx =
13066           static_cast<Rdb_inplace_alter_ctx *>(*pctx);
13067 
13068       /* Mark indexes to be dropped */
13069       dict_manager.add_drop_index(ctx->m_dropped_index_ids, batch);
13070 
13071       for (const auto &index : ctx->m_added_indexes) {
13072         create_index_ids.insert(index->get_gl_index_id());
13073       }
13074 
13075       if (ddl_manager.put_and_write(ctx->m_new_tdef, batch)) {
13076         /*
13077           Failed to write new entry into data dictionary, this should never
13078           happen.
13079         */
13080         DBUG_ASSERT(0);
13081       }
13082 
13083       /*
13084         Remove uncommitted key definitons from ddl_manager, as they are now
13085         committed into the data dictionary.
13086       */
13087       ddl_manager.remove_uncommitted_keydefs(ctx->m_added_indexes);
13088     }
13089 
13090     if (dict_manager.commit(batch)) {
13091       /*
13092         Should never reach here. We assume MyRocks will abort if commit fails.
13093       */
13094       DBUG_ASSERT(0);
13095     }
13096 
13097     dict_manager.unlock();
13098 
13099     /* Mark ongoing create indexes as finished/remove from data dictionary */
13100     dict_manager.finish_indexes_operation(
13101         create_index_ids, Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
13102 
13103     rdb_drop_idx_thread.signal();
13104   }
13105 
13106   if (ha_alter_info->handler_flags & ALTER_CHANGE_CREATE_OPTION) {
13107     const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
13108     rocksdb::WriteBatch *const batch = wb.get();
13109     std::unordered_set<GL_INDEX_ID> create_index_ids;
13110 
13111     ulonglong auto_incr_val = ha_alter_info->create_info->auto_increment_value;
13112 
13113     for (inplace_alter_handler_ctx **pctx = ctx_array; *pctx; pctx++) {
13114       Rdb_inplace_alter_ctx *const ctx =
13115           static_cast<Rdb_inplace_alter_ctx *>(*pctx);
13116       auto_incr_val = std::max(auto_incr_val, ctx->m_max_auto_incr);
13117       dict_manager.put_auto_incr_val(
13118           batch, ctx->m_new_tdef->get_autoincr_gl_index_id(), auto_incr_val,
13119           true /* overwrite */);
13120       ctx->m_new_tdef->m_auto_incr_val = auto_incr_val;
13121     }
13122 
13123     if (dict_manager.commit(batch)) {
13124       DBUG_ASSERT(0);
13125     }
13126   }
13127 
13128   DBUG_RETURN(HA_EXIT_SUCCESS);
13129 }
13130 
13131 #define SHOW_FNAME(name) rocksdb_show_##name
13132 
13133 #define DEF_SHOW_FUNC(name, key)                                           \
13134   static int SHOW_FNAME(name)(MYSQL_THD thd, SHOW_VAR * var, char *buff) { \
13135     rocksdb_status_counters.name =                                         \
13136         rocksdb_stats->getTickerCount(rocksdb::key);                       \
13137     var->type = SHOW_LONGLONG;                                             \
13138     var->value = reinterpret_cast<char *>(&rocksdb_status_counters.name);  \
13139     return HA_EXIT_SUCCESS;                                                \
13140   }
13141 
13142 #define DEF_STATUS_VAR(name) \
13143   { "rocksdb_" #name, (char *)&SHOW_FNAME(name), SHOW_FUNC }
13144 
13145 #define DEF_STATUS_VAR_PTR(name, ptr, option) \
13146   { "rocksdb_" name, (char *)ptr, option }
13147 
13148 #define DEF_STATUS_VAR_FUNC(name, ptr, option) \
13149   { name, reinterpret_cast<char *>(ptr), option }
13150 
13151 struct rocksdb_status_counters_t {
13152   uint64_t block_cache_miss;
13153   uint64_t block_cache_hit;
13154   uint64_t block_cache_add;
13155   uint64_t block_cache_add_failures;
13156   uint64_t block_cache_index_miss;
13157   uint64_t block_cache_index_hit;
13158   uint64_t block_cache_index_add;
13159   uint64_t block_cache_index_bytes_insert;
13160   uint64_t block_cache_index_bytes_evict;
13161   uint64_t block_cache_filter_miss;
13162   uint64_t block_cache_filter_hit;
13163   uint64_t block_cache_filter_add;
13164   uint64_t block_cache_filter_bytes_insert;
13165   uint64_t block_cache_filter_bytes_evict;
13166   uint64_t block_cache_bytes_read;
13167   uint64_t block_cache_bytes_write;
13168   uint64_t block_cache_data_bytes_insert;
13169   uint64_t block_cache_data_miss;
13170   uint64_t block_cache_data_hit;
13171   uint64_t block_cache_data_add;
13172   uint64_t bloom_filter_useful;
13173   uint64_t bloom_filter_full_positive;
13174   uint64_t bloom_filter_full_true_positive;
13175   uint64_t memtable_hit;
13176   uint64_t memtable_miss;
13177   uint64_t get_hit_l0;
13178   uint64_t get_hit_l1;
13179   uint64_t get_hit_l2_and_up;
13180   uint64_t compaction_key_drop_new;
13181   uint64_t compaction_key_drop_obsolete;
13182   uint64_t compaction_key_drop_user;
13183   uint64_t number_keys_written;
13184   uint64_t number_keys_read;
13185   uint64_t number_keys_updated;
13186   uint64_t bytes_written;
13187   uint64_t bytes_read;
13188   uint64_t number_db_seek;
13189   uint64_t number_db_seek_found;
13190   uint64_t number_db_next;
13191   uint64_t number_db_next_found;
13192   uint64_t number_db_prev;
13193   uint64_t number_db_prev_found;
13194   uint64_t iter_bytes_read;
13195   uint64_t no_file_closes;
13196   uint64_t no_file_opens;
13197   uint64_t no_file_errors;
13198   uint64_t stall_micros;
13199   uint64_t num_iterators;
13200   uint64_t number_multiget_get;
13201   uint64_t number_multiget_keys_read;
13202   uint64_t number_multiget_bytes_read;
13203   uint64_t number_deletes_filtered;
13204   uint64_t number_merge_failures;
13205   uint64_t bloom_filter_prefix_checked;
13206   uint64_t bloom_filter_prefix_useful;
13207   uint64_t number_reseeks_iteration;
13208   uint64_t getupdatessince_calls;
13209   uint64_t block_cachecompressed_miss;
13210   uint64_t block_cachecompressed_hit;
13211   uint64_t wal_synced;
13212   uint64_t wal_bytes;
13213   uint64_t write_self;
13214   uint64_t write_other;
13215   uint64_t write_timedout;
13216   uint64_t write_wal;
13217   uint64_t flush_write_bytes;
13218   uint64_t compact_read_bytes;
13219   uint64_t compact_write_bytes;
13220   uint64_t number_superversion_acquires;
13221   uint64_t number_superversion_releases;
13222   uint64_t number_superversion_cleanups;
13223   uint64_t number_block_not_compressed;
13224 };
13225 
13226 static rocksdb_status_counters_t rocksdb_status_counters;
13227 
13228 DEF_SHOW_FUNC(block_cache_miss, BLOCK_CACHE_MISS)
13229 DEF_SHOW_FUNC(block_cache_hit, BLOCK_CACHE_HIT)
13230 DEF_SHOW_FUNC(block_cache_add, BLOCK_CACHE_ADD)
13231 DEF_SHOW_FUNC(block_cache_add_failures, BLOCK_CACHE_ADD_FAILURES)
13232 DEF_SHOW_FUNC(block_cache_index_miss, BLOCK_CACHE_INDEX_MISS)
13233 DEF_SHOW_FUNC(block_cache_index_hit, BLOCK_CACHE_INDEX_HIT)
13234 DEF_SHOW_FUNC(block_cache_index_add, BLOCK_CACHE_INDEX_ADD)
13235 DEF_SHOW_FUNC(block_cache_index_bytes_insert, BLOCK_CACHE_INDEX_BYTES_INSERT)
13236 DEF_SHOW_FUNC(block_cache_index_bytes_evict, BLOCK_CACHE_INDEX_BYTES_EVICT)
13237 DEF_SHOW_FUNC(block_cache_filter_miss, BLOCK_CACHE_FILTER_MISS)
13238 DEF_SHOW_FUNC(block_cache_filter_hit, BLOCK_CACHE_FILTER_HIT)
13239 DEF_SHOW_FUNC(block_cache_filter_add, BLOCK_CACHE_FILTER_ADD)
13240 DEF_SHOW_FUNC(block_cache_filter_bytes_insert, BLOCK_CACHE_FILTER_BYTES_INSERT)
13241 DEF_SHOW_FUNC(block_cache_filter_bytes_evict, BLOCK_CACHE_FILTER_BYTES_EVICT)
13242 DEF_SHOW_FUNC(block_cache_bytes_read, BLOCK_CACHE_BYTES_READ)
13243 DEF_SHOW_FUNC(block_cache_bytes_write, BLOCK_CACHE_BYTES_WRITE)
13244 DEF_SHOW_FUNC(block_cache_data_bytes_insert, BLOCK_CACHE_DATA_BYTES_INSERT)
13245 DEF_SHOW_FUNC(block_cache_data_miss, BLOCK_CACHE_DATA_MISS)
13246 DEF_SHOW_FUNC(block_cache_data_hit, BLOCK_CACHE_DATA_HIT)
13247 DEF_SHOW_FUNC(block_cache_data_add, BLOCK_CACHE_DATA_ADD)
13248 DEF_SHOW_FUNC(bloom_filter_useful, BLOOM_FILTER_USEFUL)
13249 DEF_SHOW_FUNC(bloom_filter_full_positive, BLOOM_FILTER_FULL_POSITIVE)
13250 DEF_SHOW_FUNC(bloom_filter_full_true_positive, BLOOM_FILTER_FULL_TRUE_POSITIVE)
13251 DEF_SHOW_FUNC(memtable_hit, MEMTABLE_HIT)
13252 DEF_SHOW_FUNC(memtable_miss, MEMTABLE_MISS)
13253 DEF_SHOW_FUNC(get_hit_l0, GET_HIT_L0)
13254 DEF_SHOW_FUNC(get_hit_l1, GET_HIT_L1)
13255 DEF_SHOW_FUNC(get_hit_l2_and_up, GET_HIT_L2_AND_UP)
13256 DEF_SHOW_FUNC(compaction_key_drop_new, COMPACTION_KEY_DROP_NEWER_ENTRY)
13257 DEF_SHOW_FUNC(compaction_key_drop_obsolete, COMPACTION_KEY_DROP_OBSOLETE)
13258 DEF_SHOW_FUNC(compaction_key_drop_user, COMPACTION_KEY_DROP_USER)
13259 DEF_SHOW_FUNC(number_keys_written, NUMBER_KEYS_WRITTEN)
13260 DEF_SHOW_FUNC(number_keys_read, NUMBER_KEYS_READ)
13261 DEF_SHOW_FUNC(number_keys_updated, NUMBER_KEYS_UPDATED)
13262 DEF_SHOW_FUNC(bytes_written, BYTES_WRITTEN)
13263 DEF_SHOW_FUNC(bytes_read, BYTES_READ)
13264 DEF_SHOW_FUNC(number_db_seek, NUMBER_DB_SEEK)
13265 DEF_SHOW_FUNC(number_db_seek_found, NUMBER_DB_SEEK_FOUND)
13266 DEF_SHOW_FUNC(number_db_next, NUMBER_DB_NEXT)
13267 DEF_SHOW_FUNC(number_db_next_found, NUMBER_DB_NEXT_FOUND)
13268 DEF_SHOW_FUNC(number_db_prev, NUMBER_DB_PREV)
13269 DEF_SHOW_FUNC(number_db_prev_found, NUMBER_DB_PREV_FOUND)
13270 DEF_SHOW_FUNC(iter_bytes_read, ITER_BYTES_READ)
13271 DEF_SHOW_FUNC(no_file_closes, NO_FILE_CLOSES)
13272 DEF_SHOW_FUNC(no_file_opens, NO_FILE_OPENS)
13273 DEF_SHOW_FUNC(no_file_errors, NO_FILE_ERRORS)
13274 DEF_SHOW_FUNC(stall_micros, STALL_MICROS)
13275 DEF_SHOW_FUNC(num_iterators, NO_ITERATORS)
13276 DEF_SHOW_FUNC(number_multiget_get, NUMBER_MULTIGET_CALLS)
13277 DEF_SHOW_FUNC(number_multiget_keys_read, NUMBER_MULTIGET_KEYS_READ)
13278 DEF_SHOW_FUNC(number_multiget_bytes_read, NUMBER_MULTIGET_BYTES_READ)
13279 DEF_SHOW_FUNC(number_deletes_filtered, NUMBER_FILTERED_DELETES)
13280 DEF_SHOW_FUNC(number_merge_failures, NUMBER_MERGE_FAILURES)
13281 DEF_SHOW_FUNC(bloom_filter_prefix_checked, BLOOM_FILTER_PREFIX_CHECKED)
13282 DEF_SHOW_FUNC(bloom_filter_prefix_useful, BLOOM_FILTER_PREFIX_USEFUL)
13283 DEF_SHOW_FUNC(number_reseeks_iteration, NUMBER_OF_RESEEKS_IN_ITERATION)
13284 DEF_SHOW_FUNC(getupdatessince_calls, GET_UPDATES_SINCE_CALLS)
13285 DEF_SHOW_FUNC(block_cachecompressed_miss, BLOCK_CACHE_COMPRESSED_MISS)
13286 DEF_SHOW_FUNC(block_cachecompressed_hit, BLOCK_CACHE_COMPRESSED_HIT)
13287 DEF_SHOW_FUNC(wal_synced, WAL_FILE_SYNCED)
13288 DEF_SHOW_FUNC(wal_bytes, WAL_FILE_BYTES)
13289 DEF_SHOW_FUNC(write_self, WRITE_DONE_BY_SELF)
13290 DEF_SHOW_FUNC(write_other, WRITE_DONE_BY_OTHER)
13291 DEF_SHOW_FUNC(write_timedout, WRITE_TIMEDOUT)
13292 DEF_SHOW_FUNC(write_wal, WRITE_WITH_WAL)
13293 DEF_SHOW_FUNC(flush_write_bytes, FLUSH_WRITE_BYTES)
13294 DEF_SHOW_FUNC(compact_read_bytes, COMPACT_READ_BYTES)
13295 DEF_SHOW_FUNC(compact_write_bytes, COMPACT_WRITE_BYTES)
13296 DEF_SHOW_FUNC(number_superversion_acquires, NUMBER_SUPERVERSION_ACQUIRES)
13297 DEF_SHOW_FUNC(number_superversion_releases, NUMBER_SUPERVERSION_RELEASES)
13298 DEF_SHOW_FUNC(number_superversion_cleanups, NUMBER_SUPERVERSION_CLEANUPS)
13299 DEF_SHOW_FUNC(number_block_not_compressed, NUMBER_BLOCK_NOT_COMPRESSED)
13300 
13301 static void myrocks_update_status() {
13302   export_stats.rows_deleted = global_stats.rows[ROWS_DELETED];
13303   export_stats.rows_inserted = global_stats.rows[ROWS_INSERTED];
13304   export_stats.rows_read = global_stats.rows[ROWS_READ];
13305   export_stats.rows_updated = global_stats.rows[ROWS_UPDATED];
13306   export_stats.rows_deleted_blind = global_stats.rows[ROWS_DELETED_BLIND];
13307   export_stats.rows_expired = global_stats.rows[ROWS_EXPIRED];
13308   export_stats.rows_filtered = global_stats.rows[ROWS_FILTERED];
13309 
13310   export_stats.system_rows_deleted = global_stats.system_rows[ROWS_DELETED];
13311   export_stats.system_rows_inserted = global_stats.system_rows[ROWS_INSERTED];
13312   export_stats.system_rows_read = global_stats.system_rows[ROWS_READ];
13313   export_stats.system_rows_updated = global_stats.system_rows[ROWS_UPDATED];
13314 
13315   export_stats.queries_point = global_stats.queries[QUERIES_POINT];
13316   export_stats.queries_range = global_stats.queries[QUERIES_RANGE];
13317 
13318   export_stats.covered_secondary_key_lookups =
13319       global_stats.covered_secondary_key_lookups;
13320 }
13321 
13322 static void myrocks_update_memory_status() {
13323   std::vector<rocksdb::DB *> dbs;
13324   std::unordered_set<const rocksdb::Cache *> cache_set;
13325   dbs.push_back(rdb);
13326   std::map<rocksdb::MemoryUtil::UsageType, uint64_t> temp_usage_by_type;
13327   rocksdb::MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set,
13328                                                        &temp_usage_by_type);
13329   memory_stats.memtable_total =
13330       temp_usage_by_type[rocksdb::MemoryUtil::kMemTableTotal];
13331   memory_stats.memtable_unflushed =
13332       temp_usage_by_type[rocksdb::MemoryUtil::kMemTableUnFlushed];
13333 }
13334 
13335 static SHOW_VAR myrocks_status_variables[] = {
13336     DEF_STATUS_VAR_FUNC("rows_deleted", &export_stats.rows_deleted,
13337                         SHOW_LONGLONG),
13338     DEF_STATUS_VAR_FUNC("rows_inserted", &export_stats.rows_inserted,
13339                         SHOW_LONGLONG),
13340     DEF_STATUS_VAR_FUNC("rows_read", &export_stats.rows_read, SHOW_LONGLONG),
13341     DEF_STATUS_VAR_FUNC("rows_updated", &export_stats.rows_updated,
13342                         SHOW_LONGLONG),
13343     DEF_STATUS_VAR_FUNC("rows_deleted_blind", &export_stats.rows_deleted_blind,
13344                         SHOW_LONGLONG),
13345     DEF_STATUS_VAR_FUNC("rows_expired", &export_stats.rows_expired,
13346                         SHOW_LONGLONG),
13347     DEF_STATUS_VAR_FUNC("rows_filtered", &export_stats.rows_filtered,
13348                         SHOW_LONGLONG),
13349     DEF_STATUS_VAR_FUNC("system_rows_deleted",
13350                         &export_stats.system_rows_deleted, SHOW_LONGLONG),
13351     DEF_STATUS_VAR_FUNC("system_rows_inserted",
13352                         &export_stats.system_rows_inserted, SHOW_LONGLONG),
13353     DEF_STATUS_VAR_FUNC("system_rows_read", &export_stats.system_rows_read,
13354                         SHOW_LONGLONG),
13355     DEF_STATUS_VAR_FUNC("system_rows_updated",
13356                         &export_stats.system_rows_updated, SHOW_LONGLONG),
13357     DEF_STATUS_VAR_FUNC("memtable_total", &memory_stats.memtable_total,
13358                         SHOW_LONGLONG),
13359     DEF_STATUS_VAR_FUNC("memtable_unflushed", &memory_stats.memtable_unflushed,
13360                         SHOW_LONGLONG),
13361     DEF_STATUS_VAR_FUNC("queries_point", &export_stats.queries_point,
13362                         SHOW_LONGLONG),
13363     DEF_STATUS_VAR_FUNC("queries_range", &export_stats.queries_range,
13364                         SHOW_LONGLONG),
13365     DEF_STATUS_VAR_FUNC("covered_secondary_key_lookups",
13366                         &export_stats.covered_secondary_key_lookups,
13367                         SHOW_LONGLONG),
13368 
13369     {NullS, NullS, SHOW_LONG}};
13370 
13371 static void show_myrocks_vars(THD *thd, SHOW_VAR *var, char *buff) {
13372   myrocks_update_status();
13373   myrocks_update_memory_status();
13374   var->type = SHOW_ARRAY;
13375   var->value = reinterpret_cast<char *>(&myrocks_status_variables);
13376 }
13377 
13378 static ulonglong io_stall_prop_value(
13379     const std::map<std::string, std::string> &props, const std::string &key) {
13380   std::map<std::string, std::string>::const_iterator iter =
13381       props.find("io_stalls." + key);
13382   if (iter != props.end()) {
13383     return std::stoull(iter->second);
13384   } else {
13385     DBUG_PRINT("warning",
13386                ("RocksDB GetMapPropery hasn't returned key=%s", key.c_str()));
13387     DBUG_ASSERT(0);
13388     return 0;
13389   }
13390 }
13391 
13392 static void update_rocksdb_stall_status() {
13393   st_io_stall_stats local_io_stall_stats;
13394   for (const auto &cf_name : cf_manager.get_cf_names()) {
13395     rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(cf_name);
13396     if (cfh == nullptr) {
13397       continue;
13398     }
13399 
13400     std::map<std::string, std::string> props;
13401     if (!rdb->GetMapProperty(cfh, "rocksdb.cfstats", &props)) {
13402       continue;
13403     }
13404 
13405     local_io_stall_stats.level0_slowdown +=
13406         io_stall_prop_value(props, "level0_slowdown");
13407     local_io_stall_stats.level0_slowdown_with_compaction +=
13408         io_stall_prop_value(props, "level0_slowdown_with_compaction");
13409     local_io_stall_stats.level0_numfiles +=
13410         io_stall_prop_value(props, "level0_numfiles");
13411     local_io_stall_stats.level0_numfiles_with_compaction +=
13412         io_stall_prop_value(props, "level0_numfiles_with_compaction");
13413     local_io_stall_stats.stop_for_pending_compaction_bytes +=
13414         io_stall_prop_value(props, "stop_for_pending_compaction_bytes");
13415     local_io_stall_stats.slowdown_for_pending_compaction_bytes +=
13416         io_stall_prop_value(props, "slowdown_for_pending_compaction_bytes");
13417     local_io_stall_stats.memtable_compaction +=
13418         io_stall_prop_value(props, "memtable_compaction");
13419     local_io_stall_stats.memtable_slowdown +=
13420         io_stall_prop_value(props, "memtable_slowdown");
13421     local_io_stall_stats.total_stop += io_stall_prop_value(props, "total_stop");
13422     local_io_stall_stats.total_slowdown +=
13423         io_stall_prop_value(props, "total_slowdown");
13424   }
13425   io_stall_stats = local_io_stall_stats;
13426 }
13427 
13428 static SHOW_VAR rocksdb_stall_status_variables[] = {
13429     DEF_STATUS_VAR_FUNC("l0_file_count_limit_slowdowns",
13430                         &io_stall_stats.level0_slowdown, SHOW_LONGLONG),
13431     DEF_STATUS_VAR_FUNC("locked_l0_file_count_limit_slowdowns",
13432                         &io_stall_stats.level0_slowdown_with_compaction,
13433                         SHOW_LONGLONG),
13434     DEF_STATUS_VAR_FUNC("l0_file_count_limit_stops",
13435                         &io_stall_stats.level0_numfiles, SHOW_LONGLONG),
13436     DEF_STATUS_VAR_FUNC("locked_l0_file_count_limit_stops",
13437                         &io_stall_stats.level0_numfiles_with_compaction,
13438                         SHOW_LONGLONG),
13439     DEF_STATUS_VAR_FUNC("pending_compaction_limit_stops",
13440                         &io_stall_stats.stop_for_pending_compaction_bytes,
13441                         SHOW_LONGLONG),
13442     DEF_STATUS_VAR_FUNC("pending_compaction_limit_slowdowns",
13443                         &io_stall_stats.slowdown_for_pending_compaction_bytes,
13444                         SHOW_LONGLONG),
13445     DEF_STATUS_VAR_FUNC("memtable_limit_stops",
13446                         &io_stall_stats.memtable_compaction, SHOW_LONGLONG),
13447     DEF_STATUS_VAR_FUNC("memtable_limit_slowdowns",
13448                         &io_stall_stats.memtable_slowdown, SHOW_LONGLONG),
13449     DEF_STATUS_VAR_FUNC("total_stops", &io_stall_stats.total_stop,
13450                         SHOW_LONGLONG),
13451     DEF_STATUS_VAR_FUNC("total_slowdowns", &io_stall_stats.total_slowdown,
13452                         SHOW_LONGLONG),
13453     // end of the array marker
13454     {NullS, NullS, SHOW_LONG}};
13455 
13456 static void show_rocksdb_stall_vars(THD *thd, SHOW_VAR *var, char *buff) {
13457   update_rocksdb_stall_status();
13458   var->type = SHOW_ARRAY;
13459   var->value = reinterpret_cast<char *>(&rocksdb_stall_status_variables);
13460 }
13461 
13462 static SHOW_VAR rocksdb_status_vars[] = {
13463     DEF_STATUS_VAR(block_cache_miss),
13464     DEF_STATUS_VAR(block_cache_hit),
13465     DEF_STATUS_VAR(block_cache_add),
13466     DEF_STATUS_VAR(block_cache_add_failures),
13467     DEF_STATUS_VAR(block_cache_index_miss),
13468     DEF_STATUS_VAR(block_cache_index_hit),
13469     DEF_STATUS_VAR(block_cache_index_add),
13470     DEF_STATUS_VAR(block_cache_index_bytes_insert),
13471     DEF_STATUS_VAR(block_cache_index_bytes_evict),
13472     DEF_STATUS_VAR(block_cache_filter_miss),
13473     DEF_STATUS_VAR(block_cache_filter_hit),
13474     DEF_STATUS_VAR(block_cache_filter_add),
13475     DEF_STATUS_VAR(block_cache_filter_bytes_insert),
13476     DEF_STATUS_VAR(block_cache_filter_bytes_evict),
13477     DEF_STATUS_VAR(block_cache_bytes_read),
13478     DEF_STATUS_VAR(block_cache_bytes_write),
13479     DEF_STATUS_VAR(block_cache_data_bytes_insert),
13480     DEF_STATUS_VAR(block_cache_data_miss),
13481     DEF_STATUS_VAR(block_cache_data_hit),
13482     DEF_STATUS_VAR(block_cache_data_add),
13483     DEF_STATUS_VAR(bloom_filter_useful),
13484     DEF_STATUS_VAR(bloom_filter_full_positive),
13485     DEF_STATUS_VAR(bloom_filter_full_true_positive),
13486     DEF_STATUS_VAR(memtable_hit),
13487     DEF_STATUS_VAR(memtable_miss),
13488     DEF_STATUS_VAR(get_hit_l0),
13489     DEF_STATUS_VAR(get_hit_l1),
13490     DEF_STATUS_VAR(get_hit_l2_and_up),
13491     DEF_STATUS_VAR(compaction_key_drop_new),
13492     DEF_STATUS_VAR(compaction_key_drop_obsolete),
13493     DEF_STATUS_VAR(compaction_key_drop_user),
13494     DEF_STATUS_VAR(number_keys_written),
13495     DEF_STATUS_VAR(number_keys_read),
13496     DEF_STATUS_VAR(number_keys_updated),
13497     DEF_STATUS_VAR(bytes_written),
13498     DEF_STATUS_VAR(bytes_read),
13499     DEF_STATUS_VAR(number_db_seek),
13500     DEF_STATUS_VAR(number_db_seek_found),
13501     DEF_STATUS_VAR(number_db_next),
13502     DEF_STATUS_VAR(number_db_next_found),
13503     DEF_STATUS_VAR(number_db_prev),
13504     DEF_STATUS_VAR(number_db_prev_found),
13505     DEF_STATUS_VAR(iter_bytes_read),
13506     DEF_STATUS_VAR(no_file_closes),
13507     DEF_STATUS_VAR(no_file_opens),
13508     DEF_STATUS_VAR(no_file_errors),
13509     DEF_STATUS_VAR(stall_micros),
13510     DEF_STATUS_VAR(num_iterators),
13511     DEF_STATUS_VAR(number_multiget_get),
13512     DEF_STATUS_VAR(number_multiget_keys_read),
13513     DEF_STATUS_VAR(number_multiget_bytes_read),
13514     DEF_STATUS_VAR(number_deletes_filtered),
13515     DEF_STATUS_VAR(number_merge_failures),
13516     DEF_STATUS_VAR(bloom_filter_prefix_checked),
13517     DEF_STATUS_VAR(bloom_filter_prefix_useful),
13518     DEF_STATUS_VAR(number_reseeks_iteration),
13519     DEF_STATUS_VAR(getupdatessince_calls),
13520     DEF_STATUS_VAR(block_cachecompressed_miss),
13521     DEF_STATUS_VAR(block_cachecompressed_hit),
13522     DEF_STATUS_VAR(wal_synced),
13523     DEF_STATUS_VAR(wal_bytes),
13524     DEF_STATUS_VAR(write_self),
13525     DEF_STATUS_VAR(write_other),
13526     DEF_STATUS_VAR(write_timedout),
13527     DEF_STATUS_VAR(write_wal),
13528     DEF_STATUS_VAR(flush_write_bytes),
13529     DEF_STATUS_VAR(compact_read_bytes),
13530     DEF_STATUS_VAR(compact_write_bytes),
13531     DEF_STATUS_VAR(number_superversion_acquires),
13532     DEF_STATUS_VAR(number_superversion_releases),
13533     DEF_STATUS_VAR(number_superversion_cleanups),
13534     DEF_STATUS_VAR(number_block_not_compressed),
13535     DEF_STATUS_VAR_PTR("row_lock_deadlocks", &rocksdb_row_lock_deadlocks,
13536                        SHOW_LONGLONG),
13537     DEF_STATUS_VAR_PTR("row_lock_wait_timeouts",
13538                        &rocksdb_row_lock_wait_timeouts, SHOW_LONGLONG),
13539     DEF_STATUS_VAR_PTR("snapshot_conflict_errors",
13540                        &rocksdb_snapshot_conflict_errors, SHOW_LONGLONG),
13541     DEF_STATUS_VAR_PTR("wal_group_syncs", &rocksdb_wal_group_syncs,
13542                        SHOW_LONGLONG),
13543     DEF_STATUS_VAR_PTR("manual_compactions_processed",
13544                        &rocksdb_manual_compactions_processed, SHOW_LONGLONG),
13545     DEF_STATUS_VAR_PTR("manual_compactions_running",
13546                        &rocksdb_manual_compactions_running, SHOW_LONGLONG),
13547     DEF_STATUS_VAR_PTR("number_sst_entry_put", &rocksdb_num_sst_entry_put,
13548                        SHOW_LONGLONG),
13549     DEF_STATUS_VAR_PTR("number_sst_entry_delete", &rocksdb_num_sst_entry_delete,
13550                        SHOW_LONGLONG),
13551     DEF_STATUS_VAR_PTR("number_sst_entry_singledelete",
13552                        &rocksdb_num_sst_entry_singledelete, SHOW_LONGLONG),
13553     DEF_STATUS_VAR_PTR("number_sst_entry_merge", &rocksdb_num_sst_entry_merge,
13554                        SHOW_LONGLONG),
13555     DEF_STATUS_VAR_PTR("number_sst_entry_other", &rocksdb_num_sst_entry_other,
13556                        SHOW_LONGLONG),
13557 #ifndef DBUG_OFF
13558     DEF_STATUS_VAR_PTR("num_get_for_update_calls",
13559                        &rocksdb_num_get_for_update_calls, SHOW_LONGLONG),
13560 #endif
13561     // the variables generated by SHOW_FUNC are sorted only by prefix (first
13562     // arg in the tuple below), so make sure it is unique to make sorting
13563     // deterministic as quick sort is not stable
13564     {"rocksdb", reinterpret_cast<char *>(&show_myrocks_vars), SHOW_FUNC},
13565     {"rocksdb_stall", reinterpret_cast<char *>(&show_rocksdb_stall_vars),
13566      SHOW_FUNC},
13567     {NullS, NullS, SHOW_LONG}};
13568 
13569 /*
13570   Background thread's main logic
13571 */
13572 
13573 void Rdb_background_thread::run() {
13574   // How many seconds to wait till flushing the WAL next time.
13575   const int WAKE_UP_INTERVAL = 1;
13576 
13577   timespec ts_next_sync;
13578   set_timespec(ts_next_sync, WAKE_UP_INTERVAL);
13579 
13580   for (;;) {
13581     // Wait until the next timeout or until we receive a signal to stop the
13582     // thread. Request to stop the thread should only be triggered when the
13583     // storage engine is being unloaded.
13584     RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
13585     const auto ret MY_ATTRIBUTE((__unused__)) =
13586         mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts_next_sync);
13587 
13588     // Check that we receive only the expected error codes.
13589     DBUG_ASSERT(ret == 0 || ret == ETIMEDOUT);
13590     const bool local_stop = m_stop;
13591     const bool local_save_stats = m_save_stats;
13592     reset();
13593     RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
13594 
13595     if (local_stop) {
13596       // If we're here then that's because condition variable was signaled by
13597       // another thread and we're shutting down. Break out the loop to make
13598       // sure that shutdown thread can proceed.
13599       break;
13600     }
13601 
13602     // This path should be taken only when the timer expired.
13603     DBUG_ASSERT(ret == ETIMEDOUT);
13604 
13605     if (local_save_stats) {
13606       ddl_manager.persist_stats();
13607     }
13608 
13609     // Set the next timestamp for mysql_cond_timedwait() (which ends up calling
13610     // pthread_cond_timedwait()) to wait on.
13611     set_timespec(ts_next_sync, WAKE_UP_INTERVAL);
13612 
13613     // Flush the WAL. Sync it for both background and never modes to copy
13614     // InnoDB's behavior. For mode never, the wal file isn't even written,
13615     // whereas background writes to the wal file, but issues the syncs in a
13616     // background thread.
13617     if (rdb && (rocksdb_flush_log_at_trx_commit != FLUSH_LOG_SYNC) &&
13618         !rocksdb_db_options->allow_mmap_writes) {
13619       const rocksdb::Status s = rdb->FlushWAL(true);
13620       if (!s.ok()) {
13621         rdb_handle_io_error(s, RDB_IO_ERROR_BG_THREAD);
13622       }
13623     }
13624     // Recalculate statistics for indexes.
13625     if (rocksdb_stats_recalc_rate) {
13626       std::unordered_map<GL_INDEX_ID, std::shared_ptr<const Rdb_key_def>>
13627           to_recalc;
13628 
13629       if (rdb_indexes_to_recalc.empty()) {
13630         struct Rdb_index_collector : public Rdb_tables_scanner {
13631           int add_table(Rdb_tbl_def *tdef) override {
13632             for (uint i = 0; i < tdef->m_key_count; i++) {
13633               rdb_indexes_to_recalc.push_back(
13634                   tdef->m_key_descr_arr[i]->get_gl_index_id());
13635             }
13636             return HA_EXIT_SUCCESS;
13637           }
13638         } collector;
13639         ddl_manager.scan_for_tables(&collector);
13640       }
13641 
13642       while (to_recalc.size() < rocksdb_stats_recalc_rate &&
13643              !rdb_indexes_to_recalc.empty()) {
13644         const auto index_id = rdb_indexes_to_recalc.back();
13645         rdb_indexes_to_recalc.pop_back();
13646 
13647         std::shared_ptr<const Rdb_key_def> keydef =
13648             ddl_manager.safe_find(index_id);
13649 
13650         if (keydef) {
13651           to_recalc.insert(std::make_pair(keydef->get_gl_index_id(), keydef));
13652         }
13653       }
13654 
13655       if (!to_recalc.empty()) {
13656         calculate_stats(to_recalc, false);
13657       }
13658     }
13659 
13660   }
13661 
13662   // save remaining stats which might've left unsaved
13663   ddl_manager.persist_stats();
13664 }
13665 
13666 /*
13667   A background thread to handle manual compactions,
13668   except for dropping indexes/tables. Every second, it checks
13669   pending manual compactions, and it calls CompactRange if there is.
13670 */
13671 void Rdb_manual_compaction_thread::run() {
13672   mysql_mutex_init(0, &m_mc_mutex, MY_MUTEX_INIT_FAST);
13673   RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
13674   for (;;) {
13675     if (m_stop) {
13676       break;
13677     }
13678     timespec ts;
13679     set_timespec(ts, 1);
13680 
13681     const auto ret MY_ATTRIBUTE((__unused__)) =
13682         mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts);
13683     if (m_stop) {
13684       break;
13685     }
13686     // make sure, no program error is returned
13687     DBUG_ASSERT(ret == 0 || ret == ETIMEDOUT);
13688     RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
13689 
13690     RDB_MUTEX_LOCK_CHECK(m_mc_mutex);
13691     // Grab the first item and proceed, if not empty.
13692     if (m_requests.empty()) {
13693       RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
13694       RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
13695       continue;
13696     }
13697     Manual_compaction_request &mcr = m_requests.begin()->second;
13698     DBUG_ASSERT(mcr.cf != nullptr);
13699     DBUG_ASSERT(mcr.state == Manual_compaction_request::INITED);
13700     mcr.state = Manual_compaction_request::RUNNING;
13701     RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
13702 
13703     DBUG_ASSERT(mcr.state == Manual_compaction_request::RUNNING);
13704     // NO_LINT_DEBUG
13705     sql_print_information("Manual Compaction id %d cf %s started.", mcr.mc_id,
13706                           mcr.cf->GetName().c_str());
13707     rocksdb_manual_compactions_running++;
13708     if (rocksdb_debug_manual_compaction_delay > 0) {
13709       my_sleep(rocksdb_debug_manual_compaction_delay * 1000000);
13710     }
13711     // CompactRange may take a very long time. On clean shutdown,
13712     // it is cancelled by CancelAllBackgroundWork, then status is
13713     // set to shutdownInProgress.
13714     const rocksdb::Status s = rdb->CompactRange(
13715         getCompactRangeOptions(mcr.concurrency), mcr.cf, mcr.start, mcr.limit);
13716     rocksdb_manual_compactions_running--;
13717     if (s.ok()) {
13718       // NO_LINT_DEBUG
13719       sql_print_information("Manual Compaction id %d cf %s ended.", mcr.mc_id,
13720                             mcr.cf->GetName().c_str());
13721     } else {
13722       // NO_LINT_DEBUG
13723       sql_print_information("Manual Compaction id %d cf %s aborted. %s",
13724                             mcr.mc_id, mcr.cf->GetName().c_str(), s.getState());
13725       if (!s.IsShutdownInProgress()) {
13726         rdb_handle_io_error(s, RDB_IO_ERROR_BG_THREAD);
13727       } else {
13728         DBUG_ASSERT(m_requests.size() == 1);
13729       }
13730     }
13731     rocksdb_manual_compactions_processed++;
13732     clear_manual_compaction_request(mcr.mc_id, false);
13733     RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
13734   }
13735   clear_all_manual_compaction_requests();
13736   DBUG_ASSERT(m_requests.empty());
13737   RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
13738   mysql_mutex_destroy(&m_mc_mutex);
13739 }
13740 
13741 void Rdb_manual_compaction_thread::clear_all_manual_compaction_requests() {
13742   RDB_MUTEX_LOCK_CHECK(m_mc_mutex);
13743   m_requests.clear();
13744   RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
13745 }
13746 
13747 void Rdb_manual_compaction_thread::clear_manual_compaction_request(
13748     int mc_id, bool init_only) {
13749   bool erase = true;
13750   RDB_MUTEX_LOCK_CHECK(m_mc_mutex);
13751   auto it = m_requests.find(mc_id);
13752   if (it != m_requests.end()) {
13753     if (init_only) {
13754       Manual_compaction_request mcr = it->second;
13755       if (mcr.state != Manual_compaction_request::INITED) {
13756         erase = false;
13757       }
13758     }
13759     if (erase) {
13760       m_requests.erase(it);
13761     }
13762   } else {
13763     // Current code path guarantees that erasing by the same mc_id happens
13764     // at most once. INITED state may be erased by a thread that requested
13765     // the compaction. RUNNING state is erased by mc thread only.
13766     DBUG_ASSERT(0);
13767   }
13768   RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
13769 }
13770 
13771 int Rdb_manual_compaction_thread::request_manual_compaction(
13772     rocksdb::ColumnFamilyHandle *cf, rocksdb::Slice *start,
13773     rocksdb::Slice *limit, int concurrency) {
13774   int mc_id = -1;
13775   RDB_MUTEX_LOCK_CHECK(m_mc_mutex);
13776   if (m_requests.size() >= rocksdb_max_manual_compactions) {
13777     RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
13778     return mc_id;
13779   }
13780   Manual_compaction_request mcr;
13781   mc_id = mcr.mc_id = ++m_latest_mc_id;
13782   mcr.state = Manual_compaction_request::INITED;
13783   mcr.cf = cf;
13784   mcr.start = start;
13785   mcr.limit = limit;
13786   mcr.concurrency = concurrency;
13787   m_requests.insert(std::make_pair(mcr.mc_id, mcr));
13788   RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
13789   return mc_id;
13790 }
13791 
13792 bool Rdb_manual_compaction_thread::is_manual_compaction_finished(int mc_id) {
13793   bool finished = false;
13794   RDB_MUTEX_LOCK_CHECK(m_mc_mutex);
13795   if (m_requests.count(mc_id) == 0) {
13796     finished = true;
13797   }
13798   RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
13799   return finished;
13800 }
13801 
13802 /**
13803  * Locking read + Not Found + Read Committed occurs if we accessed
13804  * a row by Seek, tried to lock it, failed, released and reacquired the
13805  * snapshot (because of READ COMMITTED mode) and the row was deleted by
13806  * someone else in the meantime.
13807  * If so, we either just skipping the row, or re-creating a snapshot
13808  * and seek again. In both cases, Read Committed constraint is not broken.
13809  */
13810 bool ha_rocksdb::should_skip_invalidated_record(const int rc) {
13811   if ((m_lock_rows != RDB_LOCK_NONE && rc == HA_ERR_KEY_NOT_FOUND &&
13812        my_core::thd_tx_isolation(ha_thd()) == ISO_READ_COMMITTED)) {
13813     return true;
13814   }
13815   return false;
13816 }
13817 /**
13818  * Indicating snapshot needs to be re-created and retrying seek again,
13819  * instead of returning errors or empty set. This is normally applicable
13820  * when hitting kBusy when locking the first row of the transaction,
13821  * with Repeatable Read isolation level.
13822  */
13823 bool ha_rocksdb::should_recreate_snapshot(const int rc,
13824                                           const bool is_new_snapshot) {
13825   if (should_skip_invalidated_record(rc) ||
13826       (rc == HA_ERR_ROCKSDB_STATUS_BUSY && is_new_snapshot)) {
13827     return true;
13828   }
13829   return false;
13830 }
13831 
13832 /**
13833  * If calling put/delete/singledelete without locking the row,
13834  * it is necessary to pass assume_tracked=false to RocksDB TX API.
13835  * Read Free Replication and Blind Deletes are the cases when
13836  * using TX API and skipping row locking.
13837  */
13838 bool ha_rocksdb::can_assume_tracked(THD *thd) {
13839   if (/* MARIAROCKS_NOT_YET use_read_free_rpl() ||*/ (THDVAR(thd, blind_delete_primary_key))) {
13840     return false;
13841   }
13842   return true;
13843 }
13844 
13845 bool ha_rocksdb::check_bloom_and_set_bounds(
13846     THD *thd, const Rdb_key_def &kd, const rocksdb::Slice &eq_cond,
13847     const bool use_all_keys, size_t bound_len, uchar *const lower_bound,
13848     uchar *const upper_bound, rocksdb::Slice *lower_bound_slice,
13849     rocksdb::Slice *upper_bound_slice) {
13850   bool can_use_bloom = can_use_bloom_filter(thd, kd, eq_cond, use_all_keys);
13851   if (!can_use_bloom) {
13852     setup_iterator_bounds(kd, eq_cond, bound_len, lower_bound, upper_bound,
13853                           lower_bound_slice, upper_bound_slice);
13854   }
13855   return can_use_bloom;
13856 }
13857 
13858 /**
13859   Deciding if it is possible to use bloom filter or not.
13860 
13861   @detail
13862    Even if bloom filter exists, it is not always possible
13863    to use bloom filter. If using bloom filter when you shouldn't,
13864    false negative may happen -- fewer rows than expected may be returned.
13865    It is users' responsibility to use bloom filter correctly.
13866 
13867    If bloom filter does not exist, return value does not matter because
13868    RocksDB does not use bloom filter internally.
13869 
13870   @param kd
13871   @param eq_cond      Equal condition part of the key. This always includes
13872                       system index id (4 bytes).
13873   @param use_all_keys True if all key parts are set with equal conditions.
13874                       This is aware of extended keys.
13875 */
13876 bool ha_rocksdb::can_use_bloom_filter(THD *thd, const Rdb_key_def &kd,
13877                                       const rocksdb::Slice &eq_cond,
13878                                       const bool use_all_keys) {
13879   bool can_use = false;
13880 
13881   if (THDVAR(thd, skip_bloom_filter_on_read)) {
13882     return can_use;
13883   }
13884 
13885   const rocksdb::SliceTransform *prefix_extractor = kd.get_extractor();
13886   if (prefix_extractor) {
13887     /*
13888       This is an optimized use case for CappedPrefixTransform.
13889       If eq_cond length >= prefix extractor length and if
13890       all keys are used for equal lookup, it is
13891       always possible to use bloom filter.
13892 
13893       Prefix bloom filter can't be used on descending scan with
13894       prefix lookup (i.e. WHERE id1=1 ORDER BY id2 DESC), because of
13895       RocksDB's limitation. On ascending (or not sorting) scan,
13896       keys longer than the capped prefix length will be truncated down
13897       to the capped length and the resulting key is added to the bloom filter.
13898 
13899       Keys shorter than the capped prefix length will be added to
13900       the bloom filter. When keys are looked up, key conditionals
13901       longer than the capped length can be used; key conditionals
13902       shorter require all parts of the key to be available
13903       for the short key match.
13904     */
13905     if ((use_all_keys && prefix_extractor->InRange(eq_cond)) ||
13906         prefix_extractor->SameResultWhenAppended(eq_cond)) {
13907       can_use = true;
13908     } else {
13909       can_use = false;
13910     }
13911   } else {
13912     /*
13913       if prefix extractor is not defined, all key parts have to be
13914       used by eq_cond.
13915     */
13916     if (use_all_keys) {
13917       can_use = true;
13918     } else {
13919       can_use = false;
13920     }
13921   }
13922 
13923   return can_use;
13924 }
13925 
13926 /* For modules that need access to the global data structures */
13927 rocksdb::TransactionDB *rdb_get_rocksdb_db() { return rdb; }
13928 
13929 Rdb_cf_manager &rdb_get_cf_manager() { return cf_manager; }
13930 
13931 const rocksdb::BlockBasedTableOptions &rdb_get_table_options() {
13932   return *rocksdb_tbl_options;
13933 }
13934 
13935 bool rdb_is_ttl_enabled() { return rocksdb_enable_ttl; }
13936 bool rdb_is_ttl_read_filtering_enabled() {
13937   return rocksdb_enable_ttl_read_filtering;
13938 }
13939 #ifndef DBUG_OFF
13940 int rdb_dbug_set_ttl_rec_ts() { return rocksdb_debug_ttl_rec_ts; }
13941 int rdb_dbug_set_ttl_snapshot_ts() { return rocksdb_debug_ttl_snapshot_ts; }
13942 int rdb_dbug_set_ttl_read_filter_ts() {
13943   return rocksdb_debug_ttl_read_filter_ts;
13944 }
13945 bool rdb_dbug_set_ttl_ignore_pk() { return rocksdb_debug_ttl_ignore_pk; }
13946 #endif
13947 
13948 void rdb_update_global_stats(const operation_type &type, uint count,
13949                              bool is_system_table) {
13950   DBUG_ASSERT(type < ROWS_MAX);
13951 
13952   if (count == 0) {
13953     return;
13954   }
13955 
13956   if (is_system_table) {
13957     global_stats.system_rows[type].add(count);
13958   } else {
13959     global_stats.rows[type].add(count);
13960   }
13961 }
13962 
13963 int rdb_get_table_perf_counters(const char *const tablename,
13964                                 Rdb_perf_counters *const counters) {
13965   DBUG_ASSERT(tablename != nullptr);
13966 
13967   Rdb_table_handler *table_handler;
13968   table_handler = rdb_open_tables.get_table_handler(tablename);
13969   if (table_handler == nullptr) {
13970     return HA_ERR_ROCKSDB_INVALID_TABLE;
13971   }
13972 
13973   counters->load(table_handler->m_table_perf_context);
13974 
13975   rdb_open_tables.release_table_handler(table_handler);
13976   return HA_EXIT_SUCCESS;
13977 }
13978 
13979 const char *get_rdb_io_error_string(const RDB_IO_ERROR_TYPE err_type) {
13980   // If this assertion fails then this means that a member has been either added
13981   // to or removed from RDB_IO_ERROR_TYPE enum and this function needs to be
13982   // changed to return the appropriate value.
13983   static_assert(RDB_IO_ERROR_LAST == 4, "Please handle all the error types.");
13984 
13985   switch (err_type) {
13986     case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_TX_COMMIT:
13987       return "RDB_IO_ERROR_TX_COMMIT";
13988     case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_DICT_COMMIT:
13989       return "RDB_IO_ERROR_DICT_COMMIT";
13990     case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_BG_THREAD:
13991       return "RDB_IO_ERROR_BG_THREAD";
13992     case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_GENERAL:
13993       return "RDB_IO_ERROR_GENERAL";
13994     default:
13995       DBUG_ASSERT(false);
13996       return "(unknown)";
13997   }
13998 }
13999 
14000 // In case of core dump generation we want this function NOT to be optimized
14001 // so that we can capture as much data as possible to debug the root cause
14002 // more efficiently.
14003 #ifdef __GNUC__
14004 #endif
14005 void rdb_handle_io_error(const rocksdb::Status status,
14006                          const RDB_IO_ERROR_TYPE err_type) {
14007   if (status.IsIOError()) {
14008     /* skip dumping core if write failed and we are allowed to do so */
14009 #ifdef MARIAROCKS_NOT_YET
14010     if (skip_core_dump_on_error) {
14011       opt_core_file = false;
14012     }
14013 #endif
14014     switch (err_type) {
14015       case RDB_IO_ERROR_TX_COMMIT:
14016       case RDB_IO_ERROR_DICT_COMMIT: {
14017         rdb_log_status_error(status, "failed to write to WAL");
14018         /* NO_LINT_DEBUG */
14019         sql_print_error("MyRocks: aborting on WAL write error.");
14020         abort();
14021         break;
14022       }
14023       case RDB_IO_ERROR_BG_THREAD: {
14024         rdb_log_status_error(status, "BG thread failed to write to RocksDB");
14025         /* NO_LINT_DEBUG */
14026         sql_print_error("MyRocks: aborting on BG write error.");
14027         abort();
14028         break;
14029       }
14030       case RDB_IO_ERROR_GENERAL: {
14031         rdb_log_status_error(status, "failed on I/O");
14032         /* NO_LINT_DEBUG */
14033         sql_print_error("MyRocks: aborting on I/O error.");
14034         abort();
14035         break;
14036       }
14037       default:
14038         DBUG_ASSERT(0);
14039         break;
14040     }
14041   } else if (status.IsCorruption()) {
14042     rdb_log_status_error(status, "data corruption detected!");
14043     rdb_persist_corruption_marker();
14044     /* NO_LINT_DEBUG */
14045     sql_print_error("MyRocks: aborting because of data corruption.");
14046     abort();
14047   } else if (!status.ok()) {
14048     switch (err_type) {
14049       case RDB_IO_ERROR_DICT_COMMIT: {
14050         rdb_log_status_error(status, "Failed to write to WAL (dictionary)");
14051         /* NO_LINT_DEBUG */
14052         sql_print_error("MyRocks: aborting on WAL write error.");
14053         abort();
14054         break;
14055       }
14056       default:
14057         rdb_log_status_error(status, "Failed to read/write in RocksDB");
14058         break;
14059     }
14060   }
14061 }
14062 #ifdef __GNUC__
14063 #endif
14064 Rdb_dict_manager *rdb_get_dict_manager(void) { return &dict_manager; }
14065 
14066 Rdb_ddl_manager *rdb_get_ddl_manager(void) { return &ddl_manager; }
14067 
14068 Rdb_binlog_manager *rdb_get_binlog_manager(void) { return &binlog_manager; }
14069 
14070 void rocksdb_set_compaction_options(
14071     my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
14072     my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14073     void *const var_ptr, const void *const save) {
14074   if (var_ptr && save) {
14075     *(uint64_t *)var_ptr = *(const uint64_t *)save;
14076   }
14077   const Rdb_compact_params params = {
14078       (uint64_t)rocksdb_compaction_sequential_deletes,
14079       (uint64_t)rocksdb_compaction_sequential_deletes_window,
14080       (uint64_t)rocksdb_compaction_sequential_deletes_file_size};
14081   if (properties_collector_factory) {
14082     properties_collector_factory->SetCompactionParams(params);
14083   }
14084 }
14085 
14086 void rocksdb_set_table_stats_sampling_pct(
14087     my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
14088     my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14089     void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
14090   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14091 
14092   const uint32_t new_val = *static_cast<const uint32_t *>(save);
14093 
14094   if (new_val != rocksdb_table_stats_sampling_pct) {
14095     rocksdb_table_stats_sampling_pct = new_val;
14096 
14097     if (properties_collector_factory) {
14098       properties_collector_factory->SetTableStatsSamplingPct(
14099           rocksdb_table_stats_sampling_pct);
14100     }
14101   }
14102 
14103   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14104 }
14105 
14106 /*
14107   This function allows setting the rate limiter's bytes per second value
14108   but only if the rate limiter is turned on which has to be done at startup.
14109   If the rate is already 0 (turned off) or we are changing it to 0 (trying
14110   to turn it off) this function will push a warning to the client and do
14111   nothing.
14112   This is similar to the code in innodb_doublewrite_update (found in
14113   storage/innobase/handler/ha_innodb.cc).
14114 */
14115 void rocksdb_set_rate_limiter_bytes_per_sec(
14116     my_core::THD *const thd,
14117     my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14118     void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
14119   const uint64_t new_val = *static_cast<const uint64_t *>(save);
14120   if (new_val == 0 || rocksdb_rate_limiter_bytes_per_sec == 0) {
14121     /*
14122       If a rate_limiter was not enabled at startup we can't change it nor
14123       can we disable it if one was created at startup
14124     */
14125     push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_WRONG_ARGUMENTS,
14126                         "RocksDB: rocksdb_rate_limiter_bytes_per_sec cannot "
14127                         "be dynamically changed to or from 0.  Do a clean "
14128                         "shutdown if you want to change it from or to 0.");
14129   } else if (new_val != rocksdb_rate_limiter_bytes_per_sec) {
14130     /* Apply the new value to the rate limiter and store it locally */
14131     DBUG_ASSERT(rocksdb_rate_limiter != nullptr);
14132     rocksdb_rate_limiter_bytes_per_sec = new_val;
14133     rocksdb_rate_limiter->SetBytesPerSecond(new_val);
14134   }
14135 }
14136 
14137 void rocksdb_set_sst_mgr_rate_bytes_per_sec(
14138     my_core::THD *const thd,
14139     my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14140     void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
14141   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14142 
14143   const uint64_t new_val = *static_cast<const uint64_t *>(save);
14144 
14145   if (new_val != rocksdb_sst_mgr_rate_bytes_per_sec) {
14146     rocksdb_sst_mgr_rate_bytes_per_sec = new_val;
14147 
14148     rocksdb_db_options->sst_file_manager->SetDeleteRateBytesPerSecond(
14149         rocksdb_sst_mgr_rate_bytes_per_sec);
14150   }
14151 
14152   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14153 }
14154 
14155 void rocksdb_set_delayed_write_rate(THD *thd, struct st_mysql_sys_var *var,
14156                                     void *var_ptr, const void *save) {
14157   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14158   const uint64_t new_val = *static_cast<const uint64_t *>(save);
14159   if (rocksdb_delayed_write_rate != new_val) {
14160     rocksdb_delayed_write_rate = new_val;
14161     rocksdb::Status s =
14162         rdb->SetDBOptions({{"delayed_write_rate", std::to_string(new_val)}});
14163 
14164     if (!s.ok()) {
14165       /* NO_LINT_DEBUG */
14166       sql_print_warning(
14167           "MyRocks: failed to update delayed_write_rate. "
14168           "status code = %d, status = %s",
14169           s.code(), s.ToString().c_str());
14170     }
14171   }
14172   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14173 }
14174 
14175 void rocksdb_set_max_latest_deadlocks(THD *thd, struct st_mysql_sys_var *var,
14176                                       void *var_ptr, const void *save) {
14177   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14178   const uint32_t new_val = *static_cast<const uint32_t *>(save);
14179   if (rocksdb_max_latest_deadlocks != new_val) {
14180     rocksdb_max_latest_deadlocks = new_val;
14181     rdb->SetDeadlockInfoBufferSize(rocksdb_max_latest_deadlocks);
14182   }
14183   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14184 }
14185 
14186 void rdb_set_collation_exception_list(const char *const exception_list) {
14187   DBUG_ASSERT(rdb_collation_exceptions != nullptr);
14188 
14189   if (!rdb_collation_exceptions->set_patterns(exception_list)) {
14190     my_core::warn_about_bad_patterns(rdb_collation_exceptions,
14191                                      "strict_collation_exceptions");
14192   }
14193 }
14194 
14195 void rocksdb_set_collation_exception_list(THD *const thd,
14196                                           struct st_mysql_sys_var *const var,
14197                                           void *const var_ptr,
14198                                           const void *const save) {
14199   const char *const val = *static_cast<const char *const *>(save);
14200 
14201   rdb_set_collation_exception_list(val == nullptr ? "" : val);
14202 
14203   //psergey-todo: what is the purpose of the below??
14204   const char *val_copy= val? my_strdup(PSI_INSTRUMENT_ME, val, MYF(0)): nullptr;
14205   my_free(*static_cast<char**>(var_ptr));
14206   *static_cast<const char**>(var_ptr) = val_copy;
14207 }
14208 
14209 int mysql_value_to_bool(struct st_mysql_value *value, my_bool *return_value) {
14210   int new_value_type = value->value_type(value);
14211   if (new_value_type == MYSQL_VALUE_TYPE_STRING) {
14212     char buf[16];
14213     int len = sizeof(buf);
14214     const char *str = value->val_str(value, buf, &len);
14215     if (str && (my_strcasecmp(system_charset_info, "true", str) == 0 ||
14216                 my_strcasecmp(system_charset_info, "on", str) == 0)) {
14217       *return_value = TRUE;
14218     } else if (str && (my_strcasecmp(system_charset_info, "false", str) == 0 ||
14219                        my_strcasecmp(system_charset_info, "off", str) == 0)) {
14220       *return_value = FALSE;
14221     } else {
14222       return 1;
14223     }
14224   } else if (new_value_type == MYSQL_VALUE_TYPE_INT) {
14225     long long intbuf;
14226     value->val_int(value, &intbuf);
14227     if (intbuf > 1) return 1;
14228     *return_value = intbuf > 0 ? TRUE : FALSE;
14229   } else {
14230     return 1;
14231   }
14232 
14233   return 0;
14234 }
14235 
14236 int rocksdb_check_bulk_load(
14237     THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)),
14238     void *save, struct st_mysql_value *value) {
14239   my_bool new_value;
14240   if (mysql_value_to_bool(value, &new_value) != 0) {
14241     return 1;
14242   }
14243 
14244   Rdb_transaction *tx = get_tx_from_thd(thd);
14245   if (tx != nullptr) {
14246     bool is_critical_error;
14247     const int rc = tx->finish_bulk_load(&is_critical_error);
14248     if (rc != 0 && is_critical_error) {
14249       // NO_LINT_DEBUG
14250       sql_print_error(
14251           "RocksDB: Error %d finalizing last SST file while "
14252           "setting bulk loading variable",
14253           rc);
14254       THDVAR(thd, bulk_load) = 0;
14255       return 1;
14256     }
14257   }
14258 
14259   *static_cast<bool *>(save) = new_value;
14260   return 0;
14261 }
14262 
14263 int rocksdb_check_bulk_load_allow_unsorted(
14264     THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)),
14265     void *save, struct st_mysql_value *value) {
14266   my_bool new_value;
14267   if (mysql_value_to_bool(value, &new_value) != 0) {
14268     return 1;
14269   }
14270 
14271   if (THDVAR(thd, bulk_load)) {
14272     my_error(ER_ERROR_WHEN_EXECUTING_COMMAND, MYF(0), "SET",
14273              "Cannot change this setting while bulk load is enabled");
14274 
14275     return 1;
14276   }
14277 
14278   *static_cast<bool *>(save) = new_value;
14279   return 0;
14280 }
14281 
14282 static void rocksdb_set_max_background_jobs(THD *thd,
14283                                             struct st_mysql_sys_var *const var,
14284                                             void *const var_ptr,
14285                                             const void *const save) {
14286   DBUG_ASSERT(save != nullptr);
14287   DBUG_ASSERT(rocksdb_db_options != nullptr);
14288   DBUG_ASSERT(rocksdb_db_options->env != nullptr);
14289 
14290   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14291 
14292   const int new_val = *static_cast<const int *>(save);
14293 
14294   if (rocksdb_db_options->max_background_jobs != new_val) {
14295     rocksdb_db_options->max_background_jobs = new_val;
14296     rocksdb::Status s =
14297         rdb->SetDBOptions({{"max_background_jobs", std::to_string(new_val)}});
14298 
14299     if (!s.ok()) {
14300       /* NO_LINT_DEBUG */
14301       sql_print_warning(
14302           "MyRocks: failed to update max_background_jobs. "
14303           "Status code = %d, status = %s.",
14304           s.code(), s.ToString().c_str());
14305     }
14306   }
14307 
14308   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14309 }
14310 
14311 static void rocksdb_set_bytes_per_sync(
14312     THD *thd MY_ATTRIBUTE((__unused__)),
14313     struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14314     void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
14315   DBUG_ASSERT(save != nullptr);
14316   DBUG_ASSERT(rocksdb_db_options != nullptr);
14317   DBUG_ASSERT(rocksdb_db_options->env != nullptr);
14318 
14319   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14320 
14321   const ulonglong new_val = *static_cast<const ulonglong *>(save);
14322 
14323   if (rocksdb_db_options->bytes_per_sync != new_val) {
14324     rocksdb_db_options->bytes_per_sync = new_val;
14325     rocksdb::Status s =
14326         rdb->SetDBOptions({{"bytes_per_sync", std::to_string(new_val)}});
14327 
14328     if (!s.ok()) {
14329       /* NO_LINT_DEBUG */
14330       sql_print_warning(
14331           "MyRocks: failed to update max_background_jobs. "
14332           "Status code = %d, status = %s.",
14333           s.code(), s.ToString().c_str());
14334     }
14335   }
14336 
14337   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14338 }
14339 
14340 static void rocksdb_set_wal_bytes_per_sync(
14341     THD *thd MY_ATTRIBUTE((__unused__)),
14342     struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14343     void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
14344   DBUG_ASSERT(save != nullptr);
14345   DBUG_ASSERT(rocksdb_db_options != nullptr);
14346   DBUG_ASSERT(rocksdb_db_options->env != nullptr);
14347 
14348   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14349 
14350   const ulonglong new_val = *static_cast<const ulonglong *>(save);
14351 
14352   if (rocksdb_db_options->wal_bytes_per_sync != new_val) {
14353     rocksdb_db_options->wal_bytes_per_sync = new_val;
14354     rocksdb::Status s =
14355         rdb->SetDBOptions({{"wal_bytes_per_sync", std::to_string(new_val)}});
14356 
14357     if (!s.ok()) {
14358       /* NO_LINT_DEBUG */
14359       sql_print_warning(
14360           "MyRocks: failed to update max_background_jobs. "
14361           "Status code = %d, status = %s.",
14362           s.code(), s.ToString().c_str());
14363     }
14364   }
14365 
14366   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14367 }
14368 
14369 /*
14370   Validating and updating block cache size via sys_var::check path.
14371   SetCapacity may take seconds when reducing block cache, and
14372   sys_var::update holds LOCK_global_system_variables mutex, so
14373   updating block cache size is done at check path instead.
14374 */
14375 static int rocksdb_validate_set_block_cache_size(
14376     THD *thd MY_ATTRIBUTE((__unused__)),
14377     struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14378     void *var_ptr, struct st_mysql_value *value) {
14379   DBUG_ASSERT(value != nullptr);
14380 
14381   long long new_value;
14382 
14383   /* value is NULL */
14384   if (value->val_int(value, &new_value)) {
14385     return HA_EXIT_FAILURE;
14386   }
14387 
14388   if (new_value < RDB_MIN_BLOCK_CACHE_SIZE ||
14389       (uint64_t)new_value > (uint64_t)LLONG_MAX) {
14390     return HA_EXIT_FAILURE;
14391   }
14392 
14393   RDB_MUTEX_LOCK_CHECK(rdb_block_cache_resize_mutex);
14394   const rocksdb::BlockBasedTableOptions &table_options =
14395       rdb_get_table_options();
14396 
14397   if (rocksdb_block_cache_size != new_value && table_options.block_cache) {
14398     table_options.block_cache->SetCapacity(new_value);
14399   }
14400   *static_cast<int64_t *>(var_ptr) = static_cast<int64_t>(new_value);
14401   RDB_MUTEX_UNLOCK_CHECK(rdb_block_cache_resize_mutex);
14402   return HA_EXIT_SUCCESS;
14403 }
14404 
14405 static int rocksdb_validate_update_cf_options(
14406     THD * /* unused */, struct st_mysql_sys_var * /*unused*/, void *save,
14407     struct st_mysql_value *value) {
14408   char buff[STRING_BUFFER_USUAL_SIZE];
14409   const char *str;
14410   int length;
14411   length = sizeof(buff);
14412   str = value->val_str(value, buff, &length);
14413   // In some cases, str can point to buff in the stack.
14414   // This can cause invalid memory access after validation is finished.
14415   // To avoid this kind case, let's alway duplicate the str if str is not
14416   // nullptr
14417   *(const char **)save = (str == nullptr) ? nullptr : my_strdup(PSI_INSTRUMENT_ME, str, MYF(0));
14418 
14419   if (str == nullptr) {
14420     return HA_EXIT_SUCCESS;
14421   }
14422 
14423   Rdb_cf_options::Name_to_config_t option_map;
14424 
14425   // Basic sanity checking and parsing the options into a map. If this fails
14426   // then there's no point to proceed.
14427   if (!Rdb_cf_options::parse_cf_options(str, &option_map)) {
14428     my_error(ER_WRONG_VALUE_FOR_VAR, MYF(0), "rocksdb_update_cf_options", str);
14429     // Free what we've copied with my_strdup above.
14430     my_free((void*)(*(const char **)save));
14431     return HA_EXIT_FAILURE;
14432   }
14433   // Loop through option_map and create missing column families
14434   for (Rdb_cf_options::Name_to_config_t::iterator it = option_map.begin();
14435        it != option_map.end(); ++it) {
14436     cf_manager.get_or_create_cf(rdb, it->first);
14437   }
14438   return HA_EXIT_SUCCESS;
14439 }
14440 
14441 static void rocksdb_set_update_cf_options(
14442     THD *const /* unused */, struct st_mysql_sys_var *const /* unused */,
14443     void *const var_ptr, const void *const save) {
14444   const char *const val = *static_cast<const char *const *>(save);
14445 
14446   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14447 
14448   my_free(*reinterpret_cast<char **>(var_ptr));
14449 
14450   if (!val) {
14451     *reinterpret_cast<char **>(var_ptr) = nullptr;
14452     RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14453     return;
14454   }
14455 
14456   DBUG_ASSERT(val != nullptr);
14457 
14458   // Reset the pointers regardless of how much success we had with updating
14459   // the CF options. This will results in consistent behavior and avoids
14460   // dealing with cases when only a subset of CF-s was successfully updated.
14461   *reinterpret_cast<const char **>(var_ptr) = val;
14462 
14463   // Do the real work of applying the changes.
14464   Rdb_cf_options::Name_to_config_t option_map;
14465 
14466   // This should never fail, because of rocksdb_validate_update_cf_options
14467   if (!Rdb_cf_options::parse_cf_options(val, &option_map)) {
14468     my_free(*reinterpret_cast<char**>(var_ptr));
14469     RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14470     return;
14471   }
14472 
14473   // For each CF we have, see if we need to update any settings.
14474   for (const auto &cf_name : cf_manager.get_cf_names()) {
14475     DBUG_ASSERT(!cf_name.empty());
14476 
14477     rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(cf_name);
14478     DBUG_ASSERT(cfh != nullptr);
14479 
14480     const auto it = option_map.find(cf_name);
14481     std::string per_cf_options = (it != option_map.end()) ? it->second : "";
14482 
14483     if (!per_cf_options.empty()) {
14484       Rdb_cf_options::Name_to_config_t opt_map;
14485       rocksdb::Status s = rocksdb::StringToMap(per_cf_options, &opt_map);
14486 
14487       if (s != rocksdb::Status::OK()) {
14488         // NO_LINT_DEBUG
14489         sql_print_warning(
14490             "MyRocks: failed to convert the options for column "
14491             "family '%s' to a map. %s",
14492             cf_name.c_str(), s.ToString().c_str());
14493       } else {
14494         DBUG_ASSERT(rdb != nullptr);
14495 
14496         // Finally we can apply the options.
14497         s = rdb->SetOptions(cfh, opt_map);
14498 
14499         if (s != rocksdb::Status::OK()) {
14500           // NO_LINT_DEBUG
14501           sql_print_warning(
14502               "MyRocks: failed to apply the options for column "
14503               "family '%s'. %s",
14504               cf_name.c_str(), s.ToString().c_str());
14505         } else {
14506           // NO_LINT_DEBUG
14507           sql_print_information(
14508               "MyRocks: options for column family '%s' "
14509               "have been successfully updated.",
14510               cf_name.c_str());
14511 
14512           // Make sure that data is internally consistent as well and update
14513           // the CF options. This is necessary also to make sure that the CF
14514           // options will be correctly reflected in the relevant table:
14515           // ROCKSDB_CF_OPTIONS in INFORMATION_SCHEMA.
14516           rocksdb::ColumnFamilyOptions cf_options = rdb->GetOptions(cfh);
14517           std::string updated_options;
14518 
14519           s = rocksdb::GetStringFromColumnFamilyOptions(&updated_options,
14520                                                         cf_options);
14521 
14522           DBUG_ASSERT(s == rocksdb::Status::OK());
14523           DBUG_ASSERT(!updated_options.empty());
14524 
14525           cf_manager.update_options_map(cf_name, updated_options);
14526         }
14527       }
14528     }
14529   }
14530 
14531   // Our caller (`plugin_var_memalloc_global_update`) will call `my_free` to
14532   // free up resources used before.
14533 
14534   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14535 }
14536 
14537 void rdb_queue_save_stats_request() { rdb_bg_thread.request_save_stats(); }
14538 
14539 #ifdef MARIAROCKS_NOT_YET // MDEV-10976
14540 
14541 void ha_rocksdb::rpl_before_delete_rows() {
14542   DBUG_ENTER_FUNC();
14543 
14544   m_in_rpl_delete_rows = true;
14545 
14546   DBUG_VOID_RETURN;
14547 }
14548 
14549 void ha_rocksdb::rpl_after_delete_rows() {
14550   DBUG_ENTER_FUNC();
14551 
14552   m_in_rpl_delete_rows = false;
14553 
14554   DBUG_VOID_RETURN;
14555 }
14556 
14557 void ha_rocksdb::rpl_before_update_rows() {
14558   DBUG_ENTER_FUNC();
14559 
14560   m_in_rpl_update_rows = true;
14561 
14562   DBUG_VOID_RETURN;
14563 }
14564 
14565 void ha_rocksdb::rpl_after_update_rows() {
14566   DBUG_ENTER_FUNC();
14567 
14568   m_in_rpl_update_rows = false;
14569 
14570   DBUG_VOID_RETURN;
14571 }
14572 
14573 #if 0
14574 bool ha_rocksdb::is_read_free_rpl_table() const {
14575   return table->s && m_tbl_def->m_is_read_free_rpl_table;
14576 }
14577 #endif
14578 
14579 /**
14580   @brief
14581   Read Free Replication can be used or not. Returning true means
14582   Read Free Replication can be used.
14583 */
14584 bool ha_rocksdb::use_read_free_rpl() const {
14585   DBUG_ENTER_FUNC();
14586 
14587   if (!ha_thd()->rli_slave || table->triggers || /* !is_read_free_rpl_table()*/ ) {
14588     DBUG_RETURN(false);
14589   }
14590 
14591 #if 0 // MARIAROCKS_NOT_YET : read-free replication is not supported
14592   switch (rocksdb_read_free_rpl) {
14593     case read_free_rpl_type::OFF:
14594       DBUG_RETURN(false);
14595     case read_free_rpl_type::PK_ONLY:
14596       DBUG_RETURN(!has_hidden_pk(table) && table->s->keys == 1);
14597     case read_free_rpl_type::PK_SK:
14598       DBUG_RETURN(!has_hidden_pk(table));
14599   }
14600 #else
14601   DBUG_RETURN(false);
14602 #endif
14603 
14604   DBUG_ASSERT(false);
14605   DBUG_RETURN(false);
14606 }
14607 #endif // MARIAROCKS_NOT_YET
14608 
14609 double ha_rocksdb::read_time(uint index, uint ranges, ha_rows rows) {
14610   DBUG_ENTER_FUNC();
14611 
14612   if (index != table->s->primary_key) {
14613     /* Non covering index range scan */
14614     DBUG_RETURN(handler::read_time(index, ranges, rows));
14615   }
14616 
14617   DBUG_RETURN((rows / 20.0) + 1);
14618 }
14619 
14620 void ha_rocksdb::print_error(int error, myf errflag) {
14621   if (error == HA_ERR_ROCKSDB_STATUS_BUSY) {
14622     error = HA_ERR_LOCK_DEADLOCK;
14623   }
14624   handler::print_error(error, errflag);
14625 }
14626 
14627 std::string rdb_corruption_marker_file_name() {
14628   std::string ret(rocksdb_datadir);
14629   ret.append("/ROCKSDB_CORRUPTED");
14630   return ret;
14631 }
14632 
14633 void sql_print_verbose_info(const char *format, ...)
14634 {
14635   va_list args;
14636 
14637   if (global_system_variables.log_warnings > 2) {
14638     va_start(args, format);
14639     sql_print_information_v(format, args);
14640     va_end(args);
14641   }
14642 }
14643 
14644 }  // namespace myrocks
14645 
14646 
14647 /**
14648   Construct and emit duplicate key error message using information
14649   from table's record buffer.
14650 
14651   @sa print_keydup_error(table, key, msg, errflag, thd, org_table_name).
14652 */
14653 
14654 void print_keydup_error(TABLE *table, KEY *key, myf errflag,
14655                         const THD *thd, const char *org_table_name)
14656 {
14657   print_keydup_error(table, key, ER(ER_DUP_ENTRY_WITH_KEY_NAME), errflag);
14658 }
14659 
14660 /*
14661   Register the storage engine plugin outside of myrocks namespace
14662   so that mysql_declare_plugin does not get confused when it does
14663   its name generation.
14664 */
14665 
14666 
14667 struct st_mysql_storage_engine rocksdb_storage_engine = {
14668     MYSQL_HANDLERTON_INTERFACE_VERSION};
14669 
14670 maria_declare_plugin(rocksdb_se){
14671     MYSQL_STORAGE_ENGINE_PLUGIN,       /* Plugin Type */
14672     &rocksdb_storage_engine,           /* Plugin Descriptor */
14673     "ROCKSDB",                         /* Plugin Name */
14674     "Monty Program Ab",                /* Plugin Author */
14675     "RocksDB storage engine",          /* Plugin Description */
14676     PLUGIN_LICENSE_GPL,                /* Plugin Licence */
14677     myrocks::rocksdb_init_func,        /* Plugin Entry Point */
14678     myrocks::rocksdb_done_func,        /* Plugin Deinitializer */
14679     0x0001,                            /* version number (0.1) */
14680     myrocks::rocksdb_status_vars,      /* status variables */
14681     myrocks::rocksdb_system_variables, /* system variables */
14682   "1.0",                                        /* string version */
14683   myrocks::MYROCKS_MARIADB_PLUGIN_MATURITY_LEVEL
14684 },
14685     myrocks::rdb_i_s_cfstats, myrocks::rdb_i_s_dbstats,
14686     myrocks::rdb_i_s_perf_context, myrocks::rdb_i_s_perf_context_global,
14687     myrocks::rdb_i_s_cfoptions, myrocks::rdb_i_s_compact_stats,
14688     myrocks::rdb_i_s_global_info, myrocks::rdb_i_s_ddl,
14689     myrocks::rdb_i_s_sst_props, myrocks::rdb_i_s_index_file_map,
14690     myrocks::rdb_i_s_lock_info, myrocks::rdb_i_s_trx_info,
14691     myrocks::rdb_i_s_deadlock_info
14692 maria_declare_plugin_end;
14693