1 /*
2    Copyright (c) 2012, Monty Program Ab
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; version 2 of the License.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA */
16 
17 #ifdef USE_PRAGMA_IMPLEMENTATION
18 #pragma implementation  // gcc: Class implementation
19 #endif
20 
21 #define MYSQL_SERVER 1
22 
23 /* For use of 'PRIu64': */
24 #define __STDC_FORMAT_MACROS
25 
26 #include <my_global.h>
27 
28 #include <inttypes.h>
29 
30 /* The C++ file's header */
31 #include "./ha_rocksdb.h"
32 
33 /* C++ standard header files */
34 #include <inttypes.h>
35 #include <algorithm>
36 #include <limits>
37 #include <map>
38 #include <queue>
39 #include <set>
40 #include <string>
41 #include <vector>
42 
43 /* MySQL includes */
44 #include <mysql/psi/mysql_table.h>
45 // #include <thread_pool_priv.h>
46 #include <mysys_err.h>
47 #include "./debug_sync.h"
48 #include "./my_bit.h"
49 #include "./my_stacktrace.h"
50 #include "./my_sys.h"
51 #include "./sql_audit.h"
52 #include "./sql_table.h"
53 #include "./sql_hset.h"
54 #ifdef MARIAROCKS_NOT_YET
55 #endif
56 
57 /* RocksDB includes */
58 #include "monitoring/histogram.h"
59 #include "rocksdb/compaction_filter.h"
60 #include "rocksdb/env.h"
61 #include "rocksdb/memory_allocator.h"
62 #include "rocksdb/persistent_cache.h"
63 #include "rocksdb/rate_limiter.h"
64 #include "rocksdb/slice_transform.h"
65 #include "rocksdb/thread_status.h"
66 #include "rocksdb/utilities/checkpoint.h"
67 #include "rocksdb/utilities/convenience.h"
68 #include "rocksdb/utilities/memory_util.h"
69 #include "rocksdb/utilities/sim_cache.h"
70 #include "rocksdb/utilities/write_batch_with_index.h"
71 #include "util/stop_watch.h"
72 #include "./rdb_source_revision.h"
73 
74 // MariaRocks: this is needed to access RocksDB debug syncpoints:
75 #include "test_util/sync_point.h"
76 
77 /* MyRocks includes */
78 #include "./event_listener.h"
79 #include "./ha_rocksdb_proto.h"
80 #include "./logger.h"
81 #include "./nosql_access.h"
82 #include "./rdb_cf_manager.h"
83 #include "./rdb_cf_options.h"
84 #include "./rdb_converter.h"
85 #include "./rdb_datadic.h"
86 #include "./rdb_i_s.h"
87 #include "./rdb_index_merge.h"
88 #include "./rdb_mutex_wrapper.h"
89 #include "./rdb_psi.h"
90 #include "./rdb_threads.h"
91 #include "./rdb_mariadb_server_port.h"
92 
93 // Internal MySQL APIs not exposed in any header.
94 extern "C" {
95 /**
96   Mark transaction to rollback and mark error as fatal to a sub-statement.
97   @param  thd   Thread handle
98   @param  all   TRUE <=> rollback main transaction.
99 */
100 void thd_mark_transaction_to_rollback(MYSQL_THD thd, bool all);
101 
102 /**
103  *   Get the user thread's binary logging format
104  *   @param thd  user thread
105  *   @return Value to be used as index into the binlog_format_names array
106  */
107 int thd_binlog_format(const MYSQL_THD thd);
108 
109 /**
110  *   Check if binary logging is filtered for thread's current db.
111  *   @param  thd   Thread handle
112  *   @retval 1 the query is not filtered, 0 otherwise.
113  */
114 bool thd_binlog_filter_ok(const MYSQL_THD thd);
115 }
116 
117 extern my_bool opt_core_file;
118 
119 // Needed in rocksdb_init_func
120 void ignore_db_dirs_append(const char *dirname_arg);
121 
122 
123 namespace myrocks {
124 
125 static st_global_stats global_stats;
126 static st_export_stats export_stats;
127 static st_memory_stats memory_stats;
128 static st_io_stall_stats io_stall_stats;
129 
130 const std::string DEFAULT_CF_NAME("default");
131 const std::string DEFAULT_SYSTEM_CF_NAME("__system__");
132 const std::string PER_INDEX_CF_NAME("$per_index_cf");
133 
134 static std::vector<GL_INDEX_ID> rdb_indexes_to_recalc;
135 
136 #ifdef MARIADB_NOT_YET
137 class Rdb_explicit_snapshot : public explicit_snapshot {
138  public:
create(snapshot_info_st * ss_info,rocksdb::DB * db,const rocksdb::Snapshot * snapshot)139   static std::shared_ptr<Rdb_explicit_snapshot> create(
140       snapshot_info_st *ss_info, rocksdb::DB *db,
141       const rocksdb::Snapshot *snapshot) {
142     std::lock_guard<std::mutex> lock(explicit_snapshot_mutex);
143     auto s = std::unique_ptr<rocksdb::ManagedSnapshot>(
144         new rocksdb::ManagedSnapshot(db, snapshot));
145     if (!s) {
146       return nullptr;
147     }
148     ss_info->snapshot_id = ++explicit_snapshot_counter;
149     auto ret = std::make_shared<Rdb_explicit_snapshot>(*ss_info, std::move(s));
150     if (!ret) {
151       return nullptr;
152     }
153     explicit_snapshots[ss_info->snapshot_id] = ret;
154     return ret;
155   }
156 
dump_snapshots()157   static std::string dump_snapshots() {
158     std::string str;
159     std::lock_guard<std::mutex> lock(explicit_snapshot_mutex);
160     for (const auto &elem : explicit_snapshots) {
161       const auto &ss = elem.second.lock();
162       DBUG_ASSERT(ss != nullptr);
163       const auto &info = ss->ss_info;
164       str += "\nSnapshot ID: " + std::to_string(info.snapshot_id) +
165              "\nBinlog File: " + info.binlog_file +
166              "\nBinlog Pos: " + std::to_string(info.binlog_pos) +
167              "\nGtid Executed: " + info.gtid_executed + "\n";
168     }
169 
170     return str;
171   }
172 
get(const ulonglong snapshot_id)173   static std::shared_ptr<Rdb_explicit_snapshot> get(
174       const ulonglong snapshot_id) {
175     std::lock_guard<std::mutex> lock(explicit_snapshot_mutex);
176     auto elem = explicit_snapshots.find(snapshot_id);
177     if (elem == explicit_snapshots.end()) {
178       return nullptr;
179     }
180     return elem->second.lock();
181   }
182 
get_snapshot()183   rocksdb::ManagedSnapshot *get_snapshot() { return snapshot.get(); }
184 
Rdb_explicit_snapshot(snapshot_info_st ss_info,std::unique_ptr<rocksdb::ManagedSnapshot> && snapshot)185   Rdb_explicit_snapshot(snapshot_info_st ss_info,
186                         std::unique_ptr<rocksdb::ManagedSnapshot> &&snapshot)
187       : explicit_snapshot(ss_info), snapshot(std::move(snapshot)) {}
188 
~Rdb_explicit_snapshot()189   virtual ~Rdb_explicit_snapshot() {
190     std::lock_guard<std::mutex> lock(explicit_snapshot_mutex);
191     explicit_snapshots.erase(ss_info.snapshot_id);
192   }
193 
194  private:
195   std::unique_ptr<rocksdb::ManagedSnapshot> snapshot;
196 
197   static std::mutex explicit_snapshot_mutex;
198   static ulonglong explicit_snapshot_counter;
199   static std::unordered_map<ulonglong, std::weak_ptr<Rdb_explicit_snapshot>>
200       explicit_snapshots;
201 };
202 
203 std::mutex Rdb_explicit_snapshot::explicit_snapshot_mutex;
204 ulonglong Rdb_explicit_snapshot::explicit_snapshot_counter = 0;
205 std::unordered_map<ulonglong, std::weak_ptr<Rdb_explicit_snapshot>>
206     Rdb_explicit_snapshot::explicit_snapshots;
207 #endif
208 
209 /**
210   Updates row counters based on the table type and operation type.
211 */
update_row_stats(const operation_type & type)212 void ha_rocksdb::update_row_stats(const operation_type &type) {
213   DBUG_ASSERT(type < ROWS_MAX);
214   // Find if we are modifying system databases.
215   if (table->s && m_tbl_def->m_is_mysql_system_table) {
216     global_stats.system_rows[type].inc();
217   } else {
218     global_stats.rows[type].inc();
219   }
220 }
221 
222 void dbug_dump_database(rocksdb::DB *db);
223 static handler *rocksdb_create_handler(my_core::handlerton *hton,
224                                        my_core::TABLE_SHARE *table_arg,
225                                        my_core::MEM_ROOT *mem_root);
226 
getCompactRangeOptions(int concurrency=0)227 static rocksdb::CompactRangeOptions getCompactRangeOptions(
228     int concurrency = 0) {
229   rocksdb::CompactRangeOptions compact_range_options;
230   compact_range_options.bottommost_level_compaction =
231       rocksdb::BottommostLevelCompaction::kForce;
232   compact_range_options.exclusive_manual_compaction = false;
233   if (concurrency > 0) {
234     compact_range_options.max_subcompactions = concurrency;
235   }
236   return compact_range_options;
237 }
238 
239 ///////////////////////////////////////////////////////////
240 // Parameters and settings
241 ///////////////////////////////////////////////////////////
242 static char *rocksdb_default_cf_options = nullptr;
243 static char *rocksdb_override_cf_options = nullptr;
244 static char *rocksdb_update_cf_options = nullptr;
245 
246 ///////////////////////////////////////////////////////////
247 // Globals
248 ///////////////////////////////////////////////////////////
249 handlerton *rocksdb_hton;
250 
251 rocksdb::TransactionDB *rdb = nullptr;
252 rocksdb::HistogramImpl *commit_latency_stats = nullptr;
253 
254 static std::shared_ptr<rocksdb::Statistics> rocksdb_stats;
255 static std::unique_ptr<rocksdb::Env> flashcache_aware_env;
256 static std::shared_ptr<Rdb_tbl_prop_coll_factory> properties_collector_factory;
257 
258 Rdb_dict_manager dict_manager;
259 Rdb_cf_manager cf_manager;
260 Rdb_ddl_manager ddl_manager;
261 Rdb_binlog_manager binlog_manager;
262 
263 #if !defined(_WIN32) && !defined(__APPLE__)
264 Rdb_io_watchdog *io_watchdog = nullptr;
265 #endif
266 /**
267   MyRocks background thread control
268   N.B. This is besides RocksDB's own background threads
269        (@see rocksdb::CancelAllBackgroundWork())
270 */
271 
272 static Rdb_background_thread rdb_bg_thread;
273 
274 static Rdb_manual_compaction_thread rdb_mc_thread;
275 
276 // List of table names (using regex) that are exceptions to the strict
277 // collation check requirement.
278 Regex_list_handler *rdb_collation_exceptions;
279 
280 static const char **rdb_get_error_messages(int nr);
281 
rocksdb_flush_all_memtables()282 static void rocksdb_flush_all_memtables() {
283   const Rdb_cf_manager &cf_manager = rdb_get_cf_manager();
284   for (const auto &cf_handle : cf_manager.get_all_cf()) {
285     rdb->Flush(rocksdb::FlushOptions(), cf_handle);
286   }
287 }
288 
rocksdb_delete_column_family_stub(THD * const,struct st_mysql_sys_var * const,void * const,const void * const)289 static void rocksdb_delete_column_family_stub(
290     THD *const /* thd */, struct st_mysql_sys_var *const /* var */,
291     void *const /* var_ptr */, const void *const /* save */) {}
292 
rocksdb_delete_column_family(THD * const,struct st_mysql_sys_var * const,void * const,struct st_mysql_value * const value)293 static int rocksdb_delete_column_family(
294     THD *const /* thd */, struct st_mysql_sys_var *const /* var */,
295     void *const /* var_ptr */, struct st_mysql_value *const value) {
296   // Return failure for now until the race condition between creating
297   // CF and deleting CF is resolved
298   return HA_EXIT_FAILURE;
299 
300   char buff[STRING_BUFFER_USUAL_SIZE];
301   int len = sizeof(buff);
302 
303   DBUG_ASSERT(value != nullptr);
304 
305   if (const char *const cf = value->val_str(value, buff, &len)) {
306     auto &cf_manager = rdb_get_cf_manager();
307     auto ret = cf_manager.drop_cf(cf);
308     if (ret == HA_EXIT_SUCCESS) {
309       // NO_LINT_DEBUG
310       sql_print_information("RocksDB: Dropped column family: %s\n", cf);
311     } else {
312       // NO_LINT_DEBUG
313       sql_print_error("RocksDB: Failed to drop column family: %s, error: %d\n",
314                       cf, ret);
315     }
316 
317     return ret;
318   }
319 
320   return HA_EXIT_SUCCESS;
321 }
322 
323 ///////////////////////////////////////////////////////////
324 // Hash map: table name => open table handler
325 ///////////////////////////////////////////////////////////
326 
327 namespace  // anonymous namespace = not visible outside this source file
328 {
329 
330 typedef Hash_set<Rdb_table_handler> Rdb_table_set;
331 
332 class Rdb_open_tables_map {
333  private:
334   /* Hash table used to track the handlers of open tables */
335   std::unordered_map<std::string, Rdb_table_handler *> m_table_map;
336 
337   /* The mutex used to protect the hash table */
338   mutable mysql_mutex_t m_mutex;
339 
340  public:
init()341   void init() {
342     m_table_map.clear();
343     mysql_mutex_init(rdb_psi_open_tbls_mutex_key, &m_mutex, MY_MUTEX_INIT_FAST);
344   }
345 
free()346   void free() {
347     m_table_map.clear();
348     mysql_mutex_destroy(&m_mutex);
349   }
count()350   size_t count() { return m_table_map.size(); }
351 
352   Rdb_table_handler *get_table_handler(const char *const table_name);
353   void release_table_handler(Rdb_table_handler *const table_handler);
354 
355   std::vector<std::string> get_table_names(void) const;
356 };
357 
358 }  // anonymous namespace
359 
360 static Rdb_open_tables_map rdb_open_tables;
361 
rdb_normalize_dir(std::string dir)362 static std::string rdb_normalize_dir(std::string dir) {
363   while (dir.size() > 0 && dir.back() == '/') {
364     dir.resize(dir.size() - 1);
365   }
366   return dir;
367 }
368 
rocksdb_create_checkpoint(THD * const thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * const var MY_ATTRIBUTE ((__unused__)),void * const save MY_ATTRIBUTE ((__unused__)),struct st_mysql_value * const value)369 static int rocksdb_create_checkpoint(
370     THD *const thd MY_ATTRIBUTE((__unused__)),
371     struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
372     void *const save MY_ATTRIBUTE((__unused__)),
373     struct st_mysql_value *const value) {
374   char buf[FN_REFLEN];
375   int len = sizeof(buf);
376   const char *const checkpoint_dir_raw = value->val_str(value, buf, &len);
377   if (checkpoint_dir_raw) {
378     if (rdb != nullptr) {
379       std::string checkpoint_dir = rdb_normalize_dir(checkpoint_dir_raw);
380       // NO_LINT_DEBUG
381       sql_print_information("RocksDB: creating checkpoint in directory : %s\n",
382                             checkpoint_dir.c_str());
383       rocksdb::Checkpoint *checkpoint;
384       auto status = rocksdb::Checkpoint::Create(rdb, &checkpoint);
385       // We can only return HA_EXIT_FAILURE/HA_EXIT_SUCCESS here which is why
386       // the return code is ignored, but by calling into rdb_error_to_mysql,
387       // it will call my_error for us, which will propogate up to the client.
388       int rc __attribute__((__unused__));
389       if (status.ok()) {
390         status = checkpoint->CreateCheckpoint(checkpoint_dir.c_str());
391         delete checkpoint;
392         if (status.ok()) {
393           // NO_LINT_DEBUG
394           sql_print_information(
395               "RocksDB: created checkpoint in directory : %s\n",
396               checkpoint_dir.c_str());
397           return HA_EXIT_SUCCESS;
398         } else {
399           rc = ha_rocksdb::rdb_error_to_mysql(status);
400         }
401       } else {
402         rc = ha_rocksdb::rdb_error_to_mysql(status);
403       }
404     }
405   }
406   return HA_EXIT_FAILURE;
407 }
408 
409 /* This method is needed to indicate that the
410    ROCKSDB_CREATE_CHECKPOINT command is not read-only */
rocksdb_create_checkpoint_stub(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)411 static void rocksdb_create_checkpoint_stub(THD *const thd,
412                                            struct st_mysql_sys_var *const var,
413                                            void *const var_ptr,
414                                            const void *const save) {}
415 
rocksdb_force_flush_memtable_now_stub(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)416 static void rocksdb_force_flush_memtable_now_stub(
417     THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
418     const void *const save) {}
419 
rocksdb_force_flush_memtable_now(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,struct st_mysql_value * const value)420 static int rocksdb_force_flush_memtable_now(
421     THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
422     struct st_mysql_value *const value) {
423   // NO_LINT_DEBUG
424   sql_print_information("RocksDB: Manual memtable flush.");
425   rocksdb_flush_all_memtables();
426   return HA_EXIT_SUCCESS;
427 }
428 
rocksdb_force_flush_memtable_and_lzero_now_stub(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)429 static void rocksdb_force_flush_memtable_and_lzero_now_stub(
430     THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
431     const void *const save) {}
432 
rocksdb_force_flush_memtable_and_lzero_now(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,struct st_mysql_value * const value)433 static int rocksdb_force_flush_memtable_and_lzero_now(
434     THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
435     struct st_mysql_value *const value) {
436   // NO_LINT_DEBUG
437   sql_print_information("RocksDB: Manual memtable and L0 flush.");
438   rocksdb_flush_all_memtables();
439 
440   const Rdb_cf_manager &cf_manager = rdb_get_cf_manager();
441   rocksdb::CompactionOptions c_options = rocksdb::CompactionOptions();
442   rocksdb::ColumnFamilyMetaData metadata;
443   rocksdb::ColumnFamilyDescriptor cf_descr;
444 
445   int i, max_attempts = 3, num_errors = 0;
446 
447   for (const auto &cf_handle : cf_manager.get_all_cf()) {
448     for (i = 0; i < max_attempts; i++) {
449       rdb->GetColumnFamilyMetaData(cf_handle, &metadata);
450       cf_handle->GetDescriptor(&cf_descr);
451       c_options.output_file_size_limit = cf_descr.options.target_file_size_base;
452 
453       DBUG_ASSERT(metadata.levels[0].level == 0);
454       std::vector<std::string> file_names;
455       for (auto &file : metadata.levels[0].files) {
456         file_names.emplace_back(file.db_path + file.name);
457       }
458 
459       if (file_names.empty()) {
460         break;
461       }
462 
463       rocksdb::Status s;
464       s = rdb->CompactFiles(c_options, cf_handle, file_names, 1);
465 
466       // Due to a race, it's possible for CompactFiles to collide
467       // with auto compaction, causing an error to return
468       // regarding file not found. In that case, retry.
469       if (s.IsInvalidArgument()) {
470         continue;
471       }
472 
473       if (!s.ok() && !s.IsAborted()) {
474         rdb_handle_io_error(s, RDB_IO_ERROR_GENERAL);
475         return HA_EXIT_FAILURE;
476       }
477       break;
478     }
479     if (i == max_attempts) {
480       num_errors++;
481     }
482   }
483 
484   return num_errors == 0 ? HA_EXIT_SUCCESS : HA_EXIT_FAILURE;
485 }
486 
487 static void rocksdb_drop_index_wakeup_thread(
488     my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
489     struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
490     void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save);
491 
492 static my_bool rocksdb_pause_background_work = 0;
493 static mysql_mutex_t rdb_sysvars_mutex;
494 static mysql_mutex_t rdb_block_cache_resize_mutex;
495 
rocksdb_set_pause_background_work(my_core::THD * const,struct st_mysql_sys_var * const,void * const,const void * const save)496 static void rocksdb_set_pause_background_work(
497     my_core::THD *const,
498     struct st_mysql_sys_var *const,
499     void *const, const void *const save) {
500   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
501   const my_bool pause_requested = *static_cast<const my_bool *>(save);
502   if (rocksdb_pause_background_work != pause_requested) {
503     if (pause_requested) {
504       rdb->PauseBackgroundWork();
505     } else {
506       rdb->ContinueBackgroundWork();
507     }
508     rocksdb_pause_background_work = pause_requested;
509   }
510   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
511 }
512 
513 static void rocksdb_set_compaction_options(THD *thd,
514                                            struct st_mysql_sys_var *var,
515                                            void *var_ptr, const void *save);
516 
517 static void rocksdb_set_table_stats_sampling_pct(THD *thd,
518                                                  struct st_mysql_sys_var *var,
519                                                  void *var_ptr,
520                                                  const void *save);
521 
522 static void rocksdb_set_rate_limiter_bytes_per_sec(THD *thd,
523                                                    struct st_mysql_sys_var *var,
524                                                    void *var_ptr,
525                                                    const void *save);
526 
527 static void rocksdb_set_sst_mgr_rate_bytes_per_sec(THD *thd,
528                                                    struct st_mysql_sys_var *var,
529                                                    void *var_ptr,
530                                                    const void *save);
531 
532 static void rocksdb_set_delayed_write_rate(THD *thd,
533                                            struct st_mysql_sys_var *var,
534                                            void *var_ptr, const void *save);
535 
536 static void rocksdb_set_max_latest_deadlocks(THD *thd,
537                                              struct st_mysql_sys_var *var,
538                                              void *var_ptr, const void *save);
539 
540 static void rdb_set_collation_exception_list(const char *exception_list);
541 static void rocksdb_set_collation_exception_list(THD *thd,
542                                                  struct st_mysql_sys_var *var,
543                                                  void *var_ptr,
544                                                  const void *save);
545 
546 static int rocksdb_validate_update_cf_options(THD *thd,
547                                               struct st_mysql_sys_var *var,
548                                               void *save,
549                                               st_mysql_value *value);
550 
551 static void rocksdb_set_update_cf_options(THD *thd,
552                                           struct st_mysql_sys_var *var,
553                                           void *var_ptr, const void *save);
554 
555 static int rocksdb_check_bulk_load(
556     THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)),
557     void *save, struct st_mysql_value *value);
558 
559 static int rocksdb_check_bulk_load_allow_unsorted(
560     THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)),
561     void *save, struct st_mysql_value *value);
562 
563 static void rocksdb_set_max_background_jobs(THD *thd,
564                                             struct st_mysql_sys_var *const var,
565                                             void *const var_ptr,
566                                             const void *const save);
567 static void rocksdb_set_bytes_per_sync(THD *thd,
568                                        struct st_mysql_sys_var *const var,
569                                        void *const var_ptr,
570                                        const void *const save);
571 static void rocksdb_set_wal_bytes_per_sync(THD *thd,
572                                            struct st_mysql_sys_var *const var,
573                                            void *const var_ptr,
574                                            const void *const save);
575 static int rocksdb_validate_set_block_cache_size(
576     THD *thd, struct st_mysql_sys_var *const var, void *var_ptr,
577     struct st_mysql_value *value);
578 //////////////////////////////////////////////////////////////////////////////
579 // Options definitions
580 //////////////////////////////////////////////////////////////////////////////
581 static long long rocksdb_block_cache_size;
582 static long long rocksdb_sim_cache_size;
583 static my_bool rocksdb_use_clock_cache;
584 static double rocksdb_cache_high_pri_pool_ratio;
585 static my_bool rocksdb_cache_dump;
586 /* Use unsigned long long instead of uint64_t because of MySQL compatibility */
587 static unsigned long long  // NOLINT(runtime/int)
588     rocksdb_rate_limiter_bytes_per_sec;
589 static unsigned long long  // NOLINT(runtime/int)
590     rocksdb_sst_mgr_rate_bytes_per_sec;
591 static unsigned long long rocksdb_delayed_write_rate;
592 static uint32_t rocksdb_max_latest_deadlocks;
593 static unsigned long  // NOLINT(runtime/int)
594     rocksdb_persistent_cache_size_mb;
595 static ulong rocksdb_info_log_level;
596 static char *rocksdb_wal_dir;
597 static char *rocksdb_persistent_cache_path;
598 static ulong rocksdb_index_type;
599 static uint32_t rocksdb_flush_log_at_trx_commit;
600 static uint32_t rocksdb_debug_optimizer_n_rows;
601 static my_bool rocksdb_force_compute_memtable_stats;
602 static uint32_t rocksdb_force_compute_memtable_stats_cachetime;
603 static my_bool rocksdb_debug_optimizer_no_zero_cardinality;
604 static uint32_t rocksdb_wal_recovery_mode;
605 static uint32_t rocksdb_stats_level;
606 static uint32_t rocksdb_access_hint_on_compaction_start;
607 static char *rocksdb_compact_cf_name;
608 static char *rocksdb_delete_cf_name;
609 static char *rocksdb_checkpoint_name;
610 static my_bool rocksdb_signal_drop_index_thread;
611 static my_bool rocksdb_signal_remove_mariabackup_checkpoint;
612 static my_bool rocksdb_strict_collation_check = 1;
613 static my_bool rocksdb_ignore_unknown_options = 1;
614 static my_bool rocksdb_enable_2pc = 0;
615 static char *rocksdb_strict_collation_exceptions;
616 static my_bool rocksdb_collect_sst_properties = 1;
617 static my_bool rocksdb_force_flush_memtable_now_var = 0;
618 static my_bool rocksdb_force_flush_memtable_and_lzero_now_var = 0;
619 static my_bool rocksdb_enable_ttl = 1;
620 static my_bool rocksdb_enable_ttl_read_filtering = 1;
621 static int rocksdb_debug_ttl_rec_ts = 0;
622 static int rocksdb_debug_ttl_snapshot_ts = 0;
623 static int rocksdb_debug_ttl_read_filter_ts = 0;
624 static my_bool rocksdb_debug_ttl_ignore_pk = 0;
625 static my_bool rocksdb_reset_stats = 0;
626 static uint32_t rocksdb_io_write_timeout_secs = 0;
627 static uint32_t rocksdb_seconds_between_stat_computes = 3600;
628 static long long rocksdb_compaction_sequential_deletes = 0l;
629 static long long rocksdb_compaction_sequential_deletes_window = 0l;
630 static long long rocksdb_compaction_sequential_deletes_file_size = 0l;
631 static uint32_t rocksdb_validate_tables = 1;
632 static char *rocksdb_datadir;
633 static uint32_t rocksdb_table_stats_sampling_pct;
634 static my_bool rocksdb_enable_bulk_load_api = 1;
635 static my_bool rocksdb_print_snapshot_conflict_queries = 0;
636 static my_bool rocksdb_large_prefix = 0;
637 static my_bool rocksdb_allow_to_start_after_corruption = 0;
638 static char* rocksdb_git_hash;
639 
640 uint32_t rocksdb_ignore_datadic_errors = 0;
641 
642 char *compression_types_val=
643   const_cast<char*>(get_rocksdb_supported_compression_types());
644 static unsigned long rocksdb_write_policy =
645     rocksdb::TxnDBWritePolicy::WRITE_COMMITTED;
646 
647 #if 0 // MARIAROCKS_NOT_YET : read-free replication is not supported
648 char *rocksdb_read_free_rpl_tables;
649 std::mutex rocksdb_read_free_rpl_tables_mutex;
650 #if defined(HAVE_PSI_INTERFACE)
651 Regex_list_handler rdb_read_free_regex_handler(key_rwlock_read_free_rpl_tables);
652 #else
653 Regex_list_handler rdb_read_free_regex_handler;
654 #endif
655 enum read_free_rpl_type { OFF = 0, PK_ONLY, PK_SK };
656 static unsigned long rocksdb_read_free_rpl = read_free_rpl_type::OFF;
657 #endif
658 
659 static my_bool rocksdb_error_on_suboptimal_collation = 1;
660 static uint32_t rocksdb_stats_recalc_rate = 0;
661 static uint32_t rocksdb_debug_manual_compaction_delay = 0;
662 static uint32_t rocksdb_max_manual_compactions = 0;
663 static my_bool rocksdb_rollback_on_timeout = FALSE;
664 static my_bool rocksdb_enable_insert_with_update_caching = TRUE;
665 
666 std::atomic<uint64_t> rocksdb_row_lock_deadlocks(0);
667 std::atomic<uint64_t> rocksdb_row_lock_wait_timeouts(0);
668 std::atomic<uint64_t> rocksdb_snapshot_conflict_errors(0);
669 std::atomic<uint64_t> rocksdb_wal_group_syncs(0);
670 std::atomic<uint64_t> rocksdb_manual_compactions_processed(0);
671 std::atomic<uint64_t> rocksdb_manual_compactions_running(0);
672 #ifndef DBUG_OFF
673 std::atomic<uint64_t> rocksdb_num_get_for_update_calls(0);
674 #endif
675 
676 
677 
678 /*
679   Remove directory with files in it.
680   Used to remove checkpoint created by mariabackup.
681 */
682 #ifdef _WIN32
683 #include <direct.h> /* unlink*/
684 #ifndef F_OK
685 #define F_OK 0
686 #endif
687 #endif
688 
rmdir_force(const char * dir)689 static int rmdir_force(const char *dir) {
690   if (access(dir, F_OK))
691     return true;
692 
693   char path[FN_REFLEN];
694   char sep[] = {FN_LIBCHAR, 0};
695   int err = 0;
696 
697   MY_DIR *dir_info = my_dir(dir, MYF(MY_DONT_SORT | MY_WANT_STAT));
698   if (!dir_info)
699     return 1;
700 
701   for (uint i = 0; i < dir_info->number_of_files; i++) {
702     FILEINFO *file = dir_info->dir_entry + i;
703 
704     strxnmov(path, sizeof(path), dir, sep, file->name, NULL);
705 
706     err = my_delete(path, 0);
707 
708     if (err) {
709       break;
710     }
711   }
712 
713   my_dirend(dir_info);
714 
715   if (!err)
716     err = rmdir(dir);
717 
718   return (err == 0) ? HA_EXIT_SUCCESS : HA_EXIT_FAILURE;
719 }
720 
721 
rocksdb_remove_mariabackup_checkpoint(my_core::THD * const,struct st_mysql_sys_var * const,void * const var_ptr,const void * const)722 static void rocksdb_remove_mariabackup_checkpoint(
723     my_core::THD *const,
724     struct st_mysql_sys_var *const ,
725     void *const var_ptr, const void *const) {
726   std::string mariabackup_checkpoint_dir(rocksdb_datadir);
727 
728   mariabackup_checkpoint_dir.append("/mariabackup-checkpoint");
729 
730   if (unlink(mariabackup_checkpoint_dir.c_str())  == 0)
731     return;
732 
733   rmdir_force(mariabackup_checkpoint_dir.c_str());
734 }
735 
736 
rdb_init_rocksdb_db_options(void)737 static std::unique_ptr<rocksdb::DBOptions> rdb_init_rocksdb_db_options(void) {
738   auto o = std::unique_ptr<rocksdb::DBOptions>(new rocksdb::DBOptions());
739 
740   o->create_if_missing = true;
741   o->listeners.push_back(std::make_shared<Rdb_event_listener>(&ddl_manager));
742   o->info_log_level = rocksdb::InfoLogLevel::INFO_LEVEL;
743   o->max_subcompactions = DEFAULT_SUBCOMPACTIONS;
744   o->max_open_files = -2;  // auto-tune to 50% open_files_limit
745 
746   o->two_write_queues = true;
747   o->manual_wal_flush = true;
748   return o;
749 }
750 
751 /* DBOptions contains Statistics and needs to be destructed last */
752 static std::unique_ptr<rocksdb::BlockBasedTableOptions> rocksdb_tbl_options =
753     std::unique_ptr<rocksdb::BlockBasedTableOptions>(
754         new rocksdb::BlockBasedTableOptions());
755 static std::unique_ptr<rocksdb::DBOptions> rocksdb_db_options =
756     rdb_init_rocksdb_db_options();
757 
758 static std::shared_ptr<rocksdb::RateLimiter> rocksdb_rate_limiter;
759 
760 /* This enum needs to be kept up to date with rocksdb::TxnDBWritePolicy */
761 static const char *write_policy_names[] = {"write_committed", "write_prepared",
762                                            "write_unprepared", NullS};
763 
764 static TYPELIB write_policy_typelib = {array_elements(write_policy_names) - 1,
765                                        "write_policy_typelib",
766                                        write_policy_names, nullptr};
767 
768 #if 0 // MARIAROCKS_NOT_YET : read-free replication is not supported
769 /* This array needs to be kept up to date with myrocks::read_free_rpl_type */
770 static const char *read_free_rpl_names[] = {"OFF", "PK_ONLY", "PK_SK", NullS};
771 
772 static TYPELIB read_free_rpl_typelib = {array_elements(read_free_rpl_names) - 1,
773                                         "read_free_rpl_typelib",
774                                         read_free_rpl_names, nullptr};
775 #endif
776 
777 /* This enum needs to be kept up to date with rocksdb::InfoLogLevel */
778 static const char *info_log_level_names[] = {"debug_level", "info_level",
779                                              "warn_level",  "error_level",
780                                              "fatal_level", NullS};
781 
782 static TYPELIB info_log_level_typelib = {
783     array_elements(info_log_level_names) - 1, "info_log_level_typelib",
784     info_log_level_names, nullptr};
785 
rocksdb_set_rocksdb_info_log_level(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)786 static void rocksdb_set_rocksdb_info_log_level(
787     THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
788     const void *const save) {
789   DBUG_ASSERT(save != nullptr);
790 
791   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
792   rocksdb_info_log_level = *static_cast<const uint64_t *>(save);
793   rocksdb_db_options->info_log->SetInfoLogLevel(
794       static_cast<rocksdb::InfoLogLevel>(rocksdb_info_log_level));
795   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
796 }
797 
rocksdb_set_rocksdb_stats_level(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)798 static void rocksdb_set_rocksdb_stats_level(THD *const thd,
799                                             struct st_mysql_sys_var *const var,
800                                             void *const var_ptr,
801                                             const void *const save) {
802   DBUG_ASSERT(save != nullptr);
803 
804   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
805   rocksdb_db_options->statistics->set_stats_level(
806       static_cast<rocksdb::StatsLevel>(
807           *static_cast<const uint64_t *>(save)));
808   // Actual stats level is defined at rocksdb dbopt::statistics::stats_level_
809   // so adjusting rocksdb_stats_level here to make sure it points to
810   // the correct stats level.
811   rocksdb_stats_level = rocksdb_db_options->statistics->get_stats_level();
812   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
813 }
814 
rocksdb_set_reset_stats(my_core::THD * const,my_core::st_mysql_sys_var * const var MY_ATTRIBUTE ((__unused__)),void * const var_ptr,const void * const save)815 static void rocksdb_set_reset_stats(
816     my_core::THD *const /* unused */,
817     my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
818     void *const var_ptr, const void *const save) {
819   DBUG_ASSERT(save != nullptr);
820   DBUG_ASSERT(rdb != nullptr);
821   DBUG_ASSERT(rocksdb_stats != nullptr);
822 
823   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
824 
825   *static_cast<bool *>(var_ptr) = *static_cast<const bool *>(save);
826 
827   if (rocksdb_reset_stats) {
828     rocksdb::Status s = rdb->ResetStats();
829 
830     // RocksDB will always return success. Let's document this assumption here
831     // as well so that we'll get immediately notified when contract changes.
832     DBUG_ASSERT(s == rocksdb::Status::OK());
833 
834     s = rocksdb_stats->Reset();
835     DBUG_ASSERT(s == rocksdb::Status::OK());
836   }
837 
838   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
839 }
840 
rocksdb_set_io_write_timeout(my_core::THD * const thd MY_ATTRIBUTE ((__unused__)),my_core::st_mysql_sys_var * const var MY_ATTRIBUTE ((__unused__)),void * const var_ptr MY_ATTRIBUTE ((__unused__)),const void * const save)841 static void rocksdb_set_io_write_timeout(
842     my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
843     my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
844     void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
845   DBUG_ASSERT(save != nullptr);
846   DBUG_ASSERT(rdb != nullptr);
847 #if !defined(_WIN32) && !defined(__APPLE__)
848   DBUG_ASSERT(io_watchdog != nullptr);
849 #endif
850 
851   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
852 
853   const uint32_t new_val = *static_cast<const uint32_t *>(save);
854 
855   rocksdb_io_write_timeout_secs = new_val;
856 #if !defined(_WIN32) && !defined(__APPLE__)
857   io_watchdog->reset_timeout(rocksdb_io_write_timeout_secs);
858 #endif
859   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
860 }
861 
862 enum rocksdb_flush_log_at_trx_commit_type : unsigned int {
863   FLUSH_LOG_NEVER = 0,
864   FLUSH_LOG_SYNC,
865   FLUSH_LOG_BACKGROUND,
866   FLUSH_LOG_MAX /* must be last */
867 };
868 
rocksdb_validate_flush_log_at_trx_commit(THD * const thd,struct st_mysql_sys_var * const var,void * var_ptr,struct st_mysql_value * const value)869 static int rocksdb_validate_flush_log_at_trx_commit(
870     THD *const thd,
871     struct st_mysql_sys_var *const var, /* in: pointer to system variable */
872     void *var_ptr, /* out: immediate result for update function */
873     struct st_mysql_value *const value /* in: incoming value */) {
874   long long new_value;
875 
876   /* value is NULL */
877   if (value->val_int(value, &new_value)) {
878     return HA_EXIT_FAILURE;
879   }
880 
881   if (rocksdb_db_options->allow_mmap_writes && new_value != FLUSH_LOG_NEVER) {
882     return HA_EXIT_FAILURE;
883   }
884 
885   *static_cast<uint32_t *>(var_ptr) = static_cast<uint32_t>(new_value);
886   return HA_EXIT_SUCCESS;
887 }
rocksdb_compact_column_family_stub(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)888 static void rocksdb_compact_column_family_stub(
889     THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
890     const void *const save) {}
891 
892 static int rocksdb_compact_column_family(THD *const thd,
893                                          struct st_mysql_sys_var *const var,
894                                          void *const var_ptr,
895                                          struct st_mysql_value *const value);
896 
897 static const char *index_type_names[] = {"kBinarySearch", "kHashSearch", NullS};
898 
899 static TYPELIB index_type_typelib = {array_elements(index_type_names) - 1,
900                                      "index_type_typelib", index_type_names,
901                                      nullptr};
902 
903 const ulong RDB_MAX_LOCK_WAIT_SECONDS = 1024 * 1024 * 1024;
904 const ulong RDB_DEFAULT_MAX_ROW_LOCKS = 1024 * 1024;
905 const ulong RDB_MAX_ROW_LOCKS = 1024 * 1024 * 1024;
906 const ulong RDB_DEFAULT_BULK_LOAD_SIZE = 1000;
907 const ulong RDB_MAX_BULK_LOAD_SIZE = 1024 * 1024 * 1024;
908 const size_t RDB_DEFAULT_MERGE_BUF_SIZE = 64 * 1024 * 1024;
909 const size_t RDB_MIN_MERGE_BUF_SIZE = 100;
910 const size_t RDB_DEFAULT_MERGE_COMBINE_READ_SIZE = 1024 * 1024 * 1024;
911 const size_t RDB_MIN_MERGE_COMBINE_READ_SIZE = 100;
912 const size_t RDB_DEFAULT_MERGE_TMP_FILE_REMOVAL_DELAY = 0;
913 const size_t RDB_MIN_MERGE_TMP_FILE_REMOVAL_DELAY = 0;
914 const int64 RDB_DEFAULT_BLOCK_CACHE_SIZE = 512 * 1024 * 1024;
915 const int64 RDB_MIN_BLOCK_CACHE_SIZE = 1024;
916 const int RDB_MAX_CHECKSUMS_PCT = 100;
917 const ulong RDB_DEADLOCK_DETECT_DEPTH = 50;
918 
919 // TODO: 0 means don't wait at all, and we don't support it yet?
920 static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG,
921                           "Number of seconds to wait for lock", nullptr,
922                           nullptr, /*default*/ 1, /*min*/ 1,
923                           /*max*/ RDB_MAX_LOCK_WAIT_SECONDS, 0);
924 
925 static MYSQL_THDVAR_BOOL(deadlock_detect, PLUGIN_VAR_RQCMDARG,
926                          "Enables deadlock detection", nullptr, nullptr, FALSE);
927 
928 static MYSQL_THDVAR_ULONG(deadlock_detect_depth, PLUGIN_VAR_RQCMDARG,
929                           "Number of transactions deadlock detection will "
930                           "traverse through before assuming deadlock",
931                           nullptr, nullptr,
932                           /*default*/ RDB_DEADLOCK_DETECT_DEPTH,
933                           /*min*/ 2,
934                           /*max*/ ULONG_MAX, 0);
935 
936 static MYSQL_THDVAR_BOOL(
937     commit_time_batch_for_recovery, PLUGIN_VAR_RQCMDARG,
938     "TransactionOptions::commit_time_batch_for_recovery for RocksDB", nullptr,
939     nullptr, TRUE);
940 
941 static MYSQL_THDVAR_BOOL(
942     trace_sst_api, PLUGIN_VAR_RQCMDARG,
943     "Generate trace output in the log for each call to the SstFileWriter",
944     nullptr, nullptr, FALSE);
945 
946 static MYSQL_THDVAR_BOOL(
947     bulk_load, PLUGIN_VAR_RQCMDARG,
948     "Use bulk-load mode for inserts. This disables "
949     "unique_checks and enables rocksdb_commit_in_the_middle.",
950     rocksdb_check_bulk_load, nullptr, FALSE);
951 
952 static MYSQL_THDVAR_BOOL(bulk_load_allow_sk, PLUGIN_VAR_RQCMDARG,
953                          "Allow bulk loading of sk keys during bulk-load. "
954                          "Can be changed only when bulk load is disabled.",
955                          /* Intentionally reuse unsorted's check function */
956                          rocksdb_check_bulk_load_allow_unsorted, nullptr,
957                          FALSE);
958 
959 static MYSQL_THDVAR_BOOL(bulk_load_allow_unsorted, PLUGIN_VAR_RQCMDARG,
960                          "Allow unsorted input during bulk-load. "
961                          "Can be changed only when bulk load is disabled.",
962                          rocksdb_check_bulk_load_allow_unsorted, nullptr,
963                          FALSE);
964 
965 static MYSQL_SYSVAR_BOOL(enable_bulk_load_api, rocksdb_enable_bulk_load_api,
966                          PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
967                          "Enables using SstFileWriter for bulk loading",
968                          nullptr, nullptr, rocksdb_enable_bulk_load_api);
969 
970 static MYSQL_SYSVAR_STR(git_hash, rocksdb_git_hash,
971                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
972                         "Git revision of the RocksDB library used by MyRocks",
973                         nullptr, nullptr, ROCKSDB_GIT_HASH);
974 
975 static MYSQL_THDVAR_STR(tmpdir, PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_MEMALLOC,
976                         "Directory for temporary files during DDL operations.",
977                         nullptr, nullptr, "");
978 
979 #define DEFAULT_SKIP_UNIQUE_CHECK_TABLES ".*"
980 static MYSQL_THDVAR_STR(
981     skip_unique_check_tables, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
982     "Skip unique constraint checking for the specified tables", nullptr,
983     nullptr, DEFAULT_SKIP_UNIQUE_CHECK_TABLES);
984 
985 static MYSQL_THDVAR_BOOL(
986     commit_in_the_middle, PLUGIN_VAR_RQCMDARG,
987     "Commit rows implicitly every rocksdb_bulk_load_size, on bulk load/insert, "
988     "update and delete",
989     nullptr, nullptr, FALSE);
990 
991 static MYSQL_THDVAR_BOOL(
992     blind_delete_primary_key, PLUGIN_VAR_RQCMDARG,
993     "Deleting rows by primary key lookup, without reading rows (Blind Deletes)."
994     " Blind delete is disabled if the table has secondary key",
995     nullptr, nullptr, FALSE);
996 
997 #if 0 // MARIAROCKS_NOT_YET : read-free replication is not supported
998 
999 static const char *DEFAULT_READ_FREE_RPL_TABLES = ".*";
1000 
rocksdb_validate_read_free_rpl_tables(THD * thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * var MY_ATTRIBUTE ((__unused__)),void * save,struct st_mysql_value * value)1001 static int rocksdb_validate_read_free_rpl_tables(
1002     THD *thd MY_ATTRIBUTE((__unused__)),
1003     struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)), void *save,
1004     struct st_mysql_value *value) {
1005   char buff[STRING_BUFFER_USUAL_SIZE];
1006   int length = sizeof(buff);
1007   const char *wlist_buf = value->val_str(value, buff, &length);
1008   const auto wlist = wlist_buf ? wlist_buf : DEFAULT_READ_FREE_RPL_TABLES;
1009 
1010 #if defined(HAVE_PSI_INTERFACE)
1011   Regex_list_handler regex_handler(key_rwlock_read_free_rpl_tables);
1012 #else
1013   Regex_list_handler regex_handler;
1014 #endif
1015 
1016   if (!regex_handler.set_patterns(wlist)) {
1017     warn_about_bad_patterns(&regex_handler, "rocksdb_read_free_rpl_tables");
1018     return HA_EXIT_FAILURE;
1019   }
1020 
1021   *static_cast<const char **>(save) = my_strdup(wlist, MYF(MY_WME));
1022   return HA_EXIT_SUCCESS;
1023 }
1024 
rocksdb_update_read_free_rpl_tables(THD * thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * var MY_ATTRIBUTE ((__unused__)),void * var_ptr,const void * save)1025 static void rocksdb_update_read_free_rpl_tables(
1026     THD *thd MY_ATTRIBUTE((__unused__)),
1027     struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)), void *var_ptr,
1028     const void *save) {
1029   const auto wlist = *static_cast<const char *const *>(save);
1030   DBUG_ASSERT(wlist != nullptr);
1031 
1032   // This is bound to succeed since we've already checked for bad patterns in
1033   // rocksdb_validate_read_free_rpl_tables
1034   rdb_read_free_regex_handler.set_patterns(wlist);
1035 
1036   // update all table defs
1037   struct Rdb_read_free_rpl_updater : public Rdb_tables_scanner {
1038     int add_table(Rdb_tbl_def *tdef) override {
1039       tdef->check_and_set_read_free_rpl_table();
1040       return HA_EXIT_SUCCESS;
1041     }
1042   } updater;
1043   ddl_manager.scan_for_tables(&updater);
1044 
1045   if (wlist == DEFAULT_READ_FREE_RPL_TABLES) {
1046     // If running SET var = DEFAULT, then rocksdb_validate_read_free_rpl_tables
1047     // isn't called, and memory is never allocated for the value. Allocate it
1048     // here.
1049     *static_cast<const char **>(var_ptr) = my_strdup(wlist, MYF(MY_WME));
1050   } else {
1051     // Otherwise, we just reuse the value allocated from
1052     // rocksdb_validate_read_free_rpl_tables.
1053     *static_cast<const char **>(var_ptr) = wlist;
1054   }
1055 }
1056 
1057 static MYSQL_SYSVAR_STR(
1058     read_free_rpl_tables, rocksdb_read_free_rpl_tables,
1059     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC /*| PLUGIN_VAR_ALLOCATED*/,
1060     "List of tables that will use read-free replication on the slave "
1061     "(i.e. not lookup a row during replication)",
1062     rocksdb_validate_read_free_rpl_tables, rocksdb_update_read_free_rpl_tables,
1063     DEFAULT_READ_FREE_RPL_TABLES);
1064 
1065 static MYSQL_SYSVAR_ENUM(
1066     read_free_rpl, rocksdb_read_free_rpl,
1067     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
1068     "Use read-free replication on the slave (i.e. no row lookup during "
1069     "replication). Default is OFF, PK_SK will enable it on all tables with "
1070     "primary key. PK_ONLY will enable it on tables where the only key is the "
1071     "primary key (i.e. no secondary keys).",
1072     nullptr, nullptr, read_free_rpl_type::OFF, &read_free_rpl_typelib);
1073 #endif
1074 
1075 static MYSQL_THDVAR_BOOL(skip_bloom_filter_on_read, PLUGIN_VAR_RQCMDARG,
1076                          "Skip using bloom filter for reads", nullptr, nullptr,
1077                          FALSE);
1078 
1079 static MYSQL_THDVAR_ULONG(max_row_locks, PLUGIN_VAR_RQCMDARG,
1080                           "Maximum number of locks a transaction can have",
1081                           nullptr, nullptr,
1082                           /*default*/ RDB_DEFAULT_MAX_ROW_LOCKS,
1083                           /*min*/ 1,
1084                           /*max*/ RDB_MAX_ROW_LOCKS, 0);
1085 
1086 static MYSQL_THDVAR_ULONGLONG(
1087     write_batch_max_bytes, PLUGIN_VAR_RQCMDARG,
1088     "Maximum size of write batch in bytes. 0 means no limit.", nullptr, nullptr,
1089     /* default */ 0, /* min */ 0, /* max */ SIZE_T_MAX, 1);
1090 
1091 static MYSQL_THDVAR_BOOL(
1092     lock_scanned_rows, PLUGIN_VAR_RQCMDARG,
1093     "Take and hold locks on rows that are scanned but not updated", nullptr,
1094     nullptr, FALSE);
1095 
1096 static MYSQL_THDVAR_ULONG(bulk_load_size, PLUGIN_VAR_RQCMDARG,
1097                           "Max #records in a batch for bulk-load mode", nullptr,
1098                           nullptr,
1099                           /*default*/ RDB_DEFAULT_BULK_LOAD_SIZE,
1100                           /*min*/ 1,
1101                           /*max*/ RDB_MAX_BULK_LOAD_SIZE, 0);
1102 
1103 static MYSQL_THDVAR_ULONGLONG(
1104     merge_buf_size, PLUGIN_VAR_RQCMDARG,
1105     "Size to allocate for merge sort buffers written out to disk "
1106     "during inplace index creation.",
1107     nullptr, nullptr,
1108     /* default (64MB) */ RDB_DEFAULT_MERGE_BUF_SIZE,
1109     /* min (100B) */ RDB_MIN_MERGE_BUF_SIZE,
1110     /* max */ SIZE_T_MAX, 1);
1111 
1112 static MYSQL_THDVAR_ULONGLONG(
1113     merge_combine_read_size, PLUGIN_VAR_RQCMDARG,
1114     "Size that we have to work with during combine (reading from disk) phase "
1115     "of "
1116     "external sort during fast index creation.",
1117     nullptr, nullptr,
1118     /* default (1GB) */ RDB_DEFAULT_MERGE_COMBINE_READ_SIZE,
1119     /* min (100B) */ RDB_MIN_MERGE_COMBINE_READ_SIZE,
1120     /* max */ SIZE_T_MAX, 1);
1121 
1122 static MYSQL_THDVAR_ULONGLONG(
1123     merge_tmp_file_removal_delay_ms, PLUGIN_VAR_RQCMDARG,
1124     "Fast index creation creates a large tmp file on disk during index "
1125     "creation.  Removing this large file all at once when index creation is "
1126     "complete can cause trim stalls on Flash.  This variable specifies a "
1127     "duration to sleep (in milliseconds) between calling chsize() to truncate "
1128     "the file in chunks.  The chunk size is  the same as merge_buf_size.",
1129     nullptr, nullptr,
1130     /* default (0ms) */ RDB_DEFAULT_MERGE_TMP_FILE_REMOVAL_DELAY,
1131     /* min (0ms) */ RDB_MIN_MERGE_TMP_FILE_REMOVAL_DELAY,
1132     /* max */ SIZE_T_MAX, 1);
1133 
1134 static MYSQL_THDVAR_INT(
1135     manual_compaction_threads, PLUGIN_VAR_RQCMDARG,
1136     "How many rocksdb threads to run for manual compactions", nullptr, nullptr,
1137     /* default rocksdb.dboption max_subcompactions */ 0,
1138     /* min */ 0, /* max */ 128, 0);
1139 
1140 static MYSQL_SYSVAR_BOOL(
1141     create_if_missing,
1142     *reinterpret_cast<my_bool *>(&rocksdb_db_options->create_if_missing),
1143     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1144     "DBOptions::create_if_missing for RocksDB", nullptr, nullptr,
1145     rocksdb_db_options->create_if_missing);
1146 
1147 static MYSQL_SYSVAR_BOOL(
1148     two_write_queues,
1149     *reinterpret_cast<my_bool *>(&rocksdb_db_options->two_write_queues),
1150     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1151     "DBOptions::two_write_queues for RocksDB", nullptr, nullptr,
1152     rocksdb_db_options->two_write_queues);
1153 
1154 static MYSQL_SYSVAR_BOOL(
1155     manual_wal_flush,
1156     *reinterpret_cast<my_bool *>(&rocksdb_db_options->manual_wal_flush),
1157     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1158     "DBOptions::manual_wal_flush for RocksDB", nullptr, nullptr,
1159     rocksdb_db_options->manual_wal_flush);
1160 
1161 static MYSQL_SYSVAR_ENUM(write_policy, rocksdb_write_policy,
1162                          PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1163                          "DBOptions::write_policy for RocksDB", nullptr,
1164                          nullptr, rocksdb::TxnDBWritePolicy::WRITE_COMMITTED,
1165                          &write_policy_typelib);
1166 
1167 static MYSQL_SYSVAR_BOOL(
1168     create_missing_column_families,
1169     *reinterpret_cast<my_bool *>(
1170         &rocksdb_db_options->create_missing_column_families),
1171     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1172     "DBOptions::create_missing_column_families for RocksDB", nullptr, nullptr,
1173     rocksdb_db_options->create_missing_column_families);
1174 
1175 static MYSQL_SYSVAR_BOOL(
1176     error_if_exists,
1177     *reinterpret_cast<my_bool *>(&rocksdb_db_options->error_if_exists),
1178     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1179     "DBOptions::error_if_exists for RocksDB", nullptr, nullptr,
1180     rocksdb_db_options->error_if_exists);
1181 
1182 static MYSQL_SYSVAR_BOOL(
1183     paranoid_checks,
1184     *reinterpret_cast<my_bool *>(&rocksdb_db_options->paranoid_checks),
1185     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1186     "DBOptions::paranoid_checks for RocksDB", nullptr, nullptr,
1187     rocksdb_db_options->paranoid_checks);
1188 
1189 static MYSQL_SYSVAR_ULONGLONG(
1190     rate_limiter_bytes_per_sec, rocksdb_rate_limiter_bytes_per_sec,
1191     PLUGIN_VAR_RQCMDARG, "DBOptions::rate_limiter bytes_per_sec for RocksDB",
1192     nullptr, rocksdb_set_rate_limiter_bytes_per_sec, /* default */ 0L,
1193     /* min */ 0L, /* max */ MAX_RATE_LIMITER_BYTES_PER_SEC, 0);
1194 
1195 static MYSQL_SYSVAR_ULONGLONG(
1196     sst_mgr_rate_bytes_per_sec, rocksdb_sst_mgr_rate_bytes_per_sec,
1197     PLUGIN_VAR_RQCMDARG,
1198     "DBOptions::sst_file_manager rate_bytes_per_sec for RocksDB", nullptr,
1199     rocksdb_set_sst_mgr_rate_bytes_per_sec,
1200     /* default */ DEFAULT_SST_MGR_RATE_BYTES_PER_SEC,
1201     /* min */ 0L, /* max */ UINT64_MAX, 0);
1202 
1203 static MYSQL_SYSVAR_ULONGLONG(delayed_write_rate, rocksdb_delayed_write_rate,
1204                               PLUGIN_VAR_RQCMDARG,
1205                               "DBOptions::delayed_write_rate", nullptr,
1206                               rocksdb_set_delayed_write_rate,
1207                               rocksdb_db_options->delayed_write_rate, 0,
1208                               UINT64_MAX, 0);
1209 
1210 static MYSQL_SYSVAR_UINT(max_latest_deadlocks, rocksdb_max_latest_deadlocks,
1211                          PLUGIN_VAR_RQCMDARG,
1212                          "Maximum number of recent "
1213                          "deadlocks to store",
1214                          nullptr, rocksdb_set_max_latest_deadlocks,
1215                          rocksdb::kInitialMaxDeadlocks, 0, UINT32_MAX, 0);
1216 
1217 static MYSQL_SYSVAR_ENUM(
1218     info_log_level, rocksdb_info_log_level, PLUGIN_VAR_RQCMDARG,
1219     "Filter level for info logs to be written mysqld error log. "
1220     "Valid values include 'debug_level', 'info_level', 'warn_level'"
1221     "'error_level' and 'fatal_level'.",
1222     nullptr, rocksdb_set_rocksdb_info_log_level,
1223     rocksdb::InfoLogLevel::ERROR_LEVEL, &info_log_level_typelib);
1224 
1225 static MYSQL_THDVAR_INT(
1226     perf_context_level, PLUGIN_VAR_RQCMDARG,
1227     "Perf Context Level for rocksdb internal timer stat collection", nullptr,
1228     nullptr,
1229     /* default */ rocksdb::PerfLevel::kUninitialized,
1230     /* min */ rocksdb::PerfLevel::kUninitialized,
1231     /* max */ rocksdb::PerfLevel::kOutOfBounds - 1, 0);
1232 
1233 static MYSQL_SYSVAR_UINT(
1234     wal_recovery_mode, rocksdb_wal_recovery_mode, PLUGIN_VAR_RQCMDARG,
1235     "DBOptions::wal_recovery_mode for RocksDB. Default is kAbsoluteConsistency",
1236     nullptr, nullptr,
1237     /* default */ (uint)rocksdb::WALRecoveryMode::kAbsoluteConsistency,
1238     /* min */ (uint)rocksdb::WALRecoveryMode::kTolerateCorruptedTailRecords,
1239     /* max */ (uint)rocksdb::WALRecoveryMode::kSkipAnyCorruptedRecords, 0);
1240 
1241 static MYSQL_SYSVAR_UINT(
1242     stats_level, rocksdb_stats_level, PLUGIN_VAR_RQCMDARG,
1243     "Statistics Level for RocksDB. Default is 0 (kExceptHistogramOrTimers)",
1244     nullptr, rocksdb_set_rocksdb_stats_level,
1245     /* default */ (uint)rocksdb::StatsLevel::kExceptHistogramOrTimers,
1246     /* min */ (uint)rocksdb::StatsLevel::kExceptHistogramOrTimers,
1247     /* max */ (uint)rocksdb::StatsLevel::kAll, 0);
1248 
1249 static MYSQL_SYSVAR_SIZE_T(compaction_readahead_size,
1250                           rocksdb_db_options->compaction_readahead_size,
1251                           PLUGIN_VAR_RQCMDARG,
1252                           "DBOptions::compaction_readahead_size for RocksDB",
1253                           nullptr, nullptr,
1254                           rocksdb_db_options->compaction_readahead_size,
1255                           /* min */ 0L, /* max */ SIZE_T_MAX, 0);
1256 
1257 static MYSQL_SYSVAR_BOOL(
1258     new_table_reader_for_compaction_inputs,
1259     *reinterpret_cast<my_bool *>(
1260         &rocksdb_db_options->new_table_reader_for_compaction_inputs),
1261     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1262     "DBOptions::new_table_reader_for_compaction_inputs for RocksDB", nullptr,
1263     nullptr, rocksdb_db_options->new_table_reader_for_compaction_inputs);
1264 
1265 static MYSQL_SYSVAR_UINT(
1266     access_hint_on_compaction_start, rocksdb_access_hint_on_compaction_start,
1267     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1268     "DBOptions::access_hint_on_compaction_start for RocksDB", nullptr, nullptr,
1269     /* default */ (uint)rocksdb::Options::AccessHint::NORMAL,
1270     /* min */ (uint)rocksdb::Options::AccessHint::NONE,
1271     /* max */ (uint)rocksdb::Options::AccessHint::WILLNEED, 0);
1272 
1273 static MYSQL_SYSVAR_BOOL(
1274     allow_concurrent_memtable_write,
1275     *reinterpret_cast<my_bool *>(
1276         &rocksdb_db_options->allow_concurrent_memtable_write),
1277     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1278     "DBOptions::allow_concurrent_memtable_write for RocksDB", nullptr, nullptr,
1279     false);
1280 
1281 static MYSQL_SYSVAR_BOOL(
1282     enable_write_thread_adaptive_yield,
1283     *reinterpret_cast<my_bool *>(
1284         &rocksdb_db_options->enable_write_thread_adaptive_yield),
1285     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1286     "DBOptions::enable_write_thread_adaptive_yield for RocksDB", nullptr,
1287     nullptr, false);
1288 
1289 static MYSQL_SYSVAR_INT(max_open_files, rocksdb_db_options->max_open_files,
1290                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1291                         "DBOptions::max_open_files for RocksDB", nullptr,
1292                         nullptr, rocksdb_db_options->max_open_files,
1293                         /* min */ -2, /* max */ INT_MAX, 0);
1294 
1295 static MYSQL_SYSVAR_UINT64_T(max_total_wal_size,
1296                           rocksdb_db_options->max_total_wal_size,
1297                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1298                           "DBOptions::max_total_wal_size for RocksDB", nullptr,
1299                           nullptr, rocksdb_db_options->max_total_wal_size,
1300                           /* min */ 0, /* max */ LONGLONG_MAX, 0);
1301 
1302 static MYSQL_SYSVAR_BOOL(
1303     use_fsync, *reinterpret_cast<my_bool *>(&rocksdb_db_options->use_fsync),
1304     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1305     "DBOptions::use_fsync for RocksDB", nullptr, nullptr,
1306     rocksdb_db_options->use_fsync);
1307 
1308 static MYSQL_SYSVAR_STR(wal_dir, rocksdb_wal_dir,
1309                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1310                         "DBOptions::wal_dir for RocksDB", nullptr, nullptr,
1311                         rocksdb_db_options->wal_dir.c_str());
1312 
1313 static MYSQL_SYSVAR_STR(
1314     persistent_cache_path, rocksdb_persistent_cache_path,
1315     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1316     "Path for BlockBasedTableOptions::persistent_cache for RocksDB", nullptr,
1317     nullptr, "");
1318 
1319 static MYSQL_SYSVAR_ULONG(
1320     persistent_cache_size_mb, rocksdb_persistent_cache_size_mb,
1321     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1322     "Size of cache in MB for BlockBasedTableOptions::persistent_cache "
1323     "for RocksDB",
1324     nullptr, nullptr, rocksdb_persistent_cache_size_mb,
1325     /* min */ 0L, /* max */ ULONG_MAX, 0);
1326 
1327 static MYSQL_SYSVAR_UINT64_T(
1328     delete_obsolete_files_period_micros,
1329     rocksdb_db_options->delete_obsolete_files_period_micros,
1330     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1331     "DBOptions::delete_obsolete_files_period_micros for RocksDB", nullptr,
1332     nullptr, rocksdb_db_options->delete_obsolete_files_period_micros,
1333   /* min */ 0, /* max */ LONGLONG_MAX, 0);
1334 
1335 static MYSQL_SYSVAR_INT(max_background_jobs,
1336                         rocksdb_db_options->max_background_jobs,
1337                         PLUGIN_VAR_RQCMDARG,
1338                         "DBOptions::max_background_jobs for RocksDB", nullptr,
1339                         rocksdb_set_max_background_jobs,
1340                         rocksdb_db_options->max_background_jobs,
1341                         /* min */ -1, /* max */ MAX_BACKGROUND_JOBS, 0);
1342 
1343 static MYSQL_SYSVAR_UINT(max_subcompactions,
1344                          rocksdb_db_options->max_subcompactions,
1345                          PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1346                          "DBOptions::max_subcompactions for RocksDB", nullptr,
1347                          nullptr, rocksdb_db_options->max_subcompactions,
1348                          /* min */ 1, /* max */ MAX_SUBCOMPACTIONS, 0);
1349 
1350 static MYSQL_SYSVAR_SIZE_T(max_log_file_size,
1351                           rocksdb_db_options->max_log_file_size,
1352                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1353                           "DBOptions::max_log_file_size for RocksDB", nullptr,
1354                           nullptr, rocksdb_db_options->max_log_file_size,
1355                           /* min */ 0L, /* max */ SIZE_T_MAX, 0);
1356 
1357 static MYSQL_SYSVAR_SIZE_T(log_file_time_to_roll,
1358                           rocksdb_db_options->log_file_time_to_roll,
1359                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1360                           "DBOptions::log_file_time_to_roll for RocksDB",
1361                           nullptr, nullptr,
1362                           rocksdb_db_options->log_file_time_to_roll,
1363                           /* min */ 0L, /* max */ SIZE_T_MAX, 0);
1364 
1365 static MYSQL_SYSVAR_SIZE_T(keep_log_file_num,
1366                           rocksdb_db_options->keep_log_file_num,
1367                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1368                           "DBOptions::keep_log_file_num for RocksDB", nullptr,
1369                           nullptr, rocksdb_db_options->keep_log_file_num,
1370                           /* min */ 0L, /* max */ SIZE_T_MAX, 0);
1371 
1372 static MYSQL_SYSVAR_UINT64_T(max_manifest_file_size,
1373                           rocksdb_db_options->max_manifest_file_size,
1374                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1375                           "DBOptions::max_manifest_file_size for RocksDB",
1376                           nullptr, nullptr,
1377                           rocksdb_db_options->max_manifest_file_size,
1378                           /* min */ 0L, /* max */ ULONGLONG_MAX, 0);
1379 
1380 static MYSQL_SYSVAR_INT(table_cache_numshardbits,
1381                         rocksdb_db_options->table_cache_numshardbits,
1382                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1383                         "DBOptions::table_cache_numshardbits for RocksDB",
1384                         nullptr, nullptr,
1385                         rocksdb_db_options->table_cache_numshardbits,
1386                         // LRUCache limits this to 19 bits, anything greater
1387                         // fails to create a cache and returns a nullptr
1388                         /* min */ 0, /* max */ 19, 0);
1389 
1390 static MYSQL_SYSVAR_UINT64_T(wal_ttl_seconds, rocksdb_db_options->WAL_ttl_seconds,
1391                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1392                           "DBOptions::WAL_ttl_seconds for RocksDB", nullptr,
1393                           nullptr, rocksdb_db_options->WAL_ttl_seconds,
1394                           /* min */ 0L, /* max */ LONGLONG_MAX, 0);
1395 
1396 static MYSQL_SYSVAR_UINT64_T(wal_size_limit_mb,
1397                           rocksdb_db_options->WAL_size_limit_MB,
1398                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1399                           "DBOptions::WAL_size_limit_MB for RocksDB", nullptr,
1400                           nullptr, rocksdb_db_options->WAL_size_limit_MB,
1401                           /* min */ 0L, /* max */ LONGLONG_MAX, 0);
1402 
1403 static MYSQL_SYSVAR_SIZE_T(manifest_preallocation_size,
1404                           rocksdb_db_options->manifest_preallocation_size,
1405                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1406                           "DBOptions::manifest_preallocation_size for RocksDB",
1407                           nullptr, nullptr,
1408                           rocksdb_db_options->manifest_preallocation_size,
1409                           /* min */ 0L, /* max */ SIZE_T_MAX, 0);
1410 
1411 static MYSQL_SYSVAR_BOOL(
1412     use_direct_reads,
1413     *reinterpret_cast<my_bool *>(&rocksdb_db_options->use_direct_reads),
1414     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1415     "DBOptions::use_direct_reads for RocksDB", nullptr, nullptr,
1416     rocksdb_db_options->use_direct_reads);
1417 
1418 static MYSQL_SYSVAR_BOOL(
1419     use_direct_io_for_flush_and_compaction,
1420     *reinterpret_cast<my_bool *>(&rocksdb_db_options->use_direct_io_for_flush_and_compaction),
1421     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1422     "DBOptions::use_direct_io_for_flush_and_compaction for RocksDB", nullptr, nullptr,
1423     rocksdb_db_options->use_direct_io_for_flush_and_compaction);
1424 
1425 static MYSQL_SYSVAR_BOOL(
1426     allow_mmap_reads,
1427     *reinterpret_cast<my_bool *>(&rocksdb_db_options->allow_mmap_reads),
1428     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1429     "DBOptions::allow_mmap_reads for RocksDB", nullptr, nullptr,
1430     rocksdb_db_options->allow_mmap_reads);
1431 
1432 static MYSQL_SYSVAR_BOOL(
1433     allow_mmap_writes,
1434     *reinterpret_cast<my_bool *>(&rocksdb_db_options->allow_mmap_writes),
1435     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1436     "DBOptions::allow_mmap_writes for RocksDB", nullptr, nullptr,
1437     rocksdb_db_options->allow_mmap_writes);
1438 
1439 static MYSQL_SYSVAR_BOOL(
1440     is_fd_close_on_exec,
1441     *reinterpret_cast<my_bool *>(&rocksdb_db_options->is_fd_close_on_exec),
1442     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1443     "DBOptions::is_fd_close_on_exec for RocksDB", nullptr, nullptr,
1444     rocksdb_db_options->is_fd_close_on_exec);
1445 
1446 static MYSQL_SYSVAR_UINT(stats_dump_period_sec,
1447                          rocksdb_db_options->stats_dump_period_sec,
1448                          PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1449                          "DBOptions::stats_dump_period_sec for RocksDB",
1450                          nullptr, nullptr,
1451                          rocksdb_db_options->stats_dump_period_sec,
1452                          /* min */ 0, /* max */ INT_MAX, 0);
1453 
1454 static MYSQL_SYSVAR_BOOL(
1455     advise_random_on_open,
1456     *reinterpret_cast<my_bool *>(&rocksdb_db_options->advise_random_on_open),
1457     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1458     "DBOptions::advise_random_on_open for RocksDB", nullptr, nullptr,
1459     rocksdb_db_options->advise_random_on_open);
1460 
1461 static MYSQL_SYSVAR_SIZE_T(db_write_buffer_size,
1462                           rocksdb_db_options->db_write_buffer_size,
1463                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1464                           "DBOptions::db_write_buffer_size for RocksDB",
1465                           nullptr, nullptr,
1466                           rocksdb_db_options->db_write_buffer_size,
1467                           /* min */ 0L, /* max */ SIZE_T_MAX, 0);
1468 
1469 static MYSQL_SYSVAR_BOOL(
1470     use_adaptive_mutex,
1471     *reinterpret_cast<my_bool *>(&rocksdb_db_options->use_adaptive_mutex),
1472     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1473     "DBOptions::use_adaptive_mutex for RocksDB", nullptr, nullptr,
1474     rocksdb_db_options->use_adaptive_mutex);
1475 
1476 static MYSQL_SYSVAR_UINT64_T(bytes_per_sync, rocksdb_db_options->bytes_per_sync,
1477                           PLUGIN_VAR_RQCMDARG,
1478                           "DBOptions::bytes_per_sync for RocksDB", nullptr,
1479                           rocksdb_set_bytes_per_sync,
1480                           rocksdb_db_options->bytes_per_sync,
1481                           /* min */ 0L, /* max */ ULONGLONG_MAX, 0);
1482 
1483 static MYSQL_SYSVAR_UINT64_T(wal_bytes_per_sync,
1484                           rocksdb_db_options->wal_bytes_per_sync,
1485                           PLUGIN_VAR_RQCMDARG,
1486                           "DBOptions::wal_bytes_per_sync for RocksDB", nullptr,
1487                           rocksdb_set_wal_bytes_per_sync,
1488                           rocksdb_db_options->wal_bytes_per_sync,
1489                           /* min */ 0L, /* max */ ULONGLONG_MAX, 0);
1490 
1491 static MYSQL_SYSVAR_BOOL(
1492     enable_thread_tracking,
1493     *reinterpret_cast<my_bool *>(&rocksdb_db_options->enable_thread_tracking),
1494     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1495     "DBOptions::enable_thread_tracking for RocksDB", nullptr, nullptr, true);
1496 
1497 static MYSQL_SYSVAR_LONGLONG(block_cache_size, rocksdb_block_cache_size,
1498                              PLUGIN_VAR_RQCMDARG,
1499                              "block_cache size for RocksDB",
1500                              rocksdb_validate_set_block_cache_size, nullptr,
1501                              /* default */ RDB_DEFAULT_BLOCK_CACHE_SIZE,
1502                              /* min */ RDB_MIN_BLOCK_CACHE_SIZE,
1503                              /* max */ LLONG_MAX,
1504                              /* Block size */ RDB_MIN_BLOCK_CACHE_SIZE);
1505 
1506 static MYSQL_SYSVAR_LONGLONG(sim_cache_size, rocksdb_sim_cache_size,
1507                              PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1508                              "Simulated cache size for RocksDB", nullptr,
1509                              nullptr,
1510                              /* default */ 0,
1511                              /* min */ 0,
1512                              /* max */ LLONG_MAX,
1513                              /* Block size */ 0);
1514 
1515 static MYSQL_SYSVAR_BOOL(
1516     use_clock_cache, rocksdb_use_clock_cache,
1517     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1518     "Use ClockCache instead of default LRUCache for RocksDB", nullptr, nullptr,
1519     false);
1520 
1521 static MYSQL_SYSVAR_BOOL(cache_dump, rocksdb_cache_dump,
1522                          PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1523                          "Include RocksDB block cache content in core dump.",
1524                          nullptr, nullptr, true);
1525 
1526 static MYSQL_SYSVAR_DOUBLE(cache_high_pri_pool_ratio,
1527                            rocksdb_cache_high_pri_pool_ratio,
1528                            PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1529                            "Specify the size of block cache high-pri pool",
1530                            nullptr, nullptr, /* default */ 0.0, /* min */ 0.0,
1531                            /* max */ 1.0, 0);
1532 
1533 static MYSQL_SYSVAR_BOOL(
1534     cache_index_and_filter_blocks,
1535     *reinterpret_cast<my_bool *>(
1536         &rocksdb_tbl_options->cache_index_and_filter_blocks),
1537     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1538     "BlockBasedTableOptions::cache_index_and_filter_blocks for RocksDB",
1539     nullptr, nullptr, true);
1540 
1541 static MYSQL_SYSVAR_BOOL(
1542     cache_index_and_filter_with_high_priority,
1543     *reinterpret_cast<my_bool *>(
1544         &rocksdb_tbl_options->cache_index_and_filter_blocks_with_high_priority),
1545     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1546     "cache_index_and_filter_blocks_with_high_priority for RocksDB", nullptr,
1547     nullptr, true);
1548 
1549 // When pin_l0_filter_and_index_blocks_in_cache is true, RocksDB will  use the
1550 // LRU cache, but will always keep the filter & idndex block's handle checked
1551 // out (=won't call ShardedLRUCache::Release), plus the parsed out objects
1552 // the LRU cache will never push flush them out, hence they're pinned.
1553 //
1554 // This fixes the mutex contention between :ShardedLRUCache::Lookup and
1555 // ShardedLRUCache::Release which reduced the QPS ratio (QPS using secondary
1556 // index / QPS using PK).
1557 static MYSQL_SYSVAR_BOOL(
1558     pin_l0_filter_and_index_blocks_in_cache,
1559     *reinterpret_cast<my_bool *>(
1560         &rocksdb_tbl_options->pin_l0_filter_and_index_blocks_in_cache),
1561     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1562     "pin_l0_filter_and_index_blocks_in_cache for RocksDB", nullptr, nullptr,
1563     true);
1564 
1565 static MYSQL_SYSVAR_ENUM(index_type, rocksdb_index_type,
1566                          PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1567                          "BlockBasedTableOptions::index_type for RocksDB",
1568                          nullptr, nullptr,
1569                          (ulong)rocksdb_tbl_options->index_type,
1570                          &index_type_typelib);
1571 
1572 static MYSQL_SYSVAR_BOOL(
1573     hash_index_allow_collision,
1574     *reinterpret_cast<my_bool *>(
1575         &rocksdb_tbl_options->hash_index_allow_collision),
1576     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1577     "BlockBasedTableOptions::hash_index_allow_collision for RocksDB", nullptr,
1578     nullptr, rocksdb_tbl_options->hash_index_allow_collision);
1579 
1580 static MYSQL_SYSVAR_BOOL(
1581     no_block_cache,
1582     *reinterpret_cast<my_bool *>(&rocksdb_tbl_options->no_block_cache),
1583     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1584     "BlockBasedTableOptions::no_block_cache for RocksDB", nullptr, nullptr,
1585     rocksdb_tbl_options->no_block_cache);
1586 
1587 static MYSQL_SYSVAR_SIZE_T(block_size, rocksdb_tbl_options->block_size,
1588                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1589                           "BlockBasedTableOptions::block_size for RocksDB",
1590                           nullptr, nullptr, rocksdb_tbl_options->block_size,
1591                           /* min */ 1L, /* max */ SIZE_T_MAX, 0);
1592 
1593 static MYSQL_SYSVAR_INT(
1594     block_size_deviation, rocksdb_tbl_options->block_size_deviation,
1595     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1596     "BlockBasedTableOptions::block_size_deviation for RocksDB", nullptr,
1597     nullptr, rocksdb_tbl_options->block_size_deviation,
1598     /* min */ 0, /* max */ INT_MAX, 0);
1599 
1600 static MYSQL_SYSVAR_INT(
1601     block_restart_interval, rocksdb_tbl_options->block_restart_interval,
1602     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1603     "BlockBasedTableOptions::block_restart_interval for RocksDB", nullptr,
1604     nullptr, rocksdb_tbl_options->block_restart_interval,
1605     /* min */ 1, /* max */ INT_MAX, 0);
1606 
1607 static MYSQL_SYSVAR_BOOL(
1608     whole_key_filtering,
1609     *reinterpret_cast<my_bool *>(&rocksdb_tbl_options->whole_key_filtering),
1610     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1611     "BlockBasedTableOptions::whole_key_filtering for RocksDB", nullptr, nullptr,
1612     rocksdb_tbl_options->whole_key_filtering);
1613 
1614 static MYSQL_SYSVAR_STR(default_cf_options, rocksdb_default_cf_options,
1615                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1616                         "default cf options for RocksDB", nullptr, nullptr, "");
1617 
1618 static MYSQL_SYSVAR_STR(override_cf_options, rocksdb_override_cf_options,
1619                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1620                         "option overrides per cf for RocksDB", nullptr, nullptr,
1621                         "");
1622 
1623 static MYSQL_SYSVAR_STR(update_cf_options, rocksdb_update_cf_options,
1624                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC
1625                         /* psergey-merge: need this? :  PLUGIN_VAR_ALLOCATED*/,
1626                         "Option updates per column family for RocksDB",
1627                         rocksdb_validate_update_cf_options,
1628                         rocksdb_set_update_cf_options, nullptr);
1629 
1630 static MYSQL_SYSVAR_UINT(flush_log_at_trx_commit,
1631                          rocksdb_flush_log_at_trx_commit, PLUGIN_VAR_RQCMDARG,
1632                          "Sync on transaction commit. Similar to "
1633                          "innodb_flush_log_at_trx_commit. 1: sync on commit, "
1634                          "0,2: not sync on commit",
1635                          rocksdb_validate_flush_log_at_trx_commit, nullptr,
1636                          /* default */ FLUSH_LOG_SYNC,
1637                          /* min */ FLUSH_LOG_NEVER,
1638                          /* max */ FLUSH_LOG_BACKGROUND, 0);
1639 
1640 static MYSQL_THDVAR_BOOL(write_disable_wal, PLUGIN_VAR_RQCMDARG,
1641                          "WriteOptions::disableWAL for RocksDB", nullptr,
1642                          nullptr, rocksdb::WriteOptions().disableWAL);
1643 
1644 static MYSQL_THDVAR_BOOL(
1645     write_ignore_missing_column_families, PLUGIN_VAR_RQCMDARG,
1646     "WriteOptions::ignore_missing_column_families for RocksDB", nullptr,
1647     nullptr, rocksdb::WriteOptions().ignore_missing_column_families);
1648 
1649 static MYSQL_THDVAR_BOOL(skip_fill_cache, PLUGIN_VAR_RQCMDARG,
1650                          "Skip filling block cache on read requests", nullptr,
1651                          nullptr, FALSE);
1652 
1653 static MYSQL_THDVAR_BOOL(
1654     unsafe_for_binlog, PLUGIN_VAR_RQCMDARG,
1655     "Allowing statement based binary logging which may break consistency",
1656     nullptr, nullptr, FALSE);
1657 
1658 static MYSQL_THDVAR_UINT(records_in_range, PLUGIN_VAR_RQCMDARG,
1659                          "Used to override the result of records_in_range(). "
1660                          "Set to a positive number to override",
1661                          nullptr, nullptr, 0,
1662                          /* min */ 0, /* max */ INT_MAX, 0);
1663 
1664 static MYSQL_THDVAR_UINT(force_index_records_in_range, PLUGIN_VAR_RQCMDARG,
1665                          "Used to override the result of records_in_range() "
1666                          "when FORCE INDEX is used.",
1667                          nullptr, nullptr, 0,
1668                          /* min */ 0, /* max */ INT_MAX, 0);
1669 
1670 static MYSQL_SYSVAR_UINT(
1671     debug_optimizer_n_rows, rocksdb_debug_optimizer_n_rows,
1672     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_NOSYSVAR,
1673     "Test only to override rocksdb estimates of table size in a memtable",
1674     nullptr, nullptr, 0, /* min */ 0, /* max */ INT_MAX, 0);
1675 
1676 static MYSQL_SYSVAR_BOOL(force_compute_memtable_stats,
1677                          rocksdb_force_compute_memtable_stats,
1678                          PLUGIN_VAR_RQCMDARG,
1679                          "Force to always compute memtable stats", nullptr,
1680                          nullptr, TRUE);
1681 
1682 static MYSQL_SYSVAR_UINT(force_compute_memtable_stats_cachetime,
1683                          rocksdb_force_compute_memtable_stats_cachetime,
1684                          PLUGIN_VAR_RQCMDARG,
1685                          "Time in usecs to cache memtable estimates", nullptr,
1686                          nullptr, /* default */ 60 * 1000 * 1000,
1687                          /* min */ 0, /* max */ INT_MAX, 0);
1688 
1689 static MYSQL_SYSVAR_BOOL(
1690     debug_optimizer_no_zero_cardinality,
1691     rocksdb_debug_optimizer_no_zero_cardinality, PLUGIN_VAR_RQCMDARG,
1692     "In case if cardinality is zero, overrides it with some value", nullptr,
1693     nullptr, TRUE);
1694 
1695 static MYSQL_SYSVAR_STR(compact_cf, rocksdb_compact_cf_name,
1696                         PLUGIN_VAR_RQCMDARG, "Compact column family",
1697                         rocksdb_compact_column_family,
1698                         rocksdb_compact_column_family_stub, "");
1699 
1700 static MYSQL_SYSVAR_STR(delete_cf, rocksdb_delete_cf_name, PLUGIN_VAR_RQCMDARG,
1701                         "Delete column family", rocksdb_delete_column_family,
1702                         rocksdb_delete_column_family_stub, "");
1703 
1704 static MYSQL_SYSVAR_STR(create_checkpoint, rocksdb_checkpoint_name,
1705                         PLUGIN_VAR_RQCMDARG, "Checkpoint directory",
1706                         rocksdb_create_checkpoint,
1707                         rocksdb_create_checkpoint_stub, "");
1708 
1709 static MYSQL_SYSVAR_BOOL(remove_mariabackup_checkpoint,
1710                          rocksdb_signal_remove_mariabackup_checkpoint,
1711                          PLUGIN_VAR_RQCMDARG, "Remove mariabackup checkpoint",
1712                          nullptr, rocksdb_remove_mariabackup_checkpoint, FALSE);
1713 
1714 static MYSQL_SYSVAR_BOOL(signal_drop_index_thread,
1715                          rocksdb_signal_drop_index_thread, PLUGIN_VAR_RQCMDARG,
1716                          "Wake up drop index thread", nullptr,
1717                          rocksdb_drop_index_wakeup_thread, FALSE);
1718 
1719 static MYSQL_SYSVAR_BOOL(pause_background_work, rocksdb_pause_background_work,
1720                          PLUGIN_VAR_RQCMDARG,
1721                          "Disable all rocksdb background operations", nullptr,
1722                          rocksdb_set_pause_background_work, FALSE);
1723 
1724 static MYSQL_SYSVAR_BOOL(
1725     enable_ttl, rocksdb_enable_ttl, PLUGIN_VAR_RQCMDARG,
1726     "Enable expired TTL records to be dropped during compaction.", nullptr,
1727     nullptr, TRUE);
1728 
1729 static MYSQL_SYSVAR_BOOL(
1730     enable_ttl_read_filtering, rocksdb_enable_ttl_read_filtering,
1731     PLUGIN_VAR_RQCMDARG,
1732     "For tables with TTL, expired records are skipped/filtered out during "
1733     "processing and in query results. Disabling this will allow these records "
1734     "to be seen, but as a result rows may disappear in the middle of "
1735     "transactions as they are dropped during compaction. Use with caution.",
1736     nullptr, nullptr, TRUE);
1737 
1738 static MYSQL_SYSVAR_INT(
1739     debug_ttl_rec_ts, rocksdb_debug_ttl_rec_ts, PLUGIN_VAR_RQCMDARG,
1740     "For debugging purposes only.  Overrides the TTL of records to "
1741     "now() + debug_ttl_rec_ts.  The value can be +/- to simulate "
1742     "a record inserted in the past vs a record inserted in the 'future'. "
1743     "A value of 0 denotes that the variable is not set. This variable is a "
1744     "no-op in non-debug builds.",
1745     nullptr, nullptr, 0, /* min */ -3600, /* max */ 3600, 0);
1746 
1747 static MYSQL_SYSVAR_INT(
1748     debug_ttl_snapshot_ts, rocksdb_debug_ttl_snapshot_ts, PLUGIN_VAR_RQCMDARG,
1749     "For debugging purposes only.  Sets the snapshot during compaction to "
1750     "now() + debug_set_ttl_snapshot_ts.  The value can be +/- to simulate "
1751     "a snapshot in the past vs a snapshot created in the 'future'. "
1752     "A value of 0 denotes that the variable is not set. This variable is a "
1753     "no-op in non-debug builds.",
1754     nullptr, nullptr, 0, /* min */ -3600, /* max */ 3600, 0);
1755 
1756 static MYSQL_SYSVAR_INT(
1757     debug_ttl_read_filter_ts, rocksdb_debug_ttl_read_filter_ts,
1758     PLUGIN_VAR_RQCMDARG,
1759     "For debugging purposes only.  Overrides the TTL read filtering time to "
1760     "time + debug_ttl_read_filter_ts. A value of 0 denotes that the variable "
1761     "is not set. This variable is a no-op in non-debug builds.",
1762     nullptr, nullptr, 0, /* min */ -3600, /* max */ 3600, 0);
1763 
1764 static MYSQL_SYSVAR_BOOL(
1765     debug_ttl_ignore_pk, rocksdb_debug_ttl_ignore_pk, PLUGIN_VAR_RQCMDARG,
1766     "For debugging purposes only. If true, compaction filtering will not occur "
1767     "on PK TTL data. This variable is a no-op in non-debug builds.",
1768     nullptr, nullptr, FALSE);
1769 
1770 static MYSQL_SYSVAR_UINT(
1771     max_manual_compactions, rocksdb_max_manual_compactions, PLUGIN_VAR_RQCMDARG,
1772     "Maximum number of pending + ongoing number of manual compactions.",
1773     nullptr, nullptr, /* default */ 10, /* min */ 0, /* max */ UINT_MAX, 0);
1774 
1775 static MYSQL_SYSVAR_BOOL(
1776     rollback_on_timeout, rocksdb_rollback_on_timeout, PLUGIN_VAR_OPCMDARG,
1777     "Whether to roll back the complete transaction or a single statement on "
1778     "lock wait timeout (a single statement by default)",
1779     NULL, NULL, FALSE);
1780 
1781 static MYSQL_SYSVAR_UINT(
1782     debug_manual_compaction_delay, rocksdb_debug_manual_compaction_delay,
1783     PLUGIN_VAR_RQCMDARG,
1784     "For debugging purposes only. Sleeping specified seconds "
1785     "for simulating long running compactions.",
1786     nullptr, nullptr, 0, /* min */ 0, /* max */ UINT_MAX, 0);
1787 
1788 static MYSQL_SYSVAR_BOOL(
1789     reset_stats, rocksdb_reset_stats, PLUGIN_VAR_RQCMDARG,
1790     "Reset the RocksDB internal statistics without restarting the DB.", nullptr,
1791     rocksdb_set_reset_stats, FALSE);
1792 
1793 static MYSQL_SYSVAR_UINT(io_write_timeout, rocksdb_io_write_timeout_secs,
1794                          PLUGIN_VAR_RQCMDARG,
1795                          "Timeout for experimental I/O watchdog.", nullptr,
1796                          rocksdb_set_io_write_timeout, /* default */ 0,
1797                          /* min */ 0L,
1798                          /* max */ UINT_MAX, 0);
1799 
1800 static MYSQL_SYSVAR_BOOL(enable_2pc, rocksdb_enable_2pc, PLUGIN_VAR_RQCMDARG,
1801                          "Enable two phase commit for MyRocks", nullptr,
1802                          nullptr, TRUE);
1803 
1804 static MYSQL_SYSVAR_BOOL(ignore_unknown_options, rocksdb_ignore_unknown_options,
1805                          PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
1806                          "Enable ignoring unknown options passed to RocksDB",
1807                          nullptr, nullptr, TRUE);
1808 
1809 static MYSQL_SYSVAR_BOOL(strict_collation_check, rocksdb_strict_collation_check,
1810                          PLUGIN_VAR_RQCMDARG,
1811                          "Enforce case sensitive collation for MyRocks indexes",
1812                          nullptr, nullptr, TRUE);
1813 
1814 static MYSQL_SYSVAR_STR(strict_collation_exceptions,
1815                         rocksdb_strict_collation_exceptions,
1816                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
1817                         "List of tables (using regex) that are excluded "
1818                         "from the case sensitive collation enforcement",
1819                         nullptr, rocksdb_set_collation_exception_list, "");
1820 
1821 static MYSQL_SYSVAR_BOOL(collect_sst_properties, rocksdb_collect_sst_properties,
1822                          PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1823                          "Enables collecting SST file properties on each flush",
1824                          nullptr, nullptr, rocksdb_collect_sst_properties);
1825 
1826 static MYSQL_SYSVAR_BOOL(
1827     force_flush_memtable_now, rocksdb_force_flush_memtable_now_var,
1828     PLUGIN_VAR_RQCMDARG,
1829     "Forces memstore flush which may block all write requests so be careful",
1830     rocksdb_force_flush_memtable_now, rocksdb_force_flush_memtable_now_stub,
1831     FALSE);
1832 
1833 static MYSQL_SYSVAR_BOOL(
1834     force_flush_memtable_and_lzero_now,
1835     rocksdb_force_flush_memtable_and_lzero_now_var, PLUGIN_VAR_RQCMDARG,
1836     "Acts similar to force_flush_memtable_now, but also compacts all L0 files.",
1837     rocksdb_force_flush_memtable_and_lzero_now,
1838     rocksdb_force_flush_memtable_and_lzero_now_stub, FALSE);
1839 
1840 static MYSQL_SYSVAR_UINT(
1841     seconds_between_stat_computes, rocksdb_seconds_between_stat_computes,
1842     PLUGIN_VAR_RQCMDARG,
1843     "Sets a number of seconds to wait between optimizer stats recomputation. "
1844     "Only changed indexes will be refreshed.",
1845     nullptr, nullptr, rocksdb_seconds_between_stat_computes,
1846     /* min */ 0L, /* max */ UINT_MAX, 0);
1847 
1848 static MYSQL_SYSVAR_LONGLONG(compaction_sequential_deletes,
1849                              rocksdb_compaction_sequential_deletes,
1850                              PLUGIN_VAR_RQCMDARG,
1851                              "RocksDB will trigger compaction for the file if "
1852                              "it has more than this number sequential deletes "
1853                              "per window",
1854                              nullptr, rocksdb_set_compaction_options,
1855                              DEFAULT_COMPACTION_SEQUENTIAL_DELETES,
1856                              /* min */ 0L,
1857                              /* max */ MAX_COMPACTION_SEQUENTIAL_DELETES, 0);
1858 
1859 static MYSQL_SYSVAR_LONGLONG(
1860     compaction_sequential_deletes_window,
1861     rocksdb_compaction_sequential_deletes_window, PLUGIN_VAR_RQCMDARG,
1862     "Size of the window for counting rocksdb_compaction_sequential_deletes",
1863     nullptr, rocksdb_set_compaction_options,
1864     DEFAULT_COMPACTION_SEQUENTIAL_DELETES_WINDOW,
1865     /* min */ 0L, /* max */ MAX_COMPACTION_SEQUENTIAL_DELETES_WINDOW, 0);
1866 
1867 static MYSQL_SYSVAR_LONGLONG(
1868     compaction_sequential_deletes_file_size,
1869     rocksdb_compaction_sequential_deletes_file_size, PLUGIN_VAR_RQCMDARG,
1870     "Minimum file size required for compaction_sequential_deletes", nullptr,
1871     rocksdb_set_compaction_options, 0L,
1872     /* min */ -1L, /* max */ LLONG_MAX, 0);
1873 
1874 static MYSQL_SYSVAR_BOOL(
1875     compaction_sequential_deletes_count_sd,
1876     rocksdb_compaction_sequential_deletes_count_sd, PLUGIN_VAR_RQCMDARG,
1877     "Counting SingleDelete as rocksdb_compaction_sequential_deletes", nullptr,
1878     nullptr, rocksdb_compaction_sequential_deletes_count_sd);
1879 
1880 static MYSQL_SYSVAR_BOOL(
1881     print_snapshot_conflict_queries, rocksdb_print_snapshot_conflict_queries,
1882     PLUGIN_VAR_RQCMDARG,
1883     "Logging queries that got snapshot conflict errors into *.err log", nullptr,
1884     nullptr, rocksdb_print_snapshot_conflict_queries);
1885 
1886 static MYSQL_THDVAR_INT(checksums_pct, PLUGIN_VAR_RQCMDARG,
1887                         "How many percentages of rows to be checksummed",
1888                         nullptr, nullptr, RDB_MAX_CHECKSUMS_PCT,
1889                         /* min */ 0, /* max */ RDB_MAX_CHECKSUMS_PCT, 0);
1890 
1891 static MYSQL_THDVAR_BOOL(store_row_debug_checksums, PLUGIN_VAR_RQCMDARG,
1892                          "Include checksums when writing index/table records",
1893                          nullptr, nullptr, false /* default value */);
1894 
1895 static MYSQL_THDVAR_BOOL(verify_row_debug_checksums, PLUGIN_VAR_RQCMDARG,
1896                          "Verify checksums when reading index/table records",
1897                          nullptr, nullptr, false /* default value */);
1898 
1899 static MYSQL_THDVAR_BOOL(master_skip_tx_api, PLUGIN_VAR_RQCMDARG,
1900                          "Skipping holding any lock on row access. "
1901                          "Not effective on slave.",
1902                          nullptr, nullptr, false);
1903 
1904 static MYSQL_SYSVAR_UINT(
1905     validate_tables, rocksdb_validate_tables,
1906     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1907     "Verify all .frm files match all RocksDB tables (0 means no verification, "
1908     "1 means verify and fail on error, and 2 means verify but continue",
1909     nullptr, nullptr, 1 /* default value */, 0 /* min value */,
1910     2 /* max value */, 0);
1911 
1912 static MYSQL_SYSVAR_UINT(
1913     ignore_datadic_errors, rocksdb_ignore_datadic_errors,
1914     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1915     "Ignore MyRocks' data directory errors. "
1916     "(CAUTION: Use only to start the server and perform repairs. Do NOT use "
1917     "for regular operation)",
1918     nullptr, nullptr, 0 /* default value */, 0 /* min value */,
1919     1 /* max value */, 0);
1920 
1921 static MYSQL_SYSVAR_STR(datadir, rocksdb_datadir,
1922                         PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
1923                         "RocksDB data directory", nullptr, nullptr,
1924                         "./#rocksdb");
1925 
1926 static MYSQL_SYSVAR_STR(supported_compression_types,
1927   compression_types_val,
1928   PLUGIN_VAR_NOCMDOPT | PLUGIN_VAR_READONLY,
1929   "Compression algorithms supported by RocksDB",
1930   nullptr, nullptr,
1931   compression_types_val);
1932 
1933 static MYSQL_SYSVAR_UINT(
1934     table_stats_sampling_pct, rocksdb_table_stats_sampling_pct,
1935     PLUGIN_VAR_RQCMDARG,
1936     "Percentage of entries to sample when collecting statistics about table "
1937     "properties. Specify either 0 to sample everything or percentage "
1938     "[" STRINGIFY_ARG(RDB_TBL_STATS_SAMPLE_PCT_MIN) ".." STRINGIFY_ARG(
1939         RDB_TBL_STATS_SAMPLE_PCT_MAX) "]. "
1940                                       "By default " STRINGIFY_ARG(
1941                                           RDB_DEFAULT_TBL_STATS_SAMPLE_PCT) "% "
1942                                                                             "of"
1943                                                                             " e"
1944                                                                             "nt"
1945                                                                             "ri"
1946                                                                             "es"
1947                                                                             " a"
1948                                                                             "re"
1949                                                                             " "
1950                                                                             "sa"
1951                                                                             "mp"
1952                                                                             "le"
1953                                                                             "d"
1954                                                                             ".",
1955     nullptr, rocksdb_set_table_stats_sampling_pct, /* default */
1956     RDB_DEFAULT_TBL_STATS_SAMPLE_PCT, /* everything */ 0,
1957     /* max */ RDB_TBL_STATS_SAMPLE_PCT_MAX, 0);
1958 
1959 static MYSQL_SYSVAR_UINT(
1960     stats_recalc_rate, rocksdb_stats_recalc_rate, PLUGIN_VAR_RQCMDARG,
1961     "The number of indexes per second to recalculate statistics for. 0 to "
1962     "disable background recalculation.",
1963     nullptr, nullptr, 0 /* default value */, 0 /* min value */,
1964     UINT_MAX /* max value */, 0);
1965 
1966 static MYSQL_SYSVAR_BOOL(
1967     large_prefix, rocksdb_large_prefix, PLUGIN_VAR_RQCMDARG,
1968     "Support large index prefix length of 3072 bytes. If off, the maximum "
1969     "index prefix length is 767.",
1970     nullptr, nullptr, FALSE);
1971 
1972 static MYSQL_SYSVAR_BOOL(
1973     allow_to_start_after_corruption, rocksdb_allow_to_start_after_corruption,
1974     PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
1975     "Allow server still to start successfully even if RocksDB corruption is "
1976     "detected.",
1977     nullptr, nullptr, FALSE);
1978 
1979 static MYSQL_SYSVAR_BOOL(error_on_suboptimal_collation,
1980                          rocksdb_error_on_suboptimal_collation,
1981                          PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
1982                          "Raise an error instead of warning if a sub-optimal "
1983                          "collation is used",
1984                          nullptr, nullptr, TRUE);
1985 
1986 static MYSQL_SYSVAR_BOOL(
1987     enable_insert_with_update_caching,
1988     rocksdb_enable_insert_with_update_caching, PLUGIN_VAR_OPCMDARG,
1989     "Whether to enable optimization where we cache the read from a failed "
1990     "insertion attempt in INSERT ON DUPLICATE KEY UPDATE",
1991     nullptr, nullptr, TRUE);
1992 
1993 static const int ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE = 100;
1994 
1995 static struct st_mysql_sys_var *rocksdb_system_variables[] = {
1996     MYSQL_SYSVAR(lock_wait_timeout),
1997     MYSQL_SYSVAR(deadlock_detect),
1998     MYSQL_SYSVAR(deadlock_detect_depth),
1999     MYSQL_SYSVAR(commit_time_batch_for_recovery),
2000     MYSQL_SYSVAR(max_row_locks),
2001     MYSQL_SYSVAR(write_batch_max_bytes),
2002     MYSQL_SYSVAR(lock_scanned_rows),
2003     MYSQL_SYSVAR(bulk_load),
2004     MYSQL_SYSVAR(bulk_load_allow_sk),
2005     MYSQL_SYSVAR(bulk_load_allow_unsorted),
2006     MYSQL_SYSVAR(skip_unique_check_tables),
2007     MYSQL_SYSVAR(trace_sst_api),
2008     MYSQL_SYSVAR(commit_in_the_middle),
2009     MYSQL_SYSVAR(blind_delete_primary_key),
2010 #if 0 // MARIAROCKS_NOT_YET : read-free replication is not supported
2011     MYSQL_SYSVAR(read_free_rpl_tables),
2012     MYSQL_SYSVAR(read_free_rpl),
2013 #endif
2014     MYSQL_SYSVAR(bulk_load_size),
2015     MYSQL_SYSVAR(merge_buf_size),
2016     MYSQL_SYSVAR(enable_bulk_load_api),
2017     MYSQL_SYSVAR(tmpdir),
2018     MYSQL_SYSVAR(merge_combine_read_size),
2019     MYSQL_SYSVAR(merge_tmp_file_removal_delay_ms),
2020     MYSQL_SYSVAR(skip_bloom_filter_on_read),
2021 
2022     MYSQL_SYSVAR(create_if_missing),
2023     MYSQL_SYSVAR(two_write_queues),
2024     MYSQL_SYSVAR(manual_wal_flush),
2025     MYSQL_SYSVAR(write_policy),
2026     MYSQL_SYSVAR(create_missing_column_families),
2027     MYSQL_SYSVAR(error_if_exists),
2028     MYSQL_SYSVAR(paranoid_checks),
2029     MYSQL_SYSVAR(rate_limiter_bytes_per_sec),
2030     MYSQL_SYSVAR(sst_mgr_rate_bytes_per_sec),
2031     MYSQL_SYSVAR(delayed_write_rate),
2032     MYSQL_SYSVAR(max_latest_deadlocks),
2033     MYSQL_SYSVAR(info_log_level),
2034     MYSQL_SYSVAR(max_open_files),
2035     MYSQL_SYSVAR(max_total_wal_size),
2036     MYSQL_SYSVAR(use_fsync),
2037     MYSQL_SYSVAR(wal_dir),
2038     MYSQL_SYSVAR(persistent_cache_path),
2039     MYSQL_SYSVAR(persistent_cache_size_mb),
2040     MYSQL_SYSVAR(delete_obsolete_files_period_micros),
2041     MYSQL_SYSVAR(max_background_jobs),
2042     MYSQL_SYSVAR(max_log_file_size),
2043     MYSQL_SYSVAR(max_subcompactions),
2044     MYSQL_SYSVAR(log_file_time_to_roll),
2045     MYSQL_SYSVAR(keep_log_file_num),
2046     MYSQL_SYSVAR(max_manifest_file_size),
2047     MYSQL_SYSVAR(table_cache_numshardbits),
2048     MYSQL_SYSVAR(wal_ttl_seconds),
2049     MYSQL_SYSVAR(wal_size_limit_mb),
2050     MYSQL_SYSVAR(manifest_preallocation_size),
2051     MYSQL_SYSVAR(use_direct_reads),
2052     MYSQL_SYSVAR(use_direct_io_for_flush_and_compaction),
2053     MYSQL_SYSVAR(allow_mmap_reads),
2054     MYSQL_SYSVAR(allow_mmap_writes),
2055     MYSQL_SYSVAR(is_fd_close_on_exec),
2056     MYSQL_SYSVAR(stats_dump_period_sec),
2057     MYSQL_SYSVAR(advise_random_on_open),
2058     MYSQL_SYSVAR(db_write_buffer_size),
2059     MYSQL_SYSVAR(use_adaptive_mutex),
2060     MYSQL_SYSVAR(bytes_per_sync),
2061     MYSQL_SYSVAR(wal_bytes_per_sync),
2062     MYSQL_SYSVAR(enable_thread_tracking),
2063     MYSQL_SYSVAR(perf_context_level),
2064     MYSQL_SYSVAR(wal_recovery_mode),
2065     MYSQL_SYSVAR(stats_level),
2066     MYSQL_SYSVAR(access_hint_on_compaction_start),
2067     MYSQL_SYSVAR(new_table_reader_for_compaction_inputs),
2068     MYSQL_SYSVAR(compaction_readahead_size),
2069     MYSQL_SYSVAR(allow_concurrent_memtable_write),
2070     MYSQL_SYSVAR(enable_write_thread_adaptive_yield),
2071 
2072     MYSQL_SYSVAR(block_cache_size),
2073     MYSQL_SYSVAR(sim_cache_size),
2074     MYSQL_SYSVAR(use_clock_cache),
2075     MYSQL_SYSVAR(cache_high_pri_pool_ratio),
2076     MYSQL_SYSVAR(cache_dump),
2077     MYSQL_SYSVAR(cache_index_and_filter_blocks),
2078     MYSQL_SYSVAR(cache_index_and_filter_with_high_priority),
2079     MYSQL_SYSVAR(pin_l0_filter_and_index_blocks_in_cache),
2080     MYSQL_SYSVAR(index_type),
2081     MYSQL_SYSVAR(hash_index_allow_collision),
2082     MYSQL_SYSVAR(no_block_cache),
2083     MYSQL_SYSVAR(block_size),
2084     MYSQL_SYSVAR(block_size_deviation),
2085     MYSQL_SYSVAR(block_restart_interval),
2086     MYSQL_SYSVAR(whole_key_filtering),
2087 
2088     MYSQL_SYSVAR(default_cf_options),
2089     MYSQL_SYSVAR(override_cf_options),
2090     MYSQL_SYSVAR(update_cf_options),
2091 
2092     MYSQL_SYSVAR(flush_log_at_trx_commit),
2093     MYSQL_SYSVAR(write_disable_wal),
2094     MYSQL_SYSVAR(write_ignore_missing_column_families),
2095 
2096     MYSQL_SYSVAR(skip_fill_cache),
2097     MYSQL_SYSVAR(unsafe_for_binlog),
2098 
2099     MYSQL_SYSVAR(records_in_range),
2100     MYSQL_SYSVAR(force_index_records_in_range),
2101     MYSQL_SYSVAR(debug_optimizer_n_rows),
2102     MYSQL_SYSVAR(force_compute_memtable_stats),
2103     MYSQL_SYSVAR(force_compute_memtable_stats_cachetime),
2104     MYSQL_SYSVAR(debug_optimizer_no_zero_cardinality),
2105 
2106     MYSQL_SYSVAR(compact_cf),
2107     MYSQL_SYSVAR(delete_cf),
2108     MYSQL_SYSVAR(signal_drop_index_thread),
2109     MYSQL_SYSVAR(pause_background_work),
2110     MYSQL_SYSVAR(enable_2pc),
2111     MYSQL_SYSVAR(ignore_unknown_options),
2112     MYSQL_SYSVAR(strict_collation_check),
2113     MYSQL_SYSVAR(strict_collation_exceptions),
2114     MYSQL_SYSVAR(collect_sst_properties),
2115     MYSQL_SYSVAR(force_flush_memtable_now),
2116     MYSQL_SYSVAR(force_flush_memtable_and_lzero_now),
2117     MYSQL_SYSVAR(enable_ttl),
2118     MYSQL_SYSVAR(enable_ttl_read_filtering),
2119     MYSQL_SYSVAR(debug_ttl_rec_ts),
2120     MYSQL_SYSVAR(debug_ttl_snapshot_ts),
2121     MYSQL_SYSVAR(debug_ttl_read_filter_ts),
2122     MYSQL_SYSVAR(debug_ttl_ignore_pk),
2123     MYSQL_SYSVAR(reset_stats),
2124     MYSQL_SYSVAR(io_write_timeout),
2125     MYSQL_SYSVAR(seconds_between_stat_computes),
2126 
2127     MYSQL_SYSVAR(compaction_sequential_deletes),
2128     MYSQL_SYSVAR(compaction_sequential_deletes_window),
2129     MYSQL_SYSVAR(compaction_sequential_deletes_file_size),
2130     MYSQL_SYSVAR(compaction_sequential_deletes_count_sd),
2131     MYSQL_SYSVAR(print_snapshot_conflict_queries),
2132 
2133     MYSQL_SYSVAR(datadir),
2134   MYSQL_SYSVAR(supported_compression_types),
2135     MYSQL_SYSVAR(create_checkpoint),
2136     MYSQL_SYSVAR(remove_mariabackup_checkpoint),
2137     MYSQL_SYSVAR(checksums_pct),
2138     MYSQL_SYSVAR(store_row_debug_checksums),
2139     MYSQL_SYSVAR(verify_row_debug_checksums),
2140     MYSQL_SYSVAR(master_skip_tx_api),
2141 
2142     MYSQL_SYSVAR(validate_tables),
2143     MYSQL_SYSVAR(table_stats_sampling_pct),
2144 
2145     MYSQL_SYSVAR(large_prefix),
2146     MYSQL_SYSVAR(allow_to_start_after_corruption),
2147     MYSQL_SYSVAR(git_hash),
2148     MYSQL_SYSVAR(error_on_suboptimal_collation),
2149     MYSQL_SYSVAR(stats_recalc_rate),
2150     MYSQL_SYSVAR(debug_manual_compaction_delay),
2151     MYSQL_SYSVAR(max_manual_compactions),
2152     MYSQL_SYSVAR(manual_compaction_threads),
2153     MYSQL_SYSVAR(rollback_on_timeout),
2154 
2155     MYSQL_SYSVAR(enable_insert_with_update_caching),
2156 
2157     MYSQL_SYSVAR(ignore_datadic_errors),
2158     nullptr};
2159 
rdb_get_rocksdb_write_options(my_core::THD * const thd)2160 static rocksdb::WriteOptions rdb_get_rocksdb_write_options(
2161     my_core::THD *const thd) {
2162   rocksdb::WriteOptions opt;
2163 
2164   opt.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
2165   opt.disableWAL = THDVAR(thd, write_disable_wal);
2166   opt.ignore_missing_column_families =
2167       THDVAR(thd, write_ignore_missing_column_families);
2168 
2169   return opt;
2170 }
2171 
rocksdb_compact_column_family(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,struct st_mysql_value * const value)2172 static int rocksdb_compact_column_family(THD *const thd,
2173                                          struct st_mysql_sys_var *const var,
2174                                          void *const var_ptr,
2175                                          struct st_mysql_value *const value) {
2176   char buff[STRING_BUFFER_USUAL_SIZE];
2177   int len = sizeof(buff);
2178 
2179   DBUG_ASSERT(value != nullptr);
2180 
2181   if (const char *const cf = value->val_str(value, buff, &len)) {
2182     auto cfh = cf_manager.get_cf(cf);
2183     if (cfh != nullptr && rdb != nullptr) {
2184       int mc_id = rdb_mc_thread.request_manual_compaction(
2185           cfh, nullptr, nullptr, THDVAR(thd, manual_compaction_threads));
2186       if (mc_id == -1) {
2187         my_error(ER_INTERNAL_ERROR, MYF(0),
2188                  "Can't schedule more manual compactions. "
2189                  "Increase rocksdb_max_manual_compactions or stop issuing "
2190                  "more manual compactions.");
2191         return HA_EXIT_FAILURE;
2192       } else if (mc_id < 0) {
2193         return HA_EXIT_FAILURE;
2194       }
2195       // NO_LINT_DEBUG
2196       sql_print_information("RocksDB: Manual compaction of column family: %s\n",
2197                             cf);
2198       // Checking thd state every short cycle (100ms). This is for allowing to
2199       // exiting this function without waiting for CompactRange to finish.
2200       do {
2201         my_sleep(100000);
2202       } while (!thd->killed &&
2203                !rdb_mc_thread.is_manual_compaction_finished(mc_id));
2204 
2205       if (thd->killed) {
2206         // This cancels if requested compaction state is INITED.
2207         // TODO(yoshinorim): Cancel running compaction as well once
2208         // it is supported in RocksDB.
2209         rdb_mc_thread.clear_manual_compaction_request(mc_id, true);
2210       }
2211     }
2212   }
2213   return HA_EXIT_SUCCESS;
2214 }
2215 
2216 ///////////////////////////////////////////////////////////////////////////////////////////
2217 
2218 /*
2219   Drop index thread's control
2220 */
2221 
2222 static Rdb_drop_index_thread rdb_drop_idx_thread;
2223 
rocksdb_drop_index_wakeup_thread(my_core::THD * const thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * const var MY_ATTRIBUTE ((__unused__)),void * const var_ptr MY_ATTRIBUTE ((__unused__)),const void * const save)2224 static void rocksdb_drop_index_wakeup_thread(
2225     my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
2226     struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
2227     void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
2228   if (*static_cast<const bool *>(save)) {
2229     rdb_drop_idx_thread.signal();
2230   }
2231 }
2232 
rocksdb_perf_context_level(THD * const thd)2233 static inline uint32_t rocksdb_perf_context_level(THD *const thd) {
2234   DBUG_ASSERT(thd != nullptr);
2235 
2236   const int session_perf_context_level = THDVAR(thd, perf_context_level);
2237   if (session_perf_context_level > rocksdb::PerfLevel::kUninitialized) {
2238     return session_perf_context_level;
2239   }
2240 
2241   /*
2242     Fallback to global thdvar, if session specific one was not set to a valid
2243     value.
2244   */
2245 
2246   const int global_perf_context_level = THDVAR(nullptr, perf_context_level);
2247   if (global_perf_context_level > rocksdb::PerfLevel::kUninitialized) {
2248     return global_perf_context_level;
2249   }
2250 
2251   return rocksdb::PerfLevel::kDisable;
2252 }
2253 
2254 /*
2255   Very short (functor-like) interface to be passed to
2256   Rdb_transaction::walk_tx_list()
2257 */
2258 
2259 interface Rdb_tx_list_walker {
2260   virtual ~Rdb_tx_list_walker() {}
2261   virtual void process_tran(const Rdb_transaction *const) = 0;
2262 };
2263 
2264 /*
2265   This is a helper class that is passed to RocksDB to get notifications when
2266   a snapshot gets created.
2267 */
2268 
2269 class Rdb_snapshot_notifier : public rocksdb::TransactionNotifier {
2270   Rdb_transaction *m_owning_tx;
2271 
2272   void SnapshotCreated(const rocksdb::Snapshot *snapshot) override;
2273 
2274  public:
2275   Rdb_snapshot_notifier(const Rdb_snapshot_notifier &) = delete;
2276   Rdb_snapshot_notifier &operator=(const Rdb_snapshot_notifier &) = delete;
2277 
Rdb_snapshot_notifier(Rdb_transaction * const owning_tx)2278   explicit Rdb_snapshot_notifier(Rdb_transaction *const owning_tx)
2279       : m_owning_tx(owning_tx) {}
2280 
2281   // If the owning Rdb_transaction gets destructed we need to not reference
2282   // it anymore.
detach()2283   void detach() { m_owning_tx = nullptr; }
2284 };
2285 
2286 
2287 #ifdef MARIAROCKS_NOT_YET
2288 // ER_LOCK_WAIT_TIMEOUT error also has a reason in facebook/mysql-5.6
2289 #endif
timeout_message(const char * command,const char * name1,const char * name2)2290 String timeout_message(const char *command, const char *name1,
2291                        const char *name2)
2292 {
2293     String msg;
2294     msg.append("Timeout on ");
2295     msg.append(command);
2296     msg.append(": ");
2297     msg.append(name1);
2298     if (name2 && name2[0])
2299     {
2300       msg.append(".");
2301       msg.append(name2);
2302     }
2303     return msg;
2304 }
2305 
2306 
2307 /* This is the base class for transactions when interacting with rocksdb.
2308  */
2309 class Rdb_transaction {
2310  protected:
2311   ulonglong m_write_count = 0;
2312   ulonglong m_insert_count = 0;
2313   ulonglong m_update_count = 0;
2314   ulonglong m_delete_count = 0;
2315   ulonglong m_lock_count = 0;
2316   std::unordered_map<GL_INDEX_ID, ulonglong> m_auto_incr_map;
2317 
2318   bool m_is_delayed_snapshot = false;
2319   bool m_is_two_phase = false;
2320 
2321   std::unordered_set<Rdb_tbl_def*> modified_tables;
2322 
2323  private:
2324   /*
2325     Number of write operations this transaction had when we took the last
2326     savepoint (the idea is not to take another savepoint if we haven't made
2327     any changes)
2328   */
2329   ulonglong m_writes_at_last_savepoint;
2330 
2331  protected:
2332 
2333 protected:
2334   THD *m_thd = nullptr;
2335 
2336   static std::multiset<Rdb_transaction *> s_tx_list;
2337   static mysql_mutex_t s_tx_list_mutex;
2338 
2339   Rdb_io_perf *m_tbl_io_perf;
2340 
2341   bool m_tx_read_only = false;
2342 
2343   int m_timeout_sec; /* Cached value of @@rocksdb_lock_wait_timeout */
2344 
2345   /* Maximum number of locks the transaction can have */
2346   ulonglong m_max_row_locks;
2347 
2348   bool m_is_tx_failed = false;
2349   bool m_rollback_only = false;
2350 
2351   std::shared_ptr<Rdb_snapshot_notifier> m_notifier;
2352 
2353   // This should be used only when updating binlog information.
2354   virtual rocksdb::WriteBatchBase *get_write_batch() = 0;
2355   virtual bool commit_no_binlog() = 0;
2356   virtual rocksdb::Iterator *get_iterator(
2357       const rocksdb::ReadOptions &options,
2358       rocksdb::ColumnFamilyHandle *column_family) = 0;
2359 
2360 protected:
2361   /*
2362     The following two are helper functions to be overloaded by child classes.
2363     They should provide RocksDB's savepoint semantics.
2364   */
2365   virtual void do_set_savepoint() = 0;
2366   virtual void do_rollback_to_savepoint() = 0;
2367 
2368   /*
2369     @detail
2370       This function takes in the WriteBatch of the transaction to add
2371       all the AUTO_INCREMENT merges. It does so by iterating through
2372       m_auto_incr_map and then constructing key/value pairs to call merge upon.
2373 
2374     @param wb
2375    */
merge_auto_incr_map(rocksdb::WriteBatchBase * const wb)2376   rocksdb::Status merge_auto_incr_map(rocksdb::WriteBatchBase *const wb) {
2377     DBUG_EXECUTE_IF("myrocks_autoinc_upgrade", return rocksdb::Status::OK(););
2378 
2379     // Iterate through the merge map merging all keys into data dictionary.
2380     rocksdb::Status s;
2381     for (auto &it : m_auto_incr_map) {
2382       s = dict_manager.put_auto_incr_val(wb, it.first, it.second);
2383       if (!s.ok()) {
2384         return s;
2385       }
2386     }
2387     m_auto_incr_map.clear();
2388     return s;
2389   }
2390 
2391  public:
2392   rocksdb::ReadOptions m_read_opts;
2393   const char *m_mysql_log_file_name;
2394   my_off_t m_mysql_log_offset;
2395 #ifdef MARIAROCKS_NOT_YET
2396   // TODO: MariaDB probably doesn't need these at all:
2397   const char *m_mysql_gtid;
2398   const char *m_mysql_max_gtid;
2399 #endif
2400   String m_detailed_error;
2401   int64_t m_snapshot_timestamp = 0;
2402   bool m_ddl_transaction;
2403 #ifdef MARIAROCKS_NOT_YET
2404   std::shared_ptr<Rdb_explicit_snapshot> m_explicit_snapshot;
2405 #endif
2406 
2407   /*
2408     Tracks the number of tables in use through external_lock.
2409     This should not be reset during start_tx().
2410   */
2411   int64_t m_n_mysql_tables_in_use = 0;
2412 
2413   /*
2414     MariaDB's group commit:
2415   */
2416   bool commit_ordered_done;
2417   bool commit_ordered_res;
2418 
2419   /*
2420     for distinction between rdb_transaction_impl and rdb_writebatch_impl
2421     when using walk tx list
2422   */
2423   virtual bool is_writebatch_trx() const = 0;
2424 
init_mutex()2425   static void init_mutex() {
2426     mysql_mutex_init(key_mutex_tx_list, &s_tx_list_mutex, MY_MUTEX_INIT_FAST);
2427   }
2428 
term_mutex()2429   static void term_mutex() {
2430     DBUG_ASSERT(s_tx_list.size() == 0);
2431     mysql_mutex_destroy(&s_tx_list_mutex);
2432   }
2433 
walk_tx_list(Rdb_tx_list_walker * walker)2434   static void walk_tx_list(Rdb_tx_list_walker *walker) {
2435     DBUG_ASSERT(walker != nullptr);
2436 
2437     RDB_MUTEX_LOCK_CHECK(s_tx_list_mutex);
2438 
2439     for (auto it : s_tx_list) {
2440       walker->process_tran(it);
2441     }
2442 
2443     RDB_MUTEX_UNLOCK_CHECK(s_tx_list_mutex);
2444   }
2445 
set_status_error(THD * const thd,const rocksdb::Status & s,const Rdb_key_def & kd,Rdb_tbl_def * const tbl_def,Rdb_table_handler * const table_handler)2446   int set_status_error(THD *const thd, const rocksdb::Status &s,
2447                        const Rdb_key_def &kd, Rdb_tbl_def *const tbl_def,
2448                        Rdb_table_handler *const table_handler) {
2449     DBUG_ASSERT(!s.ok());
2450     DBUG_ASSERT(tbl_def != nullptr);
2451 
2452     if (s.IsTimedOut()) {
2453       /*
2454         SQL layer has weird expectations. If we return an error when
2455         doing a read in DELETE IGNORE, it will ignore the error ("because it's
2456         an IGNORE command!) but then will fail an assert, because "error code
2457         was returned, but no error happened".  Do what InnoDB's
2458         convert_error_code_to_mysql() does: force a statement
2459         rollback before returning HA_ERR_LOCK_WAIT_TIMEOUT:
2460         */
2461       my_core::thd_mark_transaction_to_rollback(
2462           thd, static_cast<bool>(rocksdb_rollback_on_timeout));
2463       m_detailed_error.copy(timeout_message(
2464           "index", tbl_def->full_tablename().c_str(), kd.get_name().c_str()));
2465       table_handler->m_lock_wait_timeout_counter.inc();
2466       rocksdb_row_lock_wait_timeouts++;
2467 
2468       return HA_ERR_LOCK_WAIT_TIMEOUT;
2469     }
2470 
2471     if (s.IsDeadlock()) {
2472       my_core::thd_mark_transaction_to_rollback(thd,
2473                                                 true /* whole transaction */);
2474       m_detailed_error = String();
2475       table_handler->m_deadlock_counter.inc();
2476       rocksdb_row_lock_deadlocks++;
2477       return HA_ERR_LOCK_DEADLOCK;
2478     } else if (s.IsBusy()) {
2479       rocksdb_snapshot_conflict_errors++;
2480       if (rocksdb_print_snapshot_conflict_queries) {
2481         char user_host_buff[MAX_USER_HOST_SIZE + 1];
2482         make_user_name(thd, user_host_buff);
2483         // NO_LINT_DEBUG
2484         sql_print_warning(
2485             "Got snapshot conflict errors: User: %s "
2486             "Query: %s",
2487             user_host_buff, thd->query());
2488       }
2489       m_detailed_error = String(" (snapshot conflict)", system_charset_info);
2490       table_handler->m_deadlock_counter.inc();
2491       return HA_ERR_ROCKSDB_STATUS_BUSY;
2492     }
2493 
2494     if (s.IsIOError() || s.IsCorruption()) {
2495       rdb_handle_io_error(s, RDB_IO_ERROR_GENERAL);
2496     }
2497 
2498     return ha_rocksdb::rdb_error_to_mysql(s);
2499   }
2500 
get_thd() const2501   THD *get_thd() const { return m_thd; }
2502 
2503   /* Used for tracking io_perf counters */
io_perf_start(Rdb_io_perf * const io_perf)2504   void io_perf_start(Rdb_io_perf *const io_perf) {
2505     /*
2506       Since perf_context is tracked per thread, it is difficult and expensive
2507       to maintain perf_context on a per table basis. Therefore, roll all
2508       perf_context data into the first table used in a query. This works well
2509       for single table queries and is probably good enough for queries that hit
2510       multiple tables.
2511 
2512       perf_context stats gathering is started when the table lock is acquired
2513       or when ha_rocksdb::start_stmt is called in case of LOCK TABLES. They
2514       are recorded when the table lock is released, or when commit/rollback
2515       is called on the transaction, whichever comes first. Table lock release
2516       and commit/rollback can happen in different orders. In the case where
2517       the lock is released before commit/rollback is called, an extra step to
2518       gather stats during commit/rollback is needed.
2519     */
2520     if (m_tbl_io_perf == nullptr &&
2521         io_perf->start(rocksdb_perf_context_level(m_thd))) {
2522       m_tbl_io_perf = io_perf;
2523     }
2524   }
2525 
io_perf_end_and_record(void)2526   void io_perf_end_and_record(void) {
2527     if (m_tbl_io_perf != nullptr) {
2528       m_tbl_io_perf->end_and_record(rocksdb_perf_context_level(m_thd));
2529       m_tbl_io_perf = nullptr;
2530     }
2531   }
2532 
io_perf_end_and_record(Rdb_io_perf * const io_perf)2533   void io_perf_end_and_record(Rdb_io_perf *const io_perf) {
2534     if (m_tbl_io_perf == io_perf) {
2535       io_perf_end_and_record();
2536     }
2537   }
2538 
update_bytes_written(ulonglong bytes_written)2539   void update_bytes_written(ulonglong bytes_written) {
2540     if (m_tbl_io_perf != nullptr) {
2541       m_tbl_io_perf->update_bytes_written(rocksdb_perf_context_level(m_thd),
2542                                           bytes_written);
2543     }
2544   }
2545 
set_params(int timeout_sec_arg,int max_row_locks_arg)2546   void set_params(int timeout_sec_arg, int max_row_locks_arg) {
2547     m_timeout_sec = timeout_sec_arg;
2548     m_max_row_locks = max_row_locks_arg;
2549     set_lock_timeout(timeout_sec_arg);
2550   }
2551 
2552   virtual void set_lock_timeout(int timeout_sec_arg) = 0;
2553 
get_write_count() const2554   ulonglong get_write_count() const { return m_write_count; }
2555 
get_insert_count() const2556   ulonglong get_insert_count() const { return m_insert_count; }
2557 
get_update_count() const2558   ulonglong get_update_count() const { return m_update_count; }
2559 
get_delete_count() const2560   ulonglong get_delete_count() const { return m_delete_count; }
2561 
incr_insert_count()2562   void incr_insert_count() { ++m_insert_count; }
2563 
incr_update_count()2564   void incr_update_count() { ++m_update_count; }
2565 
incr_delete_count()2566   void incr_delete_count() { ++m_delete_count; }
2567 
get_timeout_sec() const2568   int get_timeout_sec() const { return m_timeout_sec; }
2569 
get_lock_count() const2570   ulonglong get_lock_count() const { return m_lock_count; }
2571 
2572   virtual void set_sync(bool sync) = 0;
2573 
2574   virtual void release_lock(rocksdb::ColumnFamilyHandle *const column_family,
2575                             const std::string &rowkey) = 0;
2576 
2577   virtual bool prepare(const rocksdb::TransactionName &name) = 0;
2578 
commit_or_rollback()2579   bool commit_or_rollback() {
2580     bool res;
2581     if (m_is_tx_failed) {
2582       rollback();
2583       res = false;
2584     } else {
2585       res = commit();
2586     }
2587     return res;
2588   }
2589 
commit()2590   bool commit() {
2591     if (get_write_count() == 0) {
2592       rollback();
2593       return false;
2594     } else if (m_rollback_only) {
2595       /*
2596         Transactions marked as rollback_only are expected to be rolled back at
2597         prepare(). But there are some exceptions like below that prepare() is
2598         never called and commit() is called instead.
2599          1. Binlog is disabled
2600          2. No modification exists in binlog cache for the transaction (#195)
2601         In both cases, rolling back transaction is safe. Nothing is written to
2602         binlog.
2603        */
2604       my_error(ER_ROLLBACK_ONLY, MYF(0));
2605       rollback();
2606       return true;
2607     } else {
2608 #ifdef MARIAROCKS_NOT_YET
2609       /*
2610         Storing binlog position inside MyRocks is needed only for restoring
2611         MyRocks from backups. This feature is not supported yet.
2612       */
2613       mysql_bin_log_commit_pos(m_thd, &m_mysql_log_offset,
2614                                &m_mysql_log_file_name);
2615       binlog_manager.update(m_mysql_log_file_name, m_mysql_log_offset,
2616                             get_write_batch());
2617 #endif
2618       return commit_no_binlog();
2619     }
2620   }
2621 
2622   virtual void rollback() = 0;
2623 
snapshot_created(const rocksdb::Snapshot * const snapshot)2624   void snapshot_created(const rocksdb::Snapshot *const snapshot) {
2625     DBUG_ASSERT(snapshot != nullptr);
2626 
2627     m_read_opts.snapshot = snapshot;
2628     rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp);
2629     m_is_delayed_snapshot = false;
2630   }
2631 
2632   virtual void acquire_snapshot(bool acquire_now) = 0;
2633   virtual void release_snapshot() = 0;
2634 
has_snapshot() const2635   bool has_snapshot() const { return m_read_opts.snapshot != nullptr; }
2636 
2637  private:
2638   // The Rdb_sst_info structures we are currently loading.  In a partitioned
2639   // table this can have more than one entry
2640   std::vector<std::shared_ptr<Rdb_sst_info>> m_curr_bulk_load;
2641   std::string m_curr_bulk_load_tablename;
2642 
2643   /* External merge sorts for bulk load: key ID -> merge sort instance */
2644   std::unordered_map<GL_INDEX_ID, Rdb_index_merge> m_key_merge;
2645 
2646  public:
get_key_merge(GL_INDEX_ID kd_gl_id,rocksdb::ColumnFamilyHandle * cf,Rdb_index_merge ** key_merge)2647   int get_key_merge(GL_INDEX_ID kd_gl_id, rocksdb::ColumnFamilyHandle *cf,
2648                     Rdb_index_merge **key_merge) {
2649     int res;
2650     auto it = m_key_merge.find(kd_gl_id);
2651     if (it == m_key_merge.end()) {
2652       m_key_merge.emplace(
2653           std::piecewise_construct, std::make_tuple(kd_gl_id),
2654           std::make_tuple(
2655               get_rocksdb_tmpdir(), THDVAR(get_thd(), merge_buf_size),
2656               THDVAR(get_thd(), merge_combine_read_size),
2657               THDVAR(get_thd(), merge_tmp_file_removal_delay_ms), cf));
2658       it = m_key_merge.find(kd_gl_id);
2659       if ((res = it->second.init()) != 0) {
2660         return res;
2661       }
2662     }
2663     *key_merge = &it->second;
2664     return HA_EXIT_SUCCESS;
2665   }
2666 
2667   /* Finish bulk loading for all table handlers belongs to one connection */
finish_bulk_load(bool * is_critical_error=nullptr,int print_client_error=true)2668   int finish_bulk_load(bool *is_critical_error = nullptr,
2669                        int print_client_error = true) {
2670     Ensure_cleanup cleanup([&]() {
2671       // Always clear everything regardless of success/failure
2672       m_curr_bulk_load.clear();
2673       m_curr_bulk_load_tablename.clear();
2674       m_key_merge.clear();
2675     });
2676 
2677     int rc = 0;
2678     if (is_critical_error) {
2679       *is_critical_error = true;
2680     }
2681 
2682     // PREPARE phase: finish all on-going bulk loading Rdb_sst_info and
2683     // collect all Rdb_sst_commit_info containing (SST files, cf)
2684     int rc2 = 0;
2685     std::vector<Rdb_sst_info::Rdb_sst_commit_info> sst_commit_list;
2686     sst_commit_list.reserve(m_curr_bulk_load.size());
2687 
2688     for (auto &sst_info : m_curr_bulk_load) {
2689       Rdb_sst_info::Rdb_sst_commit_info commit_info;
2690 
2691       // Commit the list of SST files and move it to the end of
2692       // sst_commit_list, effectively transfer the ownership over
2693       rc2 = sst_info->finish(&commit_info, print_client_error);
2694       if (rc2 && rc == 0) {
2695         // Don't return yet - make sure we finish all the SST infos
2696         rc = rc2;
2697       }
2698 
2699       // Make sure we have work to do - we might be losing the race
2700       if (rc2 == 0 && commit_info.has_work()) {
2701         sst_commit_list.emplace_back(std::move(commit_info));
2702         DBUG_ASSERT(!commit_info.has_work());
2703       }
2704     }
2705 
2706     if (rc) {
2707       return rc;
2708     }
2709 
2710     // MERGING Phase: Flush the index_merge sort buffers into SST files in
2711     // Rdb_sst_info and collect all Rdb_sst_commit_info containing
2712     // (SST files, cf)
2713     if (!m_key_merge.empty()) {
2714       Ensure_cleanup malloc_cleanup([]() {
2715         /*
2716           Explicitly tell jemalloc to clean up any unused dirty pages at this
2717           point.
2718           See https://reviews.facebook.net/D63723 for more details.
2719         */
2720         purge_all_jemalloc_arenas();
2721       });
2722 
2723       rocksdb::Slice merge_key;
2724       rocksdb::Slice merge_val;
2725       for (auto it = m_key_merge.begin(); it != m_key_merge.end(); it++) {
2726         GL_INDEX_ID index_id = it->first;
2727         std::shared_ptr<const Rdb_key_def> keydef =
2728             ddl_manager.safe_find(index_id);
2729         std::string table_name = ddl_manager.safe_get_table_name(index_id);
2730 
2731         // Unable to find key definition or table name since the
2732         // table could have been dropped.
2733         // TODO(herman): there is a race here between dropping the table
2734         // and detecting a drop here. If the table is dropped while bulk
2735         // loading is finishing, these keys being added here may
2736         // be missed by the compaction filter and not be marked for
2737         // removal. It is unclear how to lock the sql table from the storage
2738         // engine to prevent modifications to it while bulk load is occurring.
2739         if (keydef == nullptr) {
2740           if (is_critical_error) {
2741             // We used to set the error but simply ignores it. This follows
2742             // current behavior and we should revisit this later
2743             *is_critical_error = false;
2744           }
2745           return HA_ERR_KEY_NOT_FOUND;
2746         } else if (table_name.empty()) {
2747           if (is_critical_error) {
2748             // We used to set the error but simply ignores it. This follows
2749             // current behavior and we should revisit this later
2750             *is_critical_error = false;
2751           }
2752           return HA_ERR_NO_SUCH_TABLE;
2753         }
2754         const std::string &index_name = keydef->get_name();
2755         Rdb_index_merge &rdb_merge = it->second;
2756 
2757         // Rdb_sst_info expects a denormalized table name in the form of
2758         // "./database/table"
2759         std::replace(table_name.begin(), table_name.end(), '.', '/');
2760         table_name = "./" + table_name;
2761         auto sst_info = std::make_shared<Rdb_sst_info>(
2762             rdb, table_name, index_name, rdb_merge.get_cf(),
2763             *rocksdb_db_options, THDVAR(get_thd(), trace_sst_api));
2764 
2765         while ((rc2 = rdb_merge.next(&merge_key, &merge_val)) == 0) {
2766           if ((rc2 = sst_info->put(merge_key, merge_val)) != 0) {
2767             rc = rc2;
2768 
2769             // Don't return yet - make sure we finish the sst_info
2770             break;
2771           }
2772         }
2773 
2774         // -1 => no more items
2775         if (rc2 != -1 && rc != 0) {
2776           rc = rc2;
2777         }
2778 
2779         Rdb_sst_info::Rdb_sst_commit_info commit_info;
2780         rc2 = sst_info->finish(&commit_info, print_client_error);
2781         if (rc2 != 0 && rc == 0) {
2782           // Only set the error from sst_info->finish if finish failed and we
2783           // didn't fail before. In other words, we don't have finish's
2784           // success mask earlier failures
2785           rc = rc2;
2786         }
2787 
2788         if (rc) {
2789           return rc;
2790         }
2791 
2792         if (commit_info.has_work()) {
2793           sst_commit_list.emplace_back(std::move(commit_info));
2794           DBUG_ASSERT(!commit_info.has_work());
2795         }
2796       }
2797     }
2798 
2799     // Early return in case we lost the race completely and end up with no
2800     // work at all
2801     if (sst_commit_list.size() == 0) {
2802       return rc;
2803     }
2804 
2805     // INGEST phase: Group all Rdb_sst_commit_info by cf (as they might
2806     // have the same cf across different indexes) and call out to RocksDB
2807     // to ingest all SST files in one atomic operation
2808     rocksdb::IngestExternalFileOptions options;
2809     options.move_files = true;
2810     options.snapshot_consistency = false;
2811     options.allow_global_seqno = false;
2812     options.allow_blocking_flush = false;
2813 
2814     std::map<rocksdb::ColumnFamilyHandle *, rocksdb::IngestExternalFileArg>
2815         arg_map;
2816 
2817     // Group by column_family
2818     for (auto &commit_info : sst_commit_list) {
2819       if (arg_map.find(commit_info.get_cf()) == arg_map.end()) {
2820         rocksdb::IngestExternalFileArg arg;
2821         arg.column_family = commit_info.get_cf(),
2822         arg.external_files = commit_info.get_committed_files(),
2823         arg.options = options;
2824 
2825         arg_map.emplace(commit_info.get_cf(), arg);
2826       } else {
2827         auto &files = arg_map[commit_info.get_cf()].external_files;
2828         files.insert(files.end(), commit_info.get_committed_files().begin(),
2829                      commit_info.get_committed_files().end());
2830       }
2831     }
2832 
2833     std::vector<rocksdb::IngestExternalFileArg> args;
2834     size_t file_count = 0;
2835     for (auto &cf_files_pair : arg_map) {
2836       args.push_back(cf_files_pair.second);
2837       file_count += cf_files_pair.second.external_files.size();
2838     }
2839 
2840     const rocksdb::Status s = rdb->IngestExternalFiles(args);
2841     if (THDVAR(m_thd, trace_sst_api)) {
2842       // NO_LINT_DEBUG
2843       sql_print_information(
2844           "SST Tracing: IngestExternalFile '%zu' files returned %s", file_count,
2845           s.ok() ? "ok" : "not ok");
2846     }
2847 
2848     if (!s.ok()) {
2849       if (print_client_error) {
2850         Rdb_sst_info::report_error_msg(s, nullptr);
2851       }
2852       return HA_ERR_ROCKSDB_BULK_LOAD;
2853     }
2854 
2855     // COMMIT phase: mark everything as completed. This avoids SST file
2856     // deletion kicking in. Otherwise SST files would get deleted if this
2857     // entire operation is aborted
2858     for (auto &commit_info : sst_commit_list) {
2859       commit_info.commit();
2860     }
2861 
2862     return rc;
2863   }
2864 
start_bulk_load(ha_rocksdb * const bulk_load,std::shared_ptr<Rdb_sst_info> sst_info)2865   int start_bulk_load(ha_rocksdb *const bulk_load,
2866                       std::shared_ptr<Rdb_sst_info> sst_info) {
2867     /*
2868      If we already have an open bulk load of a table and the name doesn't
2869      match the current one, close out the currently running one.  This allows
2870      multiple bulk loads to occur on a partitioned table, but then closes
2871      them all out when we switch to another table.
2872     */
2873     DBUG_ASSERT(bulk_load != nullptr);
2874 
2875     if (!m_curr_bulk_load.empty() &&
2876         bulk_load->get_table_basename() != m_curr_bulk_load_tablename) {
2877       const auto res = finish_bulk_load();
2878       if (res != HA_EXIT_SUCCESS) {
2879         return res;
2880       }
2881     }
2882 
2883     /*
2884      This used to track ha_rocksdb handler objects, but those can be
2885      freed by the table cache while this was referencing them. Instead
2886      of tracking ha_rocksdb handler objects, this now tracks the
2887      Rdb_sst_info allocated, and both the ha_rocksdb handler and the
2888      Rdb_transaction both have shared pointers to them.
2889 
2890      On transaction complete, it will commit each Rdb_sst_info structure found.
2891      If the ha_rocksdb object is freed, etc., it will also commit
2892      the Rdb_sst_info. The Rdb_sst_info commit path needs to be idempotent.
2893     */
2894     m_curr_bulk_load.push_back(sst_info);
2895     m_curr_bulk_load_tablename = bulk_load->get_table_basename();
2896     return HA_EXIT_SUCCESS;
2897   }
2898 
num_ongoing_bulk_load() const2899   int num_ongoing_bulk_load() const { return m_curr_bulk_load.size(); }
2900 
get_rocksdb_tmpdir() const2901   const char *get_rocksdb_tmpdir() const {
2902     const char *tmp_dir = THDVAR(get_thd(), tmpdir);
2903 
2904     /*
2905       We want to treat an empty string as nullptr, in these cases DDL operations
2906       will use the default --tmpdir passed to mysql instead.
2907     */
2908     if (tmp_dir != nullptr && *tmp_dir == '\0') {
2909       tmp_dir = nullptr;
2910     }
2911     return (tmp_dir);
2912   }
2913 
2914   /*
2915     Flush the data accumulated so far. This assumes we're doing a bulk insert.
2916 
2917     @detail
2918       This should work like transaction commit, except that we don't
2919       synchronize with the binlog (there is no API that would allow to have
2920       binlog flush the changes accumulated so far and return its current
2921       position)
2922 
2923     @todo
2924       Add test coverage for what happens when somebody attempts to do bulk
2925       inserts while inside a multi-statement transaction.
2926   */
flush_batch()2927   bool flush_batch() {
2928     if (get_write_count() == 0) return false;
2929 
2930     /* Commit the current transaction */
2931     if (commit_no_binlog()) return true;
2932 
2933     /* Start another one */
2934     start_tx();
2935     return false;
2936   }
2937 
set_auto_incr(const GL_INDEX_ID & gl_index_id,ulonglong curr_id)2938   void set_auto_incr(const GL_INDEX_ID &gl_index_id, ulonglong curr_id) {
2939     m_auto_incr_map[gl_index_id] =
2940         std::max(m_auto_incr_map[gl_index_id], curr_id);
2941   }
2942 
2943 #ifndef DBUG_OFF
get_auto_incr(const GL_INDEX_ID & gl_index_id)2944   ulonglong get_auto_incr(const GL_INDEX_ID &gl_index_id) {
2945     if (m_auto_incr_map.count(gl_index_id) > 0) {
2946       return m_auto_incr_map[gl_index_id];
2947     }
2948     return 0;
2949   }
2950 #endif
2951 
2952   virtual rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family,
2953                               const rocksdb::Slice &key,
2954                               const rocksdb::Slice &value,
2955                               const bool assume_tracked) = 0;
2956   virtual rocksdb::Status delete_key(
2957       rocksdb::ColumnFamilyHandle *const column_family,
2958       const rocksdb::Slice &key, const bool assume_tracked) = 0;
2959   virtual rocksdb::Status single_delete(
2960       rocksdb::ColumnFamilyHandle *const column_family,
2961       const rocksdb::Slice &key, const bool assume_tracked) = 0;
2962 
2963   virtual bool has_modifications() const = 0;
2964 
2965   virtual rocksdb::WriteBatchBase *get_indexed_write_batch() = 0;
2966   /*
2967     Return a WriteBatch that one can write to. The writes will skip any
2968     transaction locking. The writes will NOT be visible to the transaction.
2969   */
get_blind_write_batch()2970   rocksdb::WriteBatchBase *get_blind_write_batch() {
2971     return get_indexed_write_batch()->GetWriteBatch();
2972   }
2973 
2974   virtual rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family,
2975                               const rocksdb::Slice &key,
2976                               rocksdb::PinnableSlice *const value) const = 0;
2977   virtual rocksdb::Status get_for_update(
2978       rocksdb::ColumnFamilyHandle *const column_family,
2979       const rocksdb::Slice &key, rocksdb::PinnableSlice *const value,
2980       bool exclusive, const bool do_validate) = 0;
2981 
get_iterator(rocksdb::ColumnFamilyHandle * const column_family,bool skip_bloom_filter,bool fill_cache,const rocksdb::Slice & eq_cond_lower_bound,const rocksdb::Slice & eq_cond_upper_bound,bool read_current=false,bool create_snapshot=true)2982   rocksdb::Iterator *get_iterator(
2983       rocksdb::ColumnFamilyHandle *const column_family, bool skip_bloom_filter,
2984       bool fill_cache, const rocksdb::Slice &eq_cond_lower_bound,
2985       const rocksdb::Slice &eq_cond_upper_bound, bool read_current = false,
2986       bool create_snapshot = true) {
2987     // Make sure we are not doing both read_current (which implies we don't
2988     // want a snapshot) and create_snapshot which makes sure we create
2989     // a snapshot
2990     DBUG_ASSERT(column_family != nullptr);
2991     DBUG_ASSERT(!read_current || !create_snapshot);
2992 
2993     if (create_snapshot) acquire_snapshot(true);
2994 
2995     rocksdb::ReadOptions options = m_read_opts;
2996 
2997     if (skip_bloom_filter) {
2998       options.total_order_seek = true;
2999       options.iterate_lower_bound = &eq_cond_lower_bound;
3000       options.iterate_upper_bound = &eq_cond_upper_bound;
3001     } else {
3002       // With this option, Iterator::Valid() returns false if key
3003       // is outside of the prefix bloom filter range set at Seek().
3004       // Must not be set to true if not using bloom filter.
3005       options.prefix_same_as_start = true;
3006     }
3007     options.fill_cache = fill_cache;
3008     if (read_current) {
3009       options.snapshot = nullptr;
3010     }
3011     return get_iterator(options, column_family);
3012   }
3013 
3014   virtual bool is_tx_started() const = 0;
3015   virtual void start_tx() = 0;
3016   virtual void start_stmt() = 0;
3017 
3018  protected:
3019   // Non-virtual functions with actions to be done on transaction start and
3020   // commit.
on_commit()3021   void on_commit() {
3022     time_t tm;
3023     tm = time(nullptr);
3024     for (auto &it : modified_tables) {
3025       it->m_update_time = tm;
3026     }
3027     modified_tables.clear();
3028   }
on_rollback()3029   void on_rollback() {
3030     modified_tables.clear();
3031   }
3032  public:
3033   // Inform the transaction that this table was modified
log_table_write_op(Rdb_tbl_def * tbl)3034   void log_table_write_op(Rdb_tbl_def *tbl) {
3035     modified_tables.insert(tbl);
3036   }
3037 
set_initial_savepoint()3038   void set_initial_savepoint() {
3039     /*
3040       Set the initial savepoint. If the first statement in the transaction
3041       fails, we need something to roll back to, without rolling back the
3042       entire transaction.
3043     */
3044     do_set_savepoint();
3045     m_writes_at_last_savepoint = m_write_count;
3046   }
3047 
3048   /*
3049     Called when a "top-level" statement inside a transaction completes
3050     successfully and its changes become part of the transaction's changes.
3051   */
make_stmt_savepoint_permanent()3052   int make_stmt_savepoint_permanent() {
3053     // Take another RocksDB savepoint only if we had changes since the last
3054     // one. This is very important for long transactions doing lots of
3055     // SELECTs.
3056     if (m_writes_at_last_savepoint != m_write_count) {
3057       rocksdb::WriteBatchBase *batch = get_write_batch();
3058       rocksdb::Status status = rocksdb::Status::NotFound();
3059       while ((status = batch->PopSavePoint()) == rocksdb::Status::OK()) {
3060       }
3061 
3062       if (status != rocksdb::Status::NotFound()) {
3063         return HA_EXIT_FAILURE;
3064       }
3065 
3066       do_set_savepoint();
3067       m_writes_at_last_savepoint = m_write_count;
3068     }
3069 
3070     return HA_EXIT_SUCCESS;
3071   }
3072 
3073   /*
3074     Rollback to the savepoint we've set before the last statement
3075   */
rollback_to_stmt_savepoint()3076   void rollback_to_stmt_savepoint() {
3077     if (m_writes_at_last_savepoint != m_write_count) {
3078       do_rollback_to_savepoint();
3079       /*
3080         RollbackToSavePoint "removes the most recent SetSavePoint()", so
3081         we need to set it again so that next statement can roll back to this
3082         stage.
3083         It's ok to do it here at statement end (instead of doing it at next
3084         statement start) because setting a savepoint is cheap.
3085       */
3086       do_set_savepoint();
3087       m_writes_at_last_savepoint = m_write_count;
3088     }
3089   }
3090 
3091   virtual void rollback_stmt() = 0;
3092 
set_tx_failed(bool failed_arg)3093   void set_tx_failed(bool failed_arg) { m_is_tx_failed = failed_arg; }
3094 
can_prepare() const3095   bool can_prepare() const {
3096     if (m_rollback_only) {
3097       my_error(ER_ROLLBACK_ONLY, MYF(0));
3098       return false;
3099     }
3100     return true;
3101   }
3102 
rollback_to_savepoint(void * const savepoint)3103   int rollback_to_savepoint(void *const savepoint) {
3104     if (has_modifications()) {
3105       my_error(ER_ROLLBACK_TO_SAVEPOINT, MYF(0));
3106       m_rollback_only = true;
3107       return HA_EXIT_FAILURE;
3108     }
3109     return HA_EXIT_SUCCESS;
3110   }
3111 
3112   /*
3113     This is used by transactions started with "START TRANSACTION WITH "
3114     "CONSISTENT [ROCKSDB] SNAPSHOT". When tx_read_only is turned on,
3115     snapshot has to be created via DB::GetSnapshot(), not via Transaction
3116     API.
3117   */
is_tx_read_only() const3118   bool is_tx_read_only() const { return m_tx_read_only; }
3119 
is_two_phase() const3120   bool is_two_phase() const { return m_is_two_phase; }
3121 
set_tx_read_only(bool val)3122   void set_tx_read_only(bool val) { m_tx_read_only = val; }
3123 
Rdb_transaction(THD * const thd)3124   explicit Rdb_transaction(THD *const thd)
3125       : m_thd(thd), m_tbl_io_perf(nullptr) {
3126     RDB_MUTEX_LOCK_CHECK(s_tx_list_mutex);
3127     s_tx_list.insert(this);
3128     RDB_MUTEX_UNLOCK_CHECK(s_tx_list_mutex);
3129   }
3130 
~Rdb_transaction()3131   virtual ~Rdb_transaction() {
3132     RDB_MUTEX_LOCK_CHECK(s_tx_list_mutex);
3133     s_tx_list.erase(this);
3134     RDB_MUTEX_UNLOCK_CHECK(s_tx_list_mutex);
3135   }
3136 };
3137 
3138 /*
3139   This is a rocksdb transaction. Its members represent the current transaction,
3140   which consists of:
3141   - the snapshot
3142   - the changes we've made but are not seeing yet.
3143 
3144   The changes are made to individual tables, which store them here and then
3145   this object commits them on commit.
3146 */
3147 class Rdb_transaction_impl : public Rdb_transaction {
3148   rocksdb::Transaction *m_rocksdb_tx = nullptr;
3149   rocksdb::Transaction *m_rocksdb_reuse_tx = nullptr;
3150 
3151  public:
set_lock_timeout(int timeout_sec_arg)3152   void set_lock_timeout(int timeout_sec_arg) override {
3153     if (m_rocksdb_tx) {
3154       m_rocksdb_tx->SetLockTimeout(rdb_convert_sec_to_ms(m_timeout_sec));
3155     }
3156   }
3157 
set_sync(bool sync)3158   void set_sync(bool sync) override {
3159     if (m_rocksdb_tx)
3160       m_rocksdb_tx->GetWriteOptions()->sync = sync;
3161   }
3162 
release_lock(rocksdb::ColumnFamilyHandle * const column_family,const std::string & rowkey)3163   void release_lock(rocksdb::ColumnFamilyHandle *const column_family,
3164                     const std::string &rowkey) override {
3165     if (!THDVAR(m_thd, lock_scanned_rows)) {
3166       m_rocksdb_tx->UndoGetForUpdate(column_family, rocksdb::Slice(rowkey));
3167     }
3168   }
3169 
is_writebatch_trx() const3170   virtual bool is_writebatch_trx() const override { return false; }
3171 
3172  private:
release_tx(void)3173   void release_tx(void) {
3174     // We are done with the current active transaction object.  Preserve it
3175     // for later reuse.
3176     DBUG_ASSERT(m_rocksdb_reuse_tx == nullptr);
3177     m_rocksdb_reuse_tx = m_rocksdb_tx;
3178     m_rocksdb_tx = nullptr;
3179   }
3180 
prepare(const rocksdb::TransactionName & name)3181   bool prepare(const rocksdb::TransactionName &name) override {
3182     rocksdb::Status s;
3183     s = m_rocksdb_tx->SetName(name);
3184     if (!s.ok()) {
3185       rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3186       return false;
3187     }
3188 
3189     s = merge_auto_incr_map(m_rocksdb_tx->GetWriteBatch()->GetWriteBatch());
3190     if (!s.ok()) {
3191       rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3192       return false;
3193     }
3194 
3195     s = m_rocksdb_tx->Prepare();
3196     if (!s.ok()) {
3197       rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3198       return false;
3199     }
3200     return true;
3201   }
3202 
commit_no_binlog()3203   bool commit_no_binlog() override {
3204     bool res = false;
3205     rocksdb::Status s;
3206 
3207     s = merge_auto_incr_map(m_rocksdb_tx->GetWriteBatch()->GetWriteBatch());
3208     if (!s.ok()) {
3209       rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3210       res = true;
3211       goto error;
3212     }
3213 
3214     release_snapshot();
3215     s = m_rocksdb_tx->Commit();
3216     if (!s.ok()) {
3217       rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3218       res = true;
3219       goto error;
3220     }
3221 
3222     on_commit();
3223   error:
3224     on_rollback();
3225     /* Save the transaction object to be reused */
3226     release_tx();
3227 
3228     m_write_count = 0;
3229     m_insert_count = 0;
3230     m_update_count = 0;
3231     m_delete_count = 0;
3232     m_lock_count = 0;
3233     set_tx_read_only(false);
3234     m_rollback_only = false;
3235     return res;
3236   }
3237 
3238  public:
rollback()3239   void rollback() override {
3240     on_rollback();
3241     m_write_count = 0;
3242     m_insert_count = 0;
3243     m_update_count = 0;
3244     m_delete_count = 0;
3245     m_lock_count = 0;
3246     m_auto_incr_map.clear();
3247     m_ddl_transaction = false;
3248     if (m_rocksdb_tx) {
3249       release_snapshot();
3250       /* This will also release all of the locks: */
3251       m_rocksdb_tx->Rollback();
3252 
3253       /* Save the transaction object to be reused */
3254       release_tx();
3255 
3256       set_tx_read_only(false);
3257       m_rollback_only = false;
3258     }
3259   }
3260 
acquire_snapshot(bool acquire_now)3261   void acquire_snapshot(bool acquire_now) override {
3262     if (m_read_opts.snapshot == nullptr) {
3263 #ifdef MARIAROCKS_NOT_YET
3264       const auto thd_ss = std::static_pointer_cast<Rdb_explicit_snapshot>(
3265           m_thd->get_explicit_snapshot());
3266       if (thd_ss) {
3267         m_explicit_snapshot = thd_ss;
3268       }
3269       if (m_explicit_snapshot) {
3270         auto snapshot = m_explicit_snapshot->get_snapshot()->snapshot();
3271         snapshot_created(snapshot);
3272       } else
3273 #endif
3274       if (is_tx_read_only()) {
3275         snapshot_created(rdb->GetSnapshot());
3276       } else if (acquire_now) {
3277         m_rocksdb_tx->SetSnapshot();
3278         snapshot_created(m_rocksdb_tx->GetSnapshot());
3279       } else if (!m_is_delayed_snapshot) {
3280         m_rocksdb_tx->SetSnapshotOnNextOperation(m_notifier);
3281         m_is_delayed_snapshot = true;
3282       }
3283     }
3284   }
3285 
release_snapshot()3286   void release_snapshot() override {
3287     bool need_clear = m_is_delayed_snapshot;
3288 
3289     if (m_read_opts.snapshot != nullptr) {
3290       m_snapshot_timestamp = 0;
3291 #ifdef MARIAROCKS_NOT_YET
3292       if (m_explicit_snapshot) {
3293         m_explicit_snapshot.reset();
3294         need_clear = false;
3295       } else
3296 #endif
3297       if (is_tx_read_only()) {
3298         rdb->ReleaseSnapshot(m_read_opts.snapshot);
3299         need_clear = false;
3300       } else {
3301         need_clear = true;
3302       }
3303       m_read_opts.snapshot = nullptr;
3304     }
3305 
3306     if (need_clear && m_rocksdb_tx != nullptr) m_rocksdb_tx->ClearSnapshot();
3307   }
3308 
has_snapshot()3309   bool has_snapshot() { return m_read_opts.snapshot != nullptr; }
3310 
put(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const rocksdb::Slice & value,const bool assume_tracked)3311   rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family,
3312                       const rocksdb::Slice &key, const rocksdb::Slice &value,
3313                       const bool assume_tracked) override {
3314     ++m_write_count;
3315     ++m_lock_count;
3316     if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks) {
3317       return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
3318     }
3319     return m_rocksdb_tx->Put(column_family, key, value, assume_tracked);
3320   }
3321 
delete_key(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const bool assume_tracked)3322   rocksdb::Status delete_key(rocksdb::ColumnFamilyHandle *const column_family,
3323                              const rocksdb::Slice &key,
3324                              const bool assume_tracked) override {
3325     ++m_write_count;
3326     ++m_lock_count;
3327     if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks) {
3328       return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
3329     }
3330     return m_rocksdb_tx->Delete(column_family, key, assume_tracked);
3331   }
3332 
single_delete(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const bool assume_tracked)3333   rocksdb::Status single_delete(
3334       rocksdb::ColumnFamilyHandle *const column_family,
3335       const rocksdb::Slice &key, const bool assume_tracked) override {
3336     ++m_write_count;
3337     ++m_lock_count;
3338     if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks) {
3339       return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
3340     }
3341     return m_rocksdb_tx->SingleDelete(column_family, key, assume_tracked);
3342   }
3343 
has_modifications() const3344   bool has_modifications() const override {
3345     return m_rocksdb_tx->GetWriteBatch() &&
3346            m_rocksdb_tx->GetWriteBatch()->GetWriteBatch() &&
3347            m_rocksdb_tx->GetWriteBatch()->GetWriteBatch()->Count() > 0;
3348   }
3349 
get_write_batch()3350   rocksdb::WriteBatchBase *get_write_batch() override {
3351     if (is_two_phase()) {
3352       return m_rocksdb_tx->GetCommitTimeWriteBatch();
3353     }
3354     return m_rocksdb_tx->GetWriteBatch()->GetWriteBatch();
3355   }
3356 
3357   /*
3358     Return a WriteBatch that one can write to. The writes will skip any
3359     transaction locking. The writes WILL be visible to the transaction.
3360   */
get_indexed_write_batch()3361   rocksdb::WriteBatchBase *get_indexed_write_batch() override {
3362     ++m_write_count;
3363     return m_rocksdb_tx->GetWriteBatch();
3364   }
3365 
get(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,rocksdb::PinnableSlice * const value) const3366   rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family,
3367                       const rocksdb::Slice &key,
3368                       rocksdb::PinnableSlice *const value) const override {
3369     // clean PinnableSlice right begfore Get() for multiple gets per statement
3370     // the resources after the last Get in a statement are cleared in
3371     // handler::reset call
3372     value->Reset();
3373     global_stats.queries[QUERIES_POINT].inc();
3374     return m_rocksdb_tx->Get(m_read_opts, column_family, key, value);
3375   }
3376 
get_for_update(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,rocksdb::PinnableSlice * const value,bool exclusive,const bool do_validate)3377   rocksdb::Status get_for_update(
3378       rocksdb::ColumnFamilyHandle *const column_family,
3379       const rocksdb::Slice &key, rocksdb::PinnableSlice *const value,
3380       bool exclusive, const bool do_validate) override {
3381     if (++m_lock_count > m_max_row_locks) {
3382       return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
3383     }
3384 
3385     if (value != nullptr) {
3386       value->Reset();
3387     }
3388     rocksdb::Status s;
3389     // If snapshot is null, pass it to GetForUpdate and snapshot is
3390     // initialized there. Snapshot validation is skipped in that case.
3391     if (m_read_opts.snapshot == nullptr || do_validate) {
3392       s = m_rocksdb_tx->GetForUpdate(
3393           m_read_opts, column_family, key, value, exclusive,
3394           m_read_opts.snapshot ? do_validate : false);
3395     } else {
3396       // If snapshot is set, and if skipping validation,
3397       // call GetForUpdate without validation and set back old snapshot
3398       auto saved_snapshot = m_read_opts.snapshot;
3399       m_read_opts.snapshot = nullptr;
3400       s = m_rocksdb_tx->GetForUpdate(m_read_opts, column_family, key, value,
3401                                      exclusive, false);
3402       m_read_opts.snapshot = saved_snapshot;
3403     }
3404     return s;
3405   }
3406 
get_iterator(const rocksdb::ReadOptions & options,rocksdb::ColumnFamilyHandle * const column_family)3407   rocksdb::Iterator *get_iterator(
3408       const rocksdb::ReadOptions &options,
3409       rocksdb::ColumnFamilyHandle *const column_family) override {
3410     global_stats.queries[QUERIES_RANGE].inc();
3411     return m_rocksdb_tx->GetIterator(options, column_family);
3412   }
3413 
get_rdb_trx() const3414   const rocksdb::Transaction *get_rdb_trx() const { return m_rocksdb_tx; }
3415 
is_tx_started() const3416   bool is_tx_started() const override { return (m_rocksdb_tx != nullptr); }
3417 
start_tx()3418   void start_tx() override {
3419     rocksdb::TransactionOptions tx_opts;
3420     rocksdb::WriteOptions write_opts;
3421     tx_opts.set_snapshot = false;
3422     tx_opts.lock_timeout = rdb_convert_sec_to_ms(m_timeout_sec);
3423     tx_opts.deadlock_detect = THDVAR(m_thd, deadlock_detect);
3424     tx_opts.deadlock_detect_depth = THDVAR(m_thd, deadlock_detect_depth);
3425     // If this variable is set, this will write commit time write batch
3426     // information on recovery or memtable flush.
3427     tx_opts.use_only_the_last_commit_time_batch_for_recovery =
3428         THDVAR(m_thd, commit_time_batch_for_recovery);
3429     tx_opts.max_write_batch_size = THDVAR(m_thd, write_batch_max_bytes);
3430 
3431     write_opts.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
3432     write_opts.disableWAL = THDVAR(m_thd, write_disable_wal);
3433     write_opts.ignore_missing_column_families =
3434         THDVAR(m_thd, write_ignore_missing_column_families);
3435     m_is_two_phase = rocksdb_enable_2pc;
3436 
3437     commit_ordered_done= false;
3438 
3439     /*
3440       If m_rocksdb_reuse_tx is null this will create a new transaction object.
3441       Otherwise it will reuse the existing one.
3442     */
3443     m_rocksdb_tx =
3444         rdb->BeginTransaction(write_opts, tx_opts, m_rocksdb_reuse_tx);
3445     m_rocksdb_reuse_tx = nullptr;
3446 
3447     m_read_opts = rocksdb::ReadOptions();
3448 
3449     set_initial_savepoint();
3450 
3451     m_ddl_transaction = false;
3452   }
3453 
3454   /* Implementations of do_*savepoint based on rocksdB::Transaction savepoints
3455    */
do_set_savepoint()3456   void do_set_savepoint() override { m_rocksdb_tx->SetSavePoint(); }
3457 
do_rollback_to_savepoint()3458   void do_rollback_to_savepoint() override {
3459     m_rocksdb_tx->RollbackToSavePoint();
3460   }
3461 
3462   /*
3463     Start a statement inside a multi-statement transaction.
3464 
3465     @todo: are we sure this is called once (and not several times) per
3466     statement start?
3467 
3468     For hooking to start of statement that is its own transaction, see
3469     ha_rocksdb::external_lock().
3470   */
start_stmt()3471   void start_stmt() override {
3472     // Set the snapshot to delayed acquisition (SetSnapshotOnNextOperation)
3473     acquire_snapshot(false);
3474   }
3475 
3476   /*
3477     This must be called when last statement is rolled back, but the transaction
3478     continues
3479   */
rollback_stmt()3480   void rollback_stmt() override {
3481     /* TODO: here we must release the locks taken since the start_stmt() call */
3482     if (m_rocksdb_tx) {
3483       const rocksdb::Snapshot *const org_snapshot = m_rocksdb_tx->GetSnapshot();
3484       rollback_to_stmt_savepoint();
3485 
3486       const rocksdb::Snapshot *const cur_snapshot = m_rocksdb_tx->GetSnapshot();
3487       if (org_snapshot != cur_snapshot) {
3488         if (org_snapshot != nullptr) m_snapshot_timestamp = 0;
3489 
3490         m_read_opts.snapshot = cur_snapshot;
3491         if (cur_snapshot != nullptr) {
3492           rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp);
3493         } else {
3494           m_is_delayed_snapshot = true;
3495         }
3496       }
3497     }
3498   }
3499 
Rdb_transaction_impl(THD * const thd)3500   explicit Rdb_transaction_impl(THD *const thd)
3501       : Rdb_transaction(thd), m_rocksdb_tx(nullptr) {
3502     // Create a notifier that can be called when a snapshot gets generated.
3503     m_notifier = std::make_shared<Rdb_snapshot_notifier>(this);
3504   }
3505 
~Rdb_transaction_impl()3506   virtual ~Rdb_transaction_impl() override {
3507     rollback();
3508 
3509     // Theoretically the notifier could outlive the Rdb_transaction_impl
3510     // (because of the shared_ptr), so let it know it can't reference
3511     // the transaction anymore.
3512     m_notifier->detach();
3513 
3514     // Free any transaction memory that is still hanging around.
3515     delete m_rocksdb_reuse_tx;
3516     DBUG_ASSERT(m_rocksdb_tx == nullptr);
3517   }
3518 };
3519 
3520 /* This is a rocksdb write batch. This class doesn't hold or wait on any
3521    transaction locks (skips rocksdb transaction API) thus giving better
3522    performance.
3523 
3524    Currently this is only used for replication threads which are guaranteed
3525    to be non-conflicting. Any further usage of this class should completely
3526    be thought thoroughly.
3527 */
3528 class Rdb_writebatch_impl : public Rdb_transaction {
3529   rocksdb::WriteBatchWithIndex *m_batch;
3530   rocksdb::WriteOptions write_opts;
3531   // Called after commit/rollback.
reset()3532   void reset() {
3533     m_batch->Clear();
3534     m_read_opts = rocksdb::ReadOptions();
3535     m_ddl_transaction = false;
3536   }
3537 
3538  private:
prepare(const rocksdb::TransactionName & name)3539   bool prepare(const rocksdb::TransactionName &name) override { return true; }
3540 
commit_no_binlog()3541   bool commit_no_binlog() override {
3542     bool res = false;
3543     rocksdb::Status s;
3544     rocksdb::TransactionDBWriteOptimizations optimize;
3545     optimize.skip_concurrency_control = true;
3546 
3547     s = merge_auto_incr_map(m_batch->GetWriteBatch());
3548     if (!s.ok()) {
3549       rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3550       res = true;
3551       goto error;
3552     }
3553 
3554     release_snapshot();
3555 
3556     s = rdb->Write(write_opts, optimize, m_batch->GetWriteBatch());
3557     if (!s.ok()) {
3558       rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3559       res = true;
3560       goto error;
3561     }
3562     on_commit();
3563   error:
3564     on_rollback();
3565     reset();
3566 
3567     m_write_count = 0;
3568     m_insert_count = 0;
3569     m_update_count = 0;
3570     m_delete_count = 0;
3571     set_tx_read_only(false);
3572     m_rollback_only = false;
3573     return res;
3574   }
3575 
3576   /* Implementations of do_*savepoint based on rocksdB::WriteBatch savepoints */
do_set_savepoint()3577   void do_set_savepoint() override { m_batch->SetSavePoint(); }
3578 
do_rollback_to_savepoint()3579   void do_rollback_to_savepoint() override { m_batch->RollbackToSavePoint(); }
3580 
3581 
3582  public:
is_writebatch_trx() const3583   bool is_writebatch_trx() const override { return true; }
3584 
set_lock_timeout(int timeout_sec_arg)3585   void set_lock_timeout(int timeout_sec_arg) override {
3586     // Nothing to do here.
3587   }
3588 
set_sync(bool sync)3589   void set_sync(bool sync) override { write_opts.sync = sync; }
3590 
release_lock(rocksdb::ColumnFamilyHandle * const column_family,const std::string & rowkey)3591   void release_lock(rocksdb::ColumnFamilyHandle *const column_family,
3592                     const std::string &rowkey) override {
3593     // Nothing to do here since we don't hold any row locks.
3594   }
3595 
rollback()3596   void rollback() override {
3597     on_rollback();
3598     m_write_count = 0;
3599     m_insert_count = 0;
3600     m_update_count = 0;
3601     m_delete_count = 0;
3602     m_lock_count = 0;
3603     release_snapshot();
3604 
3605     reset();
3606     set_tx_read_only(false);
3607     m_rollback_only = false;
3608   }
3609 
acquire_snapshot(bool acquire_now)3610   void acquire_snapshot(bool acquire_now) override {
3611     if (m_read_opts.snapshot == nullptr) snapshot_created(rdb->GetSnapshot());
3612   }
3613 
release_snapshot()3614   void release_snapshot() override {
3615     if (m_read_opts.snapshot != nullptr) {
3616       rdb->ReleaseSnapshot(m_read_opts.snapshot);
3617       m_read_opts.snapshot = nullptr;
3618     }
3619   }
3620 
put(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const rocksdb::Slice & value,const bool assume_tracked)3621   rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family,
3622                       const rocksdb::Slice &key, const rocksdb::Slice &value,
3623                       const bool assume_tracked) override {
3624     ++m_write_count;
3625     m_batch->Put(column_family, key, value);
3626     // Note Put/Delete in write batch doesn't return any error code. We simply
3627     // return OK here.
3628     return rocksdb::Status::OK();
3629   }
3630 
delete_key(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const bool assume_tracked)3631   rocksdb::Status delete_key(rocksdb::ColumnFamilyHandle *const column_family,
3632                              const rocksdb::Slice &key,
3633                              const bool assume_tracked) override {
3634     ++m_write_count;
3635     m_batch->Delete(column_family, key);
3636     return rocksdb::Status::OK();
3637   }
3638 
single_delete(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const bool)3639   rocksdb::Status single_delete(
3640       rocksdb::ColumnFamilyHandle *const column_family,
3641       const rocksdb::Slice &key, const bool /* assume_tracked */) override {
3642     ++m_write_count;
3643     m_batch->SingleDelete(column_family, key);
3644     return rocksdb::Status::OK();
3645   }
3646 
has_modifications() const3647   bool has_modifications() const override {
3648     return m_batch->GetWriteBatch()->Count() > 0;
3649   }
3650 
get_write_batch()3651   rocksdb::WriteBatchBase *get_write_batch() override { return m_batch; }
3652 
get_indexed_write_batch()3653   rocksdb::WriteBatchBase *get_indexed_write_batch() override {
3654     ++m_write_count;
3655     return m_batch;
3656   }
3657 
get(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,rocksdb::PinnableSlice * const value) const3658   rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family,
3659                       const rocksdb::Slice &key,
3660                       rocksdb::PinnableSlice *const value) const override {
3661     value->Reset();
3662     return m_batch->GetFromBatchAndDB(rdb, m_read_opts, column_family, key,
3663                                       value);
3664   }
3665 
get_for_update(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,rocksdb::PinnableSlice * const value,bool,const bool)3666   rocksdb::Status get_for_update(
3667       rocksdb::ColumnFamilyHandle *const column_family,
3668       const rocksdb::Slice &key, rocksdb::PinnableSlice *const value,
3669       bool /* exclusive */, const bool /* do_validate */) override {
3670     if (value == nullptr) {
3671       rocksdb::PinnableSlice pin_val;
3672       rocksdb::Status s = get(column_family, key, &pin_val);
3673       pin_val.Reset();
3674       return s;
3675     }
3676 
3677     return get(column_family, key, value);
3678   }
3679 
get_iterator(const rocksdb::ReadOptions & options,rocksdb::ColumnFamilyHandle * const)3680   rocksdb::Iterator *get_iterator(
3681       const rocksdb::ReadOptions &options,
3682       rocksdb::ColumnFamilyHandle *const /* column_family */) override {
3683     const auto it = rdb->NewIterator(options);
3684     return m_batch->NewIteratorWithBase(it);
3685   }
3686 
is_tx_started() const3687   bool is_tx_started() const override { return (m_batch != nullptr); }
3688 
start_tx()3689   void start_tx() override {
3690     commit_ordered_done= false; // Do we need this here?
3691     reset();
3692     write_opts.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
3693     write_opts.disableWAL = THDVAR(m_thd, write_disable_wal);
3694     write_opts.ignore_missing_column_families =
3695         THDVAR(m_thd, write_ignore_missing_column_families);
3696 
3697     set_initial_savepoint();
3698   }
3699 
start_stmt()3700   void start_stmt() override {}
3701 
rollback_stmt()3702   void rollback_stmt() override {
3703     if (m_batch) rollback_to_stmt_savepoint();
3704   }
3705 
Rdb_writebatch_impl(THD * const thd)3706   explicit Rdb_writebatch_impl(THD *const thd)
3707       : Rdb_transaction(thd), m_batch(nullptr) {
3708     m_batch = new rocksdb::WriteBatchWithIndex(rocksdb::BytewiseComparator(), 0,
3709                                                true);
3710   }
3711 
~Rdb_writebatch_impl()3712   virtual ~Rdb_writebatch_impl() override {
3713     rollback();
3714     delete m_batch;
3715   }
3716 };
3717 
SnapshotCreated(const rocksdb::Snapshot * const snapshot)3718 void Rdb_snapshot_notifier::SnapshotCreated(
3719     const rocksdb::Snapshot *const snapshot) {
3720   if (m_owning_tx != nullptr) {
3721     m_owning_tx->snapshot_created(snapshot);
3722   }
3723 }
3724 
3725 std::multiset<Rdb_transaction *> Rdb_transaction::s_tx_list;
3726 mysql_mutex_t Rdb_transaction::s_tx_list_mutex;
3727 
get_tx_from_thd(THD * const thd)3728 static Rdb_transaction *get_tx_from_thd(THD *const thd) {
3729   return reinterpret_cast<Rdb_transaction *>(
3730       my_core::thd_get_ha_data(thd, rocksdb_hton));
3731 }
3732 
3733 namespace {
3734 
3735 class Rdb_perf_context_guard {
3736   Rdb_io_perf m_io_perf;
3737   Rdb_io_perf *m_io_perf_ptr;
3738   Rdb_transaction *m_tx;
3739   uint m_level;
3740 
3741  public:
3742   Rdb_perf_context_guard(const Rdb_perf_context_guard &) = delete;
3743   Rdb_perf_context_guard &operator=(const Rdb_perf_context_guard &) = delete;
3744 
Rdb_perf_context_guard(Rdb_io_perf * io_perf,uint level)3745   explicit Rdb_perf_context_guard(Rdb_io_perf *io_perf, uint level)
3746       : m_io_perf_ptr(io_perf), m_tx(nullptr), m_level(level) {
3747     m_io_perf_ptr->start(m_level);
3748   }
3749 
Rdb_perf_context_guard(Rdb_transaction * tx,uint level)3750   explicit Rdb_perf_context_guard(Rdb_transaction *tx, uint level)
3751       : m_io_perf_ptr(nullptr), m_tx(tx), m_level(level) {
3752     /*
3753       if perf_context information is already being recorded, this becomes a
3754       no-op
3755     */
3756     if (tx != nullptr) {
3757       tx->io_perf_start(&m_io_perf);
3758     }
3759   }
3760 
~Rdb_perf_context_guard()3761   ~Rdb_perf_context_guard() {
3762     if (m_tx != nullptr) {
3763       m_tx->io_perf_end_and_record();
3764     } else if (m_io_perf_ptr != nullptr) {
3765       m_io_perf_ptr->end_and_record(m_level);
3766     }
3767   }
3768 };
3769 
3770 }  // anonymous namespace
3771 
3772 /*
3773   TODO: maybe, call this in external_lock() and store in ha_rocksdb..
3774 */
3775 
get_or_create_tx(THD * const thd)3776 static Rdb_transaction *get_or_create_tx(THD *const thd) {
3777   Rdb_transaction *tx = get_tx_from_thd(thd);
3778   // TODO: this is called too many times.. O(#rows)
3779   if (tx == nullptr) {
3780     bool rpl_skip_tx_api= false; // MARIAROCKS_NOT_YET.
3781     if ((rpl_skip_tx_api && thd->rgi_slave) ||
3782         (THDVAR(thd, master_skip_tx_api) && !thd->rgi_slave))
3783     {
3784       tx = new Rdb_writebatch_impl(thd);
3785     } else {
3786       tx = new Rdb_transaction_impl(thd);
3787     }
3788     tx->set_params(THDVAR(thd, lock_wait_timeout), THDVAR(thd, max_row_locks));
3789     tx->start_tx();
3790     my_core::thd_set_ha_data(thd, rocksdb_hton, tx);
3791   } else {
3792     tx->set_params(THDVAR(thd, lock_wait_timeout), THDVAR(thd, max_row_locks));
3793     if (!tx->is_tx_started()) {
3794       tx->start_tx();
3795     }
3796   }
3797 
3798   return tx;
3799 }
3800 
rocksdb_close_connection(handlerton * const hton,THD * const thd)3801 static int rocksdb_close_connection(handlerton *const hton, THD *const thd) {
3802   Rdb_transaction *tx = get_tx_from_thd(thd);
3803   if (tx != nullptr) {
3804     bool is_critical_error;
3805     int rc = tx->finish_bulk_load(&is_critical_error, false);
3806     if (rc != 0 && is_critical_error) {
3807       // NO_LINT_DEBUG
3808       sql_print_error(
3809           "RocksDB: Error %d finalizing last SST file while "
3810           "disconnecting",
3811           rc);
3812     }
3813 
3814     delete tx;
3815   }
3816   return HA_EXIT_SUCCESS;
3817 }
3818 
3819 /*
3820  * Serializes an xid to a string so that it can
3821  * be used as a rocksdb transaction name
3822  */
rdb_xid_to_string(const XID & src)3823 static std::string rdb_xid_to_string(const XID &src) {
3824   DBUG_ASSERT(src.gtrid_length >= 0 && src.gtrid_length <= MAXGTRIDSIZE);
3825   DBUG_ASSERT(src.bqual_length >= 0 && src.bqual_length <= MAXBQUALSIZE);
3826 
3827   std::string buf;
3828   buf.reserve(RDB_XIDHDR_LEN + src.gtrid_length + src.bqual_length);
3829 
3830   /*
3831    * expand formatID to fill 8 bytes if it doesn't already
3832    * then reinterpret bit pattern as unsigned and store in network order
3833    */
3834   uchar fidbuf[RDB_FORMATID_SZ];
3835   int64 signed_fid8 = src.formatID;
3836   const uint64 raw_fid8 = *reinterpret_cast<uint64 *>(&signed_fid8);
3837   rdb_netbuf_store_uint64(fidbuf, raw_fid8);
3838   buf.append(reinterpret_cast<const char *>(fidbuf), RDB_FORMATID_SZ);
3839 
3840   buf.push_back(src.gtrid_length);
3841   buf.push_back(src.bqual_length);
3842   buf.append(src.data, (src.gtrid_length) + (src.bqual_length));
3843   return buf;
3844 }
3845 
3846 #if 0
3847 // MARIAROCKS: MariaDB doesn't have flush_wal method
3848 /**
3849   Called by hton->flush_logs after MySQL group commit prepares a set of
3850   transactions.
3851 */
3852 static bool rocksdb_flush_wal(handlerton* hton __attribute__((__unused__)))
3853   DBUG_ASSERT(rdb != nullptr);
3854 
3855   rocksdb::Status s;
3856   /*
3857     target_lsn is set to 0 when MySQL wants to sync the wal files
3858   */
3859   if ((target_lsn == 0 && !rocksdb_db_options->allow_mmap_writes) ||
3860       rocksdb_flush_log_at_trx_commit != FLUSH_LOG_NEVER) {
3861     rocksdb_wal_group_syncs++;
3862     s = rdb->FlushWAL(target_lsn == 0 ||
3863                       rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
3864   }
3865 
3866   if (!s.ok()) {
3867     rdb_log_status_error(s);
3868     return HA_EXIT_FAILURE;
3869   }
3870   return HA_EXIT_SUCCESS;
3871 }
3872 #endif
3873 
3874 /**
3875   For a slave, prepare() updates the slave_gtid_info table which tracks the
3876   replication progress.
3877 */
rocksdb_prepare(handlerton * hton,THD * thd,bool prepare_tx)3878 static int rocksdb_prepare(handlerton* hton, THD* thd, bool prepare_tx)
3879 {
3880   bool async=false; // This is "ASYNC_COMMIT" feature which is only present in webscalesql
3881 
3882   Rdb_transaction *tx = get_tx_from_thd(thd);
3883   if (!tx->can_prepare()) {
3884     return HA_EXIT_FAILURE;
3885   }
3886   if (prepare_tx ||
3887       (!my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
3888     /* We were instructed to prepare the whole transaction, or
3889     this is an SQL statement end and autocommit is on */
3890 
3891 #ifdef MARIAROCKS_NOT_YET
3892     /*
3893       Storing binlog position inside MyRocks is needed only for restoring
3894       MyRocks from backups. This feature is not supported yet.
3895     */
3896     std::vector<st_slave_gtid_info> slave_gtid_info;
3897     my_core::thd_slave_gtid_info(thd, &slave_gtid_info);
3898     for (const auto &it : slave_gtid_info) {
3899       rocksdb::WriteBatchBase *const write_batch = tx->get_blind_write_batch();
3900       binlog_manager.update_slave_gtid_info(it.id, it.db, it.gtid, write_batch);
3901     }
3902 #endif
3903 
3904     if (tx->is_two_phase()) {
3905 
3906       /*
3907         MariaDB: the following branch is never taken.
3908         We always flush at Prepare and rely on RocksDB's internal Group Commit
3909         to do some grouping.
3910       */
3911       if (thd->durability_property == HA_IGNORE_DURABILITY || async) {
3912         tx->set_sync(false);
3913       }
3914 
3915       /*
3916         MariaDB: do not flush logs if we are running in a non-crash-safe mode.
3917       */
3918       if (!rocksdb_flush_log_at_trx_commit)
3919         tx->set_sync(false);
3920 
3921       XID xid;
3922       thd_get_xid(thd, reinterpret_cast<MYSQL_XID *>(&xid));
3923       if (!tx->prepare(rdb_xid_to_string(xid))) {
3924         return HA_EXIT_FAILURE;
3925       }
3926 
3927       /*
3928         MariaDB: our Group Commit implementation does not use the
3929         hton->flush_logs call (at least currently) so the following is not
3930         needed (TODO: will we need this for binlog rotation?)
3931       */
3932 #ifdef MARIAROCKS_NOT_YET
3933       if (thd->durability_property == HA_IGNORE_DURABILITY )
3934           (rocksdb_flush_log_at_trx_commit != FLUSH_LOG_NEVER))
3935           &&
3936           THDVAR(thd, flush_log_at_trx_commit))
3937 #endif
3938 #ifdef MARIAROCKS_NOT_YET
3939       {
3940         // MariaRocks: disable the
3941         //   "write/sync redo log before flushing binlog cache to file"
3942         //  feature. See a869c56d361bb44f46c0efeb11a8f03561676247
3943         /**
3944           we set the log sequence as '1' just to trigger hton->flush_logs
3945         */
3946         thd_store_lsn(thd, 1, DB_TYPE_ROCKSDB);
3947       }
3948 #endif
3949     }
3950 
3951     DEBUG_SYNC(thd, "rocksdb.prepared");
3952   } else {
3953     tx->make_stmt_savepoint_permanent();
3954   }
3955   return HA_EXIT_SUCCESS;
3956 }
3957 
3958 /**
3959  do nothing for prepare/commit by xid
3960  this is needed to avoid crashes in XA scenarios
3961 */
rocksdb_commit_by_xid(handlerton * const hton,XID * const xid)3962 static int rocksdb_commit_by_xid(handlerton *const hton, XID *const xid) {
3963   DBUG_ENTER_FUNC();
3964 
3965   DBUG_ASSERT(hton != nullptr);
3966   DBUG_ASSERT(xid != nullptr);
3967   DBUG_ASSERT(commit_latency_stats != nullptr);
3968 
3969   rocksdb::StopWatchNano timer(rocksdb::Env::Default(), true);
3970 
3971   const auto name = rdb_xid_to_string(*xid);
3972   DBUG_ASSERT(!name.empty());
3973 
3974   rocksdb::Transaction *const trx = rdb->GetTransactionByName(name);
3975 
3976   if (trx == nullptr) {
3977     DBUG_RETURN(HA_EXIT_FAILURE);
3978   }
3979 
3980   const rocksdb::Status s = trx->Commit();
3981 
3982   if (!s.ok()) {
3983     rdb_log_status_error(s);
3984     DBUG_RETURN(HA_EXIT_FAILURE);
3985   }
3986 
3987   delete trx;
3988 
3989   // `Add()` is implemented in a thread-safe manner.
3990   commit_latency_stats->Add(timer.ElapsedNanos() / 1000);
3991 
3992   DBUG_RETURN(HA_EXIT_SUCCESS);
3993 }
3994 
rocksdb_rollback_by_xid(handlerton * const hton MY_ATTRIBUTE ((__unused__)),XID * const xid)3995 static int rocksdb_rollback_by_xid(
3996     handlerton *const hton MY_ATTRIBUTE((__unused__)), XID *const xid) {
3997   DBUG_ENTER_FUNC();
3998 
3999   DBUG_ASSERT(hton != nullptr);
4000   DBUG_ASSERT(xid != nullptr);
4001   DBUG_ASSERT(rdb != nullptr);
4002 
4003   const auto name = rdb_xid_to_string(*xid);
4004 
4005   rocksdb::Transaction *const trx = rdb->GetTransactionByName(name);
4006 
4007   if (trx == nullptr) {
4008     DBUG_RETURN(HA_EXIT_FAILURE);
4009   }
4010 
4011   const rocksdb::Status s = trx->Rollback();
4012 
4013   if (!s.ok()) {
4014     rdb_log_status_error(s);
4015     DBUG_RETURN(HA_EXIT_FAILURE);
4016   }
4017 
4018   delete trx;
4019 
4020   DBUG_RETURN(HA_EXIT_SUCCESS);
4021 }
4022 
4023 /**
4024   Rebuilds an XID from a serialized version stored in a string.
4025 */
rdb_xid_from_string(const std::string & src,XID * const dst)4026 static void rdb_xid_from_string(const std::string &src, XID *const dst) {
4027   DBUG_ASSERT(dst != nullptr);
4028   uint offset = 0;
4029   uint64 raw_fid8 =
4030       rdb_netbuf_to_uint64(reinterpret_cast<const uchar *>(src.data()));
4031   const int64 signed_fid8 = *reinterpret_cast<int64 *>(&raw_fid8);
4032   dst->formatID = signed_fid8;
4033   offset += RDB_FORMATID_SZ;
4034   dst->gtrid_length = src.at(offset);
4035   offset += RDB_GTRID_SZ;
4036   dst->bqual_length = src.at(offset);
4037   offset += RDB_BQUAL_SZ;
4038 
4039   DBUG_ASSERT(dst->gtrid_length >= 0 && dst->gtrid_length <= MAXGTRIDSIZE);
4040   DBUG_ASSERT(dst->bqual_length >= 0 && dst->bqual_length <= MAXBQUALSIZE);
4041 
4042   memset(dst->data, 0, XIDDATASIZE);
4043   src.copy(dst->data, (dst->gtrid_length) + (dst->bqual_length),
4044            RDB_XIDHDR_LEN);
4045 }
4046 
4047 /**
4048   Reading last committed binary log info from RocksDB system row.
4049   The info is needed for crash safe slave/master to work.
4050 */
rocksdb_recover(handlerton * hton,XID * xid_list,uint len)4051 static int rocksdb_recover(handlerton* hton, XID* xid_list, uint len)
4052 #ifdef MARIAROCKS_NOT_YET
4053                            char* const binlog_file,
4054                            my_off_t *const binlog_pos,
4055                            Gtid *const binlog_max_gtid) {
4056 #endif
4057 {
4058 #ifdef MARIAROCKS_NOT_YET
4059   if (binlog_file && binlog_pos) {
4060     char file_buf[FN_REFLEN + 1] = {0};
4061     my_off_t pos;
4062     char gtid_buf[FN_REFLEN + 1] = {0};
4063     if (binlog_manager.read(file_buf, &pos, gtid_buf)) {
4064       if (is_binlog_advanced(binlog_file, *binlog_pos, file_buf, pos)) {
4065         memcpy(binlog_file, file_buf, FN_REFLEN + 1);
4066         *binlog_pos = pos;
4067         // NO_LINT_DEBUG
4068         fprintf(stderr,
4069                 "RocksDB: Last binlog file position %llu,"
4070                 " file name %s\n",
4071                 pos, file_buf);
4072         if (*gtid_buf) {
4073           global_sid_lock->rdlock();
4074           binlog_max_gtid->parse(global_sid_map, gtid_buf);
4075           global_sid_lock->unlock();
4076           // NO_LINT_DEBUG
4077           fprintf(stderr, "RocksDB: Last MySQL Gtid %s\n", gtid_buf);
4078         }
4079       }
4080     }
4081   }
4082 #endif
4083 
4084   if (len == 0 || xid_list == nullptr) {
4085     return HA_EXIT_SUCCESS;
4086   }
4087 
4088   std::vector<rocksdb::Transaction *> trans_list;
4089   rdb->GetAllPreparedTransactions(&trans_list);
4090 
4091   uint count = 0;
4092   for (auto &trans : trans_list) {
4093     if (count >= len) {
4094       break;
4095     }
4096     auto name = trans->GetName();
4097     rdb_xid_from_string(name, &xid_list[count]);
4098     count++;
4099   }
4100   return count;
4101 }
4102 
4103 
4104 /*
4105   Handle a commit checkpoint request from server layer.
4106 
4107   InnoDB does this:
4108     We put the request in a queue, so that we can notify upper layer about
4109     checkpoint complete when we have flushed the redo log.
4110     If we have already flushed all relevant redo log, we notify immediately.
4111 
4112   MariaRocks just flushes everything right away ATM
4113 */
4114 
4115 static void rocksdb_checkpoint_request(handlerton *hton,
4116                                        void *cookie)
4117 {
4118   const rocksdb::Status s= rdb->SyncWAL();
4119   //TODO: what to do on error?
4120   if (s.ok())
4121   {
4122     rocksdb_wal_group_syncs++;
4123     commit_checkpoint_notify_ha(hton, cookie);
4124   }
4125 }
4126 
4127 /*
4128   @param all:   TRUE - commit the transaction
4129                 FALSE - SQL statement ended
4130 */
4131 static void rocksdb_commit_ordered(handlerton *hton, THD* thd, bool all)
4132 {
4133   // Same assert as InnoDB has
4134   DBUG_ASSERT(all || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT |
4135                                              OPTION_BEGIN)));
4136   Rdb_transaction *tx = get_tx_from_thd(thd);
4137   if (!tx->is_two_phase()) {
4138     /*
4139       ordered_commit is supposedly slower as it is done sequentially
4140       in order to preserve commit order.
4141 
4142       if we are not required do 2-phase commit with the binlog, do not do
4143       anything here.
4144     */
4145     return;
4146   }
4147 
4148   tx->set_sync(false);
4149 
4150   /* This will note the master position also */
4151   tx->commit_ordered_res= tx->commit();
4152   tx->commit_ordered_done= true;
4153 
4154 }
4155 
4156 
4157 static int rocksdb_commit(handlerton* hton, THD* thd, bool commit_tx)
4158 {
4159   DBUG_ENTER_FUNC();
4160 
4161   DBUG_ASSERT(hton != nullptr);
4162   DBUG_ASSERT(thd != nullptr);
4163   DBUG_ASSERT(commit_latency_stats != nullptr);
4164 
4165   rocksdb::StopWatchNano timer(rocksdb::Env::Default(), true);
4166 
4167   /* note: h->external_lock(F_UNLCK) is called after this function is called) */
4168   Rdb_transaction *tx = get_tx_from_thd(thd);
4169 
4170   /* this will trigger saving of perf_context information */
4171   Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd));
4172 
4173   if (tx != nullptr) {
4174     if (commit_tx || (!my_core::thd_test_options(
4175                          thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
4176       /*
4177         This will not add anything to commit_latency_stats, and this is correct
4178         right?
4179       */
4180       if (tx->commit_ordered_done)
4181       {
4182         thd_wakeup_subsequent_commits(thd, 0);
4183         DBUG_RETURN((tx->commit_ordered_res? HA_ERR_INTERNAL_ERROR: 0));
4184       }
4185 
4186       /*
4187         We get here
4188          - For a COMMIT statement that finishes a multi-statement transaction
4189          - For a statement that has its own transaction
4190       */
4191       if (thd->slave_thread)
4192       {
4193         // An attempt to make parallel slave performant (not fully successful,
4194         // see MDEV-15372):
4195 
4196         //  First, commit without syncing. This establishes the commit order
4197         tx->set_sync(false);
4198         bool tx_had_writes = tx->get_write_count()? true : false ;
4199         if (tx->commit()) {
4200           DBUG_RETURN(HA_ERR_ROCKSDB_COMMIT_FAILED);
4201         }
4202         thd_wakeup_subsequent_commits(thd, 0);
4203 
4204         if (tx_had_writes && rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC)
4205         {
4206           rocksdb::Status s= rdb->FlushWAL(true);
4207           if (!s.ok())
4208             DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
4209         }
4210       }
4211       else
4212       {
4213         /* Not a slave thread */
4214         if (tx->commit()) {
4215           DBUG_RETURN(HA_ERR_ROCKSDB_COMMIT_FAILED);
4216         }
4217       }
4218     } else {
4219       /*
4220         We get here when committing a statement within a transaction.
4221       */
4222       tx->make_stmt_savepoint_permanent();
4223     }
4224 
4225     if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED) {
4226       // For READ_COMMITTED, we release any existing snapshot so that we will
4227       // see any changes that occurred since the last statement.
4228       tx->release_snapshot();
4229     }
4230   }
4231 
4232   // `Add()` is implemented in a thread-safe manner.
4233   commit_latency_stats->Add(timer.ElapsedNanos() / 1000);
4234 
4235   DBUG_RETURN(HA_EXIT_SUCCESS);
4236 }
4237 
4238 
4239 static int rocksdb_rollback(handlerton *const hton, THD *const thd,
4240                             bool rollback_tx) {
4241   Rdb_transaction *tx = get_tx_from_thd(thd);
4242   Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd));
4243 
4244   if (tx != nullptr) {
4245     if (rollback_tx) {
4246       /*
4247         We get here, when
4248         - ROLLBACK statement is issued.
4249 
4250         Discard the changes made by the transaction
4251       */
4252       tx->rollback();
4253     } else {
4254       /*
4255         We get here when
4256         - a statement with AUTOCOMMIT=1 is being rolled back (because of some
4257           error)
4258         - a statement inside a transaction is rolled back
4259       */
4260 
4261       tx->rollback_stmt();
4262       tx->set_tx_failed(true);
4263     }
4264 
4265     if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED) {
4266       // For READ_COMMITTED, we release any existing snapshot so that we will
4267       // see any changes that occurred since the last statement.
4268       tx->release_snapshot();
4269     }
4270   }
4271   return HA_EXIT_SUCCESS;
4272 }
4273 
4274 static bool print_stats(THD *const thd, std::string const &type,
4275                         std::string const &name, std::string const &status,
4276                         stat_print_fn *stat_print) {
4277   return stat_print(thd, type.c_str(), type.size(), name.c_str(), name.size(),
4278                     status.c_str(), status.size());
4279 }
4280 
4281 static std::string format_string(const char *const format, ...) {
4282   std::string res;
4283   va_list args;
4284   va_list args_copy;
4285   char static_buff[256];
4286 
4287   DBUG_ASSERT(format != nullptr);
4288 
4289   va_start(args, format);
4290   va_copy(args_copy, args);
4291 
4292   // Calculate how much space we will need
4293   int len = vsnprintf(nullptr, 0, format, args);
4294   va_end(args);
4295 
4296   if (len < 0) {
4297     res = std::string("<format error>");
4298   } else if (len == 0) {
4299     // Shortcut for an empty string
4300     res = std::string("");
4301   } else {
4302     // For short enough output use a static buffer
4303     char *buff = static_buff;
4304     std::unique_ptr<char[]> dynamic_buff = nullptr;
4305 
4306     len++;  // Add one for null terminator
4307 
4308     // for longer output use an allocated buffer
4309     if (static_cast<uint>(len) > sizeof(static_buff)) {
4310       dynamic_buff.reset(new char[len]);
4311       buff = dynamic_buff.get();
4312     }
4313 
4314     // Now re-do the vsnprintf with the buffer which is now large enough
4315     (void)vsnprintf(buff, len, format, args_copy);
4316 
4317     // Convert to a std::string.  Note we could have created a std::string
4318     // large enough and then converted the buffer to a 'char*' and created
4319     // the output in place.  This would probably work but feels like a hack.
4320     // Since this isn't code that needs to be super-performant we are going
4321     // with this 'safer' method.
4322     res = std::string(buff);
4323   }
4324 
4325   va_end(args_copy);
4326 
4327   return res;
4328 }
4329 
4330 class Rdb_snapshot_status : public Rdb_tx_list_walker {
4331  private:
4332   std::string m_data;
4333 
4334   static std::string current_timestamp(void) {
4335     static const char *const format = "%d-%02d-%02d %02d:%02d:%02d";
4336     time_t currtime;
4337     struct tm currtm;
4338 
4339     time(&currtime);
4340 
4341     localtime_r(&currtime, &currtm);
4342 
4343     return format_string(format, currtm.tm_year + 1900, currtm.tm_mon + 1,
4344                          currtm.tm_mday, currtm.tm_hour, currtm.tm_min,
4345                          currtm.tm_sec);
4346   }
4347 
4348   static std::string get_header(void) {
4349     return "\n============================================================\n" +
4350            current_timestamp() +
4351            " ROCKSDB TRANSACTION MONITOR OUTPUT\n"
4352            "============================================================\n"
4353            "---------\n"
4354            "SNAPSHOTS\n"
4355            "---------\n"
4356            "LIST OF SNAPSHOTS FOR EACH SESSION:\n";
4357   }
4358 
4359   static std::string get_footer(void) {
4360     return "-----------------------------------------\n"
4361            "END OF ROCKSDB TRANSACTION MONITOR OUTPUT\n"
4362            "=========================================\n";
4363   }
4364 
4365   static Rdb_deadlock_info::Rdb_dl_trx_info get_dl_txn_info(
4366       const rocksdb::DeadlockInfo &txn, const GL_INDEX_ID &gl_index_id) {
4367     Rdb_deadlock_info::Rdb_dl_trx_info txn_data;
4368 
4369     txn_data.trx_id = txn.m_txn_id;
4370 
4371     txn_data.table_name = ddl_manager.safe_get_table_name(gl_index_id);
4372     if (txn_data.table_name.empty()) {
4373       txn_data.table_name =
4374           "NOT FOUND; INDEX_ID: " + std::to_string(gl_index_id.index_id);
4375     }
4376 
4377     auto kd = ddl_manager.safe_find(gl_index_id);
4378     txn_data.index_name =
4379         (kd) ? kd->get_name()
4380              : "NOT FOUND; INDEX_ID: " + std::to_string(gl_index_id.index_id);
4381 
4382     rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(txn.m_cf_id);
4383     txn_data.cf_name = cfh->GetName();
4384 
4385     txn_data.waiting_key =
4386         rdb_hexdump(txn.m_waiting_key.c_str(), txn.m_waiting_key.length());
4387 
4388     txn_data.exclusive_lock = txn.m_exclusive;
4389 
4390     return txn_data;
4391   }
4392 
4393   static Rdb_deadlock_info get_dl_path_trx_info(
4394       const rocksdb::DeadlockPath &path_entry) {
4395     Rdb_deadlock_info deadlock_info;
4396 
4397     for (auto it = path_entry.path.begin(); it != path_entry.path.end(); it++) {
4398       const auto &txn = *it;
4399       const GL_INDEX_ID gl_index_id = {
4400           txn.m_cf_id, rdb_netbuf_to_uint32(reinterpret_cast<const uchar *>(
4401                            txn.m_waiting_key.c_str()))};
4402       deadlock_info.path.push_back(get_dl_txn_info(txn, gl_index_id));
4403     }
4404     DBUG_ASSERT_IFF(path_entry.limit_exceeded, path_entry.path.empty());
4405     /* print the first txn in the path to display the full deadlock cycle */
4406     if (!path_entry.path.empty() && !path_entry.limit_exceeded) {
4407       const auto &deadlocking_txn = *(path_entry.path.end() - 1);
4408       deadlock_info.victim_trx_id = deadlocking_txn.m_txn_id;
4409       deadlock_info.deadlock_time = path_entry.deadlock_time;
4410     }
4411     return deadlock_info;
4412   }
4413 
4414  public:
4415   Rdb_snapshot_status() : m_data(get_header()) {}
4416 
4417   std::string getResult() { return m_data + get_footer(); }
4418 
4419   /* Implement Rdb_transaction interface */
4420   /* Create one row in the snapshot status table */
4421   void process_tran(const Rdb_transaction *const tx) override {
4422     DBUG_ASSERT(tx != nullptr);
4423 
4424     /* Calculate the duration the snapshot has existed */
4425     int64_t snapshot_timestamp = tx->m_snapshot_timestamp;
4426     if (snapshot_timestamp != 0) {
4427       int64_t curr_time;
4428       rdb->GetEnv()->GetCurrentTime(&curr_time);
4429 
4430       char buffer[1024];
4431 #ifdef MARIAROCKS_NOT_YET
4432       thd_security_context(tx->get_thd(), buffer, sizeof buffer, 0);
4433 #endif
4434       m_data += format_string(
4435           "---SNAPSHOT, ACTIVE %lld sec\n"
4436           "%s\n"
4437           "lock count %llu, write count %llu\n"
4438           "insert count %llu, update count %llu, delete count %llu\n",
4439           (longlong)(curr_time - snapshot_timestamp), buffer, tx->get_lock_count(),
4440           tx->get_write_count(), tx->get_insert_count(), tx->get_update_count(),
4441           tx->get_delete_count());
4442     }
4443   }
4444 
4445   void populate_deadlock_buffer() {
4446     auto dlock_buffer = rdb->GetDeadlockInfoBuffer();
4447     m_data += "----------LATEST DETECTED DEADLOCKS----------\n";
4448 
4449     for (const auto &path_entry : dlock_buffer) {
4450       std::string path_data;
4451       if (path_entry.limit_exceeded) {
4452         path_data += "\n-------DEADLOCK EXCEEDED MAX DEPTH-------\n";
4453       } else {
4454         path_data +=
4455             "\n*** DEADLOCK PATH\n"
4456             "=========================================\n";
4457         const auto dl_info = get_dl_path_trx_info(path_entry);
4458         const auto deadlock_time = dl_info.deadlock_time;
4459         for (auto it = dl_info.path.begin(); it != dl_info.path.end(); it++) {
4460           const auto &trx_info = *it;
4461           path_data += format_string(
4462               "TIMESTAMP: %" PRId64
4463               "\n"
4464               "TRANSACTION ID: %u\n"
4465               "COLUMN FAMILY NAME: %s\n"
4466               "WAITING KEY: %s\n"
4467               "LOCK TYPE: %s\n"
4468               "INDEX NAME: %s\n"
4469               "TABLE NAME: %s\n",
4470               deadlock_time, trx_info.trx_id, trx_info.cf_name.c_str(),
4471               trx_info.waiting_key.c_str(),
4472               trx_info.exclusive_lock ? "EXCLUSIVE" : "SHARED",
4473               trx_info.index_name.c_str(), trx_info.table_name.c_str());
4474           if (it != dl_info.path.end() - 1) {
4475             path_data += "---------------WAITING FOR---------------\n";
4476           }
4477         }
4478         path_data += format_string(
4479             "\n--------TRANSACTION ID: %u GOT DEADLOCK---------\n",
4480             dl_info.victim_trx_id);
4481       }
4482       m_data += path_data;
4483     }
4484   }
4485 
4486   std::vector<Rdb_deadlock_info> get_deadlock_info() {
4487     std::vector<Rdb_deadlock_info> deadlock_info;
4488     auto dlock_buffer = rdb->GetDeadlockInfoBuffer();
4489     for (const auto &path_entry : dlock_buffer) {
4490       if (!path_entry.limit_exceeded) {
4491         deadlock_info.push_back(get_dl_path_trx_info(path_entry));
4492       }
4493     }
4494     return deadlock_info;
4495   }
4496 };
4497 
4498 /**
4499  * @brief
4500  * walks through all non-replication transactions and copies
4501  * out relevant information for information_schema.rocksdb_trx
4502  */
4503 class Rdb_trx_info_aggregator : public Rdb_tx_list_walker {
4504  private:
4505   std::vector<Rdb_trx_info> *m_trx_info;
4506 
4507  public:
4508   explicit Rdb_trx_info_aggregator(std::vector<Rdb_trx_info> *const trx_info)
4509       : m_trx_info(trx_info) {}
4510 
4511   void process_tran(const Rdb_transaction *const tx) override {
4512     static const std::map<int, std::string> state_map = {
4513         {rocksdb::Transaction::STARTED, "STARTED"},
4514         {rocksdb::Transaction::AWAITING_PREPARE, "AWAITING_PREPARE"},
4515         {rocksdb::Transaction::PREPARED, "PREPARED"},
4516         {rocksdb::Transaction::AWAITING_COMMIT, "AWAITING_COMMIT"},
4517         {rocksdb::Transaction::COMMITED, "COMMITED"},
4518         {rocksdb::Transaction::AWAITING_ROLLBACK, "AWAITING_ROLLBACK"},
4519         {rocksdb::Transaction::ROLLEDBACK, "ROLLEDBACK"},
4520     };
4521 
4522     DBUG_ASSERT(tx != nullptr);
4523 
4524     THD *const thd = tx->get_thd();
4525     ulong thread_id = thd_get_thread_id(thd);
4526 
4527     if (tx->is_writebatch_trx()) {
4528       const auto wb_impl = static_cast<const Rdb_writebatch_impl *>(tx);
4529       DBUG_ASSERT(wb_impl);
4530       m_trx_info->push_back(
4531           {"",                            /* name */
4532            0,                             /* trx_id */
4533            wb_impl->get_write_count(), 0, /* lock_count */
4534            0,                             /* timeout_sec */
4535            "",                            /* state */
4536            "",                            /* waiting_key */
4537            0,                             /* waiting_cf_id */
4538            1,                             /*is_replication */
4539            1,                             /* skip_trx_api */
4540            wb_impl->is_tx_read_only(), 0, /* deadlock detection */
4541            wb_impl->num_ongoing_bulk_load(), thread_id, "" /* query string */});
4542     } else {
4543       const auto tx_impl = static_cast<const Rdb_transaction_impl *>(tx);
4544       DBUG_ASSERT(tx_impl);
4545       const rocksdb::Transaction *rdb_trx = tx_impl->get_rdb_trx();
4546 
4547       if (rdb_trx == nullptr) {
4548         return;
4549       }
4550 
4551       char query_buf[NAME_LEN+1];
4552       thd_query_safe(thd, query_buf, sizeof(query_buf));
4553       std::string query_str(query_buf);
4554 
4555       const auto state_it = state_map.find(rdb_trx->GetState());
4556       DBUG_ASSERT(state_it != state_map.end());
4557       const int is_replication = (thd->rgi_slave != nullptr);
4558       uint32_t waiting_cf_id;
4559       std::string waiting_key;
4560       rdb_trx->GetWaitingTxns(&waiting_cf_id, &waiting_key),
4561 
4562           m_trx_info->push_back(
4563               {rdb_trx->GetName(), rdb_trx->GetID(), tx_impl->get_write_count(),
4564                tx_impl->get_lock_count(), tx_impl->get_timeout_sec(),
4565                state_it->second, waiting_key, waiting_cf_id, is_replication,
4566                0, /* skip_trx_api */
4567                tx_impl->is_tx_read_only(), rdb_trx->IsDeadlockDetect(),
4568                tx_impl->num_ongoing_bulk_load(), thread_id, query_str});
4569     }
4570   }
4571 };
4572 
4573 /*
4574   returns a vector of info for all non-replication threads
4575   for use by information_schema.rocksdb_trx
4576 */
4577 std::vector<Rdb_trx_info> rdb_get_all_trx_info() {
4578   std::vector<Rdb_trx_info> trx_info;
4579   Rdb_trx_info_aggregator trx_info_agg(&trx_info);
4580   Rdb_transaction::walk_tx_list(&trx_info_agg);
4581   return trx_info;
4582 }
4583 
4584 
4585 /*
4586   returns a vector of info of recent deadlocks
4587   for use by information_schema.rocksdb_deadlock
4588 */
4589 std::vector<Rdb_deadlock_info> rdb_get_deadlock_info() {
4590   Rdb_snapshot_status showStatus;
4591   Rdb_transaction::walk_tx_list(&showStatus);
4592   return showStatus.get_deadlock_info();
4593 }
4594 
4595 #ifdef MARIAROCKS_NOT_YET
4596 /* Generate the snapshot status table */
4597 static bool rocksdb_show_snapshot_status(handlerton *const hton, THD *const thd,
4598                                          stat_print_fn *const stat_print) {
4599   Rdb_snapshot_status showStatus;
4600 
4601   Rdb_transaction::walk_tx_list(&showStatus);
4602   showStatus.populate_deadlock_buffer();
4603 
4604   /* Send the result data back to MySQL */
4605   return print_stats(thd, "rocksdb", "", showStatus.getResult(), stat_print);
4606 }
4607 #endif
4608 
4609 /*
4610   This is called for SHOW ENGINE ROCKSDB STATUS | LOGS | etc.
4611 
4612   For now, produce info about live files (which gives an imprecise idea about
4613   what column families are there).
4614 */
4615 static bool rocksdb_show_status(handlerton *const hton, THD *const thd,
4616                                 stat_print_fn *const stat_print,
4617                                 enum ha_stat_type stat_type) {
4618   DBUG_ASSERT(hton != nullptr);
4619   DBUG_ASSERT(thd != nullptr);
4620   DBUG_ASSERT(stat_print != nullptr);
4621 
4622   bool res = false;
4623   char buf[100] = {'\0'};
4624 
4625   if (stat_type == HA_ENGINE_STATUS) {
4626     DBUG_ASSERT(rdb != nullptr);
4627 
4628     std::string str;
4629 
4630     /* Global DB Statistics */
4631     if (rocksdb_stats) {
4632       str = rocksdb_stats->ToString();
4633 
4634       // Use the same format as internal RocksDB statistics entries to make
4635       // sure that output will look unified.
4636       DBUG_ASSERT(commit_latency_stats != nullptr);
4637 
4638       snprintf(buf, sizeof(buf),
4639                "rocksdb.commit_latency statistics "
4640                "Percentiles :=> 50 : %.2f 95 : %.2f "
4641                "99 : %.2f 100 : %.2f\n",
4642                commit_latency_stats->Percentile(50),
4643                commit_latency_stats->Percentile(95),
4644                commit_latency_stats->Percentile(99),
4645                commit_latency_stats->Percentile(100));
4646       str.append(buf);
4647 
4648       uint64_t v = 0;
4649 
4650       // Retrieve additional stalling related numbers from RocksDB and append
4651       // them to the buffer meant for displaying detailed statistics. The intent
4652       // here is to avoid adding another row to the query output because of
4653       // just two numbers.
4654       //
4655       // NB! We're replacing hyphens with underscores in output to better match
4656       // the existing naming convention.
4657       if (rdb->GetIntProperty("rocksdb.is-write-stopped", &v)) {
4658         snprintf(buf, sizeof(buf), "rocksdb.is_write_stopped COUNT : %llu\n", (ulonglong)v);
4659         str.append(buf);
4660       }
4661 
4662       if (rdb->GetIntProperty("rocksdb.actual-delayed-write-rate", &v)) {
4663         snprintf(buf, sizeof(buf),
4664                                    "COUNT : %llu\n",
4665                  (ulonglong)v);
4666         str.append(buf);
4667       }
4668 
4669       res |= print_stats(thd, "STATISTICS", "rocksdb", str, stat_print);
4670     }
4671 
4672     /* Per DB stats */
4673     if (rdb->GetProperty("rocksdb.dbstats", &str)) {
4674       res |= print_stats(thd, "DBSTATS", "rocksdb", str, stat_print);
4675     }
4676 
4677     /* Per column family stats */
4678     for (const auto &cf_name : cf_manager.get_cf_names()) {
4679       rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(cf_name);
4680       if (cfh == nullptr) {
4681         continue;
4682       }
4683 
4684       if (!rdb->GetProperty(cfh, "rocksdb.cfstats", &str)) {
4685         continue;
4686       }
4687 
4688       res |= print_stats(thd, "CF_COMPACTION", cf_name, str, stat_print);
4689     }
4690 
4691     /* Memory Statistics */
4692     std::vector<rocksdb::DB *> dbs;
4693     std::unordered_set<const rocksdb::Cache *> cache_set;
4694     size_t internal_cache_count = 0;
4695     size_t kDefaultInternalCacheSize = 8 * 1024 * 1024;
4696 
4697     dbs.push_back(rdb);
4698     cache_set.insert(rocksdb_tbl_options->block_cache.get());
4699 
4700     for (const auto &cf_handle : cf_manager.get_all_cf()) {
4701       rocksdb::ColumnFamilyDescriptor cf_desc;
4702       cf_handle->GetDescriptor(&cf_desc);
4703       auto *const table_factory = cf_desc.options.table_factory.get();
4704 
4705       if (table_factory != nullptr) {
4706         std::string tf_name = table_factory->Name();
4707 
4708         if (tf_name.find("BlockBasedTable") != std::string::npos) {
4709           const rocksdb::BlockBasedTableOptions *const bbt_opt =
4710               reinterpret_cast<rocksdb::BlockBasedTableOptions *>(
4711                   table_factory->GetOptions());
4712 
4713           if (bbt_opt != nullptr) {
4714             if (bbt_opt->block_cache.get() != nullptr) {
4715               cache_set.insert(bbt_opt->block_cache.get());
4716             } else {
4717               internal_cache_count++;
4718             }
4719             cache_set.insert(bbt_opt->block_cache_compressed.get());
4720           }
4721         }
4722       }
4723     }
4724 
4725     std::map<rocksdb::MemoryUtil::UsageType, uint64_t> temp_usage_by_type;
4726     str.clear();
4727     rocksdb::MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set,
4728                                                          &temp_usage_by_type);
4729     snprintf(buf, sizeof(buf), "\nMemTable Total: %llu",
4730              (ulonglong)temp_usage_by_type[rocksdb::MemoryUtil::kMemTableTotal]);
4731     str.append(buf);
4732     snprintf(buf, sizeof(buf), "\nMemTable Unflushed: %llu",
4733              (ulonglong)temp_usage_by_type[rocksdb::MemoryUtil::kMemTableUnFlushed]);
4734     str.append(buf);
4735     snprintf(buf, sizeof(buf), "\nTable Readers Total: %llu",
4736              (ulonglong)temp_usage_by_type[rocksdb::MemoryUtil::kTableReadersTotal]);
4737     str.append(buf);
4738     snprintf(buf, sizeof(buf), "\nCache Total: %llu",
4739              (ulonglong)temp_usage_by_type[rocksdb::MemoryUtil::kCacheTotal]);
4740     str.append(buf);
4741     snprintf(buf, sizeof(buf), "\nDefault Cache Capacity: %llu",
4742              (ulonglong)internal_cache_count * kDefaultInternalCacheSize);
4743     str.append(buf);
4744     res |= print_stats(thd, "MEMORY_STATS", "rocksdb", str, stat_print);
4745 
4746     /* Show the background thread status */
4747     std::vector<rocksdb::ThreadStatus> thread_list;
4748     rocksdb::Status s = rdb->GetEnv()->GetThreadList(&thread_list);
4749 
4750     if (!s.ok()) {
4751       // NO_LINT_DEBUG
4752       sql_print_error("RocksDB: Returned error (%s) from GetThreadList.\n",
4753                       s.ToString().c_str());
4754       res |= true;
4755     } else {
4756       /* For each background thread retrieved, print out its information */
4757       for (auto &it : thread_list) {
4758         /* Only look at background threads. Ignore user threads, if any. */
4759         if (it.thread_type > rocksdb::ThreadStatus::LOW_PRIORITY) {
4760           continue;
4761         }
4762 
4763         str = "\nthread_type: " + it.GetThreadTypeName(it.thread_type) +
4764               "\ncf_name: " + it.cf_name +
4765               "\noperation_type: " + it.GetOperationName(it.operation_type) +
4766               "\noperation_stage: " +
4767               it.GetOperationStageName(it.operation_stage) +
4768               "\nelapsed_time_ms: " + it.MicrosToString(it.op_elapsed_micros);
4769 
4770         for (auto &it_props : it.InterpretOperationProperties(
4771                  it.operation_type, it.op_properties)) {
4772           str += "\n" + it_props.first + ": " + std::to_string(it_props.second);
4773         }
4774 
4775         str += "\nstate_type: " + it.GetStateName(it.state_type);
4776 
4777         res |= print_stats(thd, "BG_THREADS", std::to_string(it.thread_id), str,
4778                            stat_print);
4779       }
4780     }
4781 
4782 #ifdef MARIAROCKS_NOT_YET
4783     /* Explicit snapshot information */
4784     str = Rdb_explicit_snapshot::dump_snapshots();
4785 #endif
4786 
4787     if (!str.empty()) {
4788       res |= print_stats(thd, "EXPLICIT_SNAPSHOTS", "rocksdb", str, stat_print);
4789     }
4790 #ifdef MARIAROCKS_NOT_YET
4791   } else if (stat_type == HA_ENGINE_TRX) {
4792     /* Handle the SHOW ENGINE ROCKSDB TRANSACTION STATUS command */
4793     res |= rocksdb_show_snapshot_status(hton, thd, stat_print);
4794 #endif
4795   }
4796   return res;
4797 }
4798 
4799 static inline void rocksdb_register_tx(handlerton *const hton, THD *const thd,
4800                                        Rdb_transaction *const tx) {
4801   DBUG_ASSERT(tx != nullptr);
4802 
4803   trans_register_ha(thd, FALSE, rocksdb_hton);
4804   if (my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
4805     tx->start_stmt();
4806     trans_register_ha(thd, TRUE, rocksdb_hton);
4807   }
4808 }
4809 
4810 static const char *ha_rocksdb_exts[] = {NullS};
4811 
4812 #ifdef MARIAROCKS_NOT_YET
4813 static bool rocksdb_explicit_snapshot(
4814     handlerton *const /* hton */, /*!< in: RocksDB handlerton */
4815     THD *const thd,               /*!< in: MySQL thread handle */
4816     snapshot_info_st *ss_info)    /*!< out: Snapshot information */
4817 {
4818   switch (ss_info->op) {
4819     case snapshot_operation::SNAPSHOT_CREATE: {
4820       if (mysql_bin_log_is_open()) {
4821         mysql_bin_log_lock_commits(ss_info);
4822       }
4823       auto s = Rdb_explicit_snapshot::create(ss_info, rdb, rdb->GetSnapshot());
4824       if (mysql_bin_log_is_open()) {
4825         mysql_bin_log_unlock_commits(ss_info);
4826       }
4827 
4828       thd->set_explicit_snapshot(s);
4829       return s == nullptr;
4830     }
4831     case snapshot_operation::SNAPSHOT_ATTACH: {
4832       auto s = Rdb_explicit_snapshot::get(ss_info->snapshot_id);
4833       if (!s) {
4834         return true;
4835       }
4836       *ss_info = s->ss_info;
4837       thd->set_explicit_snapshot(s);
4838       return false;
4839     }
4840     case snapshot_operation::SNAPSHOT_RELEASE: {
4841       if (!thd->get_explicit_snapshot()) {
4842         return true;
4843       }
4844       *ss_info = thd->get_explicit_snapshot()->ss_info;
4845       thd->set_explicit_snapshot(nullptr);
4846       return false;
4847     }
4848     default:
4849       DBUG_ASSERT(false);
4850       return true;
4851   }
4852   return true;
4853 }
4854 #endif
4855 
4856 /*
4857     Supporting START TRANSACTION WITH CONSISTENT [ROCKSDB] SNAPSHOT
4858 
4859     Features:
4860     1. Supporting START TRANSACTION WITH CONSISTENT SNAPSHOT
4861     2. Getting current binlog position in addition to #1.
4862 
4863     The second feature is done by START TRANSACTION WITH
4864     CONSISTENT ROCKSDB SNAPSHOT. This is Facebook's extension, and
4865     it works like existing START TRANSACTION WITH CONSISTENT INNODB SNAPSHOT.
4866 
4867     - When not setting engine, START TRANSACTION WITH CONSISTENT SNAPSHOT
4868     takes both InnoDB and RocksDB snapshots, and both InnoDB and RocksDB
4869     participate in transaction. When executing COMMIT, both InnoDB and
4870     RocksDB modifications are committed. Remember that XA is not supported yet,
4871     so mixing engines is not recommended anyway.
4872 
4873     - When setting engine, START TRANSACTION WITH CONSISTENT.. takes
4874     snapshot for the specified engine only. But it starts both
4875     InnoDB and RocksDB transactions.
4876 */
4877 static int rocksdb_start_tx_and_assign_read_view(
4878     handlerton *const hton,    /*!< in: RocksDB handlerton */
4879     THD *const thd             /*!< in: MySQL thread handle of the
4880                                user for whom the transaction should
4881                                be committed */
4882 )
4883 #ifdef MARIAROCKS_NOT_YET
4884     snapshot_info_st *ss_info) /*!< in/out: Snapshot info like binlog file, pos,
4885                                gtid executed and snapshot ID */
4886 #endif
4887 {
4888   ulong const tx_isolation = my_core::thd_tx_isolation(thd);
4889 
4890   if (tx_isolation != ISO_REPEATABLE_READ) {
4891     my_error(ER_ISOLATION_LEVEL_WITH_CONSISTENT_SNAPSHOT, MYF(0));
4892     return HA_EXIT_FAILURE;
4893   }
4894 
4895 #ifdef MARIADB_NOT_YET
4896   if (ss_info) {
4897     if (mysql_bin_log_is_open()) {
4898       mysql_bin_log_lock_commits(ss_info);
4899     } else {
4900       return HA_EXIT_FAILURE;
4901     }
4902 #endif
4903 
4904   /*
4905     MariaDB: there is no need to call mysql_bin_log_lock_commits and then
4906     unlock back.
4907     SQL layer calls start_consistent_snapshot() for all engines, including the
4908     binlog under LOCK_commit_ordered mutex.
4909 
4910     The mutex prevents binlog commits from happening (right?) while the storage
4911     engine(s) allocate read snapshots. That way, each storage engine is
4912     synchronized with current binlog position.
4913   */
4914   mysql_mutex_assert_owner(&LOCK_commit_ordered);
4915 
4916   Rdb_transaction *const tx = get_or_create_tx(thd);
4917   Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd));
4918 
4919   DBUG_ASSERT(!tx->has_snapshot());
4920   tx->set_tx_read_only(true);
4921   rocksdb_register_tx(hton, thd, tx);
4922   tx->acquire_snapshot(true);
4923 
4924 #ifdef MARIADB_NOT_YET
4925   if (ss_info) {
4926     mysql_bin_log_unlock_commits(ss_info);
4927   }
4928 #endif
4929   return HA_EXIT_SUCCESS;
4930 }
4931 
4932 #ifdef MARIADB_NOT_YET
4933 static int rocksdb_start_tx_with_shared_read_view(
4934     handlerton *const hton,    /*!< in: RocksDB handlerton */
4935     THD *const thd)            /*!< in: MySQL thread handle of the
4936                                user for whom the transaction should
4937                                be committed */
4938 #ifdef MARIADB_NOT_YET
4939     snapshot_info_st *ss_info) /*!< out: Snapshot info like binlog file, pos,
4940                                gtid executed and snapshot ID */
4941 #endif
4942 {
4943   DBUG_ASSERT(thd != nullptr);
4944 
4945   int error = HA_EXIT_SUCCESS;
4946 
4947   ulong const tx_isolation = my_core::thd_tx_isolation(thd);
4948   if (tx_isolation != ISO_REPEATABLE_READ) {
4949     my_error(ER_ISOLATION_LEVEL_WITH_CONSISTENT_SNAPSHOT, MYF(0));
4950     return HA_EXIT_FAILURE;
4951   }
4952 
4953   Rdb_transaction *tx = nullptr;
4954 #ifdef MARIADB_NOT_YET
4955   std::shared_ptr<Rdb_explicit_snapshot> explicit_snapshot;
4956   const auto op = ss_info->op;
4957 
4958   DBUG_ASSERT(op == snapshot_operation::SNAPSHOT_CREATE ||
4959               op == snapshot_operation::SNAPSHOT_ATTACH);
4960 
4961   // case: if binlogs are available get binlog file/pos and gtid info
4962   if (op == snapshot_operation::SNAPSHOT_CREATE && mysql_bin_log_is_open()) {
4963     mysql_bin_log_lock_commits(ss_info);
4964   }
4965 
4966   if (op == snapshot_operation::SNAPSHOT_ATTACH) {
4967     explicit_snapshot = Rdb_explicit_snapshot::get(ss_info->snapshot_id);
4968     if (!explicit_snapshot) {
4969       my_printf_error(ER_UNKNOWN_ERROR, "Snapshot %llu does not exist", MYF(0),
4970                       ss_info->snapshot_id);
4971       error = HA_EXIT_FAILURE;
4972     }
4973   }
4974 #endif
4975 
4976   // case: all good till now
4977   if (error == HA_EXIT_SUCCESS) {
4978     tx = get_or_create_tx(thd);
4979     Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd));
4980 
4981 #ifdef MARIADB_NOT_YET
4982     if (explicit_snapshot) {
4983       tx->m_explicit_snapshot = explicit_snapshot;
4984     }
4985 #endif
4986 
4987     DBUG_ASSERT(!tx->has_snapshot());
4988     tx->set_tx_read_only(true);
4989     rocksdb_register_tx(hton, thd, tx);
4990     tx->acquire_snapshot(true);
4991 
4992 #ifdef MARIADB_NOT_YET
4993     // case: an explicit snapshot was not assigned to this transaction
4994     if (!tx->m_explicit_snapshot) {
4995       tx->m_explicit_snapshot =
4996           Rdb_explicit_snapshot::create(ss_info, rdb, tx->m_read_opts.snapshot);
4997       if (!tx->m_explicit_snapshot) {
4998         my_printf_error(ER_UNKNOWN_ERROR, "Could not create snapshot", MYF(0));
4999         error = HA_EXIT_FAILURE;
5000       }
5001     }
5002 #endif
5003   }
5004 
5005 #ifdef MARIADB_NOT_YET
5006   // case: unlock the binlog
5007   if (op == snapshot_operation::SNAPSHOT_CREATE && mysql_bin_log_is_open()) {
5008     mysql_bin_log_unlock_commits(ss_info);
5009   }
5010 
5011   DBUG_ASSERT(error == HA_EXIT_FAILURE || tx->m_explicit_snapshot);
5012 
5013   // copy over the snapshot details to pass to the upper layers
5014   if (tx->m_explicit_snapshot) {
5015     *ss_info = tx->m_explicit_snapshot->ss_info;
5016     ss_info->op = op;
5017   }
5018 #endif
5019 
5020   return error;
5021 }
5022 #endif
5023 
5024 /* Dummy SAVEPOINT support. This is needed for long running transactions
5025  * like mysqldump (https://bugs.mysql.com/bug.php?id=71017).
5026  * Current SAVEPOINT does not correctly handle ROLLBACK and does not return
5027  * errors. This needs to be addressed in future versions (Issue#96).
5028  */
5029 static int rocksdb_savepoint(handlerton *const hton, THD *const thd,
5030                              void *const savepoint) {
5031   return HA_EXIT_SUCCESS;
5032 }
5033 
5034 static int rocksdb_rollback_to_savepoint(handlerton *const hton, THD *const thd,
5035                                          void *const savepoint) {
5036   Rdb_transaction *tx = get_tx_from_thd(thd);
5037   return tx->rollback_to_savepoint(savepoint);
5038 }
5039 
5040 static bool rocksdb_rollback_to_savepoint_can_release_mdl(
5041     handlerton *const /* hton */, THD *const /* thd */) {
5042   return true;
5043 }
5044 
5045 #ifdef MARIAROCKS_NOT_YET
5046 /*
5047   This is called for INFORMATION_SCHEMA
5048 */
5049 static void rocksdb_update_table_stats(
5050     /* per-table stats callback */
5051     void (*cb)(const char *db, const char *tbl, bool is_partition,
5052                my_io_perf_t *r, my_io_perf_t *w, my_io_perf_t *r_blob,
5053                my_io_perf_t *r_primary, my_io_perf_t *r_secondary,
5054                page_stats_t *page_stats, comp_stats_t *comp_stats,
5055                int n_lock_wait, int n_lock_wait_timeout, int n_lock_deadlock,
5056                const char *engine)) {
5057   my_io_perf_t io_perf_read;
5058   my_io_perf_t io_perf_write;
5059   my_io_perf_t io_perf;
5060   page_stats_t page_stats;
5061   comp_stats_t comp_stats;
5062   uint lock_wait_timeout_stats;
5063   uint deadlock_stats;
5064   uint lock_wait_stats;
5065   std::vector<std::string> tablenames;
5066 
5067   /*
5068     Most of these are for innodb, so setting them to 0.
5069     TODO: possibly separate out primary vs. secondary index reads
5070    */
5071   memset(&io_perf, 0, sizeof(io_perf));
5072   memset(&page_stats, 0, sizeof(page_stats));
5073   memset(&comp_stats, 0, sizeof(comp_stats));
5074   memset(&io_perf_write, 0, sizeof(io_perf_write));
5075 
5076   tablenames = rdb_open_tables.get_table_names();
5077 
5078   for (const auto &it : tablenames) {
5079     Rdb_table_handler *table_handler;
5080     std::string str, dbname, tablename, partname;
5081     char dbname_sys[NAME_LEN + 1];
5082     char tablename_sys[NAME_LEN + 1];
5083     bool is_partition;
5084 
5085     if (rdb_normalize_tablename(it, &str) != HA_EXIT_SUCCESS) {
5086       /* Function needs to return void because of the interface and we've
5087        * detected an error which shouldn't happen. There's no way to let
5088        * caller know that something failed.
5089        */
5090       SHIP_ASSERT(false);
5091       return;
5092     }
5093 
5094     if (rdb_split_normalized_tablename(str, &dbname, &tablename, &partname)) {
5095       continue;
5096     }
5097 
5098     is_partition = (partname.size() != 0);
5099 
5100     table_handler = rdb_open_tables.get_table_handler(it.c_str());
5101     if (table_handler == nullptr) {
5102       continue;
5103     }
5104 
5105     io_perf_read.bytes = table_handler->m_io_perf_read.bytes.load();
5106     io_perf_read.requests = table_handler->m_io_perf_read.requests.load();
5107     io_perf_write.bytes = table_handler->m_io_perf_write.bytes.load();
5108     io_perf_write.requests = table_handler->m_io_perf_write.requests.load();
5109     lock_wait_timeout_stats = table_handler->m_lock_wait_timeout_counter.load();
5110     deadlock_stats = table_handler->m_deadlock_counter.load();
5111     lock_wait_stats =
5112         table_handler->m_table_perf_context.m_value[PC_KEY_LOCK_WAIT_COUNT]
5113             .load();
5114 
5115     /*
5116       Convert from rocksdb timer to mysql timer. RocksDB values are
5117       in nanoseconds, but table statistics expect the value to be
5118       in my_timer format.
5119      */
5120     io_perf_read.svc_time = my_core::microseconds_to_my_timer(
5121         table_handler->m_io_perf_read.svc_time.load() / 1000);
5122     io_perf_read.svc_time_max = my_core::microseconds_to_my_timer(
5123         table_handler->m_io_perf_read.svc_time_max.load() / 1000);
5124     io_perf_read.wait_time = my_core::microseconds_to_my_timer(
5125         table_handler->m_io_perf_read.wait_time.load() / 1000);
5126     io_perf_read.wait_time_max = my_core::microseconds_to_my_timer(
5127         table_handler->m_io_perf_read.wait_time_max.load() / 1000);
5128     io_perf_read.slow_ios = table_handler->m_io_perf_read.slow_ios.load();
5129     rdb_open_tables.release_table_handler(table_handler);
5130 
5131     /*
5132       Table stats expects our database and table name to be in system encoding,
5133       not filename format. Convert before calling callback.
5134      */
5135     my_core::filename_to_tablename(dbname.c_str(), dbname_sys,
5136                                    sizeof(dbname_sys));
5137     my_core::filename_to_tablename(tablename.c_str(), tablename_sys,
5138                                    sizeof(tablename_sys));
5139     (*cb)(dbname_sys, tablename_sys, is_partition, &io_perf_read,
5140           &io_perf_write, &io_perf, &io_perf, &io_perf, &page_stats,
5141           &comp_stats, lock_wait_stats, lock_wait_timeout_stats, deadlock_stats,
5142           rocksdb_hton_name);
5143   }
5144 }
5145 #endif
5146 static rocksdb::Status check_rocksdb_options_compatibility(
5147     const char *const dbpath, const rocksdb::Options &main_opts,
5148     const std::vector<rocksdb::ColumnFamilyDescriptor> &cf_descr) {
5149   DBUG_ASSERT(rocksdb_datadir != nullptr);
5150 
5151   rocksdb::DBOptions loaded_db_opt;
5152   std::vector<rocksdb::ColumnFamilyDescriptor> loaded_cf_descs;
5153   rocksdb::Status status =
5154       LoadLatestOptions(dbpath, rocksdb::Env::Default(), &loaded_db_opt,
5155                         &loaded_cf_descs, rocksdb_ignore_unknown_options);
5156 
5157   // If we're starting from scratch and there are no options saved yet then this
5158   // is a valid case. Therefore we can't compare the current set of options to
5159   // anything.
5160   if (status.IsNotFound()) {
5161     return rocksdb::Status::OK();
5162   }
5163 
5164   if (!status.ok()) {
5165     return status;
5166   }
5167 
5168   if (loaded_cf_descs.size() != cf_descr.size()) {
5169     return rocksdb::Status::NotSupported(
5170         "Mismatched size of column family "
5171         "descriptors.");
5172   }
5173 
5174   // Please see RocksDB documentation for more context about why we need to set
5175   // user-defined functions and pointer-typed options manually.
5176   for (size_t i = 0; i < loaded_cf_descs.size(); i++) {
5177     loaded_cf_descs[i].options.compaction_filter =
5178         cf_descr[i].options.compaction_filter;
5179     loaded_cf_descs[i].options.compaction_filter_factory =
5180         cf_descr[i].options.compaction_filter_factory;
5181     loaded_cf_descs[i].options.comparator = cf_descr[i].options.comparator;
5182     loaded_cf_descs[i].options.memtable_factory =
5183         cf_descr[i].options.memtable_factory;
5184     loaded_cf_descs[i].options.merge_operator =
5185         cf_descr[i].options.merge_operator;
5186     loaded_cf_descs[i].options.prefix_extractor =
5187         cf_descr[i].options.prefix_extractor;
5188     loaded_cf_descs[i].options.table_factory =
5189         cf_descr[i].options.table_factory;
5190   }
5191 
5192   // This is the essence of the function - determine if it's safe to open the
5193   // database or not.
5194   status = CheckOptionsCompatibility(dbpath, rocksdb::Env::Default(), main_opts,
5195                                      loaded_cf_descs,
5196                                      rocksdb_ignore_unknown_options);
5197 
5198   return status;
5199 }
5200 
5201 bool prevent_myrocks_loading= false;
5202 
5203 
5204 /*
5205   Storage Engine initialization function, invoked when plugin is loaded.
5206 */
5207 
5208 static int rocksdb_init_func(void *const p) {
5209 
5210   DBUG_ENTER_FUNC();
5211 
5212   if (prevent_myrocks_loading)
5213   {
5214     my_error(ER_INTERNAL_ERROR, MYF(0),
5215              "Loading MyRocks plugin after it has been unloaded is not "
5216              "supported. Please restart mysqld");
5217     DBUG_RETURN(1);
5218   }
5219 
5220   if (rocksdb_ignore_datadic_errors)
5221   {
5222     sql_print_information(
5223         "CAUTION: Running with rocksdb_ignore_datadic_errors=1. "
5224         " This should only be used to perform repairs");
5225   }
5226 
5227   if (rdb_check_rocksdb_corruption()) {
5228     // NO_LINT_DEBUG
5229     sql_print_error(
5230         "RocksDB: There was a corruption detected in RockDB files. "
5231         "Check error log emitted earlier for more details.");
5232     if (rocksdb_allow_to_start_after_corruption) {
5233       // NO_LINT_DEBUG
5234       sql_print_information(
5235           "RocksDB: Remove rocksdb_allow_to_start_after_corruption to prevent "
5236           "server operating if RocksDB corruption is detected.");
5237     } else {
5238       // NO_LINT_DEBUG
5239       sql_print_error(
5240           "RocksDB: The server will exit normally and stop restart "
5241           "attempts. Remove %s file from data directory and "
5242           "start mysqld manually.",
5243           rdb_corruption_marker_file_name().c_str());
5244       exit(0);
5245     }
5246   }
5247 
5248   // Validate the assumption about the size of ROCKSDB_SIZEOF_HIDDEN_PK_COLUMN.
5249   static_assert(sizeof(longlong) == 8, "Assuming that longlong is 8 bytes.");
5250 
5251   init_rocksdb_psi_keys();
5252 
5253   rocksdb_hton = (handlerton *)p;
5254 
5255   rdb_open_tables.init();
5256   Ensure_cleanup rdb_open_tables_cleanup([]() { rdb_open_tables.free(); });
5257 
5258 #ifdef HAVE_PSI_INTERFACE
5259   rdb_bg_thread.init(rdb_signal_bg_psi_mutex_key, rdb_signal_bg_psi_cond_key);
5260   rdb_drop_idx_thread.init(rdb_signal_drop_idx_psi_mutex_key,
5261                            rdb_signal_drop_idx_psi_cond_key);
5262   rdb_mc_thread.init(rdb_signal_mc_psi_mutex_key, rdb_signal_mc_psi_cond_key);
5263 #else
5264   rdb_bg_thread.init();
5265   rdb_drop_idx_thread.init();
5266   rdb_mc_thread.init();
5267 #endif
5268   mysql_mutex_init(rdb_collation_data_mutex_key, &rdb_collation_data_mutex,
5269                    MY_MUTEX_INIT_FAST);
5270   mysql_mutex_init(rdb_mem_cmp_space_mutex_key, &rdb_mem_cmp_space_mutex,
5271                    MY_MUTEX_INIT_FAST);
5272 
5273   const char* initial_rocksdb_datadir_for_ignore_dirs= rocksdb_datadir;
5274   if (!strncmp(rocksdb_datadir, "./", 2))
5275     initial_rocksdb_datadir_for_ignore_dirs += 2;
5276   ignore_db_dirs_append(initial_rocksdb_datadir_for_ignore_dirs);
5277 
5278 #if defined(HAVE_PSI_INTERFACE)
5279   rdb_collation_exceptions =
5280       new Regex_list_handler(key_rwlock_collation_exception_list);
5281 #else
5282   rdb_collation_exceptions = new Regex_list_handler();
5283 #endif
5284 
5285   mysql_mutex_init(rdb_sysvars_psi_mutex_key, &rdb_sysvars_mutex,
5286                    MY_MUTEX_INIT_FAST);
5287   mysql_mutex_init(rdb_block_cache_resize_mutex_key,
5288                    &rdb_block_cache_resize_mutex, MY_MUTEX_INIT_FAST);
5289   Rdb_transaction::init_mutex();
5290 
5291   rocksdb_hton->state = SHOW_OPTION_YES;
5292   rocksdb_hton->create = rocksdb_create_handler;
5293   rocksdb_hton->close_connection = rocksdb_close_connection;
5294 
5295   rocksdb_hton->prepare = rocksdb_prepare;
5296   rocksdb_hton->prepare_ordered = NULL; // Do not need it
5297 
5298   rocksdb_hton->commit_by_xid = rocksdb_commit_by_xid;
5299   rocksdb_hton->rollback_by_xid = rocksdb_rollback_by_xid;
5300   rocksdb_hton->recover = rocksdb_recover;
5301 
5302   rocksdb_hton->commit_ordered= rocksdb_commit_ordered;
5303   rocksdb_hton->commit = rocksdb_commit;
5304 
5305   rocksdb_hton->commit_checkpoint_request= rocksdb_checkpoint_request;
5306 
5307   rocksdb_hton->rollback = rocksdb_rollback;
5308   rocksdb_hton->show_status = rocksdb_show_status;
5309 #ifdef MARIADB_NOT_YET
5310   rocksdb_hton->explicit_snapshot = rocksdb_explicit_snapshot;
5311 #endif
5312   rocksdb_hton->start_consistent_snapshot =
5313       rocksdb_start_tx_and_assign_read_view;
5314 #ifdef MARIADB_NOT_YET
5315   rocksdb_hton->start_shared_snapshot = rocksdb_start_tx_with_shared_read_view;
5316 #endif
5317   rocksdb_hton->savepoint_set = rocksdb_savepoint;
5318   rocksdb_hton->savepoint_rollback = rocksdb_rollback_to_savepoint;
5319   rocksdb_hton->savepoint_rollback_can_release_mdl =
5320       rocksdb_rollback_to_savepoint_can_release_mdl;
5321 #ifdef MARIAROCKS_NOT_YET
5322   rocksdb_hton->update_table_stats = rocksdb_update_table_stats;
5323 #endif // MARIAROCKS_NOT_YET
5324 
5325   /*
5326   Not needed in MariaDB:
5327   rocksdb_hton->flush_logs = rocksdb_flush_wal;
5328   rocksdb_hton->handle_single_table_select = rocksdb_handle_single_table_select;
5329 
5330   */
5331 
5332   rocksdb_hton->flags = HTON_TEMPORARY_NOT_SUPPORTED |
5333                         HTON_SUPPORTS_EXTENDED_KEYS | HTON_CAN_RECREATE;
5334 
5335   rocksdb_hton->tablefile_extensions= ha_rocksdb_exts;
5336   DBUG_ASSERT(!mysqld_embedded);
5337 
5338   if (rocksdb_db_options->max_open_files > (long)open_files_limit) {
5339     // NO_LINT_DEBUG
5340     sql_print_information(
5341         "RocksDB: rocksdb_max_open_files should not be "
5342         "greater than the open_files_limit, effective value "
5343         "of rocksdb_max_open_files is being set to "
5344         "open_files_limit / 2.");
5345     rocksdb_db_options->max_open_files = open_files_limit / 2;
5346   } else if (rocksdb_db_options->max_open_files == -2) {
5347     rocksdb_db_options->max_open_files = open_files_limit / 2;
5348   }
5349 
5350 #if 0 // MARIAROCKS_NOT_YET : read-free replication is not supported
5351   rdb_read_free_regex_handler.set_patterns(DEFAULT_READ_FREE_RPL_TABLES);
5352 #endif
5353 
5354   rocksdb_stats = rocksdb::CreateDBStatistics();
5355   rocksdb_stats->set_stats_level(
5356       static_cast<rocksdb::StatsLevel>(rocksdb_stats_level));
5357   rocksdb_stats_level = rocksdb_stats->get_stats_level();
5358   rocksdb_db_options->statistics = rocksdb_stats;
5359 
5360   if (rocksdb_rate_limiter_bytes_per_sec != 0) {
5361     rocksdb_rate_limiter.reset(
5362         rocksdb::NewGenericRateLimiter(rocksdb_rate_limiter_bytes_per_sec));
5363     rocksdb_db_options->rate_limiter = rocksdb_rate_limiter;
5364   }
5365 
5366   rocksdb_db_options->delayed_write_rate = rocksdb_delayed_write_rate;
5367 
5368   std::shared_ptr<Rdb_logger> myrocks_logger = std::make_shared<Rdb_logger>();
5369   rocksdb::Status s = rocksdb::CreateLoggerFromOptions(
5370       rocksdb_datadir, *rocksdb_db_options, &rocksdb_db_options->info_log);
5371   if (s.ok()) {
5372     myrocks_logger->SetRocksDBLogger(rocksdb_db_options->info_log);
5373   }
5374 
5375   rocksdb_db_options->info_log = myrocks_logger;
5376   myrocks_logger->SetInfoLogLevel(
5377       static_cast<rocksdb::InfoLogLevel>(rocksdb_info_log_level));
5378   rocksdb_db_options->wal_dir = rocksdb_wal_dir;
5379 
5380   rocksdb_db_options->wal_recovery_mode =
5381       static_cast<rocksdb::WALRecoveryMode>(rocksdb_wal_recovery_mode);
5382 
5383   rocksdb_db_options->access_hint_on_compaction_start =
5384       static_cast<rocksdb::Options::AccessHint>(
5385           rocksdb_access_hint_on_compaction_start);
5386 
5387   if (rocksdb_db_options->allow_mmap_reads &&
5388       rocksdb_db_options->use_direct_reads) {
5389     // allow_mmap_reads implies !use_direct_reads and RocksDB will not open if
5390     // mmap_reads and direct_reads are both on.   (NO_LINT_DEBUG)
5391     sql_print_error(
5392         "RocksDB: Can't enable both use_direct_reads "
5393         "and allow_mmap_reads\n");
5394     DBUG_RETURN(HA_EXIT_FAILURE);
5395   }
5396 
5397   // Check whether the filesystem backing rocksdb_datadir allows O_DIRECT
5398   if (rocksdb_db_options->use_direct_reads ||
5399       rocksdb_db_options->use_direct_io_for_flush_and_compaction) {
5400     rocksdb::EnvOptions soptions;
5401     rocksdb::Status check_status;
5402     rocksdb::Env *const env = rocksdb_db_options->env;
5403 
5404     std::string fname = format_string("%s/DIRECT_CHECK", rocksdb_datadir);
5405     if (env->FileExists(fname).ok()) {
5406       std::unique_ptr<rocksdb::SequentialFile> file;
5407       soptions.use_direct_reads = true;
5408       check_status = env->NewSequentialFile(fname, &file, soptions);
5409     } else {
5410       std::unique_ptr<rocksdb::WritableFile> file;
5411       soptions.use_direct_writes = true;
5412       check_status = env->ReopenWritableFile(fname, &file, soptions);
5413       if (file != nullptr) {
5414         file->Close();
5415       }
5416       env->DeleteFile(fname);
5417     }
5418 
5419     if (!check_status.ok()) {
5420       // NO_LINT_DEBUG
5421       sql_print_error(
5422           "RocksDB: Unable to use direct io in rocksdb-datadir:"
5423           "(%s)",
5424           check_status.getState());
5425       DBUG_RETURN(HA_EXIT_FAILURE);
5426     }
5427   }
5428 
5429   if (rocksdb_db_options->allow_mmap_writes &&
5430       rocksdb_db_options->use_direct_io_for_flush_and_compaction) {
5431     // See above comment for allow_mmap_reads. (NO_LINT_DEBUG)
5432     sql_print_error(
5433         "RocksDB: Can't enable both "
5434         "use_direct_io_for_flush_and_compaction and "
5435         "allow_mmap_writes\n");
5436     DBUG_RETURN(HA_EXIT_FAILURE);
5437   }
5438 
5439   if (rocksdb_db_options->allow_mmap_writes &&
5440       rocksdb_flush_log_at_trx_commit != FLUSH_LOG_NEVER) {
5441     // NO_LINT_DEBUG
5442     sql_print_error(
5443         "RocksDB: rocksdb_flush_log_at_trx_commit needs to be 0 "
5444         "to use allow_mmap_writes");
5445     DBUG_RETURN(HA_EXIT_FAILURE);
5446   }
5447 
5448   // sst_file_manager will move deleted rocksdb sst files to trash_dir
5449   // to be deleted in a background thread.
5450   std::string trash_dir = std::string(rocksdb_datadir) + "/trash";
5451   rocksdb_db_options->sst_file_manager.reset(NewSstFileManager(
5452       rocksdb_db_options->env, myrocks_logger, trash_dir,
5453       rocksdb_sst_mgr_rate_bytes_per_sec, true /* delete_existing_trash */));
5454 
5455   std::vector<std::string> cf_names;
5456   rocksdb::Status status;
5457   status = rocksdb::DB::ListColumnFamilies(*rocksdb_db_options, rocksdb_datadir,
5458                                            &cf_names);
5459   if (!status.ok()) {
5460     /*
5461       When we start on an empty datadir, ListColumnFamilies returns IOError,
5462       and RocksDB doesn't provide any way to check what kind of error it was.
5463       Checking system errno happens to work right now.
5464     */
5465     if (status.IsIOError()
5466 #ifndef _WIN32
5467       && errno == ENOENT
5468 #endif
5469       ) {
5470       sql_print_information("RocksDB: Got ENOENT when listing column families");
5471 
5472       // NO_LINT_DEBUG
5473       sql_print_information(
5474           "RocksDB:   assuming that we're creating a new database");
5475     } else {
5476       rdb_log_status_error(status, "Error listing column families");
5477       DBUG_RETURN(HA_EXIT_FAILURE);
5478     }
5479   } else {
5480     // NO_LINT_DEBUG
5481     sql_print_information("RocksDB: %ld column families found",
5482                           cf_names.size());
5483   }
5484 
5485   std::vector<rocksdb::ColumnFamilyDescriptor> cf_descr;
5486   std::vector<rocksdb::ColumnFamilyHandle *> cf_handles;
5487 
5488   rocksdb_tbl_options->index_type =
5489       (rocksdb::BlockBasedTableOptions::IndexType)rocksdb_index_type;
5490 
5491   if (!rocksdb_tbl_options->no_block_cache) {
5492     std::shared_ptr<rocksdb::MemoryAllocator> memory_allocator;
5493     if (!rocksdb_cache_dump) {
5494       size_t block_size = rocksdb_tbl_options->block_size;
5495       rocksdb::JemallocAllocatorOptions alloc_opt;
5496       // Limit jemalloc tcache memory usage. The range
5497       // [block_size/4, block_size] should be enough to cover most of
5498       // block cache allocation sizes.
5499       alloc_opt.limit_tcache_size = true;
5500       alloc_opt.tcache_size_lower_bound = block_size / 4;
5501       alloc_opt.tcache_size_upper_bound = block_size;
5502       rocksdb::Status new_alloc_status =
5503           rocksdb::NewJemallocNodumpAllocator(alloc_opt, &memory_allocator);
5504       if (!new_alloc_status.ok()) {
5505         // Fallback to use default malloc/free.
5506         rdb_log_status_error(new_alloc_status,
5507                              "Error excluding block cache from core dump");
5508         memory_allocator = nullptr;
5509         DBUG_RETURN(HA_EXIT_FAILURE);
5510       }
5511     }
5512     std::shared_ptr<rocksdb::Cache> block_cache =
5513         rocksdb_use_clock_cache
5514             ? rocksdb::NewClockCache(rocksdb_block_cache_size)
5515             : rocksdb::NewLRUCache(
5516                   rocksdb_block_cache_size, -1 /*num_shard_bits*/,
5517                   false /*strict_capcity_limit*/,
5518                   rocksdb_cache_high_pri_pool_ratio, memory_allocator);
5519     if (rocksdb_sim_cache_size > 0) {
5520       // Simulated cache enabled
5521       // Wrap block cache inside a simulated cache and pass it to RocksDB
5522       rocksdb_tbl_options->block_cache =
5523           rocksdb::NewSimCache(block_cache, rocksdb_sim_cache_size, 6);
5524     } else {
5525       // Pass block cache to RocksDB
5526       rocksdb_tbl_options->block_cache = block_cache;
5527     }
5528   }
5529   // Using newer BlockBasedTable format version for better compression
5530   // and better memory allocation.
5531   // See:
5532   // https://github.com/facebook/rocksdb/commit/9ab5adfc59a621d12357580c94451d9f7320c2dd
5533   rocksdb_tbl_options->format_version = 2;
5534 
5535   if (rocksdb_collect_sst_properties) {
5536     properties_collector_factory =
5537         std::make_shared<Rdb_tbl_prop_coll_factory>(&ddl_manager);
5538 
5539     rocksdb_set_compaction_options(nullptr, nullptr, nullptr, nullptr);
5540 
5541     RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
5542 
5543     DBUG_ASSERT(rocksdb_table_stats_sampling_pct <=
5544                 RDB_TBL_STATS_SAMPLE_PCT_MAX);
5545     properties_collector_factory->SetTableStatsSamplingPct(
5546         rocksdb_table_stats_sampling_pct);
5547 
5548     RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
5549   }
5550 
5551   if (rocksdb_persistent_cache_size_mb > 0) {
5552     std::shared_ptr<rocksdb::PersistentCache> pcache;
5553     uint64_t cache_size_bytes = rocksdb_persistent_cache_size_mb * 1024 * 1024;
5554     status = rocksdb::NewPersistentCache(
5555         rocksdb::Env::Default(), std::string(rocksdb_persistent_cache_path),
5556         cache_size_bytes, myrocks_logger, true, &pcache);
5557     if (!status.ok()) {
5558       // NO_LINT_DEBUG
5559       sql_print_error("RocksDB: Persistent cache returned error: (%s)",
5560                       status.getState());
5561       DBUG_RETURN(HA_EXIT_FAILURE);
5562     }
5563     rocksdb_tbl_options->persistent_cache = pcache;
5564   } else if (strlen(rocksdb_persistent_cache_path)) {
5565     // NO_LINT_DEBUG
5566     sql_print_error("RocksDB: Must specify rocksdb_persistent_cache_size_mb");
5567     DBUG_RETURN(HA_EXIT_FAILURE);
5568   }
5569 
5570   std::unique_ptr<Rdb_cf_options> cf_options_map(new Rdb_cf_options());
5571   if (!cf_options_map->init(*rocksdb_tbl_options, properties_collector_factory,
5572                             rocksdb_default_cf_options,
5573                             rocksdb_override_cf_options)) {
5574     // NO_LINT_DEBUG
5575     sql_print_error("RocksDB: Failed to initialize CF options map.");
5576     DBUG_RETURN(HA_EXIT_FAILURE);
5577   }
5578 
5579   /*
5580     If there are no column families, we're creating the new database.
5581     Create one column family named "default".
5582   */
5583   if (cf_names.size() == 0) cf_names.push_back(DEFAULT_CF_NAME);
5584 
5585   std::vector<int> compaction_enabled_cf_indices;
5586 
5587   // NO_LINT_DEBUG
5588   sql_print_information("RocksDB: Column Families at start:");
5589   for (size_t i = 0; i < cf_names.size(); ++i) {
5590     rocksdb::ColumnFamilyOptions opts;
5591     cf_options_map->get_cf_options(cf_names[i], &opts);
5592 
5593     // NO_LINT_DEBUG
5594     sql_print_information("  cf=%s", cf_names[i].c_str());
5595 
5596     // NO_LINT_DEBUG
5597     sql_print_information("    write_buffer_size=%ld", opts.write_buffer_size);
5598 
5599     // NO_LINT_DEBUG
5600     sql_print_information("    target_file_size_base=%" PRIu64,
5601                           opts.target_file_size_base);
5602 
5603     /*
5604       Temporarily disable compactions to prevent a race condition where
5605       compaction starts before compaction filter is ready.
5606     */
5607     if (!opts.disable_auto_compactions) {
5608       compaction_enabled_cf_indices.push_back(i);
5609       opts.disable_auto_compactions = true;
5610     }
5611     cf_descr.push_back(rocksdb::ColumnFamilyDescriptor(cf_names[i], opts));
5612   }
5613 
5614   rocksdb::Options main_opts(*rocksdb_db_options,
5615                              cf_options_map->get_defaults());
5616 
5617   rocksdb::TransactionDBOptions tx_db_options;
5618   tx_db_options.transaction_lock_timeout = 2000;  // 2 seconds
5619   tx_db_options.custom_mutex_factory = std::make_shared<Rdb_mutex_factory>();
5620   tx_db_options.write_policy =
5621       static_cast<rocksdb::TxnDBWritePolicy>(rocksdb_write_policy);
5622 
5623   status =
5624       check_rocksdb_options_compatibility(rocksdb_datadir, main_opts, cf_descr);
5625 
5626   // We won't start if we'll determine that there's a chance of data corruption
5627   // because of incompatible options.
5628   if (!status.ok()) {
5629     rdb_log_status_error(
5630         status, "Compatibility check against existing database options failed");
5631     DBUG_RETURN(HA_EXIT_FAILURE);
5632   }
5633 
5634   status = rocksdb::TransactionDB::Open(
5635       main_opts, tx_db_options, rocksdb_datadir, cf_descr, &cf_handles, &rdb);
5636 
5637   if (!status.ok()) {
5638     rdb_log_status_error(status, "Error opening instance");
5639     DBUG_RETURN(HA_EXIT_FAILURE);
5640   }
5641   cf_manager.init(std::move(cf_options_map), &cf_handles);
5642 
5643   if (dict_manager.init(rdb, &cf_manager)) {
5644     // NO_LINT_DEBUG
5645     sql_print_error("RocksDB: Failed to initialize data dictionary.");
5646     DBUG_RETURN(HA_EXIT_FAILURE);
5647   }
5648 
5649   if (binlog_manager.init(&dict_manager)) {
5650     // NO_LINT_DEBUG
5651     sql_print_error("RocksDB: Failed to initialize binlog manager.");
5652     DBUG_RETURN(HA_EXIT_FAILURE);
5653   }
5654 
5655   if (ddl_manager.init(&dict_manager, &cf_manager, rocksdb_validate_tables)) {
5656     // NO_LINT_DEBUG
5657     sql_print_error("RocksDB: Failed to initialize DDL manager.");
5658 
5659     if (rocksdb_ignore_datadic_errors)
5660     {
5661       sql_print_error("RocksDB: rocksdb_ignore_datadic_errors=1, "
5662                       "trying to continue");
5663     }
5664     else
5665       DBUG_RETURN(HA_EXIT_FAILURE);
5666   }
5667 
5668   Rdb_sst_info::init(rdb);
5669 
5670   /*
5671     Enable auto compaction, things needed for compaction filter are finished
5672     initializing
5673   */
5674   std::vector<rocksdb::ColumnFamilyHandle *> compaction_enabled_cf_handles;
5675   compaction_enabled_cf_handles.reserve(compaction_enabled_cf_indices.size());
5676   for (const auto &index : compaction_enabled_cf_indices) {
5677     compaction_enabled_cf_handles.push_back(cf_handles[index]);
5678   }
5679 
5680   status = rdb->EnableAutoCompaction(compaction_enabled_cf_handles);
5681 
5682   if (!status.ok()) {
5683     rdb_log_status_error(status, "Error enabling compaction");
5684     DBUG_RETURN(HA_EXIT_FAILURE);
5685   }
5686 
5687 #ifndef HAVE_PSI_INTERFACE
5688   auto err = rdb_bg_thread.create_thread(BG_THREAD_NAME);
5689 #else
5690   auto err = rdb_bg_thread.create_thread(BG_THREAD_NAME,
5691                                          rdb_background_psi_thread_key);
5692 #endif
5693   if (err != 0) {
5694     // NO_LINT_DEBUG
5695     sql_print_error("RocksDB: Couldn't start the background thread: (errno=%d)",
5696                     err);
5697     DBUG_RETURN(HA_EXIT_FAILURE);
5698   }
5699 
5700 #ifndef HAVE_PSI_INTERFACE
5701   err = rdb_drop_idx_thread.create_thread(INDEX_THREAD_NAME);
5702 #else
5703   err = rdb_drop_idx_thread.create_thread(INDEX_THREAD_NAME,
5704                                           rdb_drop_idx_psi_thread_key);
5705 #endif
5706   if (err != 0) {
5707     // NO_LINT_DEBUG
5708     sql_print_error("RocksDB: Couldn't start the drop index thread: (errno=%d)",
5709                     err);
5710     DBUG_RETURN(HA_EXIT_FAILURE);
5711   }
5712 
5713   err = rdb_mc_thread.create_thread(MANUAL_COMPACTION_THREAD_NAME
5714 #ifdef HAVE_PSI_INTERFACE
5715                                     ,
5716                                     rdb_mc_psi_thread_key
5717 #endif
5718   );
5719   if (err != 0) {
5720     // NO_LINT_DEBUG
5721     sql_print_error(
5722         "RocksDB: Couldn't start the manual compaction thread: (errno=%d)",
5723         err);
5724     DBUG_RETURN(HA_EXIT_FAILURE);
5725   }
5726 
5727   rdb_set_collation_exception_list(rocksdb_strict_collation_exceptions);
5728 
5729   if (rocksdb_pause_background_work) {
5730     rdb->PauseBackgroundWork();
5731   }
5732 
5733   // NO_LINT_DEBUG
5734   sql_print_information("RocksDB: global statistics using %s indexer",
5735                         STRINGIFY_ARG(RDB_INDEXER));
5736 #if defined(HAVE_SCHED_GETCPU)
5737   if (sched_getcpu() == -1) {
5738     // NO_LINT_DEBUG
5739     sql_print_information(
5740         "RocksDB: sched_getcpu() failed - "
5741         "global statistics will use thread_id_indexer_t instead");
5742   }
5743 #endif
5744 
5745   err = my_error_register(rdb_get_error_messages, HA_ERR_ROCKSDB_FIRST,
5746                           HA_ERR_ROCKSDB_LAST);
5747   if (err != 0) {
5748     // NO_LINT_DEBUG
5749     sql_print_error("RocksDB: Couldn't initialize error messages");
5750     DBUG_RETURN(HA_EXIT_FAILURE);
5751   }
5752 
5753 
5754 
5755   // Creating an instance of HistogramImpl should only happen after RocksDB
5756   // has been successfully initialized.
5757   commit_latency_stats = new rocksdb::HistogramImpl();
5758 
5759   // Construct a list of directories which will be monitored by I/O watchdog
5760   // to make sure that we won't lose write access to them.
5761   std::vector<std::string> directories;
5762 
5763   // 1. Data directory.
5764   directories.push_back(mysql_real_data_home);
5765 
5766   // 2. Transaction logs.
5767   if (myrocks::rocksdb_wal_dir && *myrocks::rocksdb_wal_dir) {
5768     directories.push_back(myrocks::rocksdb_wal_dir);
5769   }
5770 
5771 #if !defined(_WIN32) && !defined(__APPLE__)
5772   io_watchdog = new Rdb_io_watchdog(std::move(directories));
5773   io_watchdog->reset_timeout(rocksdb_io_write_timeout_secs);
5774 #endif
5775 
5776   // NO_LINT_DEBUG
5777   sql_print_information(
5778       "MyRocks storage engine plugin has been successfully "
5779       "initialized.");
5780 
5781   // Skip cleaning up rdb_open_tables as we've succeeded
5782   rdb_open_tables_cleanup.skip();
5783 
5784   DBUG_RETURN(HA_EXIT_SUCCESS);
5785 }
5786 
5787 /*
5788   Storage Engine deinitialization function, invoked when plugin is unloaded.
5789 */
5790 
5791 static int rocksdb_done_func(void *const p) {
5792   DBUG_ENTER_FUNC();
5793 
5794   int error = 0;
5795 
5796   // signal the drop index thread to stop
5797   rdb_drop_idx_thread.signal(true);
5798 
5799   // Flush all memtables for not losing data, even if WAL is disabled.
5800   rocksdb_flush_all_memtables();
5801 
5802   // Stop all rocksdb background work
5803   CancelAllBackgroundWork(rdb->GetBaseDB(), true);
5804 
5805   // Signal the background thread to stop and to persist all stats collected
5806   // from background flushes and compactions. This will add more keys to a new
5807   // memtable, but since the memtables were just flushed, it should not trigger
5808   // a flush that can stall due to background threads being stopped. As long
5809   // as these keys are stored in a WAL file, they can be retrieved on restart.
5810   rdb_bg_thread.signal(true);
5811 
5812   // Wait for the background thread to finish.
5813   auto err = rdb_bg_thread.join();
5814   if (err != 0) {
5815     // We'll log the message and continue because we're shutting down and
5816     // continuation is the optimal strategy.
5817     // NO_LINT_DEBUG
5818     sql_print_error("RocksDB: Couldn't stop the background thread: (errno=%d)",
5819                     err);
5820   }
5821 
5822   // Wait for the drop index thread to finish.
5823   err = rdb_drop_idx_thread.join();
5824   if (err != 0) {
5825     // NO_LINT_DEBUG
5826     sql_print_error("RocksDB: Couldn't stop the index thread: (errno=%d)", err);
5827   }
5828 
5829   // signal the manual compaction thread to stop
5830   rdb_mc_thread.signal(true);
5831   // Wait for the manual compaction thread to finish.
5832   err = rdb_mc_thread.join();
5833   if (err != 0) {
5834     // NO_LINT_DEBUG
5835     sql_print_error(
5836         "RocksDB: Couldn't stop the manual compaction thread: (errno=%d)", err);
5837   }
5838 
5839   if (rdb_open_tables.count()) {
5840     // Looks like we are getting unloaded and yet we have some open tables
5841     // left behind.
5842     error = 1;
5843   }
5844 
5845   rdb_open_tables.free();
5846   /*
5847     destructors for static objects can be called at _exit(),
5848     but we want to free the memory at dlclose()
5849   */
5850   // MARIADB_MERGE_2019:  rdb_open_tables.m_hash.~Rdb_table_set();
5851   mysql_mutex_destroy(&rdb_sysvars_mutex);
5852   mysql_mutex_destroy(&rdb_block_cache_resize_mutex);
5853 
5854 
5855   delete rdb_collation_exceptions;
5856 
5857   mysql_mutex_destroy(&rdb_collation_data_mutex);
5858   mysql_mutex_destroy(&rdb_mem_cmp_space_mutex);
5859 
5860   Rdb_transaction::term_mutex();
5861 
5862   for (auto &it : rdb_collation_data) {
5863     delete it;
5864     it = nullptr;
5865   }
5866 
5867   ddl_manager.cleanup();
5868   binlog_manager.cleanup();
5869   dict_manager.cleanup();
5870   cf_manager.cleanup();
5871 
5872   delete rdb;
5873   rdb = nullptr;
5874 
5875   delete commit_latency_stats;
5876   commit_latency_stats = nullptr;
5877 
5878 #if !defined(_WIN32) && !defined(__APPLE__)
5879   delete io_watchdog;
5880   io_watchdog = nullptr;
5881 #endif
5882 
5883 // Disown the cache data since we're shutting down.
5884 // This results in memory leaks but it improved the shutdown time.
5885 // Don't disown when running under valgrind
5886 #ifndef HAVE_valgrind
5887   if (rocksdb_tbl_options->block_cache) {
5888     rocksdb_tbl_options->block_cache->DisownData();
5889   }
5890 #endif /* HAVE_valgrind */
5891 
5892   /*
5893     MariaDB: don't clear rocksdb_db_options and rocksdb_tbl_options.
5894     MyRocks' plugin variables refer to them.
5895 
5896     The plugin cannot be loaded again (see prevent_myrocks_loading) but plugin
5897     variables are processed before myrocks::rocksdb_init_func is invoked, so
5898     they must point to valid memory.
5899   */
5900   //rocksdb_db_options = nullptr;
5901   rocksdb_db_options->statistics = nullptr;
5902   //rocksdb_tbl_options = nullptr;
5903   rocksdb_stats = nullptr;
5904 
5905   my_free(rocksdb_update_cf_options);
5906   rocksdb_update_cf_options = nullptr;
5907 
5908   my_error_unregister(HA_ERR_ROCKSDB_FIRST, HA_ERR_ROCKSDB_LAST);
5909 
5910   /*
5911     Prevent loading the plugin after it has been loaded and then unloaded. This
5912     doesn't work currently.
5913   */
5914   prevent_myrocks_loading= true;
5915 
5916   DBUG_RETURN(error);
5917 }
5918 
5919 static inline void rocksdb_smart_seek(bool seek_backward,
5920                                       rocksdb::Iterator *const iter,
5921                                       const rocksdb::Slice &key_slice) {
5922   if (seek_backward) {
5923     iter->SeekForPrev(key_slice);
5924   } else {
5925     iter->Seek(key_slice);
5926   }
5927 }
5928 
5929 static inline void rocksdb_smart_next(bool seek_backward,
5930                                       rocksdb::Iterator *const iter) {
5931   if (seek_backward) {
5932     iter->Prev();
5933   } else {
5934     iter->Next();
5935   }
5936 }
5937 
5938 #ifndef DBUG_OFF
5939 // simulate that RocksDB has reported corrupted data
5940 static void dbug_change_status_to_corrupted(rocksdb::Status *status) {
5941   *status = rocksdb::Status::Corruption();
5942 }
5943 #endif
5944 
5945 // If the iterator is not valid it might be because of EOF but might be due
5946 // to IOError or corruption. The good practice is always check it.
5947 // https://github.com/facebook/rocksdb/wiki/Iterator#error-handling
5948 static inline bool is_valid(rocksdb::Iterator *scan_it) {
5949   if (scan_it->Valid()) {
5950     return true;
5951   } else {
5952     rocksdb::Status s = scan_it->status();
5953     DBUG_EXECUTE_IF("rocksdb_return_status_corrupted",
5954                     dbug_change_status_to_corrupted(&s););
5955     if (s.IsIOError() || s.IsCorruption()) {
5956       if (s.IsCorruption()) {
5957         rdb_persist_corruption_marker();
5958       }
5959       rdb_handle_io_error(s, RDB_IO_ERROR_GENERAL);
5960     }
5961     return false;
5962   }
5963 }
5964 
5965 /**
5966   @brief
5967   Example of simple lock controls. The "table_handler" it creates is a
5968   structure we will pass to each ha_rocksdb handler. Do you have to have
5969   one of these? Well, you have pieces that are used for locking, and
5970   they are needed to function.
5971 */
5972 
5973 Rdb_table_handler *Rdb_open_tables_map::get_table_handler(
5974     const char *const table_name) {
5975   DBUG_ASSERT(table_name != nullptr);
5976 
5977   Rdb_table_handler *table_handler;
5978 
5979   std::string table_name_str(table_name);
5980 
5981   // First, look up the table in the hash map.
5982   RDB_MUTEX_LOCK_CHECK(m_mutex);
5983   const auto it = m_table_map.find(table_name_str);
5984   if (it != m_table_map.end()) {
5985     // Found it
5986     table_handler = it->second;
5987   } else {
5988     char *tmp_name;
5989 
5990     // Since we did not find it in the hash map, attempt to create and add it
5991     // to the hash map.
5992     if (!(table_handler = reinterpret_cast<Rdb_table_handler *>(my_multi_malloc(
5993               MYF(MY_WME | MY_ZEROFILL), &table_handler, sizeof(*table_handler),
5994               &tmp_name, table_name_str.length() + 1, NullS)))) {
5995       // Allocating a new Rdb_table_handler and a new table name failed.
5996       RDB_MUTEX_UNLOCK_CHECK(m_mutex);
5997       return nullptr;
5998     }
5999 
6000     table_handler->m_ref_count = 0;
6001     table_handler->m_table_name_length = table_name_str.length();
6002     table_handler->m_table_name = tmp_name;
6003     strmov(table_handler->m_table_name, table_name);
6004 
6005     m_table_map.emplace(table_name_str, table_handler);
6006 
6007     thr_lock_init(&table_handler->m_thr_lock);
6008 #ifdef MARIAROCKS_NOT_YET
6009     table_handler->m_io_perf_read.init();
6010     table_handler->m_io_perf_write.init();
6011 #endif
6012   }
6013   DBUG_ASSERT(table_handler->m_ref_count >= 0);
6014   table_handler->m_ref_count++;
6015 
6016   RDB_MUTEX_UNLOCK_CHECK(m_mutex);
6017 
6018   return table_handler;
6019 }
6020 
6021 std::vector<std::string> rdb_get_open_table_names(void) {
6022   return rdb_open_tables.get_table_names();
6023 }
6024 
6025 std::vector<std::string> Rdb_open_tables_map::get_table_names(void) const {
6026   const Rdb_table_handler *table_handler;
6027   std::vector<std::string> names;
6028 
6029   RDB_MUTEX_LOCK_CHECK(m_mutex);
6030   for (const auto &kv : m_table_map) {
6031     table_handler = kv.second;
6032     DBUG_ASSERT(table_handler != nullptr);
6033     names.push_back(table_handler->m_table_name);
6034   }
6035   RDB_MUTEX_UNLOCK_CHECK(m_mutex);
6036 
6037   return names;
6038 }
6039 
6040 /*
6041   Inspired by innobase_get_int_col_max_value from InnoDB. This returns the
6042   maximum value a type can take on.
6043 */
6044 static ulonglong rdb_get_int_col_max_value(const Field *field) {
6045   ulonglong max_value = 0;
6046   switch (field->key_type()) {
6047     case HA_KEYTYPE_BINARY:
6048       max_value = 0xFFULL;
6049       break;
6050     case HA_KEYTYPE_INT8:
6051       max_value = 0x7FULL;
6052       break;
6053     case HA_KEYTYPE_USHORT_INT:
6054       max_value = 0xFFFFULL;
6055       break;
6056     case HA_KEYTYPE_SHORT_INT:
6057       max_value = 0x7FFFULL;
6058       break;
6059     case HA_KEYTYPE_UINT24:
6060       max_value = 0xFFFFFFULL;
6061       break;
6062     case HA_KEYTYPE_INT24:
6063       max_value = 0x7FFFFFULL;
6064       break;
6065     case HA_KEYTYPE_ULONG_INT:
6066       max_value = 0xFFFFFFFFULL;
6067       break;
6068     case HA_KEYTYPE_LONG_INT:
6069       max_value = 0x7FFFFFFFULL;
6070       break;
6071     case HA_KEYTYPE_ULONGLONG:
6072       max_value = 0xFFFFFFFFFFFFFFFFULL;
6073       break;
6074     case HA_KEYTYPE_LONGLONG:
6075       max_value = 0x7FFFFFFFFFFFFFFFULL;
6076       break;
6077     case HA_KEYTYPE_FLOAT:
6078       max_value = 0x1000000ULL;
6079       break;
6080     case HA_KEYTYPE_DOUBLE:
6081       max_value = 0x20000000000000ULL;
6082       break;
6083     default:
6084       abort();
6085   }
6086 
6087   return max_value;
6088 }
6089 
6090 void ha_rocksdb::load_auto_incr_value() {
6091   ulonglong auto_incr = 0;
6092   bool validate_last = false, use_datadic = true;
6093 #ifndef DBUG_OFF
6094   DBUG_EXECUTE_IF("myrocks_autoinc_upgrade", use_datadic = false;);
6095   validate_last = true;
6096 #endif
6097 
6098   if (use_datadic && dict_manager.get_auto_incr_val(
6099                          m_tbl_def->get_autoincr_gl_index_id(), &auto_incr)) {
6100     update_auto_incr_val(auto_incr);
6101   }
6102 
6103   // If we find nothing in the data dictionary, or if we are in debug mode,
6104   // then call index_last to get the last value.
6105   //
6106   // This is needed when upgrading from a server that did not support
6107   // persistent auto_increment, of if the table is empty.
6108   //
6109   // For debug mode, we are just verifying that the data dictionary value is
6110   // greater than or equal to the maximum value in the table.
6111   if (auto_incr == 0 || validate_last) {
6112     auto_incr = load_auto_incr_value_from_index();
6113     update_auto_incr_val(auto_incr);
6114   }
6115 
6116   // If we failed to find anything from the data dictionary and index, then
6117   // initialize auto_increment to 1.
6118   if (m_tbl_def->m_auto_incr_val == 0) {
6119     update_auto_incr_val(1);
6120   }
6121 }
6122 
6123 ulonglong ha_rocksdb::load_auto_incr_value_from_index() {
6124   const int save_active_index = active_index;
6125   active_index = table->s->next_number_index;
6126   const uint8 save_table_status = table->status;
6127   ulonglong last_val = 0;
6128 
6129   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
6130   const bool is_new_snapshot = !tx->has_snapshot();
6131   if (is_new_snapshot) {
6132     tx->acquire_snapshot(true);
6133   }
6134 
6135   // Do a lookup. We only need index column, so it should be index-only.
6136   // (another reason to make it index-only is that table->read_set is not set
6137   // appropriately and non-index-only lookup will not read the value)
6138   const bool save_keyread_only = m_keyread_only;
6139   m_keyread_only = true;
6140   m_converter->set_is_key_requested(true);
6141 
6142   if (!index_last(table->record[0])) {
6143     Field *field =
6144         table->key_info[table->s->next_number_index].key_part[0].field;
6145     ulonglong max_val = rdb_get_int_col_max_value(field);
6146     MY_BITMAP *const old_map = dbug_tmp_use_all_columns(table, &table->read_set);
6147     last_val = field->val_int();
6148     if (last_val != max_val) {
6149       last_val++;
6150     }
6151 #ifndef DBUG_OFF
6152     ulonglong dd_val;
6153     if (last_val <= max_val) {
6154       const auto &gl_index_id = m_tbl_def->get_autoincr_gl_index_id();
6155       if (dict_manager.get_auto_incr_val(gl_index_id, &dd_val) &&
6156           tx->get_auto_incr(gl_index_id) == 0) {
6157         DBUG_ASSERT(dd_val >= last_val);
6158       }
6159     }
6160 #endif
6161     dbug_tmp_restore_column_map(&table->read_set, old_map);
6162   }
6163 
6164   m_keyread_only = save_keyread_only;
6165   if (is_new_snapshot) {
6166     tx->release_snapshot();
6167   }
6168 
6169   table->status = save_table_status;
6170   active_index = save_active_index;
6171 
6172   /*
6173     Do what ha_rocksdb::index_end() does.
6174     (Why don't we use index_init/index_end? class handler defines index_init
6175     as private, for some reason).
6176     */
6177   release_scan_iterator();
6178 
6179   return last_val;
6180 }
6181 
6182 void ha_rocksdb::update_auto_incr_val(ulonglong val) {
6183   ulonglong auto_incr_val = m_tbl_def->m_auto_incr_val;
6184   while (
6185       auto_incr_val < val &&
6186       !m_tbl_def->m_auto_incr_val.compare_exchange_weak(auto_incr_val, val)) {
6187     // Do nothing - just loop until auto_incr_val is >= val or we successfully
6188     // set it
6189   }
6190 }
6191 
6192 void ha_rocksdb::update_auto_incr_val_from_field() {
6193   Field *field;
6194   ulonglong new_val, max_val;
6195   field = table->key_info[table->s->next_number_index].key_part[0].field;
6196   max_val = rdb_get_int_col_max_value(field);
6197 
6198   MY_BITMAP *const old_map =
6199       dbug_tmp_use_all_columns(table, &table->read_set);
6200   new_val = field->val_int();
6201   // don't increment if we would wrap around
6202   if (new_val != max_val) {
6203     new_val++;
6204   }
6205 
6206   dbug_tmp_restore_column_map(&table->read_set, old_map);
6207 
6208   // Only update if positive value was set for auto_incr column.
6209   if (new_val <= max_val) {
6210     Rdb_transaction *const tx = get_or_create_tx(table->in_use);
6211     tx->set_auto_incr(m_tbl_def->get_autoincr_gl_index_id(), new_val);
6212 
6213     // Update the in memory auto_incr value in m_tbl_def.
6214     update_auto_incr_val(new_val);
6215   }
6216 }
6217 
6218 int ha_rocksdb::load_hidden_pk_value() {
6219   const int save_active_index = active_index;
6220   active_index = m_tbl_def->m_key_count - 1;
6221   const uint8 save_table_status = table->status;
6222 
6223   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
6224   const bool is_new_snapshot = !tx->has_snapshot();
6225 
6226   longlong hidden_pk_id = 1;
6227   // Do a lookup.
6228   if (!index_last(table->record[0])) {
6229     /*
6230       Decode PK field from the key
6231     */
6232     auto err = read_hidden_pk_id_from_rowkey(&hidden_pk_id);
6233     if (err) {
6234       if (is_new_snapshot) {
6235         tx->release_snapshot();
6236       }
6237       return err;
6238     }
6239 
6240     hidden_pk_id++;
6241   }
6242 
6243   longlong old = m_tbl_def->m_hidden_pk_val;
6244   while (old < hidden_pk_id &&
6245          !m_tbl_def->m_hidden_pk_val.compare_exchange_weak(old, hidden_pk_id)) {
6246   }
6247 
6248   if (is_new_snapshot) {
6249     tx->release_snapshot();
6250   }
6251 
6252   table->status = save_table_status;
6253   active_index = save_active_index;
6254 
6255   release_scan_iterator();
6256 
6257   return HA_EXIT_SUCCESS;
6258 }
6259 
6260 /* Get PK value from m_tbl_def->m_hidden_pk_info. */
6261 longlong ha_rocksdb::update_hidden_pk_val() {
6262   DBUG_ASSERT(has_hidden_pk(table));
6263   const longlong new_val = m_tbl_def->m_hidden_pk_val++;
6264   return new_val;
6265 }
6266 
6267 /* Get the id of the hidden pk id from m_last_rowkey */
6268 int ha_rocksdb::read_hidden_pk_id_from_rowkey(longlong *const hidden_pk_id) {
6269   DBUG_ASSERT(table != nullptr);
6270   DBUG_ASSERT(has_hidden_pk(table));
6271 
6272   rocksdb::Slice rowkey_slice(m_last_rowkey.ptr(), m_last_rowkey.length());
6273 
6274   // Get hidden primary key from old key slice
6275   Rdb_string_reader reader(&rowkey_slice);
6276   if ((!reader.read(Rdb_key_def::INDEX_NUMBER_SIZE))) {
6277     return HA_ERR_ROCKSDB_CORRUPT_DATA;
6278   }
6279 
6280   const int length= 8; /* was Field_longlong::PACK_LENGTH in FB MySQL tree */
6281   const uchar *from = reinterpret_cast<const uchar *>(reader.read(length));
6282   if (from == nullptr) {
6283     /* Mem-comparable image doesn't have enough bytes */
6284     return HA_ERR_ROCKSDB_CORRUPT_DATA;
6285   }
6286 
6287   *hidden_pk_id = rdb_netbuf_read_uint64(&from);
6288   return HA_EXIT_SUCCESS;
6289 }
6290 
6291 /**
6292   @brief
6293   Free lock controls. We call this whenever we close a table. If the table had
6294   the last reference to the table_handler, then we free the memory associated
6295   with it.
6296 */
6297 
6298 void Rdb_open_tables_map::release_table_handler(
6299     Rdb_table_handler *const table_handler) {
6300   RDB_MUTEX_LOCK_CHECK(m_mutex);
6301 
6302   DBUG_ASSERT(table_handler != nullptr);
6303   DBUG_ASSERT(table_handler->m_ref_count > 0);
6304   if (!--table_handler->m_ref_count) {
6305     // Last reference was released. Tear down the hash entry.
6306     const auto ret MY_ATTRIBUTE((__unused__)) =
6307         m_table_map.erase(std::string(table_handler->m_table_name));
6308     DBUG_ASSERT(ret == 1);  // the hash entry must actually be found and deleted
6309     my_core::thr_lock_delete(&table_handler->m_thr_lock);
6310     my_free(table_handler);
6311   }
6312 
6313   RDB_MUTEX_UNLOCK_CHECK(m_mutex);
6314 }
6315 
6316 static handler *rocksdb_create_handler(my_core::handlerton *const hton,
6317                                        my_core::TABLE_SHARE *const table_arg,
6318                                        my_core::MEM_ROOT *const mem_root) {
6319   return new (mem_root) ha_rocksdb(hton, table_arg);
6320 }
6321 
6322 ha_rocksdb::ha_rocksdb(my_core::handlerton *const hton,
6323                        my_core::TABLE_SHARE *const table_arg)
6324     : handler(hton, table_arg),
6325       m_table_handler(nullptr),
6326       m_scan_it(nullptr),
6327       m_scan_it_skips_bloom(false),
6328       m_scan_it_snapshot(nullptr),
6329       m_scan_it_lower_bound(nullptr),
6330       m_scan_it_upper_bound(nullptr),
6331       m_tbl_def(nullptr),
6332       m_pk_descr(nullptr),
6333       m_key_descr_arr(nullptr),
6334       m_pk_can_be_decoded(false),
6335       m_pk_tuple(nullptr),
6336       m_pk_packed_tuple(nullptr),
6337       m_sk_packed_tuple(nullptr),
6338       m_end_key_packed_tuple(nullptr),
6339       m_sk_match_prefix(nullptr),
6340       m_sk_match_prefix_buf(nullptr),
6341       m_sk_packed_tuple_old(nullptr),
6342       m_dup_sk_packed_tuple(nullptr),
6343       m_dup_sk_packed_tuple_old(nullptr),
6344       m_pack_buffer(nullptr),
6345       m_lock_rows(RDB_LOCK_NONE),
6346       m_keyread_only(false),
6347       m_insert_with_update(false),
6348       m_dup_pk_found(false),
6349       m_in_rpl_delete_rows(false),
6350       m_in_rpl_update_rows(false),
6351       m_force_skip_unique_check(false) {}
6352 
6353 
6354 const std::string &ha_rocksdb::get_table_basename() const {
6355   return m_tbl_def->base_tablename();
6356 }
6357 
6358 /**
6359   @return
6360     false  OK
6361     other  Error inpacking the data
6362 */
6363 bool ha_rocksdb::init_with_fields() {
6364   DBUG_ENTER_FUNC();
6365 
6366   const uint pk = table_share->primary_key;
6367   if (pk != MAX_KEY) {
6368     const uint key_parts = table_share->key_info[pk].user_defined_key_parts;
6369     check_keyread_allowed(pk /*PK*/, key_parts - 1, true);
6370   } else {
6371     m_pk_can_be_decoded = false;
6372   }
6373   cached_table_flags = table_flags();
6374 
6375   DBUG_RETURN(false); /* Ok */
6376 }
6377 
6378 /*
6379   If the key is a TTL key, we may need to filter it out.
6380 
6381   The purpose of read filtering for tables with TTL is to ensure that
6382   during a transaction a key which has expired already but not removed by
6383   compaction yet is not returned to the user.
6384 
6385   Without this the user might be hit with problems such as disappearing
6386   rows within a transaction, etc, because the compaction filter ignores
6387   snapshots when filtering keys.
6388 */
6389 bool ha_rocksdb::should_hide_ttl_rec(const Rdb_key_def &kd,
6390                                      const rocksdb::Slice &ttl_rec_val,
6391                                      const int64_t curr_ts) {
6392   DBUG_ASSERT(kd.has_ttl());
6393   DBUG_ASSERT(kd.m_ttl_rec_offset != UINT_MAX);
6394 
6395   /*
6396     Curr_ts can only be 0 if there are no snapshots open.
6397     should_hide_ttl_rec can only be called when there is >=1 snapshots, unless
6398     we are filtering on the write path (single INSERT/UPDATE) in which case
6399     we are passed in the current time as curr_ts.
6400 
6401     In the event curr_ts is 0, we always decide not to filter the record. We
6402     also log a warning and increment a diagnostic counter.
6403   */
6404   if (curr_ts == 0) {
6405     update_row_stats(ROWS_HIDDEN_NO_SNAPSHOT);
6406     return false;
6407   }
6408 
6409   if (!rdb_is_ttl_read_filtering_enabled() || !rdb_is_ttl_enabled()) {
6410     return false;
6411   }
6412 
6413   Rdb_string_reader reader(&ttl_rec_val);
6414 
6415   /*
6416     Find where the 8-byte ttl is for each record in this index.
6417   */
6418   uint64 ts;
6419   if (!reader.read(kd.m_ttl_rec_offset) || reader.read_uint64(&ts)) {
6420     /*
6421       This condition should never be reached since all TTL records have an
6422       8 byte ttl field in front. Don't filter the record out, and log an error.
6423     */
6424     std::string buf;
6425     buf = rdb_hexdump(ttl_rec_val.data(), ttl_rec_val.size(),
6426                       RDB_MAX_HEXDUMP_LEN);
6427     const GL_INDEX_ID gl_index_id = kd.get_gl_index_id();
6428     // NO_LINT_DEBUG
6429     sql_print_error(
6430         "Decoding ttl from PK value failed, "
6431         "for index (%u,%u), val: %s",
6432         gl_index_id.cf_id, gl_index_id.index_id, buf.c_str());
6433     DBUG_ASSERT(0);
6434     return false;
6435   }
6436 
6437   /* Hide record if it has expired before the current snapshot time. */
6438   uint64 read_filter_ts = 0;
6439 #ifndef DBUG_OFF
6440   read_filter_ts += rdb_dbug_set_ttl_read_filter_ts();
6441 #endif
6442   bool is_hide_ttl =
6443       ts + kd.m_ttl_duration + read_filter_ts <= static_cast<uint64>(curr_ts);
6444   if (is_hide_ttl) {
6445     update_row_stats(ROWS_FILTERED);
6446 
6447     /* increment examined row count when rows are skipped */
6448     THD *thd = ha_thd();
6449     thd->inc_examined_row_count(1);
6450     DEBUG_SYNC(thd, "rocksdb.ttl_rows_examined");
6451   }
6452   return is_hide_ttl;
6453 }
6454 
6455 int ha_rocksdb::rocksdb_skip_expired_records(const Rdb_key_def &kd,
6456                                              rocksdb::Iterator *const iter,
6457                                              bool seek_backward) {
6458   if (kd.has_ttl()) {
6459     THD *thd = ha_thd();
6460     while (iter->Valid() &&
6461            should_hide_ttl_rec(
6462                kd, iter->value(),
6463                get_or_create_tx(table->in_use)->m_snapshot_timestamp)) {
6464       DEBUG_SYNC(thd, "rocksdb.check_flags_ser");
6465       if (thd && thd->killed) {
6466         return HA_ERR_QUERY_INTERRUPTED;
6467       }
6468       rocksdb_smart_next(seek_backward, iter);
6469     }
6470   }
6471   return HA_EXIT_SUCCESS;
6472 }
6473 
6474 #ifndef DBUG_OFF
6475 void dbug_append_garbage_at_end(rocksdb::PinnableSlice *on_disk_rec) {
6476   std::string str(on_disk_rec->data(), on_disk_rec->size());
6477   on_disk_rec->Reset();
6478   str.append("abc");
6479   on_disk_rec->PinSelf(rocksdb::Slice(str));
6480 }
6481 
6482 void dbug_truncate_record(rocksdb::PinnableSlice *on_disk_rec) {
6483   on_disk_rec->remove_suffix(on_disk_rec->size());
6484 }
6485 
6486 void dbug_modify_rec_varchar12(rocksdb::PinnableSlice *on_disk_rec) {
6487   std::string res;
6488   // The record is NULL-byte followed by VARCHAR(10).
6489   // Put the NULL-byte
6490   res.append("\0", 1);
6491   // Then, add a valid VARCHAR(12) value.
6492   res.append("\xC", 1);
6493   res.append("123456789ab", 12);
6494 
6495   on_disk_rec->Reset();
6496   on_disk_rec->PinSelf(rocksdb::Slice(res));
6497 }
6498 
6499 void dbug_create_err_inplace_alter() {
6500   my_printf_error(ER_UNKNOWN_ERROR,
6501                   "Intentional failure in inplace alter occurred.", MYF(0));
6502 }
6503 #endif
6504 
6505 int ha_rocksdb::convert_record_from_storage_format(
6506     const rocksdb::Slice *const key, uchar *const buf) {
6507   DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read1",
6508                   dbug_append_garbage_at_end(&m_retrieved_record););
6509   DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read2",
6510                   dbug_truncate_record(&m_retrieved_record););
6511   DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read3",
6512                   dbug_modify_rec_varchar12(&m_retrieved_record););
6513 
6514   return convert_record_from_storage_format(key, &m_retrieved_record, buf);
6515 }
6516 
6517 /*
6518   @brief
6519   Unpack the record in this->m_retrieved_record and this->m_last_rowkey from
6520   storage format into buf (which can be table->record[0] or table->record[1]).
6521 
6522   @param  key   Table record's key in mem-comparable form.
6523   @param  buf   Store record in table->record[0] format here
6524 
6525   @detail
6526     If the table has blobs, the unpacked data in buf may keep pointers to the
6527     data in this->m_retrieved_record.
6528 
6529     The key is only needed to check its checksum value (the checksum is in
6530     m_retrieved_record).
6531 
6532   @seealso
6533     rdb_converter::setup_read_decoders()  Sets up data structures which tell
6534   which columns to decode.
6535 
6536   @return
6537     0      OK
6538     other  Error inpacking the data
6539 */
6540 
6541 int ha_rocksdb::convert_record_from_storage_format(
6542     const rocksdb::Slice *const key, const rocksdb::Slice *const value,
6543     uchar *const buf) {
6544   return m_converter->decode(m_pk_descr, buf, key, value);
6545 }
6546 
6547 int ha_rocksdb::alloc_key_buffers(const TABLE *const table_arg,
6548                                   const Rdb_tbl_def *const tbl_def_arg,
6549                                   bool alloc_alter_buffers) {
6550   DBUG_ENTER_FUNC();
6551 
6552   DBUG_ASSERT(m_pk_tuple == nullptr);
6553 
6554   std::shared_ptr<Rdb_key_def> *const kd_arr = tbl_def_arg->m_key_descr_arr;
6555 
6556   uint key_len = 0;
6557   uint max_packed_sk_len = 0;
6558   uint pack_key_len = 0;
6559 
6560   m_pk_descr = kd_arr[pk_index(table_arg, tbl_def_arg)];
6561   if (has_hidden_pk(table_arg)) {
6562     m_pk_key_parts = 1;
6563   } else {
6564     m_pk_key_parts =
6565         table->key_info[table->s->primary_key].user_defined_key_parts;
6566     key_len = table->key_info[table->s->primary_key].key_length;
6567   }
6568 
6569   // move this into get_table_handler() ??
6570   m_pk_descr->setup(table_arg, tbl_def_arg);
6571 
6572   m_pk_tuple = reinterpret_cast<uchar *>(my_malloc(key_len, MYF(0)));
6573 
6574   pack_key_len = m_pk_descr->max_storage_fmt_length();
6575   m_pk_packed_tuple =
6576       reinterpret_cast<uchar *>(my_malloc(pack_key_len, MYF(0)));
6577 
6578   /* Sometimes, we may use m_sk_packed_tuple for storing packed PK */
6579   max_packed_sk_len = pack_key_len;
6580   for (uint i = 0; i < table_arg->s->keys; i++) {
6581     /* Primary key was processed above */
6582     if (i == table_arg->s->primary_key) continue;
6583 
6584     // TODO: move this into get_table_handler() ??
6585     kd_arr[i]->setup(table_arg, tbl_def_arg);
6586 
6587     const uint packed_len = kd_arr[i]->max_storage_fmt_length();
6588     if (packed_len > max_packed_sk_len) {
6589       max_packed_sk_len = packed_len;
6590     }
6591   }
6592 
6593   m_sk_packed_tuple =
6594       reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
6595   m_sk_match_prefix_buf =
6596       reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
6597   m_sk_packed_tuple_old =
6598       reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
6599   m_end_key_packed_tuple =
6600       reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
6601   m_pack_buffer =
6602       reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
6603 
6604   m_scan_it_lower_bound =
6605       reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
6606   m_scan_it_upper_bound =
6607       reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
6608 
6609   /*
6610     If inplace alter is happening, allocate special buffers for unique
6611     secondary index duplicate checking.
6612   */
6613   if (alloc_alter_buffers) {
6614     m_dup_sk_packed_tuple =
6615         reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
6616     m_dup_sk_packed_tuple_old =
6617         reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
6618   }
6619 
6620   if (m_pk_tuple == nullptr || m_pk_packed_tuple == nullptr ||
6621       m_sk_packed_tuple == nullptr || m_sk_packed_tuple_old == nullptr ||
6622       m_end_key_packed_tuple == nullptr || m_pack_buffer == nullptr ||
6623       m_scan_it_upper_bound == nullptr || m_scan_it_lower_bound == nullptr ||
6624       (alloc_alter_buffers && (m_dup_sk_packed_tuple == nullptr ||
6625                                m_dup_sk_packed_tuple_old == nullptr))) {
6626     // One or more of the above allocations failed.  Clean up and exit
6627     free_key_buffers();
6628 
6629     DBUG_RETURN(HA_ERR_OUT_OF_MEM);
6630   }
6631 
6632   DBUG_RETURN(HA_EXIT_SUCCESS);
6633 }
6634 
6635 void ha_rocksdb::free_key_buffers() {
6636   my_free(m_pk_tuple);
6637   m_pk_tuple = nullptr;
6638 
6639   my_free(m_pk_packed_tuple);
6640   m_pk_packed_tuple = nullptr;
6641 
6642   my_free(m_sk_packed_tuple);
6643   m_sk_packed_tuple = nullptr;
6644 
6645   my_free(m_sk_match_prefix_buf);
6646   m_sk_match_prefix_buf = nullptr;
6647 
6648   my_free(m_sk_packed_tuple_old);
6649   m_sk_packed_tuple_old = nullptr;
6650 
6651   my_free(m_end_key_packed_tuple);
6652   m_end_key_packed_tuple = nullptr;
6653 
6654   my_free(m_pack_buffer);
6655   m_pack_buffer = nullptr;
6656 
6657   my_free(m_dup_sk_packed_tuple);
6658   m_dup_sk_packed_tuple = nullptr;
6659 
6660   my_free(m_dup_sk_packed_tuple_old);
6661   m_dup_sk_packed_tuple_old = nullptr;
6662 
6663   my_free(m_scan_it_lower_bound);
6664   m_scan_it_lower_bound = nullptr;
6665 
6666   my_free(m_scan_it_upper_bound);
6667   m_scan_it_upper_bound = nullptr;
6668 }
6669 
6670 #ifdef MARIAROCKS_NOT_YET
6671 #endif
6672 void ha_rocksdb::set_skip_unique_check_tables(const char *const whitelist) {
6673   const char *const wl =
6674       whitelist ? whitelist : DEFAULT_SKIP_UNIQUE_CHECK_TABLES;
6675 
6676 #if defined(HAVE_PSI_INTERFACE)
6677   Regex_list_handler regex_handler(key_rwlock_skip_unique_check_tables);
6678 #else
6679   Regex_list_handler regex_handler;
6680 #endif
6681 
6682   if (!regex_handler.set_patterns(wl)) {
6683     warn_about_bad_patterns(&regex_handler, "skip_unique_check_tables");
6684   }
6685 
6686   m_skip_unique_check = regex_handler.matches(m_tbl_def->base_tablename());
6687 }
6688 
6689 /**
6690   @return
6691     HA_EXIT_SUCCESS  OK
6692     other            HA_ERR error code (can be SE-specific)
6693 */
6694 int ha_rocksdb::open(const char *const name, int mode, uint test_if_locked) {
6695   DBUG_ENTER_FUNC();
6696 
6697   int err = close();
6698   if (err) {
6699     DBUG_RETURN(err);
6700   }
6701 
6702   m_table_handler = rdb_open_tables.get_table_handler(name);
6703 
6704   if (m_table_handler == nullptr) {
6705     DBUG_RETURN(HA_ERR_OUT_OF_MEM);
6706   }
6707 
6708   my_core::thr_lock_data_init(&m_table_handler->m_thr_lock, &m_db_lock,
6709                               nullptr);
6710   m_io_perf.init(&m_table_handler->m_table_perf_context,
6711                  &m_table_handler->m_io_perf_read,
6712                  &m_table_handler->m_io_perf_write, &stats);
6713   Rdb_perf_context_guard guard(&m_io_perf,
6714                                rocksdb_perf_context_level(ha_thd()));
6715 
6716   std::string fullname;
6717   err = rdb_normalize_tablename(name, &fullname);
6718   if (err != HA_EXIT_SUCCESS) {
6719     DBUG_RETURN(err);
6720   }
6721 
6722   m_tbl_def = ddl_manager.find(fullname);
6723   if (m_tbl_def == nullptr) {
6724     my_error(ER_INTERNAL_ERROR, MYF(0),
6725              "Attempt to open a table that is not present in RocksDB-SE data "
6726              "dictionary");
6727     DBUG_RETURN(HA_ERR_ROCKSDB_INVALID_TABLE);
6728   }
6729   if (m_tbl_def->m_key_count != table->s->keys + has_hidden_pk(table)? 1:0)
6730   {
6731     sql_print_error("MyRocks: DDL mismatch: .frm file has %u indexes, "
6732                     "MyRocks has %u (%s hidden pk)",
6733                     table->s->keys, m_tbl_def->m_key_count,
6734                     has_hidden_pk(table)? "1" : "no");
6735 
6736     if (rocksdb_ignore_datadic_errors)
6737     {
6738       sql_print_error("MyRocks: rocksdb_ignore_datadic_errors=1, "
6739                       "trying to continue");
6740     }
6741     else
6742     {
6743       my_error(ER_INTERNAL_ERROR, MYF(0),
6744                "MyRocks: DDL mismatch. Check the error log for details");
6745       DBUG_RETURN(HA_ERR_ROCKSDB_INVALID_TABLE);
6746     }
6747   }
6748 
6749 
6750   m_lock_rows = RDB_LOCK_NONE;
6751   m_key_descr_arr = m_tbl_def->m_key_descr_arr;
6752 
6753   /*
6754     Full table scan actually uses primary key
6755     (UPDATE needs to know this, otherwise it will go into infinite loop on
6756     queries like "UPDATE tbl SET pk=pk+100")
6757   */
6758   key_used_on_scan = table->s->primary_key;
6759 
6760   // close() above has already called free_key_buffers(). No need to do it here.
6761   err = alloc_key_buffers(table, m_tbl_def);
6762 
6763   if (err) {
6764     DBUG_RETURN(err);
6765   }
6766 
6767   /*
6768     init_with_fields() is used to initialize table flags based on the field
6769     definitions in table->field[].
6770     It is called by open_binary_frm(), but that function calls the method for
6771     a temporary ha_rocksdb object which is later destroyed.
6772 
6773     If we are here in ::open(), then init_with_fields() has not been called
6774     for this object. Call it ourselves, we want all member variables to be
6775     properly initialized.
6776   */
6777   init_with_fields();
6778 
6779   /* Initialize decoder */
6780   m_converter = std::make_shared<Rdb_converter>(ha_thd(), m_tbl_def, table);
6781 
6782   /*
6783      Update m_ttl_bytes address to same as Rdb_converter's m_ttl_bytes.
6784      Remove this code after moving convert_record_to_storage_format() into
6785      Rdb_converter class.
6786   */
6787   m_ttl_bytes = m_converter->get_ttl_bytes_buffer();
6788 
6789   /*
6790     MariaDB: adjust field->part_of_key for PK columns. We can only do it here
6791     because SE API is just relying on the HA_PRIMARY_KEY_IN_READ_INDEX which
6792     does not allow to distinguish between unpack'able and non-unpack'able
6793     columns.
6794     Upstream uses handler->init_with_fields() but we don't have that call.
6795   */
6796   {
6797     if (!has_hidden_pk(table)) {
6798       KEY *const pk_info = &table->key_info[table->s->primary_key];
6799       for (uint kp = 0; kp < pk_info->user_defined_key_parts; kp++) {
6800         if (!m_pk_descr->can_unpack(kp)) {
6801           //
6802           uint field_index= pk_info->key_part[kp].field->field_index;
6803           table->field[field_index]->part_of_key.clear_all();
6804           table->field[field_index]->part_of_key.set_bit(table->s->primary_key);
6805         }
6806       }
6807     }
6808 
6809     for (uint key= 0; key < table->s->keys; key++) {
6810       KEY *const key_info = &table->key_info[key];
6811       if (key ==  table->s->primary_key)
6812         continue;
6813       for (uint kp = 0; kp < key_info->usable_key_parts; kp++) {
6814         uint field_index= key_info->key_part[kp].field->field_index;
6815         if (m_key_descr_arr[key]->can_unpack(kp)) {
6816           table->field[field_index]->part_of_key.set_bit(key);
6817         } else {
6818           table->field[field_index]->part_of_key.clear_bit(key);
6819         }
6820       }
6821     }
6822   }
6823 
6824   info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
6825 
6826   /*
6827     The following load_XXX code calls row decode functions, and they do
6828     that without having done ::external_lock() or index_init()/rnd_init().
6829     (Note: this also means we're doing a read when there was no
6830     rdb_converter::setup_field_encoders() call)
6831 
6832     Initialize the necessary variables for them:
6833   */
6834 
6835   /* Load auto_increment value only once on first use. */
6836   if (table->found_next_number_field && m_tbl_def->m_auto_incr_val == 0) {
6837     load_auto_incr_value();
6838   }
6839 
6840   /* Load hidden pk only once on first use. */
6841   if (has_hidden_pk(table) && m_tbl_def->m_hidden_pk_val == 0 &&
6842       (err = load_hidden_pk_value()) != HA_EXIT_SUCCESS) {
6843     free_key_buffers();
6844     DBUG_RETURN(err);
6845   }
6846 
6847   /* Index block size in MyRocks: used by MySQL in query optimization */
6848   stats.block_size = rocksdb_tbl_options->block_size;
6849 
6850 #ifdef MARIAROCKS_NOT_YET // MDEV-10976
6851 #endif
6852   /* Determine at open whether we should skip unique checks for this table */
6853   set_skip_unique_check_tables(THDVAR(ha_thd(), skip_unique_check_tables));
6854 
6855   DBUG_RETURN(HA_EXIT_SUCCESS);
6856 }
6857 
6858 int ha_rocksdb::close(void) {
6859   DBUG_ENTER_FUNC();
6860 
6861   m_pk_descr = nullptr;
6862   m_key_descr_arr = nullptr;
6863   m_converter = nullptr;
6864   free_key_buffers();
6865 
6866   if (m_table_handler != nullptr) {
6867     rdb_open_tables.release_table_handler(m_table_handler);
6868     m_table_handler = nullptr;
6869   }
6870 
6871   // These are needed to suppress valgrind errors in rocksdb.partition
6872   m_last_rowkey.free();
6873   m_sk_tails.free();
6874   m_sk_tails_old.free();
6875   m_pk_unpack_info.free();
6876 
6877   DBUG_RETURN(HA_EXIT_SUCCESS);
6878 }
6879 
6880 static const char *rdb_error_messages[] = {
6881     "Table must have a PRIMARY KEY.",
6882     "Specifying DATA DIRECTORY for an individual table is not supported.",
6883     "Specifying INDEX DIRECTORY for an individual table is not supported.",
6884     "RocksDB commit failed.",
6885     "Failure during bulk load operation.",
6886     "Found data corruption.",
6887     "CRC checksum mismatch.",
6888     "Invalid table.",
6889     "Could not access RocksDB properties.",
6890     "File I/O error during merge/sort operation.",
6891     "RocksDB status: not found.",
6892     "RocksDB status: corruption.",
6893     "RocksDB status: invalid argument.",
6894     "RocksDB status: io error.",
6895     "RocksDB status: no space.",
6896     "RocksDB status: merge in progress.",
6897     "RocksDB status: incomplete.",
6898     "RocksDB status: shutdown in progress.",
6899     "RocksDB status: timed out.",
6900     "RocksDB status: aborted.",
6901     "RocksDB status: lock limit reached.",
6902     "RocksDB status: busy.",
6903     "RocksDB status: deadlock.",
6904     "RocksDB status: expired.",
6905     "RocksDB status: try again.",
6906 };
6907 
6908 static_assert((sizeof(rdb_error_messages) / sizeof(rdb_error_messages[0])) ==
6909                   ((HA_ERR_ROCKSDB_LAST - HA_ERR_ROCKSDB_FIRST) + 1),
6910               "Number of error messages doesn't match number of error codes");
6911 
6912 //psergey-merge: do we need this in MariaDB: we have get_error_messages
6913 //below...
6914 #if 0
6915 static const char *rdb_get_error_message(int nr) {
6916   return rdb_error_messages[nr - HA_ERR_ROCKSDB_FIRST];
6917 }
6918 #endif
6919 
6920 static const char **rdb_get_error_messages(int nr) { return rdb_error_messages; }
6921 
6922 bool ha_rocksdb::get_error_message(const int error, String *const buf) {
6923   DBUG_ENTER_FUNC();
6924 
6925   static_assert(HA_ERR_ROCKSDB_LAST > HA_ERR_FIRST,
6926                 "HA_ERR_ROCKSDB_LAST > HA_ERR_FIRST");
6927   static_assert(HA_ERR_ROCKSDB_LAST > HA_ERR_LAST,
6928                 "HA_ERR_ROCKSDB_LAST > HA_ERR_LAST");
6929 
6930   if (error == HA_ERR_LOCK_WAIT_TIMEOUT || error == HA_ERR_LOCK_DEADLOCK ||
6931       error == HA_ERR_ROCKSDB_STATUS_BUSY) {
6932     Rdb_transaction *const tx = get_tx_from_thd(ha_thd());
6933     DBUG_ASSERT(tx != nullptr);
6934     buf->append(tx->m_detailed_error);
6935     DBUG_RETURN(true);
6936   }
6937 
6938   if (error >= HA_ERR_ROCKSDB_FIRST && error <= HA_ERR_ROCKSDB_LAST) {
6939     buf->append(rdb_error_messages[error - HA_ERR_ROCKSDB_FIRST]);
6940   }
6941 
6942   // We can be called with the values which are < HA_ERR_FIRST because most
6943   // MySQL internal functions will just return HA_EXIT_FAILURE in case of
6944   // an error.
6945 
6946   DBUG_RETURN(false);
6947 }
6948 
6949 /*
6950   Generalized way to convert RocksDB status errors into MySQL error code, and
6951   print error message.
6952 
6953   Each error code below maps to a RocksDB status code found in:
6954   rocksdb/include/rocksdb/status.h
6955 */
6956 int ha_rocksdb::rdb_error_to_mysql(const rocksdb::Status &s,
6957                                    const char *opt_msg) {
6958   DBUG_ASSERT(!s.ok());
6959 
6960   int err;
6961   switch (s.code()) {
6962     case rocksdb::Status::Code::kOk:
6963       err = HA_EXIT_SUCCESS;
6964       break;
6965     case rocksdb::Status::Code::kNotFound:
6966       err = HA_ERR_ROCKSDB_STATUS_NOT_FOUND;
6967       break;
6968     case rocksdb::Status::Code::kCorruption:
6969       err = HA_ERR_ROCKSDB_STATUS_CORRUPTION;
6970       break;
6971     case rocksdb::Status::Code::kNotSupported:
6972       err = HA_ERR_ROCKSDB_STATUS_NOT_SUPPORTED;
6973       break;
6974     case rocksdb::Status::Code::kInvalidArgument:
6975       err = HA_ERR_ROCKSDB_STATUS_INVALID_ARGUMENT;
6976       break;
6977     case rocksdb::Status::Code::kIOError:
6978       err = (s.IsNoSpace()) ? HA_ERR_ROCKSDB_STATUS_NO_SPACE
6979                             : HA_ERR_ROCKSDB_STATUS_IO_ERROR;
6980       break;
6981     case rocksdb::Status::Code::kMergeInProgress:
6982       err = HA_ERR_ROCKSDB_STATUS_MERGE_IN_PROGRESS;
6983       break;
6984     case rocksdb::Status::Code::kIncomplete:
6985       err = HA_ERR_ROCKSDB_STATUS_INCOMPLETE;
6986       break;
6987     case rocksdb::Status::Code::kShutdownInProgress:
6988       err = HA_ERR_ROCKSDB_STATUS_SHUTDOWN_IN_PROGRESS;
6989       break;
6990     case rocksdb::Status::Code::kTimedOut:
6991       err = HA_ERR_ROCKSDB_STATUS_TIMED_OUT;
6992       break;
6993     case rocksdb::Status::Code::kAborted:
6994       err = (s.IsLockLimit()) ? HA_ERR_ROCKSDB_STATUS_LOCK_LIMIT
6995                               : HA_ERR_ROCKSDB_STATUS_ABORTED;
6996       break;
6997     case rocksdb::Status::Code::kBusy:
6998       err = (s.IsDeadlock()) ? HA_ERR_ROCKSDB_STATUS_DEADLOCK
6999                              : HA_ERR_ROCKSDB_STATUS_BUSY;
7000       break;
7001     case rocksdb::Status::Code::kExpired:
7002       err = HA_ERR_ROCKSDB_STATUS_EXPIRED;
7003       break;
7004     case rocksdb::Status::Code::kTryAgain:
7005       err = HA_ERR_ROCKSDB_STATUS_TRY_AGAIN;
7006       break;
7007     default:
7008       DBUG_ASSERT(0);
7009       return -1;
7010   }
7011 
7012   std::string errMsg;
7013   if (s.IsLockLimit()) {
7014     errMsg =
7015         "Operation aborted: Failed to acquire lock due to "
7016         "rocksdb_max_row_locks limit";
7017   } else {
7018     errMsg = s.ToString();
7019   }
7020 
7021   if (opt_msg) {
7022     std::string concatenated_error = errMsg + " (" + std::string(opt_msg) + ")";
7023     my_error(ER_GET_ERRMSG, MYF(0), s.code(), concatenated_error.c_str(),
7024              rocksdb_hton_name);
7025   } else {
7026     my_error(ER_GET_ERRMSG, MYF(0), s.code(), errMsg.c_str(),
7027              rocksdb_hton_name);
7028   }
7029 
7030   return err;
7031 }
7032 
7033 /* MyRocks supports only the following collations for indexed columns */
7034 static const std::set<uint> RDB_INDEX_COLLATIONS = {
7035     COLLATION_BINARY, COLLATION_UTF8_BIN, COLLATION_LATIN1_BIN};
7036 
7037 static bool rdb_is_index_collation_supported(
7038     const my_core::Field *const field) {
7039   const my_core::enum_field_types type = field->real_type();
7040   /* Handle [VAR](CHAR|BINARY) or TEXT|BLOB */
7041   if (type == MYSQL_TYPE_VARCHAR || type == MYSQL_TYPE_STRING ||
7042       type == MYSQL_TYPE_BLOB)  {
7043 
7044     return (RDB_INDEX_COLLATIONS.find(field->charset()->number) !=
7045             RDB_INDEX_COLLATIONS.end()) ||
7046             rdb_is_collation_supported(field->charset());
7047   }
7048   return true;
7049 }
7050 
7051 
7052 static bool
7053 rdb_field_uses_nopad_collation(const my_core::Field *const field) {
7054   const my_core::enum_field_types type = field->real_type();
7055   /* Handle [VAR](CHAR|BINARY) or TEXT|BLOB */
7056   if (type == MYSQL_TYPE_VARCHAR || type == MYSQL_TYPE_STRING ||
7057       type == MYSQL_TYPE_BLOB) {
7058 
7059     /*
7060       This is technically a NOPAD collation but it's a binary collation
7061       that we can handle.
7062     */
7063     if (RDB_INDEX_COLLATIONS.find(field->charset()->number) !=
7064            RDB_INDEX_COLLATIONS.end())
7065       return false;
7066 
7067     return (field->charset()->state & MY_CS_NOPAD);
7068   }
7069   return false;
7070 }
7071 
7072 
7073 /*
7074   Create structures needed for storing data in rocksdb. This is called when the
7075   table is created. The structures will be shared by all TABLE* objects.
7076 
7077   @param
7078     table_arg        Table with definition
7079     db_table         "dbname.tablename"
7080     len              strlen of the above
7081     tbl_def_arg      tbl_def whose key_descr is being created/populated
7082     old_tbl_def_arg  tbl_def from which keys are being copied over from
7083                      (for use during inplace alter)
7084 
7085   @return
7086     0      - Ok
7087     other  - error, either given table ddl is not supported by rocksdb or OOM.
7088 */
7089 int ha_rocksdb::create_key_defs(
7090     const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
7091     const TABLE *const old_table_arg /* = nullptr */,
7092     const Rdb_tbl_def *const old_tbl_def_arg
7093     /* = nullptr */) const {
7094   DBUG_ENTER_FUNC();
7095 
7096   DBUG_ASSERT(table_arg->s != nullptr);
7097 
7098   /*
7099     These need to be one greater than MAX_INDEXES since the user can create
7100     MAX_INDEXES secondary keys and no primary key which would cause us
7101     to generate a hidden one.
7102   */
7103   std::array<key_def_cf_info, MAX_INDEXES + 1> cfs;
7104 
7105   /*
7106     NOTE: All new column families must be created before new index numbers are
7107     allocated to each key definition. See below for more details.
7108     http://github.com/MySQLOnRocksDB/mysql-5.6/issues/86#issuecomment-138515501
7109   */
7110   if (create_cfs(table_arg, tbl_def_arg, &cfs)) {
7111     DBUG_RETURN(HA_EXIT_FAILURE);
7112   }
7113 
7114   uint64 ttl_duration = 0;
7115   std::string ttl_column;
7116   uint ttl_field_offset;
7117 
7118   uint err;
7119   if ((err = Rdb_key_def::extract_ttl_duration(table_arg, tbl_def_arg,
7120                                                &ttl_duration))) {
7121     DBUG_RETURN(err);
7122   }
7123 
7124   if ((err = Rdb_key_def::extract_ttl_col(table_arg, tbl_def_arg, &ttl_column,
7125                                           &ttl_field_offset))) {
7126     DBUG_RETURN(err);
7127   }
7128 
7129   /* We don't currently support TTL on tables with hidden primary keys. */
7130   if (ttl_duration > 0 && has_hidden_pk(table_arg)) {
7131     my_error(ER_RDB_TTL_UNSUPPORTED, MYF(0));
7132     DBUG_RETURN(HA_EXIT_FAILURE);
7133   }
7134 
7135   /*
7136     If TTL duration is not specified but TTL column was specified, throw an
7137     error because TTL column requires duration.
7138   */
7139   if (ttl_duration == 0 && !ttl_column.empty()) {
7140     my_error(ER_RDB_TTL_COL_FORMAT, MYF(0), ttl_column.c_str());
7141     DBUG_RETURN(HA_EXIT_FAILURE);
7142   }
7143 
7144   if (!old_tbl_def_arg) {
7145     /*
7146       old_tbl_def doesn't exist. this means we are in the process of creating
7147       a new table.
7148 
7149       Get the index numbers (this will update the next_index_number)
7150       and create Rdb_key_def structures.
7151     */
7152     for (uint i = 0; i < tbl_def_arg->m_key_count; i++) {
7153       if (create_key_def(table_arg, i, tbl_def_arg, &m_key_descr_arr[i], cfs[i],
7154                          ttl_duration, ttl_column)) {
7155         DBUG_RETURN(HA_EXIT_FAILURE);
7156       }
7157     }
7158   } else {
7159     /*
7160       old_tbl_def exists.  This means we are creating a new tbl_def as part of
7161       in-place alter table.  Copy over existing keys from the old_tbl_def and
7162       generate the necessary new key definitions if any.
7163     */
7164     if (create_inplace_key_defs(table_arg, tbl_def_arg, old_table_arg,
7165                                 old_tbl_def_arg, cfs, ttl_duration,
7166                                 ttl_column)) {
7167       DBUG_RETURN(HA_EXIT_FAILURE);
7168     }
7169   }
7170 
7171   DBUG_RETURN(HA_EXIT_SUCCESS);
7172 }
7173 
7174 /*
7175   Checks index parameters and creates column families needed for storing data
7176   in rocksdb if necessary.
7177 
7178   @param in
7179     table_arg     Table with definition
7180     db_table      Table name
7181     tbl_def_arg   Table def structure being populated
7182 
7183   @param out
7184     cfs           CF info for each key definition in 'key_info' order
7185 
7186   @return
7187     0      - Ok
7188     other  - error
7189 */
7190 int ha_rocksdb::create_cfs(
7191     const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
7192     std::array<struct key_def_cf_info, MAX_INDEXES + 1> *const cfs) const {
7193   DBUG_ENTER_FUNC();
7194 
7195   DBUG_ASSERT(table_arg->s != nullptr);
7196 
7197   char tablename_sys[NAME_LEN + 1];
7198   bool tsys_set= false;
7199 
7200   /*
7201     The first loop checks the index parameters and creates
7202     column families if necessary.
7203   */
7204   for (uint i = 0; i < tbl_def_arg->m_key_count; i++) {
7205     rocksdb::ColumnFamilyHandle *cf_handle;
7206 
7207     if (!is_hidden_pk(i, table_arg, tbl_def_arg) &&
7208         tbl_def_arg->base_tablename().find(tmp_file_prefix) != 0) {
7209       if (!tsys_set)
7210       {
7211         tsys_set= true;
7212         my_core::filename_to_tablename(tbl_def_arg->base_tablename().c_str(),
7213                                    tablename_sys, sizeof(tablename_sys));
7214       }
7215 
7216       for (uint part = 0; part < table_arg->key_info[i].ext_key_parts;
7217            part++)
7218       {
7219         /* MariaDB: disallow NOPAD collations */
7220         if (rdb_field_uses_nopad_collation(
7221               table_arg->key_info[i].key_part[part].field))
7222         {
7223           my_error(ER_MYROCKS_CANT_NOPAD_COLLATION, MYF(0));
7224           DBUG_RETURN(HA_EXIT_FAILURE);
7225         }
7226 
7227         if (rocksdb_strict_collation_check &&
7228             !rdb_is_index_collation_supported(
7229                 table_arg->key_info[i].key_part[part].field) &&
7230             !rdb_collation_exceptions->matches(tablename_sys)) {
7231 
7232           char buf[1024];
7233           my_snprintf(buf, sizeof(buf),
7234                       "Indexed column %s.%s uses a collation that does not "
7235                       "allow index-only access in secondary key and has "
7236                       "reduced disk space efficiency in primary key.",
7237                        tbl_def_arg->full_tablename().c_str(),
7238                        table_arg->key_info[i].key_part[part].field->field_name.str);
7239 
7240           my_error(ER_INTERNAL_ERROR, MYF(ME_WARNING), buf);
7241         }
7242       }
7243     }
7244 
7245     // Internal consistency check to make sure that data in TABLE and
7246     // Rdb_tbl_def structures matches. Either both are missing or both are
7247     // specified. Yes, this is critical enough to make it into SHIP_ASSERT.
7248     SHIP_ASSERT(IF_PARTITIONING(!table_arg->part_info,true) == tbl_def_arg->base_partition().empty());
7249 
7250     // Generate the name for the column family to use.
7251     bool per_part_match_found = false;
7252     std::string cf_name =
7253         generate_cf_name(i, table_arg, tbl_def_arg, &per_part_match_found);
7254 
7255     // Prevent create from using the system column family.
7256     if (cf_name == DEFAULT_SYSTEM_CF_NAME) {
7257       my_error(ER_WRONG_ARGUMENTS, MYF(0),
7258                "column family not valid for storing index data.");
7259       DBUG_RETURN(HA_EXIT_FAILURE);
7260     }
7261 
7262     // Here's how `get_or_create_cf` will use the input parameters:
7263     //
7264     // `cf_name` - will be used as a CF name.
7265     cf_handle = cf_manager.get_or_create_cf(rdb, cf_name);
7266 
7267     if (!cf_handle) {
7268       DBUG_RETURN(HA_EXIT_FAILURE);
7269     }
7270 
7271     auto &cf = (*cfs)[i];
7272 
7273     cf.cf_handle = cf_handle;
7274     cf.is_reverse_cf = Rdb_cf_manager::is_cf_name_reverse(cf_name.c_str());
7275     cf.is_per_partition_cf = per_part_match_found;
7276   }
7277 
7278   DBUG_RETURN(HA_EXIT_SUCCESS);
7279 }
7280 
7281 /*
7282   Create key definition needed for storing data in rocksdb during ADD index
7283   inplace operations.
7284 
7285   @param in
7286     table_arg         Table with definition
7287     tbl_def_arg       New table def structure being populated
7288     old_tbl_def_arg   Old(current) table def structure
7289     cfs               Struct array which contains column family information
7290 
7291   @return
7292     0      - Ok
7293     other  - error, either given table ddl is not supported by rocksdb or OOM.
7294 */
7295 int ha_rocksdb::create_inplace_key_defs(
7296     const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
7297     const TABLE *const old_table_arg, const Rdb_tbl_def *const old_tbl_def_arg,
7298     const std::array<key_def_cf_info, MAX_INDEXES + 1> &cfs,
7299     uint64 ttl_duration, const std::string &ttl_column) const {
7300   DBUG_ENTER_FUNC();
7301 
7302   std::shared_ptr<Rdb_key_def> *const old_key_descr =
7303       old_tbl_def_arg->m_key_descr_arr;
7304   std::shared_ptr<Rdb_key_def> *const new_key_descr =
7305       tbl_def_arg->m_key_descr_arr;
7306   const std::unordered_map<std::string, uint> old_key_pos =
7307       get_old_key_positions(table_arg, tbl_def_arg, old_table_arg,
7308                             old_tbl_def_arg);
7309 
7310   uint i;
7311   for (i = 0; i < tbl_def_arg->m_key_count; i++) {
7312     const auto &it = old_key_pos.find(get_key_name(i, table_arg, tbl_def_arg));
7313 
7314     if (it != old_key_pos.end()) {
7315       /*
7316         Found matching index in old table definition, so copy it over to the
7317         new one created.
7318       */
7319       const Rdb_key_def &okd = *old_key_descr[it->second];
7320 
7321       const GL_INDEX_ID gl_index_id = okd.get_gl_index_id();
7322       struct Rdb_index_info index_info;
7323       if (!dict_manager.get_index_info(gl_index_id, &index_info)) {
7324         // NO_LINT_DEBUG
7325         sql_print_error(
7326             "RocksDB: Could not get index information "
7327             "for Index Number (%u,%u), table %s",
7328             gl_index_id.cf_id, gl_index_id.index_id,
7329             old_tbl_def_arg->full_tablename().c_str());
7330         DBUG_RETURN(HA_EXIT_FAILURE);
7331       }
7332 
7333       uint32 ttl_rec_offset =
7334           Rdb_key_def::has_index_flag(index_info.m_index_flags,
7335                                       Rdb_key_def::TTL_FLAG)
7336               ? Rdb_key_def::calculate_index_flag_offset(
7337                     index_info.m_index_flags, Rdb_key_def::TTL_FLAG)
7338               : UINT_MAX;
7339 
7340       /*
7341         We can't use the copy constructor because we need to update the
7342         keynr within the pack_info for each field and the keyno of the keydef
7343         itself.
7344       */
7345       new_key_descr[i] = std::make_shared<Rdb_key_def>(
7346           okd.get_index_number(), i, okd.get_cf(),
7347           index_info.m_index_dict_version, index_info.m_index_type,
7348           index_info.m_kv_version, okd.m_is_reverse_cf,
7349           okd.m_is_per_partition_cf, okd.m_name.c_str(),
7350           dict_manager.get_stats(gl_index_id), index_info.m_index_flags,
7351           ttl_rec_offset, index_info.m_ttl_duration);
7352     } else if (create_key_def(table_arg, i, tbl_def_arg, &new_key_descr[i],
7353                               cfs[i], ttl_duration, ttl_column)) {
7354       DBUG_RETURN(HA_EXIT_FAILURE);
7355     }
7356 
7357     DBUG_ASSERT(new_key_descr[i] != nullptr);
7358     new_key_descr[i]->setup(table_arg, tbl_def_arg);
7359   }
7360 
7361   DBUG_RETURN(HA_EXIT_SUCCESS);
7362 }
7363 
7364 std::unordered_map<std::string, uint> ha_rocksdb::get_old_key_positions(
7365     const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg,
7366     const TABLE *const old_table_arg,
7367     const Rdb_tbl_def *const old_tbl_def_arg) const {
7368   DBUG_ENTER_FUNC();
7369 
7370   std::shared_ptr<Rdb_key_def> *const old_key_descr =
7371       old_tbl_def_arg->m_key_descr_arr;
7372   std::unordered_map<std::string, uint> old_key_pos;
7373   std::unordered_map<std::string, uint> new_key_pos;
7374   uint i;
7375 
7376   for (i = 0; i < tbl_def_arg->m_key_count; i++) {
7377     new_key_pos[get_key_name(i, table_arg, tbl_def_arg)] = i;
7378   }
7379 
7380   for (i = 0; i < old_tbl_def_arg->m_key_count; i++) {
7381     if (is_hidden_pk(i, old_table_arg, old_tbl_def_arg)) {
7382       old_key_pos[old_key_descr[i]->m_name] = i;
7383       continue;
7384     }
7385 
7386     /*
7387       In case of matching key name, need to check key parts of keys as well,
7388       in case a simultaneous drop + add is performed, where the key name is the
7389       same but the key parts are different.
7390 
7391       Example:
7392       CREATE TABLE t1 (a INT, b INT, KEY ka(a)) ENGINE=RocksDB;
7393       ALTER TABLE t1 DROP INDEX ka, ADD INDEX ka(b), ALGORITHM=INPLACE;
7394     */
7395     const KEY *const old_key = &old_table_arg->key_info[i];
7396     const auto &it = new_key_pos.find(old_key->name.str);
7397     if (it == new_key_pos.end()) {
7398       continue;
7399     }
7400 
7401     KEY *const new_key = &table_arg->key_info[it->second];
7402 
7403     /*
7404       Check that the key is identical between old and new tables.
7405       If not, we still need to create a new index.
7406 
7407       The exception is if there is an index changed from unique to non-unique,
7408       in these cases we don't need to rebuild as they are stored the same way in
7409       RocksDB.
7410     */
7411     bool unique_to_non_unique =
7412         ((old_key->flags ^ new_key->flags) == HA_NOSAME) &&
7413         (old_key->flags & HA_NOSAME);
7414 
7415     if (compare_keys(old_key, new_key) && !unique_to_non_unique) {
7416       continue;
7417     }
7418 
7419     /* Check to make sure key parts match. */
7420     if (compare_key_parts(old_key, new_key)) {
7421       continue;
7422     }
7423 
7424     old_key_pos[old_key->name.str] = i;
7425   }
7426 
7427   DBUG_RETURN(old_key_pos);
7428 }
7429 
7430 /* Check to see if two keys are identical. */
7431 int ha_rocksdb::compare_keys(const KEY *const old_key,
7432                              const KEY *const new_key) const {
7433   DBUG_ENTER_FUNC();
7434 
7435   /* Check index name. */
7436   if (strcmp(old_key->name.str, new_key->name.str) != 0) {
7437     DBUG_RETURN(HA_EXIT_FAILURE);
7438   }
7439 
7440   /* If index algorithms are different then keys are different. */
7441   if (old_key->algorithm != new_key->algorithm) {
7442     DBUG_RETURN(HA_EXIT_FAILURE);
7443   }
7444 
7445   /* Check that the key is identical between old and new tables.  */
7446   if ((old_key->flags ^ new_key->flags) & HA_KEYFLAG_MASK) {
7447     DBUG_RETURN(HA_EXIT_FAILURE);
7448   }
7449 
7450   /* Check index comment. (for column family changes) */
7451   std::string old_comment(old_key->comment.str, old_key->comment.length);
7452   std::string new_comment(new_key->comment.str, new_key->comment.length);
7453   if (old_comment.compare(new_comment) != 0) {
7454     DBUG_RETURN(HA_EXIT_FAILURE);
7455   }
7456 
7457   DBUG_RETURN(HA_EXIT_SUCCESS);
7458 }
7459 
7460 /* Check two keys to ensure that key parts within keys match */
7461 int ha_rocksdb::compare_key_parts(const KEY *const old_key,
7462                                   const KEY *const new_key) const {
7463   DBUG_ENTER_FUNC();
7464 
7465   /* Skip if key parts do not match, as it is a different key */
7466   if (new_key->user_defined_key_parts != old_key->user_defined_key_parts) {
7467     DBUG_RETURN(HA_EXIT_FAILURE);
7468   }
7469 
7470   /* Check to see that key parts themselves match */
7471   for (uint i = 0; i < old_key->user_defined_key_parts; i++) {
7472     if (strcmp(old_key->key_part[i].field->field_name.str,
7473                new_key->key_part[i].field->field_name.str) != 0) {
7474       DBUG_RETURN(HA_EXIT_FAILURE);
7475     }
7476 
7477     /* Check if prefix index key part length has changed */
7478     if (old_key->key_part[i].length != new_key->key_part[i].length) {
7479       DBUG_RETURN(HA_EXIT_FAILURE);
7480     }
7481   }
7482 
7483   DBUG_RETURN(HA_EXIT_SUCCESS);
7484 }
7485 
7486 /*
7487   Create key definition needed for storing data in rocksdb.
7488   This can be called either during CREATE table or doing ADD index operations.
7489 
7490   @param in
7491     table_arg     Table with definition
7492     i             Position of index being created inside table_arg->key_info
7493     tbl_def_arg   Table def structure being populated
7494     cf_info       Struct which contains column family information
7495 
7496   @param out
7497     new_key_def  Newly created index definition.
7498 
7499   @return
7500     0      - Ok
7501     other  - error, either given table ddl is not supported by rocksdb or OOM.
7502 */
7503 int ha_rocksdb::create_key_def(const TABLE *const table_arg, const uint i,
7504                                const Rdb_tbl_def *const tbl_def_arg,
7505                                std::shared_ptr<Rdb_key_def> *const new_key_def,
7506                                const struct key_def_cf_info &cf_info,
7507                                uint64 ttl_duration,
7508                                const std::string &ttl_column) const {
7509   DBUG_ENTER_FUNC();
7510 
7511   DBUG_ASSERT(*new_key_def == nullptr);
7512 
7513   const uint index_id = ddl_manager.get_and_update_next_number(&dict_manager);
7514   const uint16_t index_dict_version = Rdb_key_def::INDEX_INFO_VERSION_LATEST;
7515   uchar index_type;
7516   uint16_t kv_version;
7517 
7518   if (is_hidden_pk(i, table_arg, tbl_def_arg)) {
7519     index_type = Rdb_key_def::INDEX_TYPE_HIDDEN_PRIMARY;
7520     kv_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_LATEST;
7521   } else if (i == table_arg->s->primary_key) {
7522     index_type = Rdb_key_def::INDEX_TYPE_PRIMARY;
7523     uint16 pk_latest_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_LATEST;
7524     kv_version = pk_latest_version;
7525   } else {
7526     index_type = Rdb_key_def::INDEX_TYPE_SECONDARY;
7527     uint16 sk_latest_version = Rdb_key_def::SECONDARY_FORMAT_VERSION_LATEST;
7528     kv_version = sk_latest_version;
7529   }
7530 
7531   // Use PRIMARY_FORMAT_VERSION_UPDATE1 here since it is the same value as
7532   // SECONDARY_FORMAT_VERSION_UPDATE1 so it doesn't matter if this is a
7533   // primary key or secondary key.
7534   DBUG_EXECUTE_IF("MYROCKS_LEGACY_VARBINARY_FORMAT", {
7535     kv_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_UPDATE1;
7536   });
7537 
7538   DBUG_EXECUTE_IF("MYROCKS_NO_COVERED_BITMAP_FORMAT", {
7539     if (index_type == Rdb_key_def::INDEX_TYPE_SECONDARY) {
7540       kv_version = Rdb_key_def::SECONDARY_FORMAT_VERSION_UPDATE2;
7541     }
7542   });
7543 
7544   uint32 index_flags = (ttl_duration > 0 ? Rdb_key_def::TTL_FLAG : 0);
7545 
7546   uint32 ttl_rec_offset =
7547       Rdb_key_def::has_index_flag(index_flags, Rdb_key_def::TTL_FLAG)
7548           ? Rdb_key_def::calculate_index_flag_offset(index_flags,
7549                                                      Rdb_key_def::TTL_FLAG)
7550           : UINT_MAX;
7551 
7552   const char *const key_name = get_key_name(i, table_arg, m_tbl_def);
7553   *new_key_def = std::make_shared<Rdb_key_def>(
7554       index_id, i, cf_info.cf_handle, index_dict_version, index_type,
7555       kv_version, cf_info.is_reverse_cf, cf_info.is_per_partition_cf, key_name,
7556       Rdb_index_stats(), index_flags, ttl_rec_offset, ttl_duration);
7557 
7558   if (!ttl_column.empty()) {
7559     (*new_key_def)->m_ttl_column = ttl_column;
7560   }
7561   // initialize key_def
7562   (*new_key_def)->setup(table_arg, tbl_def_arg);
7563   DBUG_RETURN(HA_EXIT_SUCCESS);
7564 }
7565 
7566 int rdb_normalize_tablename(const std::string &tablename,
7567                             std::string *const strbuf) {
7568   if (tablename.size() < 2 || tablename[0] != '.' ||
7569       (tablename[1] != FN_LIBCHAR && tablename[1] != FN_LIBCHAR2)) {
7570     DBUG_ASSERT(0);  // We were not passed table name?
7571     return HA_ERR_ROCKSDB_INVALID_TABLE;
7572   }
7573 
7574   size_t pos = tablename.find_first_of(FN_LIBCHAR, 2);
7575   if (pos == std::string::npos) {
7576     pos = tablename.find_first_of(FN_LIBCHAR2, 2);
7577   }
7578 
7579   if (pos == std::string::npos) {
7580     DBUG_ASSERT(0);  // We were not passed table name?
7581     return HA_ERR_ROCKSDB_INVALID_TABLE;
7582   }
7583 
7584   *strbuf = tablename.substr(2, pos - 2) + "." + tablename.substr(pos + 1);
7585 
7586   return HA_EXIT_SUCCESS;
7587 }
7588 
7589 /*
7590   Check to see if the user's original statement includes foreign key
7591   references
7592 */
7593 bool ha_rocksdb::contains_foreign_key(THD *const thd) {
7594   bool success;
7595   const char *str = thd_query_string(thd)->str;
7596 
7597   DBUG_ASSERT(str != nullptr);
7598 
7599   while (*str != '\0') {
7600     // Scan from our current pos looking for 'FOREIGN'
7601     str = rdb_find_in_string(str, "FOREIGN", &success);
7602     if (!success) {
7603       return false;
7604     }
7605 
7606     // Skip past the found "FOREIGN'
7607     str = rdb_check_next_token(&my_charset_bin, str, "FOREIGN", &success);
7608     DBUG_ASSERT(success);
7609 
7610     if (!my_isspace(&my_charset_bin, *str)) {
7611       return false;
7612     }
7613 
7614     // See if the next token is 'KEY'
7615     str = rdb_check_next_token(&my_charset_bin, str, "KEY", &success);
7616     if (!success) {
7617       continue;
7618     }
7619 
7620     // See if the next token is '('
7621     str = rdb_check_next_token(&my_charset_bin, str, "(", &success);
7622     if (!success) {
7623       // There is an optional index id after 'FOREIGN KEY', skip it
7624       str = rdb_skip_id(&my_charset_bin, str);
7625 
7626       // Now check for '(' again
7627       str = rdb_check_next_token(&my_charset_bin, str, "(", &success);
7628     }
7629 
7630     // If we have found 'FOREIGN KEY [<word>] (' we can be confident we have
7631     // a foreign key clause.
7632     return success;
7633   }
7634 
7635   // We never found a valid foreign key clause
7636   return false;
7637 }
7638 
7639 /**
7640   @brief
7641   splits the normalized table name of <dbname>.<tablename>#P#<part_no> into
7642   the <dbname>, <tablename> and <part_no> components.
7643 
7644   @param dbbuf returns database name/table_schema
7645   @param tablebuf returns tablename
7646   @param partitionbuf returns partition suffix if there is one
7647   @return HA_EXIT_SUCCESS on success, non-zero on failure to split
7648 */
7649 int rdb_split_normalized_tablename(const std::string &fullname,
7650                                    std::string *const db,
7651                                    std::string *const table,
7652                                    std::string *const partition) {
7653   DBUG_ASSERT(!fullname.empty());
7654 
7655 #define RDB_PARTITION_STR "#P#"
7656 
7657   /* Normalize returns dbname.tablename. */
7658   size_t dotpos = fullname.find('.');
7659 
7660   /* Invalid table name? */
7661   if (dotpos == std::string::npos) {
7662     return HA_ERR_ROCKSDB_INVALID_TABLE;
7663   }
7664 
7665   // Table must have a database name associated with it.
7666   DBUG_ASSERT(dotpos > 0);
7667 
7668   if (db != nullptr) {
7669     *db = fullname.substr(0, dotpos);
7670   }
7671 
7672   dotpos++;
7673 
7674   const size_t partpos =
7675       fullname.find(RDB_PARTITION_STR, dotpos, strlen(RDB_PARTITION_STR));
7676 
7677   if (partpos != std::string::npos) {
7678     DBUG_ASSERT(partpos >= dotpos);
7679 
7680     if (table != nullptr) {
7681       *table = fullname.substr(dotpos, partpos - dotpos);
7682     }
7683 
7684     if (partition != nullptr) {
7685       *partition = fullname.substr(partpos + strlen(RDB_PARTITION_STR));
7686     }
7687   } else if (table != nullptr) {
7688     *table = fullname.substr(dotpos);
7689   }
7690 
7691   return HA_EXIT_SUCCESS;
7692 }
7693 
7694 /*
7695  Create a table's Rdb_tbl_def and its Rdb_key_defs and store table information
7696  into MyRocks Data Dictionary
7697  The method is called during create table/partition, truncate table/partition
7698 
7699  @param table_name            IN      table's name formated as
7700  'dbname.tablename'
7701  @param table_arg             IN      sql table
7702  @param auto_increment_value  IN      specified table's auto increment value
7703 
7704   @return
7705     HA_EXIT_SUCCESS  OK
7706     other            HA_ERR error code (can be SE-specific)
7707 */
7708 int ha_rocksdb::create_table(const std::string &table_name,
7709                              const TABLE *table_arg,
7710                              ulonglong auto_increment_value) {
7711   DBUG_ENTER_FUNC();
7712 
7713 
7714 
7715   int err;
7716 
7717   const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
7718   rocksdb::WriteBatch *const batch = wb.get();
7719 
7720   /* Create table/key descriptions and put them into the data dictionary */
7721   m_tbl_def = new Rdb_tbl_def(table_name);
7722 
7723   uint n_keys = table_arg->s->keys;
7724 
7725   /*
7726     If no primary key found, create a hidden PK and place it inside table
7727     definition
7728   */
7729   if (has_hidden_pk(table_arg)) {
7730     n_keys += 1;
7731     // reset hidden pk id
7732     // the starting valid value for hidden pk is 1
7733     m_tbl_def->m_hidden_pk_val = 1;
7734   }
7735 
7736   m_key_descr_arr = new std::shared_ptr<Rdb_key_def>[n_keys];
7737   m_tbl_def->m_key_count = n_keys;
7738   m_tbl_def->m_key_descr_arr = m_key_descr_arr;
7739 
7740   err = create_key_defs(table_arg, m_tbl_def);
7741   if (err != HA_EXIT_SUCCESS) {
7742     goto error;
7743   }
7744 
7745   m_pk_descr = m_key_descr_arr[pk_index(table_arg, m_tbl_def)];
7746 
7747   if (auto_increment_value) {
7748     bool autoinc_upgrade_test = false;
7749     m_tbl_def->m_auto_incr_val = auto_increment_value;
7750     DBUG_EXECUTE_IF("myrocks_autoinc_upgrade", autoinc_upgrade_test = true;);
7751     if (!autoinc_upgrade_test) {
7752       auto s = dict_manager.put_auto_incr_val(
7753           batch, m_tbl_def->get_autoincr_gl_index_id(),
7754           m_tbl_def->m_auto_incr_val);
7755       if (!s.ok()) {
7756         goto error;
7757       }
7758     }
7759   }
7760 
7761   dict_manager.lock();
7762   err = ddl_manager.put_and_write(m_tbl_def, batch);
7763   if (err != HA_EXIT_SUCCESS) {
7764     dict_manager.unlock();
7765     goto error;
7766   }
7767 
7768   err = dict_manager.commit(batch);
7769   if (err != HA_EXIT_SUCCESS) {
7770     dict_manager.unlock();
7771     goto error;
7772   }
7773 
7774   dict_manager.unlock();
7775 
7776   DBUG_RETURN(HA_EXIT_SUCCESS);
7777 
7778 error:
7779   /* Delete what we have allocated so far */
7780   delete m_tbl_def;
7781   m_tbl_def = nullptr;
7782   m_key_descr_arr = nullptr;
7783 
7784   DBUG_RETURN(err);
7785 }
7786 
7787 /**
7788   @brief
7789   create() is called to create a table. The variable name will have the name
7790   of the table.
7791 
7792   @details
7793   When create() is called you do not need to worry about
7794   opening the table. Also, the .frm file will have already been
7795   created so adjusting create_info is not necessary. You can overwrite
7796   the .frm file at this point if you wish to change the table
7797   definition, but there are no methods currently provided for doing
7798   so.
7799 
7800   Called from handle.cc by ha_create_table().
7801 
7802   @return
7803     HA_EXIT_SUCCESS  OK
7804     other            HA_ERR error code (can be SE-specific)
7805 
7806   @see
7807   ha_create_table() in handle.cc
7808 */
7809 
7810 int ha_rocksdb::create(const char *const name, TABLE *const table_arg,
7811                        HA_CREATE_INFO *const create_info) {
7812   DBUG_ENTER_FUNC();
7813 
7814   DBUG_ASSERT(table_arg != nullptr);
7815   DBUG_ASSERT(create_info != nullptr);
7816 
7817   if (create_info->data_file_name) {
7818     // DATA DIRECTORY is used to create tables under a specific location
7819     // outside the MySQL data directory. We don't support this for MyRocks.
7820     // The `rocksdb_datadir` setting should be used to configure RocksDB data
7821     // directory.
7822     DBUG_RETURN(HA_ERR_ROCKSDB_TABLE_DATA_DIRECTORY_NOT_SUPPORTED);
7823   }
7824 
7825   if (create_info->index_file_name) {
7826     // Similar check for INDEX DIRECTORY as well.
7827     DBUG_RETURN(HA_ERR_ROCKSDB_TABLE_INDEX_DIRECTORY_NOT_SUPPORTED);
7828   }
7829 
7830   int err;
7831   /*
7832     Construct dbname.tablename ourselves, because parititioning
7833     passes strings like "./test/t14#P#p0" for individual partitions,
7834     while table_arg->s->table_name has none of that.
7835   */
7836   std::string str;
7837   err = rdb_normalize_tablename(name, &str);
7838   if (err != HA_EXIT_SUCCESS) {
7839     DBUG_RETURN(err);
7840   }
7841 
7842   // FOREIGN KEY isn't supported yet
7843   THD *const thd = my_core::thd_get_current_thd();
7844   if (contains_foreign_key(thd)) {
7845     my_error(ER_NOT_SUPPORTED_YET, MYF(0),
7846              "FOREIGN KEY for the RocksDB storage engine");
7847     DBUG_RETURN(HA_ERR_UNSUPPORTED);
7848   }
7849 
7850   // Check whether Data Dictionary contain information
7851   Rdb_tbl_def *tbl = ddl_manager.find(str);
7852   if (tbl != nullptr) {
7853     if (thd->lex->sql_command == SQLCOM_TRUNCATE) {
7854       err = delete_table(tbl);
7855       if (err != HA_EXIT_SUCCESS) {
7856         DBUG_RETURN(err);
7857       }
7858     } else {
7859       my_error(ER_METADATA_INCONSISTENCY, MYF(0), str.c_str(), name);
7860       DBUG_RETURN(HA_ERR_ROCKSDB_CORRUPT_DATA);
7861     }
7862   }
7863 
7864   // The below adds/clears hooks in RocksDB sync points. There's no reason for
7865   // this code to be in ::create() but it needs to be somewhere where it is
7866   // away from any tight loops and where one can invoke it from mtr:
7867   DBUG_EXECUTE_IF("rocksdb_enable_delay_commits",
7868     {
7869       auto syncpoint= rocksdb::SyncPoint::GetInstance();
7870       syncpoint->SetCallBack("DBImpl::WriteImpl:BeforeLeaderEnters",
7871                              [&](void* /*arg*/) {my_sleep(500);} );
7872       syncpoint->EnableProcessing();
7873       push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_WRONG_ARGUMENTS,
7874                         "enable_delay_commits_mode ON");
7875 
7876     });
7877   DBUG_EXECUTE_IF("rocksdb_disable_delay_commits",
7878     {
7879       auto syncpoint= rocksdb::SyncPoint::GetInstance();
7880       syncpoint->ClearCallBack("DBImpl::WriteImpl:BeforeLeaderEnters");
7881       syncpoint->DisableProcessing();
7882       push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_WRONG_ARGUMENTS,
7883                         "enable_delay_commits_mode OFF");
7884     });
7885 
7886   DBUG_RETURN(create_table(str, table_arg, create_info->auto_increment_value));
7887 }
7888 
7889 /**
7890   @note
7891   This function is used only when the table has not yet been opened, and
7892   keyread_allowed bitmap doesn't have the correct values yet.
7893 
7894   See comment in ha_rocksdb::index_flags() for details.
7895 */
7896 
7897 bool ha_rocksdb::check_keyread_allowed(uint inx, uint part,
7898                                        bool all_parts) const {
7899   bool res = true;
7900   KEY *const key_info = &table_share->key_info[inx];
7901 
7902   Rdb_field_packing dummy1;
7903   res = dummy1.setup(nullptr, key_info->key_part[part].field, inx, part,
7904                      key_info->key_part[part].length);
7905 
7906   if (res && all_parts) {
7907     for (uint i = 0; i < part; i++) {
7908       Field *field;
7909       if ((field = key_info->key_part[i].field)) {
7910         Rdb_field_packing dummy;
7911         if (!dummy.setup(nullptr, field, inx, i,
7912                          key_info->key_part[i].length)) {
7913           /* Cannot do index-only reads for this column */
7914           res = false;
7915           break;
7916         }
7917       }
7918     }
7919   }
7920 
7921   const uint pk = table_share->primary_key;
7922   if (inx == pk && all_parts &&
7923       part + 1 == table_share->key_info[pk].user_defined_key_parts) {
7924     m_pk_can_be_decoded = res;
7925   }
7926 
7927   return res;
7928 }
7929 
7930 int ha_rocksdb::read_key_exact(const Rdb_key_def &kd,
7931                                rocksdb::Iterator *const iter,
7932                                const bool /* unused */,
7933                                const rocksdb::Slice &key_slice,
7934                                const int64_t ttl_filter_ts) {
7935   THD *thd = ha_thd();
7936   /*
7937     We are looking for the first record such that
7938       index_tuple= lookup_tuple.
7939     lookup_tuple may be a prefix of the index.
7940   */
7941   rocksdb_smart_seek(kd.m_is_reverse_cf, iter, key_slice);
7942 
7943   while (iter->Valid() && kd.value_matches_prefix(iter->key(), key_slice)) {
7944     if (thd && thd->killed) {
7945       return HA_ERR_QUERY_INTERRUPTED;
7946     }
7947     /*
7948       If TTL is enabled we need to check if the given key has already expired
7949       from the POV of the current transaction.  If it has, try going to the next
7950       key.
7951     */
7952     if (kd.has_ttl() && should_hide_ttl_rec(kd, iter->value(), ttl_filter_ts)) {
7953       rocksdb_smart_next(kd.m_is_reverse_cf, iter);
7954       continue;
7955     }
7956 
7957     return HA_EXIT_SUCCESS;
7958   }
7959 
7960   /*
7961     Got a record that is not equal to the lookup value, or even a record
7962     from another table.index.
7963   */
7964   return HA_ERR_KEY_NOT_FOUND;
7965 }
7966 
7967 int ha_rocksdb::read_before_key(const Rdb_key_def &kd,
7968                                 const bool full_key_match,
7969                                 const rocksdb::Slice &key_slice,
7970                                 const int64_t ttl_filter_ts) {
7971   THD *thd = ha_thd();
7972   /*
7973     We are looking for record with the biggest t.key such that
7974     t.key < lookup_tuple.
7975   */
7976   rocksdb_smart_seek(!kd.m_is_reverse_cf, m_scan_it, key_slice);
7977 
7978   while (is_valid(m_scan_it)) {
7979     if (thd && thd->killed) {
7980       return HA_ERR_QUERY_INTERRUPTED;
7981     }
7982     /*
7983       We are using full key and we've hit an exact match, or...
7984 
7985       If TTL is enabled we need to check if the given key has already expired
7986       from the POV of the current transaction.  If it has, try going to the next
7987       key.
7988     */
7989     if ((full_key_match &&
7990          kd.value_matches_prefix(m_scan_it->key(), key_slice)) ||
7991         (kd.has_ttl() &&
7992          should_hide_ttl_rec(kd, m_scan_it->value(), ttl_filter_ts))) {
7993       rocksdb_smart_next(!kd.m_is_reverse_cf, m_scan_it);
7994       continue;
7995     }
7996 
7997     return HA_EXIT_SUCCESS;
7998   }
7999 
8000   return HA_ERR_KEY_NOT_FOUND;
8001 }
8002 
8003 int ha_rocksdb::read_after_key(const Rdb_key_def &kd,
8004                                const rocksdb::Slice &key_slice,
8005                                const int64_t ttl_filter_ts) {
8006   THD *thd = ha_thd();
8007   /*
8008     We are looking for the first record such that
8009 
8010       index_tuple $GT lookup_tuple
8011 
8012     with HA_READ_AFTER_KEY, $GT = '>',
8013     with HA_READ_KEY_OR_NEXT, $GT = '>='
8014   */
8015   rocksdb_smart_seek(kd.m_is_reverse_cf, m_scan_it, key_slice);
8016 
8017   /*
8018     If TTL is enabled we need to check if the given key has already expired
8019     from the POV of the current transaction.  If it has, try going to the next
8020     key.
8021   */
8022   while (is_valid(m_scan_it) && kd.has_ttl() &&
8023          should_hide_ttl_rec(kd, m_scan_it->value(), ttl_filter_ts)) {
8024     if (thd && thd->killed) {
8025       return HA_ERR_QUERY_INTERRUPTED;
8026     }
8027     rocksdb_smart_next(kd.m_is_reverse_cf, m_scan_it);
8028   }
8029 
8030   return is_valid(m_scan_it) ? HA_EXIT_SUCCESS : HA_ERR_KEY_NOT_FOUND;
8031 }
8032 
8033 int ha_rocksdb::position_to_correct_key(
8034     const Rdb_key_def &kd, const enum ha_rkey_function &find_flag,
8035     const bool full_key_match, const uchar *const key,
8036     const key_part_map &keypart_map, const rocksdb::Slice &key_slice,
8037     bool *const move_forward, const int64_t ttl_filter_ts) {
8038   int rc = 0;
8039 
8040   *move_forward = true;
8041 
8042   switch (find_flag) {
8043     case HA_READ_KEY_EXACT:
8044       rc = read_key_exact(kd, m_scan_it, full_key_match, key_slice,
8045                           ttl_filter_ts);
8046       break;
8047     case HA_READ_BEFORE_KEY:
8048       *move_forward = false;
8049       rc = read_before_key(kd, full_key_match, key_slice, ttl_filter_ts);
8050       if (rc == 0 && !kd.covers_key(m_scan_it->key())) {
8051         /* The record we've got is not from this index */
8052         rc = HA_ERR_KEY_NOT_FOUND;
8053       }
8054       break;
8055     case HA_READ_AFTER_KEY:
8056     case HA_READ_KEY_OR_NEXT:
8057       rc = read_after_key(kd, key_slice, ttl_filter_ts);
8058       if (rc == 0 && !kd.covers_key(m_scan_it->key())) {
8059         /* The record we've got is not from this index */
8060         rc = HA_ERR_KEY_NOT_FOUND;
8061       }
8062       break;
8063     case HA_READ_KEY_OR_PREV:
8064     case HA_READ_PREFIX:
8065       /* This flag is not used by the SQL layer, so we don't support it yet. */
8066       rc = HA_ERR_UNSUPPORTED;
8067       break;
8068     case HA_READ_PREFIX_LAST:
8069     case HA_READ_PREFIX_LAST_OR_PREV:
8070       *move_forward = false;
8071       /*
8072         Find the last record with the specified index prefix lookup.
8073         - HA_READ_PREFIX_LAST requires that the record has the
8074           prefix=lookup (if there are no such records,
8075           HA_ERR_KEY_NOT_FOUND should be returned).
8076         - HA_READ_PREFIX_LAST_OR_PREV has no such requirement. If there are no
8077           records with prefix=lookup, we should return the last record
8078           before that.
8079       */
8080       rc = read_before_key(kd, full_key_match, key_slice, ttl_filter_ts);
8081       if (rc == 0) {
8082         const rocksdb::Slice &rkey = m_scan_it->key();
8083         if (!kd.covers_key(rkey)) {
8084           /* The record we've got is not from this index */
8085           rc = HA_ERR_KEY_NOT_FOUND;
8086         } else if (find_flag == HA_READ_PREFIX_LAST) {
8087           uint size = kd.pack_index_tuple(table, m_pack_buffer,
8088                                           m_sk_packed_tuple, key, keypart_map);
8089           rocksdb::Slice lookup_tuple(
8090               reinterpret_cast<char *>(m_sk_packed_tuple), size);
8091 
8092           // We need to compare the key we've got with the original search
8093           // prefix.
8094           if (!kd.value_matches_prefix(rkey, lookup_tuple)) {
8095             rc = HA_ERR_KEY_NOT_FOUND;
8096           }
8097         }
8098       }
8099       break;
8100     default:
8101       DBUG_ASSERT(0);
8102       break;
8103   }
8104 
8105   return rc;
8106 }
8107 
8108 int ha_rocksdb::calc_eq_cond_len(const Rdb_key_def &kd,
8109                                  const enum ha_rkey_function &find_flag,
8110                                  const rocksdb::Slice &slice,
8111                                  const int bytes_changed_by_succ,
8112                                  const key_range *const end_key,
8113                                  uint *const end_key_packed_size) {
8114   if (find_flag == HA_READ_KEY_EXACT) return slice.size();
8115 
8116   if (find_flag == HA_READ_PREFIX_LAST) {
8117     /*
8118       We have made the kd.successor(m_sk_packed_tuple) call above.
8119 
8120       The slice is at least Rdb_key_def::INDEX_NUMBER_SIZE bytes long.
8121     */
8122     return slice.size() - bytes_changed_by_succ;
8123   }
8124 
8125   if (end_key) {
8126     *end_key_packed_size =
8127         kd.pack_index_tuple(table, m_pack_buffer, m_end_key_packed_tuple,
8128                             end_key->key, end_key->keypart_map);
8129 
8130     /*
8131       Calculating length of the equal conditions here. 4 byte index id is
8132       included.
8133       Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3)
8134        WHERE id1=1 AND id2=1 AND id3>=2 => eq_cond_len= 4+8+4= 16
8135        WHERE id1=1 AND id2>=1 AND id3>=2 => eq_cond_len= 4+8= 12
8136       Example2: id1 VARCHAR(30), id2 INT, PRIMARY KEY (id1, id2)
8137        WHERE id1 = 'AAA' and id2 < 3; => eq_cond_len=13 (varchar used 9 bytes)
8138     */
8139     rocksdb::Slice end_slice(reinterpret_cast<char *>(m_end_key_packed_tuple),
8140                              *end_key_packed_size);
8141     return slice.difference_offset(end_slice);
8142   }
8143 
8144   /*
8145     On range scan without any end key condition, there is no
8146     eq cond, and eq cond length is the same as index_id size (4 bytes).
8147     Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3)
8148      WHERE id1>=1 AND id2 >= 2 and id2 <= 5 => eq_cond_len= 4
8149   */
8150   return Rdb_key_def::INDEX_NUMBER_SIZE;
8151 }
8152 
8153 int ha_rocksdb::read_row_from_primary_key(uchar *const buf) {
8154   int rc;
8155   const rocksdb::Slice &rkey = m_scan_it->key();
8156   const uint pk_size = rkey.size();
8157   const char *pk_data = rkey.data();
8158 
8159   memcpy(m_pk_packed_tuple, pk_data, pk_size);
8160   m_last_rowkey.copy(pk_data, pk_size, &my_charset_bin);
8161 
8162   if (m_lock_rows != RDB_LOCK_NONE) {
8163     /* We need to put a lock and re-read */
8164     rc = get_row_by_rowid(buf, m_pk_packed_tuple, pk_size);
8165   } else {
8166     /* Unpack from the row we've read */
8167     const rocksdb::Slice &value = m_scan_it->value();
8168     rc = convert_record_from_storage_format(&rkey, &value, buf);
8169   }
8170 
8171   return rc;
8172 }
8173 
8174 int ha_rocksdb::read_row_from_secondary_key(uchar *const buf,
8175                                             const Rdb_key_def &kd,
8176                                             bool move_forward) {
8177   int rc = 0;
8178   uint pk_size;
8179 
8180   /* Get the key columns and primary key value */
8181   const rocksdb::Slice &rkey = m_scan_it->key();
8182   const rocksdb::Slice &value = m_scan_it->value();
8183 
8184 #ifndef DBUG_OFF
8185   bool save_keyread_only = m_keyread_only;
8186 #endif
8187   DBUG_EXECUTE_IF("dbug.rocksdb.HA_EXTRA_KEYREAD", { m_keyread_only = true; });
8188 
8189   bool covered_lookup = (m_keyread_only && kd.can_cover_lookup()) ||
8190                         kd.covers_lookup(&value, &m_lookup_bitmap);
8191 
8192 #ifndef DBUG_OFF
8193   m_keyread_only = save_keyread_only;
8194 #endif
8195 
8196   if (covered_lookup && m_lock_rows == RDB_LOCK_NONE) {
8197     pk_size =
8198         kd.get_primary_key_tuple(table, *m_pk_descr, &rkey, m_pk_packed_tuple);
8199     if (pk_size == RDB_INVALID_KEY_LEN) {
8200       rc = HA_ERR_ROCKSDB_CORRUPT_DATA;
8201     } else {
8202       rc = kd.unpack_record(table, buf, &rkey, &value,
8203                             m_converter->get_verify_row_debug_checksums());
8204       global_stats.covered_secondary_key_lookups.inc();
8205     }
8206   } else {
8207     if (kd.m_is_reverse_cf) move_forward = !move_forward;
8208 
8209     rc = find_icp_matching_index_rec(move_forward, buf);
8210     if (!rc) {
8211       const rocksdb::Slice &rkey = m_scan_it->key();
8212       pk_size = kd.get_primary_key_tuple(table, *m_pk_descr, &rkey,
8213                                          m_pk_packed_tuple);
8214       if (pk_size == RDB_INVALID_KEY_LEN) {
8215         rc = HA_ERR_ROCKSDB_CORRUPT_DATA;
8216       } else {
8217         rc = get_row_by_rowid(buf, m_pk_packed_tuple, pk_size);
8218       }
8219     }
8220   }
8221 
8222   if (!rc) {
8223     m_last_rowkey.copy((const char *)m_pk_packed_tuple, pk_size,
8224                        &my_charset_bin);
8225   }
8226 
8227   return rc;
8228 }
8229 
8230 /**
8231   @note
8232     The problem with this function is that SQL layer calls it, when
8233      - the table has not been yet opened (no ::open() call done)
8234      - this->table_share already exists, but it is in the process of being
8235        filled, so some of fields are still NULL.
8236      - In particular, table_share->key_info[inx].key_part[] is filled only up
8237        to part #part. Subsequent key parts are not yet filled.
8238 
8239     To complicate things further, SQL layer will call index_flags() with
8240     all_parts=TRUE. Essentially, we're asked to provide flags for reading
8241     keyparts whose datatype is not yet known.
8242 
8243     We walk around this problem by using check_keyread_allowed(), which uses
8244     table_share object and is careful not to step on unitialized data.
8245 
8246     When we get a call with all_parts=TRUE, we try to analyze all parts but
8247     ignore those that have key_part->field==nullptr (these are not initialized
8248     yet).
8249 */
8250 
8251 ulong ha_rocksdb::index_flags(uint inx, uint part, bool all_parts) const {
8252   DBUG_ENTER_FUNC();
8253 
8254   ulong base_flags = HA_READ_NEXT |  // doesn't seem to be used
8255                      HA_READ_ORDER | HA_READ_RANGE | HA_READ_PREV;
8256 
8257   if (check_keyread_allowed(inx, part, all_parts)) {
8258     base_flags |= HA_KEYREAD_ONLY;
8259   }
8260 
8261   if (inx == table_share->primary_key) {
8262     /*
8263       Index-only reads on primary key are the same as table scan for us. Still,
8264       we need to explicitly "allow" them, otherwise SQL layer will miss some
8265       plans.
8266     */
8267     base_flags |= HA_KEYREAD_ONLY | HA_CLUSTERED_INDEX;
8268   } else {
8269     /*
8270       We can Index Condition Pushdown any key except the primary. With primary
8271       key, we get (pk, record) pair immediately, there is no place to put the
8272       ICP check.
8273     */
8274     base_flags |= HA_DO_INDEX_COND_PUSHDOWN;
8275   }
8276 
8277   DBUG_RETURN(base_flags);
8278 }
8279 
8280 /**
8281   @brief
8282   Read next index tuple through the secondary index.
8283 
8284   @details
8285   m_scan_it points at the index key-value pair that we should read the (pk,row)
8286   pair for.
8287 */
8288 int ha_rocksdb::secondary_index_read(const int keyno, uchar *const buf) {
8289   DBUG_ASSERT(table != nullptr);
8290 #ifdef MARIAROCKS_NOT_YET
8291   stats.rows_requested++;
8292 #endif
8293   /* Use STATUS_NOT_FOUND when record not found or some error occurred */
8294   table->status = STATUS_NOT_FOUND;
8295 
8296   if (is_valid(m_scan_it)) {
8297     rocksdb::Slice key = m_scan_it->key();
8298 
8299     /* Check if we've ran out of records of this index */
8300     if (m_key_descr_arr[keyno]->covers_key(key)) {
8301       int rc = 0;
8302 
8303       // TODO: We could here check if we have ran out of range we're scanning
8304       const uint size = m_key_descr_arr[keyno]->get_primary_key_tuple(
8305           table, *m_pk_descr, &key, m_pk_packed_tuple);
8306       if (size == RDB_INVALID_KEY_LEN) {
8307         return HA_ERR_ROCKSDB_CORRUPT_DATA;
8308       }
8309 
8310       m_last_rowkey.copy((const char *)m_pk_packed_tuple, size,
8311                          &my_charset_bin);
8312 
8313       rocksdb::Slice value = m_scan_it->value();
8314       bool covered_lookup =
8315           (m_keyread_only && m_key_descr_arr[keyno]->can_cover_lookup()) ||
8316           m_key_descr_arr[keyno]->covers_lookup(&value, &m_lookup_bitmap);
8317       if (covered_lookup && m_lock_rows == RDB_LOCK_NONE) {
8318         rc = m_key_descr_arr[keyno]->unpack_record(
8319             table, buf, &key, &value,
8320             m_converter->get_verify_row_debug_checksums());
8321         global_stats.covered_secondary_key_lookups.inc();
8322       } else {
8323         DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete_sk");
8324         rc = get_row_by_rowid(buf, m_pk_packed_tuple, size);
8325       }
8326 
8327       if (!rc) {
8328         table->status = 0;
8329 #ifdef MARIAROCKS_NOT_YET
8330         stats.rows_read++;
8331         stats.rows_index_next++;
8332 #endif
8333         update_row_stats(ROWS_READ);
8334       }
8335       return rc;
8336     }
8337   }
8338   return HA_ERR_END_OF_FILE;
8339 }
8340 
8341 bool ha_rocksdb::is_using_full_key(key_part_map keypart_map,
8342                                    uint actual_key_parts)
8343 {
8344   return (keypart_map == HA_WHOLE_KEY) ||
8345          (keypart_map == ((key_part_map(1) << actual_key_parts)
8346                         - 1));
8347 }
8348 
8349 /*
8350   ha_rocksdb::read_range_first overrides handler::read_range_first.
8351   The only difference from handler::read_range_first is that
8352   ha_rocksdb::read_range_first passes end_key to
8353   ha_rocksdb::index_read_map_impl function.
8354 
8355   @return
8356     HA_EXIT_SUCCESS  OK
8357     other            HA_ERR error code (can be SE-specific)
8358 */
8359 int ha_rocksdb::read_range_first(const key_range *const start_key,
8360                                  const key_range *const end_key,
8361                                  bool eq_range_arg, bool sorted) {
8362   DBUG_ENTER_FUNC();
8363 
8364   int result;
8365 
8366   eq_range = eq_range_arg;
8367 
8368   /* MariaDB: Pass RANGE_SCAN_ASC when we support ICP on backwards scans */
8369   set_end_range(end_key);
8370 
8371   range_key_part = table->key_info[active_index].key_part;
8372 
8373   if (!start_key) {
8374     // Read first record
8375     result = ha_index_first(table->record[0]);
8376   } else {
8377 #ifdef MARIAROCKS_NOT_YET
8378     if (is_using_prohibited_gap_locks(
8379             is_using_full_unique_key(active_index, start_key->keypart_map,
8380                                      start_key->flag))) {
8381       DBUG_RETURN(HA_ERR_LOCK_DEADLOCK);
8382     }
8383 #endif
8384     increment_statistics(&SSV::ha_read_key_count);
8385 
8386     MYSQL_TABLE_IO_WAIT(m_psi, PSI_TABLE_FETCH_ROW, active_index, 0, {
8387       result =
8388           index_read_map_impl(table->record[0], start_key->key,
8389                               start_key->keypart_map, start_key->flag, end_key);
8390     })
8391   }
8392   if (result) {
8393     DBUG_RETURN((result == HA_ERR_KEY_NOT_FOUND) ? HA_ERR_END_OF_FILE : result);
8394   }
8395 
8396   if (compare_key(end_range) <= 0) {
8397     DBUG_RETURN(HA_EXIT_SUCCESS);
8398   } else {
8399     /*
8400       The last read row does not fall in the range. So request
8401       storage engine to release row lock if possible.
8402     */
8403     unlock_row();
8404     DBUG_RETURN(HA_ERR_END_OF_FILE);
8405   }
8406 }
8407 
8408 int ha_rocksdb::prepare_index_scan()
8409 {
8410   range_key_part= table->key_info[active_index].key_part;
8411   set_end_range(NULL);
8412   return 0;
8413 }
8414 
8415 
8416 int ha_rocksdb::prepare_range_scan(const key_range *start_key,
8417                                     const key_range *end_key)
8418 {
8419   range_key_part= table->key_info[active_index].key_part;
8420 
8421   if (start_key)
8422   {
8423     m_save_start_range= *start_key;
8424     m_start_range= &m_save_start_range;
8425   }
8426   else
8427     m_start_range= NULL;
8428 
8429   set_end_range(end_key);
8430   return 0;
8431 }
8432 
8433 
8434 /**
8435    @return
8436     HA_EXIT_SUCCESS  OK
8437     other            HA_ERR error code (can be SE-specific)
8438 */
8439 
8440 int ha_rocksdb::index_read_map(uchar *const buf, const uchar *const key,
8441                                key_part_map keypart_map,
8442                                enum ha_rkey_function find_flag) {
8443   DBUG_ENTER_FUNC();
8444 
8445   DBUG_RETURN(index_read_map_impl(buf, key, keypart_map, find_flag, nullptr));
8446 }
8447 
8448 /*
8449    See storage/rocksdb/rocksdb-range-access.txt for description of how MySQL
8450    index navigation commands are converted into RocksDB lookup commands.
8451 
8452    This function takes end_key as an argument, and it is set on range scan.
8453    MyRocks needs to decide whether prefix bloom filter can be used or not.
8454    To decide to use prefix bloom filter or not, calculating equal condition
8455    length
8456    is needed. On equal lookups (find_flag == HA_READ_KEY_EXACT), equal
8457    condition length is the same as rocksdb::Slice.size() of the start key.
8458    On range scan, equal condition length is MIN(start_key, end_key) of the
8459    rocksdb::Slice expression.
8460 
8461    @return
8462     HA_EXIT_SUCCESS  OK
8463     other            HA_ERR error code (can be SE-specific)
8464 */
8465 int ha_rocksdb::index_read_map_impl(uchar *const buf, const uchar *const key,
8466                                     key_part_map keypart_map,
8467                                     enum ha_rkey_function find_flag,
8468                                     const key_range *end_key) {
8469   DBUG_ENTER_FUNC();
8470 
8471   DBUG_EXECUTE_IF("myrocks_busy_loop_on_row_read", int debug_i = 0;
8472                   while (1) { debug_i++; });
8473 
8474   int rc = 0;
8475 
8476   THD *thd = ha_thd();
8477   DEBUG_SYNC(thd, "rocksdb.check_flags_rmi");
8478   if (thd && thd->killed) {
8479     rc = HA_ERR_QUERY_INTERRUPTED;
8480     DBUG_RETURN(rc);
8481   }
8482 
8483   const Rdb_key_def &kd = *m_key_descr_arr[active_index];
8484   const uint actual_key_parts = kd.get_key_parts();
8485   bool using_full_key = is_using_full_key(keypart_map, actual_key_parts);
8486 
8487   if (!end_key) end_key = end_range;
8488 
8489   /* By default, we don't need the retrieved records to match the prefix */
8490   m_sk_match_prefix = nullptr;
8491 #ifdef MARIAROCKS_NOT_YET
8492   stats.rows_requested++;
8493 #endif
8494   if (active_index == table->s->primary_key && find_flag == HA_READ_KEY_EXACT &&
8495       using_full_key) {
8496     /*
8497       Equality lookup over primary key, using full tuple.
8498       This is a special case, use DB::Get.
8499     */
8500     const uint size = kd.pack_index_tuple(table, m_pack_buffer,
8501                                           m_pk_packed_tuple, key, keypart_map);
8502     bool skip_lookup = is_blind_delete_enabled();
8503 
8504     rc = get_row_by_rowid(buf, m_pk_packed_tuple, size, skip_lookup, false);
8505 
8506     if (!rc && !skip_lookup) {
8507 #ifdef MARIAROCKS_NOT_YET
8508       stats.rows_read++;
8509       stats.rows_index_first++;
8510 #endif
8511       update_row_stats(ROWS_READ);
8512     }
8513     DBUG_RETURN(rc);
8514   }
8515 
8516   /*
8517     Unique secondary index performs lookups without the extended key fields
8518   */
8519   uint packed_size;
8520   if (active_index != table->s->primary_key &&
8521       table->key_info[active_index].flags & HA_NOSAME &&
8522       find_flag == HA_READ_KEY_EXACT && using_full_key) {
8523     key_part_map tmp_map = (key_part_map(1) << table->key_info[active_index]
8524                                                    .user_defined_key_parts) -
8525                            1;
8526     packed_size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
8527                                       key, tmp_map);
8528     if (table->key_info[active_index].user_defined_key_parts !=
8529         kd.get_key_parts()) {
8530       using_full_key = false;
8531     }
8532   } else {
8533     packed_size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
8534                                       key, keypart_map);
8535   }
8536 
8537   if ((pushed_idx_cond && pushed_idx_cond_keyno == active_index) &&
8538       (find_flag == HA_READ_KEY_EXACT || find_flag == HA_READ_PREFIX_LAST)) {
8539     /*
8540       We are doing a point index lookup, and ICP is enabled. It is possible
8541       that this call will be followed by ha_rocksdb->index_next_same() call.
8542 
8543       Do what InnoDB does: save the lookup tuple now. We will need it in
8544       index_next_same/find_icp_matching_index_rec in order to stop scanning
8545       as soon as index record doesn't match the lookup tuple.
8546 
8547       When not using ICP, handler::index_next_same() will make sure that rows
8548       that don't match the lookup prefix are not returned.
8549       row matches the lookup prefix.
8550     */
8551     m_sk_match_prefix = m_sk_match_prefix_buf;
8552     m_sk_match_length = packed_size;
8553     memcpy(m_sk_match_prefix, m_sk_packed_tuple, packed_size);
8554   }
8555 
8556   int bytes_changed_by_succ = 0;
8557   if (find_flag == HA_READ_PREFIX_LAST_OR_PREV ||
8558       find_flag == HA_READ_PREFIX_LAST || find_flag == HA_READ_AFTER_KEY) {
8559     /* See below */
8560     bytes_changed_by_succ = kd.successor(m_sk_packed_tuple, packed_size);
8561   }
8562 
8563   rocksdb::Slice slice(reinterpret_cast<const char *>(m_sk_packed_tuple),
8564                        packed_size);
8565 
8566   uint end_key_packed_size = 0;
8567   /*
8568     In MariaDB, the end_key is always the bigger end of the range.
8569     If we are doing a reverse-ordered scan (that is, walking from the bigger
8570     key values to smaller), we should use the smaller end of range as end_key.
8571   */
8572   const key_range *cur_end_key= end_key;
8573   if (find_flag == HA_READ_PREFIX_LAST_OR_PREV ||
8574       find_flag == HA_READ_BEFORE_KEY)
8575   {
8576     cur_end_key= m_start_range;
8577   }
8578 
8579   const uint eq_cond_len =
8580       calc_eq_cond_len(kd, find_flag, slice, bytes_changed_by_succ, cur_end_key,
8581                        &end_key_packed_size);
8582 
8583   bool use_all_keys = false;
8584   if (find_flag == HA_READ_KEY_EXACT &&
8585       my_count_bits(keypart_map) == kd.get_key_parts()) {
8586     use_all_keys = true;
8587   }
8588 
8589   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
8590   const bool is_new_snapshot = !tx->has_snapshot();
8591   // Loop as long as we get a deadlock error AND we end up creating the
8592   // snapshot here (i.e. it did not exist prior to this)
8593   for (;;) {
8594     DEBUG_SYNC(thd, "rocksdb.check_flags_rmi_scan");
8595     if (thd && thd->killed) {
8596       rc = HA_ERR_QUERY_INTERRUPTED;
8597       break;
8598     }
8599     /*
8600       This will open the iterator and position it at a record that's equal or
8601       greater than the lookup tuple.
8602     */
8603     setup_scan_iterator(kd, &slice, use_all_keys, eq_cond_len);
8604 
8605     /*
8606       Once we are positioned on from above, move to the position we really
8607       want: See storage/rocksdb/rocksdb-range-access.txt
8608     */
8609     bool move_forward;
8610     rc =
8611         position_to_correct_key(kd, find_flag, using_full_key, key, keypart_map,
8612                                 slice, &move_forward, tx->m_snapshot_timestamp);
8613 
8614     if (rc) {
8615       break;
8616     }
8617 
8618     m_skip_scan_it_next_call = false;
8619 
8620     /*
8621       Now get the data for the row into 'buf'.  If we were using a primary key
8622       then we have all the rows we need.  For a secondary key we now need to
8623       lookup the primary key.
8624     */
8625     if (active_index == table->s->primary_key) {
8626       rc = read_row_from_primary_key(buf);
8627     } else {
8628       rc = read_row_from_secondary_key(buf, kd, move_forward);
8629     }
8630 
8631     if (!should_recreate_snapshot(rc, is_new_snapshot)) {
8632       break; /* Exit the loop */
8633     }
8634 
8635     // release the snapshot and iterator so they will be regenerated
8636     tx->release_snapshot();
8637     release_scan_iterator();
8638   }
8639 
8640   if (rc) {
8641     /*
8642       This status is returned on any error
8643       the only possible error condition is record-not-found
8644     */
8645     table->status = STATUS_NOT_FOUND;
8646   } else {
8647     table->status = 0;
8648 #ifdef MARIAROCKS_NOT_YET
8649     stats.rows_read++;
8650     stats.rows_index_first++;
8651 #endif
8652     update_row_stats(ROWS_READ);
8653   }
8654 
8655   DBUG_RETURN(rc);
8656 }
8657 
8658 /*
8659   @brief
8660   Scan the secondary index until we find an index record that satisfies ICP
8661 
8662   @param move_forward   TRUE  <=> move m_scan_it forward
8663                         FALSE <=> move m_scan_it backward
8664   @param buf            Record buffer (must be the same buffer that
8665                         pushed index condition points to, in practice
8666                         it is table->record[0])
8667 
8668   @detail
8669   Move the current iterator m_scan_it until we get an index tuple that
8670   satisfies the pushed Index Condition.
8671   (if there is no pushed index condition, return right away)
8672 
8673   @return
8674     0     - Index tuple satisfies ICP, can do index read.
8675     other - error code
8676 */
8677 
8678 int ha_rocksdb::find_icp_matching_index_rec(const bool move_forward,
8679                                             uchar *const buf) {
8680   if (pushed_idx_cond && pushed_idx_cond_keyno == active_index) {
8681     const Rdb_key_def &kd = *m_key_descr_arr[active_index];
8682     THD *thd = ha_thd();
8683 
8684     while (1) {
8685       int rc = rocksdb_skip_expired_records(kd, m_scan_it, !move_forward);
8686       if (rc != HA_EXIT_SUCCESS) {
8687         return rc;
8688       }
8689 
8690       if (thd && thd->killed) {
8691         return HA_ERR_QUERY_INTERRUPTED;
8692       }
8693 
8694       if (!is_valid(m_scan_it)) {
8695         table->status = STATUS_NOT_FOUND;
8696         return HA_ERR_END_OF_FILE;
8697       }
8698       const rocksdb::Slice rkey = m_scan_it->key();
8699 
8700       if (!kd.covers_key(rkey)) {
8701         table->status = STATUS_NOT_FOUND;
8702         return HA_ERR_END_OF_FILE;
8703       }
8704 
8705       if (m_sk_match_prefix) {
8706         const rocksdb::Slice prefix((const char *)m_sk_match_prefix,
8707                                     m_sk_match_length);
8708         if (!kd.value_matches_prefix(rkey, prefix)) {
8709           table->status = STATUS_NOT_FOUND;
8710           return HA_ERR_END_OF_FILE;
8711         }
8712       }
8713 
8714       const rocksdb::Slice value = m_scan_it->value();
8715       int err = kd.unpack_record(table, buf, &rkey, &value,
8716                                  m_converter->get_verify_row_debug_checksums());
8717       if (err != HA_EXIT_SUCCESS) {
8718         return err;
8719       }
8720 
8721       const check_result_t icp_status= handler_index_cond_check(this);
8722       if (icp_status == CHECK_NEG) {
8723         rocksdb_smart_next(!move_forward, m_scan_it);
8724         continue; /* Get the next (or prev) index tuple */
8725       }
8726       else if (icp_status == CHECK_OUT_OF_RANGE ||
8727                icp_status == CHECK_ABORTED_BY_USER) {
8728         /* We have walked out of range we are scanning */
8729         table->status = STATUS_NOT_FOUND;
8730         return HA_ERR_END_OF_FILE;
8731       }
8732       else /* icp_status == CHECK_POS */
8733       {
8734         /* Index Condition is satisfied. We have rc==0, proceed to fetch the
8735          * row. */
8736         break;
8737       }
8738     }
8739   }
8740   return HA_EXIT_SUCCESS;
8741 }
8742 
8743 /**
8744    @return
8745     HA_EXIT_SUCCESS  OK
8746     other            HA_ERR error code (can be SE-specific)
8747 */
8748 
8749 /**
8750    @return
8751     HA_ADMIN_OK      OK
8752     other            HA_ADMIN error code
8753 */
8754 int ha_rocksdb::check(THD *const thd, HA_CHECK_OPT *const check_opt) {
8755   DBUG_ENTER_FUNC();
8756 
8757   DBUG_ASSERT(thd != nullptr);
8758   DBUG_ASSERT(check_opt != nullptr);
8759 
8760   const uint pk = pk_index(table, m_tbl_def);
8761   String rowkey_copy;
8762   String sec_key_copy;
8763   const char *const table_name = table->s->table_name.str;
8764 
8765   // Only when debugging: don't use snapshot when reading
8766   // Rdb_transaction *tx= get_or_create_tx(table->in_use);
8767   // tx->snapshot= nullptr;
8768 
8769   bool save_verify_row_debug_checksums =
8770       m_converter->get_verify_row_debug_checksums();
8771   m_converter->set_verify_row_debug_checksums(true);
8772   /* For each secondary index, check that we can get a PK value from it */
8773   // NO_LINT_DEBUG
8774   sql_print_verbose_info("CHECKTABLE %s: Checking table %s", table_name,
8775                          table_name);
8776   ha_rows UNINIT_VAR(row_checksums_at_start);  // set/used iff first_index==true
8777   ha_rows row_checksums = ha_rows(-1);
8778   bool first_index = true;
8779 
8780   for (uint keyno = 0; keyno < table->s->keys; keyno++) {
8781     if (keyno != pk) {
8782       extra(HA_EXTRA_KEYREAD);
8783       ha_index_init(keyno, true);
8784       ha_rows rows = 0;
8785       ha_rows checksums = 0;
8786       if (first_index) {
8787         row_checksums_at_start = m_converter->get_row_checksums_checked();
8788       }
8789       int res;
8790       // NO_LINT_DEBUG
8791       sql_print_verbose_info("CHECKTABLE %s:   Checking index %s", table_name,
8792                              table->key_info[keyno].name.str);
8793       while (1) {
8794         if (!rows) {
8795           res = index_first(table->record[0]);
8796         } else {
8797           res = index_next(table->record[0]);
8798         }
8799 
8800         if (res == HA_ERR_END_OF_FILE) break;
8801         if (res) {
8802           // error
8803           // NO_LINT_DEBUG
8804           sql_print_error("CHECKTABLE %s:   .. row %lld: index scan error %d",
8805                           table_name, rows, res);
8806           goto error;
8807         }
8808         rocksdb::Slice key = m_scan_it->key();
8809         sec_key_copy.copy(key.data(), key.size(), &my_charset_bin);
8810         rowkey_copy.copy(m_last_rowkey.ptr(), m_last_rowkey.length(),
8811                          &my_charset_bin);
8812 
8813         if (m_key_descr_arr[keyno]->unpack_info_has_checksum(
8814                 m_scan_it->value())) {
8815           checksums++;
8816         }
8817 
8818         if ((res = get_row_by_rowid(table->record[0], rowkey_copy.ptr(),
8819                                     rowkey_copy.length()))) {
8820           // NO_LINT_DEBUG
8821           sql_print_error(
8822               "CHECKTABLE %s:   .. row %lld: "
8823               "failed to fetch row by rowid",
8824               table_name, rows);
8825           goto error;
8826         }
8827 
8828         longlong hidden_pk_id = 0;
8829         if (has_hidden_pk(table) &&
8830             read_hidden_pk_id_from_rowkey(&hidden_pk_id)) {
8831           goto error;
8832         }
8833 
8834         /* Check if we get the same PK value */
8835         uint packed_size = m_pk_descr->pack_record(
8836             table, m_pack_buffer, table->record[0], m_pk_packed_tuple, nullptr,
8837             false, hidden_pk_id);
8838         if (packed_size != rowkey_copy.length() ||
8839             memcmp(m_pk_packed_tuple, rowkey_copy.ptr(), packed_size)) {
8840           // NO_LINT_DEBUG
8841           sql_print_error("CHECKTABLE %s:   .. row %lld: PK value mismatch",
8842                           table_name, rows);
8843           goto print_and_error;
8844         }
8845 
8846         /* Check if we get the same secondary key value */
8847         packed_size = m_key_descr_arr[keyno]->pack_record(
8848             table, m_pack_buffer, table->record[0], m_sk_packed_tuple,
8849             &m_sk_tails, false, hidden_pk_id);
8850         if (packed_size != sec_key_copy.length() ||
8851             memcmp(m_sk_packed_tuple, sec_key_copy.ptr(), packed_size)) {
8852           // NO_LINT_DEBUG
8853           sql_print_error(
8854               "CHECKTABLE %s:   .. row %lld: "
8855               "secondary index value mismatch",
8856               table_name, rows);
8857           goto print_and_error;
8858         }
8859         rows++;
8860         continue;
8861 
8862       print_and_error : {
8863         std::string buf;
8864         buf = rdb_hexdump(rowkey_copy.ptr(), rowkey_copy.length(),
8865                           RDB_MAX_HEXDUMP_LEN);
8866         // NO_LINT_DEBUG
8867         sql_print_error("CHECKTABLE %s:   rowkey: %s", table_name, buf.c_str());
8868 
8869         buf = rdb_hexdump(m_retrieved_record.data(), m_retrieved_record.size(),
8870                           RDB_MAX_HEXDUMP_LEN);
8871         // NO_LINT_DEBUG
8872         sql_print_error("CHECKTABLE %s:   record: %s", table_name, buf.c_str());
8873 
8874         buf = rdb_hexdump(sec_key_copy.ptr(), sec_key_copy.length(),
8875                           RDB_MAX_HEXDUMP_LEN);
8876         // NO_LINT_DEBUG
8877         sql_print_error("CHECKTABLE %s:   index: %s", table_name, buf.c_str());
8878 
8879         goto error;
8880       }
8881       }
8882       // NO_LINT_DEBUG
8883       sql_print_verbose_info("CHECKTABLE %s:   ... %lld index entries checked "
8884                              "(%lld had checksums)",
8885                              table_name, rows, checksums);
8886 
8887       if (first_index) {
8888         row_checksums =
8889             m_converter->get_row_checksums_checked() - row_checksums_at_start;
8890         first_index = false;
8891       }
8892       ha_index_end();
8893     }
8894   }
8895   if (row_checksums != ha_rows(-1)) {
8896     // NO_LINT_DEBUG
8897     sql_print_verbose_info("CHECKTABLE %s:   %lld table records had checksums",
8898                            table_name, row_checksums);
8899   }
8900   extra(HA_EXTRA_NO_KEYREAD);
8901 
8902   m_converter->set_verify_row_debug_checksums(save_verify_row_debug_checksums);
8903   /*
8904     TODO(sergiy): we should check also for PK records that are missing in
8905     the secondary indexes.
8906     For that, need to walk through the PK and check that every PK record has a
8907     proper counterpart in each secondary index.
8908   */
8909   DBUG_RETURN(HA_ADMIN_OK);
8910 error:
8911   m_converter->set_verify_row_debug_checksums(save_verify_row_debug_checksums);
8912   ha_index_or_rnd_end();
8913   extra(HA_EXTRA_NO_KEYREAD);
8914 
8915   DBUG_RETURN(HA_ADMIN_CORRUPT);
8916 }
8917 
8918 static void dbug_dump_str(FILE *const out, const char *const str, int len) {
8919   fprintf(out, "\"");
8920   for (int i = 0; i < len; i++) {
8921     if (str[i] > 32) {
8922       fprintf(out, "%c", str[i]);
8923     } else {
8924       fprintf(out, "\\%d", str[i]);
8925     }
8926   }
8927   fprintf(out, "\"");
8928 }
8929 
8930 /*
8931   Debugging help: dump the whole database into a human-readable file.
8932   Usage:
8933     dbug_dump_database(rdb);
8934 */
8935 
8936 void dbug_dump_database(rocksdb::DB *const db) {
8937   FILE *const out = fopen("/tmp/rocksdb.dump", "wt");
8938   if (!out) return;
8939 
8940   rocksdb::Iterator *it = db->NewIterator(rocksdb::ReadOptions());
8941   for (it->SeekToFirst(); it->Valid(); it->Next()) {
8942     rocksdb::Slice key = it->key();
8943     rocksdb::Slice val = it->value();
8944     dbug_dump_str(out, key.data(), key.size());
8945     fprintf(out, " -> ");
8946     dbug_dump_str(out, val.data(), val.size());
8947     fprintf(out, "\n");
8948   }
8949 
8950   delete it;
8951   fclose(out);
8952 }
8953 
8954 rocksdb::Status ha_rocksdb::get_for_update(
8955     Rdb_transaction *const tx, rocksdb::ColumnFamilyHandle *const column_family,
8956     const rocksdb::Slice &key, rocksdb::PinnableSlice *const value) const {
8957   DBUG_ASSERT(m_lock_rows != RDB_LOCK_NONE);
8958   const bool exclusive = m_lock_rows != RDB_LOCK_READ;
8959 
8960   const bool do_validate =
8961       my_core::thd_tx_isolation(ha_thd()) > ISO_READ_COMMITTED;
8962   rocksdb::Status s =
8963       tx->get_for_update(column_family, key, value, exclusive, do_validate);
8964 
8965 #ifndef DBUG_OFF
8966   ++rocksdb_num_get_for_update_calls;
8967 #endif
8968   return s;
8969 }
8970 
8971 bool ha_rocksdb::is_blind_delete_enabled() {
8972   THD *thd = ha_thd();
8973   /*
8974     Note: in MariaDB, thd->lex->table_count is only set for multi-table DELETE,
8975     not for single-table DELETE.  So we check thd->lex->query_tables instead.
8976   */
8977   return (THDVAR(thd, blind_delete_primary_key) &&
8978           thd->lex->sql_command == SQLCOM_DELETE &&
8979           thd->lex->query_tables && !thd->lex->query_tables->next_global &&
8980           table->s->keys == 1 &&
8981           !has_hidden_pk(table) && !thd->rgi_slave);
8982 }
8983 
8984 /*
8985   Given a rowid (i.e. packed PK) as a parameter, get the record.
8986 
8987   @return
8988     HA_EXIT_SUCCESS  OK
8989     other            HA_ERR error code (can be SE-specific)
8990 */
8991 
8992 int ha_rocksdb::get_row_by_rowid(uchar *const buf, const char *const rowid,
8993                                  const uint rowid_size, const bool skip_lookup,
8994                                  const bool skip_ttl_check) {
8995   DBUG_ENTER_FUNC();
8996 
8997   DBUG_ASSERT(table != nullptr);
8998 
8999   int rc;
9000 
9001   rocksdb::Slice key_slice(rowid, rowid_size);
9002 
9003   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
9004   DBUG_ASSERT(tx != nullptr);
9005 
9006   DEBUG_SYNC(ha_thd(), "rocksdb.get_row_by_rowid");
9007   DBUG_EXECUTE_IF("dbug.rocksdb.get_row_by_rowid", {
9008     THD *thd = ha_thd();
9009     const char act[] =
9010         "now signal Reached "
9011         "wait_for signal.rocksdb.get_row_by_rowid_let_running";
9012     DBUG_ASSERT(opt_debug_sync_timeout > 0);
9013     DBUG_ASSERT(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
9014   };);
9015 
9016   bool found;
9017   rocksdb::Status s;
9018 
9019   /* Pretend row found without looking up */
9020   if (skip_lookup) {
9021 #ifdef MARIAROCKS_NOT_YET
9022     stats.rows_deleted_blind++;
9023 #endif
9024     update_row_stats(ROWS_DELETED_BLIND);
9025     m_last_rowkey.copy((const char *)rowid, rowid_size, &my_charset_bin);
9026     table->status = 0;
9027     DBUG_RETURN(0);
9028   }
9029 
9030   if (m_lock_rows == RDB_LOCK_NONE) {
9031     tx->acquire_snapshot(true);
9032     s = tx->get(m_pk_descr->get_cf(), key_slice, &m_retrieved_record);
9033   } else if (m_insert_with_update && m_dup_pk_found) {
9034     DBUG_ASSERT(m_pk_descr->get_keyno() == m_dupp_errkey);
9035     DBUG_ASSERT(m_dup_pk_retrieved_record.length() ==
9036                 m_retrieved_record.size());
9037     DBUG_ASSERT(memcmp(m_dup_pk_retrieved_record.ptr(),
9038                        m_retrieved_record.data(),
9039                        m_retrieved_record.size()) == 0);
9040 
9041     // do nothing - we already have the result in m_retrieved_record and
9042     // already taken the lock
9043     s = rocksdb::Status::OK();
9044   } else {
9045     s = get_for_update(tx, m_pk_descr->get_cf(), key_slice,
9046                        &m_retrieved_record);
9047   }
9048 
9049   DBUG_EXECUTE_IF("rocksdb_return_status_corrupted",
9050                   dbug_change_status_to_corrupted(&s););
9051 
9052   if (!s.IsNotFound() && !s.ok()) {
9053     DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def,
9054                                      m_table_handler));
9055   }
9056   found = !s.IsNotFound();
9057 
9058   table->status = STATUS_NOT_FOUND;
9059   if (found) {
9060     /* If we found the record, but it's expired, pretend we didn't find it.  */
9061     if (!skip_ttl_check && m_pk_descr->has_ttl() &&
9062         should_hide_ttl_rec(*m_pk_descr, m_retrieved_record,
9063                             tx->m_snapshot_timestamp)) {
9064       DBUG_RETURN(HA_ERR_KEY_NOT_FOUND);
9065     }
9066 
9067     m_last_rowkey.copy((const char *)rowid, rowid_size, &my_charset_bin);
9068     rc = convert_record_from_storage_format(&key_slice, buf);
9069 
9070     if (!rc) {
9071       table->status = 0;
9072     }
9073   } else {
9074     /*
9075       Note: we don't need to unlock the row. It is intentional that we keep
9076       locks on rows that don't exist.
9077     */
9078     rc = HA_ERR_KEY_NOT_FOUND;
9079   }
9080 
9081   DBUG_RETURN(rc);
9082 }
9083 
9084 /**
9085   @return
9086     HA_EXIT_SUCCESS  OK
9087     other            HA_ERR error code (can be SE-specific)
9088 */
9089 int ha_rocksdb::index_next(uchar *const buf) {
9090   DBUG_ENTER_FUNC();
9091 
9092   bool moves_forward = true;
9093   if (m_key_descr_arr[active_index]->m_is_reverse_cf) {
9094     moves_forward = false;
9095   }
9096 
9097   int rc = index_next_with_direction(buf, moves_forward);
9098   if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
9099 
9100   DBUG_RETURN(rc);
9101 }
9102 
9103 /**
9104   @return
9105     HA_EXIT_SUCCESS  OK
9106     other            HA_ERR error code (can be SE-specific)
9107 */
9108 int ha_rocksdb::index_prev(uchar *const buf) {
9109   DBUG_ENTER_FUNC();
9110 
9111   bool moves_forward = false;
9112   if (m_key_descr_arr[active_index]->m_is_reverse_cf) {
9113     moves_forward = true;
9114   }
9115 
9116   int rc = index_next_with_direction(buf, moves_forward);
9117   if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
9118 
9119   DBUG_RETURN(rc);
9120 }
9121 
9122 int ha_rocksdb::index_next_with_direction(uchar *const buf, bool move_forward) {
9123   DBUG_ENTER_FUNC();
9124 
9125   int rc;
9126 
9127   if (active_index == pk_index(table, m_tbl_def)) {
9128     rc = rnd_next_with_direction(buf, move_forward);
9129   } else {
9130     THD *thd = ha_thd();
9131     for (;;) {
9132       DEBUG_SYNC(thd, "rocksdb.check_flags_inwd");
9133       if (thd && thd->killed) {
9134         rc = HA_ERR_QUERY_INTERRUPTED;
9135         break;
9136       }
9137       if (m_skip_scan_it_next_call) {
9138         m_skip_scan_it_next_call = false;
9139       } else {
9140         if (move_forward) {
9141           m_scan_it->Next(); /* this call cannot fail */
9142         } else {
9143           m_scan_it->Prev();
9144         }
9145       }
9146       rc = rocksdb_skip_expired_records(*m_key_descr_arr[active_index],
9147                                         m_scan_it, !move_forward);
9148       if (rc != HA_EXIT_SUCCESS) {
9149         break;
9150       }
9151       rc = find_icp_matching_index_rec(move_forward, buf);
9152       if (!rc) rc = secondary_index_read(active_index, buf);
9153       if (!should_skip_invalidated_record(rc)) {
9154         break;
9155       }
9156     }
9157   }
9158 
9159   DBUG_RETURN(rc);
9160 }
9161 
9162 /**
9163   @return
9164     HA_EXIT_SUCCESS  OK
9165     other            HA_ERR error code (can be SE-specific)
9166 */
9167 int ha_rocksdb::index_first(uchar *const buf) {
9168   DBUG_ENTER_FUNC();
9169 
9170   m_sk_match_prefix = nullptr;
9171   int rc = m_key_descr_arr[active_index]->m_is_reverse_cf
9172                ? index_last_intern(buf)
9173                : index_first_intern(buf);
9174   if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
9175 
9176   DBUG_RETURN(rc);
9177 }
9178 
9179 /**
9180   @return
9181     HA_EXIT_SUCCESS  OK
9182     other            HA_ERR error code (can be SE-specific)
9183 */
9184 int ha_rocksdb::index_last(uchar *const buf) {
9185   DBUG_ENTER_FUNC();
9186 
9187   m_sk_match_prefix = nullptr;
9188   int rc = m_key_descr_arr[active_index]->m_is_reverse_cf
9189                ? index_first_intern(buf)
9190                : index_last_intern(buf);
9191   if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
9192 
9193   DBUG_RETURN(rc);
9194 }
9195 
9196 /*
9197   Start scanning from the "first" value.
9198 
9199   The 'first' here means "the first from start of the key space".
9200   For reverse-ordered key spaces, the first value will be the biggest, 'zzzz'.
9201 
9202   An picture of a forward-ordered keyspace (remember, the keys have form
9203   'indexnr-keyval'. Suppose the index we are at has number n)
9204 
9205       (n-1) - ...
9206       ( n )          <--- 1. (n) doesn't exist in the db but it would be here.
9207       ( n ) - aaa       <--- 2. Seek("n") will put us here on the first index
9208       ( n ) - bbb               record.
9209       ( n ) - cc
9210 
9211   So, need to do: Seek(n);
9212 
9213   A backward-ordered keyspace:
9214 
9215       (n+1) - bbb
9216       (n+1) - aaa
9217       (n+1)        <--- (n+1) doesn't exist in the db but would be here.
9218       ( n ) - ccc       <--- 1. We need to be here.
9219       ( n ) - bbb
9220       ( n ) - aaa
9221       ( n )
9222 
9223   So, need to: Seek(n+1);
9224 
9225 */
9226 
9227 int ha_rocksdb::index_first_intern(uchar *const buf) {
9228   DBUG_ENTER_FUNC();
9229 
9230   uchar *key;
9231   uint key_size;
9232   int rc;
9233 
9234   if (is_pk(active_index, table, m_tbl_def)) {
9235     key = m_pk_packed_tuple;
9236   } else {
9237     key = m_sk_packed_tuple;
9238   }
9239 
9240   DBUG_ASSERT(key != nullptr);
9241 
9242   const Rdb_key_def &kd = *m_key_descr_arr[active_index];
9243   int key_start_matching_bytes = kd.get_first_key(key, &key_size);
9244 
9245   rocksdb::Slice index_key((const char *)key, key_size);
9246 
9247   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
9248   DBUG_ASSERT(tx != nullptr);
9249 
9250   const bool is_new_snapshot = !tx->has_snapshot();
9251   // Loop as long as we get a deadlock error AND we end up creating the
9252   // snapshot here (i.e. it did not exist prior to this)
9253   for (;;) {
9254     setup_scan_iterator(kd, &index_key, false, key_start_matching_bytes);
9255     m_scan_it->Seek(index_key);
9256     m_skip_scan_it_next_call = true;
9257 
9258     rc = index_next_with_direction(buf, true);
9259     if (!should_recreate_snapshot(rc, is_new_snapshot)) {
9260       break; /* exit the loop */
9261     }
9262 
9263     // release the snapshot and iterator so they will be regenerated
9264     tx->release_snapshot();
9265     release_scan_iterator();
9266   }
9267 
9268   if (!rc) {
9269     /*
9270       index_next is always incremented on success, so decrement if it is
9271       index_first instead
9272      */
9273 #ifdef MARIAROCKS_NOT_YET
9274     stats.rows_index_first++;
9275     stats.rows_index_next--;
9276 #endif
9277   }
9278 
9279   DBUG_RETURN(rc);
9280 }
9281 
9282 /**
9283   @details
9284   Start scanning from the "last" value
9285 
9286   The 'last' here means "the last from start of the key space".
9287   For reverse-ordered key spaces, we will actually read the smallest value.
9288 
9289   An picture of a forward-ordered keyspace (remember, the keys have form
9290   'indexnr-keyval'. Suppose the we are at a key that has number n)
9291 
9292      (n-1)-something
9293      ( n )-aaa
9294      ( n )-bbb
9295      ( n )-ccc            <----------- Need to seek to here.
9296      (n+1)      <---- Doesn't exist, but would be here.
9297      (n+1)-smth, or no value at all
9298 
9299    RocksDB's Iterator::SeekForPrev($val) seeks to "at $val or last value that's
9300    smaller". We can't seek to "(n)-ccc" directly, because we don't know what
9301    is the value of 'ccc' (the biggest record with prefix (n)). Instead, we seek
9302    to "(n+1)", which is the least possible value that's greater than any value
9303    in index #n.
9304 
9305    So, need to:  it->SeekForPrev(n+1)
9306 
9307    A backward-ordered keyspace:
9308 
9309       (n+1)-something
9310       ( n ) - ccc
9311       ( n ) - bbb
9312       ( n ) - aaa       <---------------- (*) Need to seek here.
9313       ( n ) <--- Doesn't exist, but would be here.
9314       (n-1)-smth, or no value at all
9315 
9316    So, need to:  it->SeekForPrev(n)
9317 */
9318 
9319 int ha_rocksdb::index_last_intern(uchar *const buf) {
9320   DBUG_ENTER_FUNC();
9321 
9322   uchar *key;
9323   uint key_size;
9324   int rc;
9325 
9326   if (is_pk(active_index, table, m_tbl_def)) {
9327     key = m_pk_packed_tuple;
9328   } else {
9329     key = m_sk_packed_tuple;
9330   }
9331 
9332   DBUG_ASSERT(key != nullptr);
9333 
9334   const Rdb_key_def &kd = *m_key_descr_arr[active_index];
9335   int key_end_matching_bytes = kd.get_last_key(key, &key_size);
9336 
9337   rocksdb::Slice index_key((const char *)key, key_size);
9338 
9339   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
9340   DBUG_ASSERT(tx != nullptr);
9341 
9342   bool is_new_snapshot = !tx->has_snapshot();
9343   // Loop as long as we get a deadlock error AND we end up creating the
9344   // snapshot here (i.e. it did not exist prior to this)
9345   for (;;) {
9346     setup_scan_iterator(kd, &index_key, false, key_end_matching_bytes);
9347     m_scan_it->SeekForPrev(index_key);
9348     m_skip_scan_it_next_call = false;
9349 
9350     if (is_pk(active_index, table, m_tbl_def)) {
9351       m_skip_scan_it_next_call = true;
9352       rc = rnd_next_with_direction(buf, false);
9353     } else {
9354       rc = find_icp_matching_index_rec(false /*move_forward*/, buf);
9355       if (!rc) rc = secondary_index_read(active_index, buf);
9356     }
9357 
9358     if (!should_recreate_snapshot(rc, is_new_snapshot)) {
9359       break; /* exit the loop */
9360     }
9361 
9362     // release the snapshot and iterator so they will be regenerated
9363     tx->release_snapshot();
9364     release_scan_iterator();
9365   }
9366 
9367   if (!rc) {
9368     /*
9369       index_next is always incremented on success, so decrement if it is
9370       index_first instead
9371      */
9372 #ifdef MARIAROCKS_NOT_YET
9373     stats.rows_index_first++;
9374     stats.rows_index_next--;
9375 #endif
9376   }
9377 
9378   DBUG_RETURN(rc);
9379 }
9380 
9381 void ha_rocksdb::unlock_row() {
9382   DBUG_ENTER_FUNC();
9383 
9384   if (m_lock_rows != RDB_LOCK_NONE) {
9385     Rdb_transaction *const tx = get_or_create_tx(table->in_use);
9386     tx->release_lock(m_pk_descr->get_cf(),
9387                      std::string(m_last_rowkey.ptr(), m_last_rowkey.length()));
9388   }
9389 
9390   DBUG_VOID_RETURN;
9391 }
9392 
9393 /*
9394   Returning true if SingleDelete can be used.
9395   - Secondary Indexes can always use SingleDelete.
9396   - If the index is PRIMARY KEY, and if all of the columns of the table
9397     are covered by the PRIMARY KEY, SingleDelete can be used.
9398 */
9399 bool ha_rocksdb::can_use_single_delete(const uint index) const {
9400   return (index != pk_index(table, m_tbl_def) ||
9401           (!has_hidden_pk(table) &&
9402            table->key_info[index].ext_key_parts == table->s->fields));
9403 }
9404 
9405 bool ha_rocksdb::skip_unique_check() const {
9406   /*
9407     We want to skip unique checks if:
9408       1) bulk_load is on
9409       2) this table is in the whitelist of tables to skip and the replication
9410          lag has reached a large enough value (see unique_check_lag_threshold
9411          and unique_check_lage_reset_threshold)
9412       3) the user set unique_checks option to 0, and the table does not have
9413          any indexes. If the table has secondary keys, then those might becomes
9414          inconsisted/corrupted
9415       4) We're using read-free replication
9416   */
9417   return THDVAR(table->in_use, bulk_load) ||
9418          (m_force_skip_unique_check && m_skip_unique_check) ||
9419          (my_core::thd_test_options(table->in_use,
9420                                     OPTION_RELAXED_UNIQUE_CHECKS) &&
9421           m_tbl_def->m_key_count == 1) ||
9422 #ifdef MARIAROCKS_NOT_YET
9423          use_read_free_rpl();
9424 #else
9425          FALSE;
9426 #endif
9427 }
9428 
9429 #ifdef MARIAROCKS_NOT_YET // MDEV-10975
9430 void ha_rocksdb::set_force_skip_unique_check(bool skip) {
9431   DBUG_ENTER_FUNC();
9432 
9433   m_force_skip_unique_check = skip;
9434 
9435   DBUG_VOID_RETURN;
9436 }
9437 #endif
9438 
9439 bool ha_rocksdb::commit_in_the_middle() {
9440   return THDVAR(table->in_use, bulk_load) ||
9441          THDVAR(table->in_use, commit_in_the_middle);
9442 }
9443 
9444 /*
9445   Executing bulk commit if it should.
9446   @retval true if bulk commit failed
9447   @retval false if bulk commit was skipped or succeeded
9448 */
9449 bool ha_rocksdb::do_bulk_commit(Rdb_transaction *const tx) {
9450   return commit_in_the_middle() &&
9451          tx->get_write_count() >= THDVAR(table->in_use, bulk_load_size) &&
9452          tx->flush_batch();
9453 }
9454 
9455 /*
9456   If table was created without primary key, SQL layer represents the primary
9457   key number as MAX_INDEXES.  Hence, this function returns true if the table
9458   does not contain a primary key. (In which case we generate a hidden
9459   'auto-incremented' pk.)
9460 */
9461 bool ha_rocksdb::has_hidden_pk(const TABLE *const table) const {
9462   return Rdb_key_def::table_has_hidden_pk(table);
9463 }
9464 
9465 /*
9466   Returns true if given index number is a hidden_pk.
9467   - This is used when a table is created with no primary key.
9468 */
9469 bool ha_rocksdb::is_hidden_pk(const uint index, const TABLE *const table_arg,
9470                               const Rdb_tbl_def *const tbl_def_arg) {
9471   DBUG_ASSERT(table_arg->s != nullptr);
9472 
9473   return (table_arg->s->primary_key == MAX_INDEXES &&
9474           index == tbl_def_arg->m_key_count - 1);
9475 }
9476 
9477 /* Returns index of primary key */
9478 uint ha_rocksdb::pk_index(const TABLE *const table_arg,
9479                           const Rdb_tbl_def *const tbl_def_arg) {
9480   DBUG_ASSERT(table_arg->s != nullptr);
9481 
9482   return table_arg->s->primary_key == MAX_INDEXES ? tbl_def_arg->m_key_count - 1
9483                                                   : table_arg->s->primary_key;
9484 }
9485 
9486 /* Returns true if given index number is a primary key */
9487 bool ha_rocksdb::is_pk(const uint index, const TABLE *const table_arg,
9488                        const Rdb_tbl_def *const tbl_def_arg) {
9489   DBUG_ASSERT(table_arg->s != nullptr);
9490 
9491   return index == table_arg->s->primary_key ||
9492          is_hidden_pk(index, table_arg, tbl_def_arg);
9493 }
9494 
9495 uint ha_rocksdb::max_supported_key_part_length() const {
9496   DBUG_ENTER_FUNC();
9497   DBUG_RETURN(rocksdb_large_prefix ? MAX_INDEX_COL_LEN_LARGE
9498                                    : MAX_INDEX_COL_LEN_SMALL);
9499 }
9500 
9501 const char *ha_rocksdb::get_key_name(const uint index,
9502                                      const TABLE *const table_arg,
9503                                      const Rdb_tbl_def *const tbl_def_arg) {
9504   if (is_hidden_pk(index, table_arg, tbl_def_arg)) {
9505     return HIDDEN_PK_NAME;
9506   }
9507 
9508   DBUG_ASSERT(table_arg->key_info != nullptr);
9509   DBUG_ASSERT(table_arg->key_info[index].name.str != nullptr);
9510 
9511   return table_arg->key_info[index].name.str;
9512 }
9513 
9514 const char *ha_rocksdb::get_key_comment(const uint index,
9515                                         const TABLE *const table_arg,
9516                                         const Rdb_tbl_def *const tbl_def_arg) {
9517   if (is_hidden_pk(index, table_arg, tbl_def_arg)) {
9518     return nullptr;
9519   }
9520 
9521   DBUG_ASSERT(table_arg->key_info != nullptr);
9522 
9523   return table_arg->key_info[index].comment.str;
9524 }
9525 
9526 const std::string ha_rocksdb::generate_cf_name(
9527     const uint index, const TABLE *const table_arg,
9528     const Rdb_tbl_def *const tbl_def_arg, bool *per_part_match_found) {
9529   DBUG_ASSERT(table_arg != nullptr);
9530   DBUG_ASSERT(tbl_def_arg != nullptr);
9531   DBUG_ASSERT(per_part_match_found != nullptr);
9532 
9533   // When creating CF-s the caller needs to know if there was a custom CF name
9534   // specified for a given paritition.
9535   *per_part_match_found = false;
9536 
9537   // Index comment is used to define the column family name specification(s).
9538   // If there was no comment, we get an emptry string, and it means "use the
9539   // default column family".
9540   const char *const comment = get_key_comment(index, table_arg, tbl_def_arg);
9541 
9542   // `get_key_comment` can return `nullptr`, that's why this.
9543   std::string key_comment = comment ? comment : "";
9544 
9545   std::string cf_name = Rdb_key_def::parse_comment_for_qualifier(
9546       key_comment, table_arg, tbl_def_arg, per_part_match_found,
9547       RDB_CF_NAME_QUALIFIER);
9548 
9549   if (IF_PARTITIONING(table_arg->part_info,nullptr) != nullptr && !*per_part_match_found) {
9550     // At this point we tried to search for a custom CF name for a partition,
9551     // but none was specified. Therefore default one will be used.
9552     return "";
9553   }
9554 
9555   // If we didn't find any partitioned/non-partitioned qualifiers, return the
9556   // comment itself.  NOTE: this currently handles returning the cf name
9557   // specified in the index comment in the case of no partitions, which doesn't
9558   // use any qualifiers at the moment. (aka its a special case)
9559   if (cf_name.empty() && !key_comment.empty()) {
9560     return key_comment;
9561   }
9562 
9563   return cf_name;
9564 }
9565 
9566 const std::string ha_rocksdb::get_table_comment(const TABLE *const table_arg) {
9567   DBUG_ASSERT(table_arg->s != nullptr);
9568 
9569   return table_arg->s->comment.str;
9570 }
9571 
9572 /**
9573   Write a new row
9574 
9575   @param[in] buf                new row data to write
9576   @return
9577     HA_EXIT_SUCCESS  OK
9578     other            HA_ERR error code (can be SE-specific)
9579 */
9580 int ha_rocksdb::write_row(const uchar *const buf) {
9581   DBUG_ENTER_FUNC();
9582 
9583   DBUG_ASSERT(buf != nullptr);
9584   DBUG_ASSERT(buf == table->record[0]);
9585   DBUG_ASSERT(m_lock_rows == RDB_LOCK_WRITE);
9586 
9587   /*
9588     Note: "buf == table->record[0]" is copied from innodb. I am not aware of
9589     any use cases where this condition is not true.
9590   */
9591   if (table->next_number_field && buf == table->record[0]) {
9592     int err;
9593     if ((err = update_auto_increment())) {
9594       DBUG_RETURN(err);
9595     }
9596   }
9597 
9598   // clear cache at beginning of write for INSERT ON DUPLICATE
9599   // we may get multiple write->fail->read->update if there are multiple
9600   // values from INSERT
9601   m_dup_pk_found = false;
9602 
9603   const int rv = update_write_row(nullptr, buf, skip_unique_check());
9604 
9605   if (rv == 0) {
9606 #ifdef MARIAROCKS_NOT_YET
9607     stats.rows_inserted++;
9608 #endif
9609     update_row_stats(ROWS_INSERTED);
9610   }
9611 
9612   DBUG_RETURN(rv);
9613 }
9614 
9615 /**
9616   Constructing m_last_rowkey (MyRocks key expression) from
9617   before_update|delete image (MySQL row expression).
9618   m_last_rowkey is normally set during lookup phase, such as
9619   rnd_next_with_direction() and rnd_pos(). With Read Free Replication,
9620   these read functions are skipped and update_rows(), delete_rows() are
9621   called without setting m_last_rowkey. This function sets m_last_rowkey
9622   for Read Free Replication.
9623 */
9624 void ha_rocksdb::set_last_rowkey(const uchar *const old_data) {
9625 #ifdef MARIAROCKS_NOT_YET
9626   if (old_data && use_read_free_rpl()) {
9627     const int old_pk_size = m_pk_descr->pack_record(
9628         table, m_pack_buffer, old_data, m_pk_packed_tuple, nullptr, false);
9629     m_last_rowkey.copy((const char *)m_pk_packed_tuple, old_pk_size,
9630                        &my_charset_bin);
9631   }
9632 #endif
9633 }
9634 
9635 /**
9636   Collect update data for primary key
9637 
9638   @param[in, out] row_info            hold all data for update row, such as
9639                                       new row data/old row data
9640   @return
9641     HA_EXIT_SUCCESS  OK
9642     other            HA_ERR error code (can be SE-specific)
9643 */
9644 int ha_rocksdb::get_pk_for_update(struct update_row_info *const row_info) {
9645   int size;
9646 
9647   /*
9648     Get new row key for any insert, and any update where the pk is not hidden.
9649     Row key for updates with hidden pk is handled below.
9650   */
9651   if (!has_hidden_pk(table)) {
9652     row_info->hidden_pk_id = 0;
9653 
9654     row_info->new_pk_unpack_info = &m_pk_unpack_info;
9655 
9656     size = m_pk_descr->pack_record(
9657         table, m_pack_buffer, row_info->new_data, m_pk_packed_tuple,
9658         row_info->new_pk_unpack_info, false, 0, 0, nullptr);
9659   } else if (row_info->old_data == nullptr) {
9660     row_info->hidden_pk_id = update_hidden_pk_val();
9661     size =
9662         m_pk_descr->pack_hidden_pk(row_info->hidden_pk_id, m_pk_packed_tuple);
9663   } else {
9664     /*
9665       If hidden primary key, rowkey for new record will always be the same as
9666       before
9667     */
9668     size = row_info->old_pk_slice.size();
9669     memcpy(m_pk_packed_tuple, row_info->old_pk_slice.data(), size);
9670     int err = read_hidden_pk_id_from_rowkey(&row_info->hidden_pk_id);
9671     if (err) {
9672       return err;
9673     }
9674   }
9675 
9676   row_info->new_pk_slice =
9677       rocksdb::Slice((const char *)m_pk_packed_tuple, size);
9678 
9679   return HA_EXIT_SUCCESS;
9680 }
9681 
9682 /**
9683    Check the specified primary key value is unique and also lock the row
9684 
9685   @param[in] key_id           key index
9686   @param[in] row_info         hold all data for update row, such as old row
9687                               data and new row data
9688   @param[out] found           whether the primary key exists before.
9689   @param[out] pk_changed      whether primary key is changed
9690   @return
9691     HA_EXIT_SUCCESS  OK
9692     other            HA_ERR error code (can be SE-specific)
9693 */
9694 int ha_rocksdb::check_and_lock_unique_pk(const uint key_id,
9695                                          const struct update_row_info &row_info,
9696                                          bool *const found) {
9697   DBUG_ASSERT(found != nullptr);
9698 
9699   DBUG_ASSERT(row_info.old_pk_slice.size() == 0 ||
9700               row_info.new_pk_slice.compare(row_info.old_pk_slice) != 0);
9701 
9702   /* Ignore PK violations if this is a optimized 'replace into' */
9703 #ifdef MARIAROCKS_NOT_YET
9704   const bool ignore_pk_unique_check = ha_thd()->lex->blind_replace_into;
9705 #else
9706   const bool ignore_pk_unique_check= false;
9707 #endif
9708 
9709   /*
9710     Perform a read to determine if a duplicate entry exists. For primary
9711     keys, a point lookup will be sufficient.
9712 
9713     note: we intentionally don't set options.snapshot here. We want to read
9714     the latest committed data.
9715   */
9716 
9717   /*
9718     To prevent race conditions like below, it is necessary to
9719     take a lock for a target row. get_for_update() holds a gap lock if
9720     target key does not exist, so below conditions should never
9721     happen.
9722 
9723     1) T1 Get(empty) -> T2 Get(empty) -> T1 Put(insert) -> T1 commit
9724        -> T2 Put(overwrite) -> T2 commit
9725     2) T1 Get(empty) -> T1 Put(insert, not committed yet) -> T2 Get(empty)
9726        -> T2 Put(insert, blocked) -> T1 commit -> T2 commit(overwrite)
9727   */
9728   const rocksdb::Status s =
9729       get_for_update(row_info.tx, m_pk_descr->get_cf(), row_info.new_pk_slice,
9730                      ignore_pk_unique_check ? nullptr : &m_retrieved_record);
9731   if (!s.ok() && !s.IsNotFound()) {
9732     return row_info.tx->set_status_error(
9733         table->in_use, s, *m_key_descr_arr[key_id], m_tbl_def, m_table_handler);
9734   }
9735 
9736   bool key_found = ignore_pk_unique_check ? false : !s.IsNotFound();
9737 
9738   /*
9739     If the pk key has ttl, we may need to pretend the row wasn't
9740     found if it is already expired.
9741   */
9742   if (key_found && m_pk_descr->has_ttl() &&
9743       should_hide_ttl_rec(*m_pk_descr, m_retrieved_record,
9744                           (row_info.tx->m_snapshot_timestamp
9745                                ? row_info.tx->m_snapshot_timestamp
9746                                : static_cast<int64_t>(std::time(nullptr))))) {
9747     key_found = false;
9748   }
9749 
9750   if (key_found && row_info.old_data == nullptr && m_insert_with_update) {
9751     // In INSERT ON DUPLICATE KEY UPDATE ... case, if the insert failed
9752     // due to a duplicate key, remember the last key and skip the check
9753     // next time
9754     m_dup_pk_found = true;
9755 
9756 #ifndef DBUG_OFF
9757     // save it for sanity checking later
9758     m_dup_pk_retrieved_record.copy(m_retrieved_record.data(),
9759                                    m_retrieved_record.size(), &my_charset_bin);
9760 #endif
9761   }
9762 
9763   *found = key_found;
9764 
9765   return HA_EXIT_SUCCESS;
9766 }
9767 
9768 /**
9769    Check the specified secondary key value is unique and also lock the row
9770 
9771   @param[in] key_id           key index
9772   @param[in] row_info         hold all data for update row, such as old row
9773                               data and new row data
9774   @param[out] found           whether specified key value exists before.
9775   @return
9776     HA_EXIT_SUCCESS  OK
9777     other            HA_ERR error code (can be SE-specific)
9778 */
9779 int ha_rocksdb::check_and_lock_sk(const uint key_id,
9780                                   const struct update_row_info &row_info,
9781                                   bool *const found) {
9782   DBUG_ASSERT(found != nullptr);
9783   *found = false;
9784 
9785   /*
9786     Can skip checking this key if none of the key fields have changed.
9787   */
9788   if (row_info.old_data != nullptr && !m_update_scope.is_set(key_id)) {
9789     return HA_EXIT_SUCCESS;
9790   }
9791 
9792   KEY *key_info = nullptr;
9793   uint n_null_fields = 0;
9794   uint user_defined_key_parts = 1;
9795 
9796   key_info = &table->key_info[key_id];
9797   user_defined_key_parts = key_info->user_defined_key_parts;
9798   /*
9799     If there are no uniqueness requirements, there's no need to obtain a
9800     lock for this key.
9801   */
9802   if (!(key_info->flags & HA_NOSAME)) {
9803     return HA_EXIT_SUCCESS;
9804   }
9805 
9806   const Rdb_key_def &kd = *m_key_descr_arr[key_id];
9807 
9808   /*
9809     Calculate the new key for obtaining the lock
9810 
9811     For unique secondary indexes, the key used for locking does not
9812     include the extended fields.
9813   */
9814   int size =
9815       kd.pack_record(table, m_pack_buffer, row_info.new_data, m_sk_packed_tuple,
9816                      nullptr, false, 0, user_defined_key_parts, &n_null_fields);
9817   if (n_null_fields > 0) {
9818     /*
9819       If any fields are marked as NULL this will never match another row as
9820       to NULL never matches anything else including another NULL.
9821      */
9822     return HA_EXIT_SUCCESS;
9823   }
9824 
9825   const rocksdb::Slice new_slice =
9826       rocksdb::Slice((const char *)m_sk_packed_tuple, size);
9827 
9828   /*
9829      Acquire lock on the old key in case of UPDATE
9830   */
9831   if (row_info.old_data != nullptr) {
9832     size = kd.pack_record(table, m_pack_buffer, row_info.old_data,
9833                           m_sk_packed_tuple_old, nullptr, false, 0,
9834                           user_defined_key_parts);
9835     const rocksdb::Slice old_slice =
9836         rocksdb::Slice((const char *)m_sk_packed_tuple_old, size);
9837 
9838     const rocksdb::Status s =
9839         get_for_update(row_info.tx, kd.get_cf(), old_slice, nullptr);
9840     if (!s.ok()) {
9841       return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def,
9842                                            m_table_handler);
9843     }
9844 
9845     /*
9846       If the old and new keys are the same we're done since we've already taken
9847       the lock on the old key
9848     */
9849     if (!new_slice.compare(old_slice)) {
9850       return HA_EXIT_SUCCESS;
9851     }
9852   }
9853 
9854   /*
9855     Perform a read to determine if a duplicate entry exists - since this is
9856     a secondary indexes a range scan is needed.
9857 
9858     note: we intentionally don't set options.snapshot here. We want to read
9859     the latest committed data.
9860   */
9861 
9862   const bool all_parts_used = (user_defined_key_parts == kd.get_key_parts());
9863 
9864   /*
9865     This iterator seems expensive since we need to allocate and free
9866     memory for each unique index.
9867 
9868     If this needs to be optimized, for keys without NULL fields, the
9869     extended primary key fields can be migrated to the value portion of the
9870     key. This enables using Get() instead of Seek() as in the primary key
9871     case.
9872 
9873     The bloom filter may need to be disabled for this lookup.
9874   */
9875   uchar lower_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE];
9876   uchar upper_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE];
9877   rocksdb::Slice lower_bound_slice;
9878   rocksdb::Slice upper_bound_slice;
9879 
9880   const bool total_order_seek = !check_bloom_and_set_bounds(
9881       ha_thd(), kd, new_slice, all_parts_used, Rdb_key_def::INDEX_NUMBER_SIZE,
9882       lower_bound_buf, upper_bound_buf, &lower_bound_slice, &upper_bound_slice);
9883   const bool fill_cache = !THDVAR(ha_thd(), skip_fill_cache);
9884 
9885   const rocksdb::Status s =
9886       get_for_update(row_info.tx, kd.get_cf(), new_slice, nullptr);
9887   if (!s.ok() && !s.IsNotFound()) {
9888     return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def,
9889                                          m_table_handler);
9890   }
9891 
9892   rocksdb::Iterator *const iter = row_info.tx->get_iterator(
9893       kd.get_cf(), total_order_seek, fill_cache, lower_bound_slice,
9894       upper_bound_slice, true /* read current data */,
9895       false /* acquire snapshot */);
9896   /*
9897     Need to scan the transaction to see if there is a duplicate key.
9898     Also need to scan RocksDB and verify the key has not been deleted
9899     in the transaction.
9900   */
9901   iter->Seek(new_slice);
9902   *found = !read_key_exact(kd, iter, all_parts_used, new_slice,
9903                            row_info.tx->m_snapshot_timestamp);
9904   delete iter;
9905 
9906   return HA_EXIT_SUCCESS;
9907 }
9908 
9909 /**
9910    Enumerate all keys to check their uniquess and also lock it
9911 
9912   @param[in] row_info         hold all data for update row, such as old row
9913                               data and new row data
9914   @param[out] pk_changed      whether primary key is changed
9915   @return
9916     HA_EXIT_SUCCESS  OK
9917     other            HA_ERR error code (can be SE-specific)
9918 */
9919 int ha_rocksdb::check_uniqueness_and_lock(
9920     const struct update_row_info &row_info, bool pk_changed) {
9921   /*
9922     Go through each index and determine if the index has uniqueness
9923     requirements. If it does, then try to obtain a row lock on the new values.
9924     Once all locks have been obtained, then perform the changes needed to
9925     update/insert the row.
9926   */
9927   for (uint key_id = 0; key_id < m_tbl_def->m_key_count; key_id++) {
9928     bool found;
9929     int rc;
9930 
9931     if (is_pk(key_id, table, m_tbl_def)) {
9932       if (row_info.old_pk_slice.size() > 0 && !pk_changed) {
9933         found = false;
9934         rc = HA_EXIT_SUCCESS;
9935       } else {
9936         rc = check_and_lock_unique_pk(key_id, row_info, &found);
9937       }
9938     } else {
9939       rc = check_and_lock_sk(key_id, row_info, &found);
9940     }
9941 
9942     if (rc != HA_EXIT_SUCCESS) {
9943       return rc;
9944     }
9945 
9946     if (found) {
9947       /* There is a row with this key already, so error out. */
9948       errkey = key_id;
9949       m_dupp_errkey = errkey;
9950 
9951       return HA_ERR_FOUND_DUPP_KEY;
9952     }
9953   }
9954 
9955   return HA_EXIT_SUCCESS;
9956 }
9957 
9958 /**
9959   Check whether secondary key value is duplicate or not
9960 
9961   @param[in] table_arg         the table currently working on
9962   @param[in  key_def           the key_def is being checked
9963   @param[in] key               secondary key storage data
9964   @param[out] sk_info          hold secondary key memcmp datas(new/old)
9965   @return
9966     HA_EXIT_SUCCESS  OK
9967     other            HA_ERR error code (can be SE-specific)
9968 */
9969 
9970 int ha_rocksdb::check_duplicate_sk(const TABLE *table_arg,
9971                                    const Rdb_key_def &key_def,
9972                                    const rocksdb::Slice *key,
9973                                    struct unique_sk_buf_info *sk_info) {
9974   uint n_null_fields = 0;
9975   const rocksdb::Comparator *index_comp = key_def.get_cf()->GetComparator();
9976 
9977   /* Get proper SK buffer. */
9978   uchar *sk_buf = sk_info->swap_and_get_sk_buf();
9979 
9980   /* Get memcmp form of sk without extended pk tail */
9981   uint sk_memcmp_size =
9982       key_def.get_memcmp_sk_parts(table_arg, *key, sk_buf, &n_null_fields);
9983 
9984   sk_info->sk_memcmp_key =
9985       rocksdb::Slice(reinterpret_cast<char *>(sk_buf), sk_memcmp_size);
9986 
9987   if (sk_info->sk_memcmp_key_old.size() > 0 && n_null_fields == 0 &&
9988       index_comp->Compare(sk_info->sk_memcmp_key, sk_info->sk_memcmp_key_old) ==
9989           0) {
9990     return 1;
9991   }
9992 
9993   sk_info->sk_memcmp_key_old = sk_info->sk_memcmp_key;
9994   return 0;
9995 }
9996 
9997 int ha_rocksdb::bulk_load_key(Rdb_transaction *const tx, const Rdb_key_def &kd,
9998                               const rocksdb::Slice &key,
9999                               const rocksdb::Slice &value, bool sort) {
10000   DBUG_ENTER_FUNC();
10001   int res;
10002   THD *thd = ha_thd();
10003   if (thd && thd->killed) {
10004     DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED);
10005   }
10006 
10007   rocksdb::ColumnFamilyHandle *cf = kd.get_cf();
10008 
10009   // In the case of unsorted inserts, m_sst_info allocated here is not
10010   // used to store the keys. It is still used to indicate when tables
10011   // are switched.
10012   if (m_sst_info == nullptr || m_sst_info->is_done()) {
10013     m_sst_info.reset(new Rdb_sst_info(rdb, m_table_handler->m_table_name,
10014                                       kd.get_name(), cf, *rocksdb_db_options,
10015                                       THDVAR(ha_thd(), trace_sst_api)));
10016     res = tx->start_bulk_load(this, m_sst_info);
10017     if (res != HA_EXIT_SUCCESS) {
10018       DBUG_RETURN(res);
10019     }
10020   }
10021   DBUG_ASSERT(m_sst_info);
10022 
10023   if (sort) {
10024     Rdb_index_merge *key_merge;
10025     DBUG_ASSERT(cf != nullptr);
10026 
10027     res = tx->get_key_merge(kd.get_gl_index_id(), cf, &key_merge);
10028     if (res == HA_EXIT_SUCCESS) {
10029       res = key_merge->add(key, value);
10030     }
10031   } else {
10032     res = m_sst_info->put(key, value);
10033   }
10034 
10035   DBUG_RETURN(res);
10036 }
10037 
10038 int ha_rocksdb::finalize_bulk_load(bool print_client_error) {
10039   DBUG_ENTER_FUNC();
10040 
10041   int res = HA_EXIT_SUCCESS;
10042 
10043   /* Skip if there are no possible ongoing bulk loads */
10044   if (m_sst_info) {
10045     if (m_sst_info->is_done()) {
10046       m_sst_info.reset();
10047       DBUG_RETURN(res);
10048     }
10049 
10050     Rdb_sst_info::Rdb_sst_commit_info commit_info;
10051 
10052     // Wrap up the current work in m_sst_info and get ready to commit
10053     // This transfer the responsibility of commit over to commit_info
10054     res = m_sst_info->finish(&commit_info, print_client_error);
10055     if (res == 0) {
10056       // Make sure we have work to do - under race condition we could lose
10057       // to another thread and end up with no work
10058       if (commit_info.has_work()) {
10059         rocksdb::IngestExternalFileOptions opts;
10060         opts.move_files = true;
10061         opts.snapshot_consistency = false;
10062         opts.allow_global_seqno = false;
10063         opts.allow_blocking_flush = false;
10064 
10065         const rocksdb::Status s = rdb->IngestExternalFile(
10066             commit_info.get_cf(), commit_info.get_committed_files(), opts);
10067         if (!s.ok()) {
10068           if (print_client_error) {
10069             Rdb_sst_info::report_error_msg(s, nullptr);
10070           }
10071           res = HA_ERR_ROCKSDB_BULK_LOAD;
10072         } else {
10073           // Mark the list of SST files as committed, otherwise they'll get
10074           // cleaned up when commit_info destructs
10075           commit_info.commit();
10076         }
10077       }
10078     }
10079     m_sst_info.reset();
10080   }
10081   DBUG_RETURN(res);
10082 }
10083 
10084 /**
10085   Update an existing primary key record or write a new primary key record
10086 
10087   @param[in] kd                the primary key is being update/write
10088   @param[in] update_row_info   hold all row data, such as old row data and
10089                                new row data
10090   @param[in] pk_changed        whether primary key is changed
10091   @return
10092     HA_EXIT_SUCCESS OK
10093     Other           HA_ERR error code (can be SE-specific)
10094  */
10095 int ha_rocksdb::update_write_pk(const Rdb_key_def &kd,
10096                                 const struct update_row_info &row_info,
10097                                 bool pk_changed) {
10098   uint key_id = kd.get_keyno();
10099   bool hidden_pk = is_hidden_pk(key_id, table, m_tbl_def);
10100   ulonglong bytes_written = 0;
10101 
10102   /*
10103     If the PK has changed, or if this PK uses single deletes and this is an
10104     update, the old key needs to be deleted. In the single delete case, it
10105     might be possible to have this sequence of keys: PUT(X), PUT(X), SD(X),
10106     resulting in the first PUT(X) showing up.
10107   */
10108   if (!hidden_pk && (pk_changed || ((row_info.old_pk_slice.size() > 0) &&
10109                                     can_use_single_delete(key_id)))) {
10110     const rocksdb::Status s = delete_or_singledelete(
10111         key_id, row_info.tx, kd.get_cf(), row_info.old_pk_slice);
10112     if (!s.ok()) {
10113       return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def,
10114                                            m_table_handler);
10115     } else {
10116       bytes_written = row_info.old_pk_slice.size();
10117     }
10118   }
10119 
10120   if (table->found_next_number_field) {
10121     update_auto_incr_val_from_field();
10122   }
10123 
10124   int rc = HA_EXIT_SUCCESS;
10125   rocksdb::Slice value_slice;
10126   /* Prepare the new record to be written into RocksDB */
10127   if ((rc = m_converter->encode_value_slice(
10128            m_pk_descr, row_info.new_pk_slice, row_info.new_pk_unpack_info,
10129            !row_info.old_pk_slice.empty(), should_store_row_debug_checksums(),
10130            m_ttl_bytes, &m_ttl_bytes_updated, &value_slice))) {
10131     return rc;
10132   }
10133 
10134   const auto cf = m_pk_descr->get_cf();
10135   if (rocksdb_enable_bulk_load_api && THDVAR(table->in_use, bulk_load) &&
10136       !hidden_pk) {
10137     /*
10138       Write the primary key directly to an SST file using an SstFileWriter
10139      */
10140     rc = bulk_load_key(row_info.tx, kd, row_info.new_pk_slice, value_slice,
10141                        THDVAR(table->in_use, bulk_load_allow_unsorted));
10142   } else if (row_info.skip_unique_check || row_info.tx->m_ddl_transaction) {
10143     /*
10144       It is responsibility of the user to make sure that the data being
10145       inserted doesn't violate any unique keys.
10146     */
10147     row_info.tx->get_indexed_write_batch()->Put(cf, row_info.new_pk_slice,
10148                                                 value_slice);
10149   } else {
10150     const bool assume_tracked = can_assume_tracked(ha_thd());
10151     const auto s = row_info.tx->put(cf, row_info.new_pk_slice, value_slice,
10152                                     assume_tracked);
10153     if (!s.ok()) {
10154       if (s.IsBusy()) {
10155         errkey = table->s->primary_key;
10156         m_dupp_errkey = errkey;
10157         rc = HA_ERR_FOUND_DUPP_KEY;
10158       } else {
10159         rc = row_info.tx->set_status_error(table->in_use, s, *m_pk_descr,
10160                                            m_tbl_def, m_table_handler);
10161       }
10162     }
10163   }
10164 
10165   if (rc == HA_EXIT_SUCCESS) {
10166     row_info.tx->update_bytes_written(
10167         bytes_written + row_info.new_pk_slice.size() + value_slice.size());
10168   }
10169   return rc;
10170 }
10171 
10172 /**
10173   update an existing secondary key record or write a new secondary key record
10174 
10175   @param[in] table_arg    Table we're working on
10176   @param[in] kd           The secondary key being update/write
10177   @param[in] row_info     data structure contains old row data and new row data
10178   @param[in] bulk_load_sk whether support bulk load. Currently it is only
10179                           support for write
10180   @return
10181     HA_EXIT_SUCCESS OK
10182     Other           HA_ERR error code (can be SE-specific)
10183  */
10184 int ha_rocksdb::update_write_sk(const TABLE *const table_arg,
10185                                 const Rdb_key_def &kd,
10186                                 const struct update_row_info &row_info,
10187                                 const bool bulk_load_sk) {
10188   int new_packed_size;
10189   int old_packed_size;
10190   int rc = HA_EXIT_SUCCESS;
10191 
10192   rocksdb::Slice new_key_slice;
10193   rocksdb::Slice new_value_slice;
10194   rocksdb::Slice old_key_slice;
10195 
10196   const uint key_id = kd.get_keyno();
10197 
10198   ulonglong bytes_written = 0;
10199 
10200   /*
10201     Can skip updating this key if none of the key fields have changed and, if
10202     this table has TTL, the TTL timestamp has not changed.
10203   */
10204   if (row_info.old_data != nullptr && !m_update_scope.is_set(key_id) &&
10205       (!kd.has_ttl() || !m_ttl_bytes_updated)) {
10206     return HA_EXIT_SUCCESS;
10207   }
10208 
10209   bool store_row_debug_checksums = should_store_row_debug_checksums();
10210   new_packed_size =
10211       kd.pack_record(table_arg, m_pack_buffer, row_info.new_data,
10212                      m_sk_packed_tuple, &m_sk_tails, store_row_debug_checksums,
10213                      row_info.hidden_pk_id, 0, nullptr, m_ttl_bytes);
10214 
10215   if (row_info.old_data != nullptr) {
10216     // The old value
10217     old_packed_size = kd.pack_record(
10218         table_arg, m_pack_buffer, row_info.old_data, m_sk_packed_tuple_old,
10219         &m_sk_tails_old, store_row_debug_checksums, row_info.hidden_pk_id, 0,
10220         nullptr, m_ttl_bytes);
10221 
10222     /*
10223       Check if we are going to write the same value. This can happen when
10224       one does
10225         UPDATE tbl SET col='foo'
10226       and we are looking at the row that already has col='foo'.
10227 
10228       We also need to compare the unpack info. Suppose, the collation is
10229       case-insensitive, and unpack info contains information about whether
10230       the letters were uppercase and lowercase.  Then, both 'foo' and 'FOO'
10231       will have the same key value, but different data in unpack_info.
10232 
10233       (note: anyone changing bytewise_compare should take this code into
10234       account)
10235     */
10236     if (old_packed_size == new_packed_size &&
10237         m_sk_tails_old.get_current_pos() == m_sk_tails.get_current_pos() &&
10238         !(kd.has_ttl() && m_ttl_bytes_updated) &&
10239         memcmp(m_sk_packed_tuple_old, m_sk_packed_tuple, old_packed_size) ==
10240             0 &&
10241         memcmp(m_sk_tails_old.ptr(), m_sk_tails.ptr(),
10242                m_sk_tails.get_current_pos()) == 0) {
10243       return HA_EXIT_SUCCESS;
10244     }
10245 
10246     /*
10247       Deleting entries from secondary index should skip locking, but
10248       be visible to the transaction.
10249       (also note that DDL statements do not delete rows, so this is not a DDL
10250        statement)
10251     */
10252     old_key_slice = rocksdb::Slice(
10253         reinterpret_cast<const char *>(m_sk_packed_tuple_old), old_packed_size);
10254 
10255     row_info.tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(),
10256                                                          old_key_slice);
10257 
10258     bytes_written = old_key_slice.size();
10259   }
10260 
10261   new_key_slice = rocksdb::Slice(
10262       reinterpret_cast<const char *>(m_sk_packed_tuple), new_packed_size);
10263   new_value_slice =
10264       rocksdb::Slice(reinterpret_cast<const char *>(m_sk_tails.ptr()),
10265                      m_sk_tails.get_current_pos());
10266 
10267   if (bulk_load_sk && row_info.old_data == nullptr) {
10268     rc = bulk_load_key(row_info.tx, kd, new_key_slice, new_value_slice, true);
10269   } else {
10270     row_info.tx->get_indexed_write_batch()->Put(kd.get_cf(), new_key_slice,
10271                                                 new_value_slice);
10272   }
10273 
10274   row_info.tx->update_bytes_written(bytes_written + new_key_slice.size() +
10275                                     new_value_slice.size());
10276 
10277   return rc;
10278 }
10279 
10280 /**
10281    Update existing indexes(PK/SKs) or write new indexes(PK/SKs)
10282 
10283    @param[in] row_info    hold all row data, such as old key/new key
10284    @param[in] pk_changed  whether primary key is changed
10285    @return
10286      HA_EXIT_SUCCESS OK
10287      Other           HA_ERR error code (can be SE-specific)
10288  */
10289 int ha_rocksdb::update_write_indexes(const struct update_row_info &row_info,
10290                                      const bool pk_changed) {
10291   int rc;
10292   bool bulk_load_sk;
10293 
10294   // The PK must be updated first to pull out the TTL value.
10295   rc = update_write_pk(*m_pk_descr, row_info, pk_changed);
10296   if (rc != HA_EXIT_SUCCESS) {
10297     return rc;
10298   }
10299 
10300   // Update the remaining indexes. Allow bulk loading only if
10301   // allow_sk is enabled
10302   bulk_load_sk = rocksdb_enable_bulk_load_api &&
10303                  THDVAR(table->in_use, bulk_load) &&
10304                  THDVAR(table->in_use, bulk_load_allow_sk);
10305   for (uint key_id = 0; key_id < m_tbl_def->m_key_count; key_id++) {
10306     if (is_pk(key_id, table, m_tbl_def)) {
10307       continue;
10308     }
10309 
10310     rc = update_write_sk(table, *m_key_descr_arr[key_id], row_info,
10311                          bulk_load_sk);
10312     if (rc != HA_EXIT_SUCCESS) {
10313       return rc;
10314     }
10315   }
10316 
10317   return HA_EXIT_SUCCESS;
10318 }
10319 
10320 /**
10321   Update an existing row or write a new row
10322 
10323   @param[in] old_data           nullptr for write, non-null for update
10324   @param[in] new_data           non-null for write/update
10325   @param[in] skip_unique_check  whether to check uniqueness
10326   @return
10327     HA_EXIT_SUCCESS OK
10328     Other           HA_ERR error code (can be SE-specific)
10329  */
10330 int ha_rocksdb::update_write_row(const uchar *const old_data,
10331                                  const uchar *const new_data,
10332                                  const bool skip_unique_check) {
10333   DBUG_ENTER_FUNC();
10334 
10335   THD *thd = ha_thd();
10336   if (thd && thd->killed) {
10337     DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED);
10338   }
10339 
10340   bool pk_changed = false;
10341   struct update_row_info row_info;
10342 
10343   row_info.old_data = old_data;
10344   row_info.new_data = new_data;
10345   row_info.skip_unique_check = skip_unique_check;
10346   row_info.new_pk_unpack_info = nullptr;
10347   set_last_rowkey(old_data);
10348 
10349   row_info.tx = get_or_create_tx(table->in_use);
10350 
10351   if (old_data != nullptr) {
10352     row_info.old_pk_slice =
10353         rocksdb::Slice(m_last_rowkey.ptr(), m_last_rowkey.length());
10354 
10355     /* Determine which indexes need updating. */
10356     calc_updated_indexes();
10357   }
10358 
10359   /*
10360     Get the new row key into row_info.new_pk_slice
10361    */
10362   int rc = get_pk_for_update(&row_info);
10363   if (rc != HA_EXIT_SUCCESS) {
10364     DBUG_RETURN(rc);
10365   }
10366 
10367   /*
10368     For UPDATEs, if the key has changed, we need to obtain a lock. INSERTs
10369     always require locking.
10370   */
10371   if (row_info.old_pk_slice.size() > 0) {
10372     pk_changed = row_info.new_pk_slice.compare(row_info.old_pk_slice) != 0;
10373   }
10374 
10375   if (!skip_unique_check) {
10376     /*
10377       Check to see if we are going to have failures because of unique
10378       keys.  Also lock the appropriate key values.
10379     */
10380     rc = check_uniqueness_and_lock(row_info, pk_changed);
10381     if (rc != HA_EXIT_SUCCESS) {
10382       DBUG_RETURN(rc);
10383     }
10384   }
10385 
10386   DEBUG_SYNC(ha_thd(), "rocksdb.update_write_row_after_unique_check");
10387 
10388   /*
10389     At this point, all locks have been obtained, and all checks for duplicate
10390     keys have been performed. No further errors can be allowed to occur from
10391     here because updates to the transaction will be made and those updates
10392     cannot be easily removed without rolling back the entire transaction.
10393   */
10394   rc = update_write_indexes(row_info, pk_changed);
10395   if (rc != HA_EXIT_SUCCESS) {
10396     DBUG_RETURN(rc);
10397   }
10398 
10399   if (old_data != nullptr) {
10400     row_info.tx->incr_update_count();
10401   } else {
10402     row_info.tx->incr_insert_count();
10403   }
10404 
10405   row_info.tx->log_table_write_op(m_tbl_def);
10406 
10407   if (do_bulk_commit(row_info.tx)) {
10408     DBUG_RETURN(HA_ERR_ROCKSDB_BULK_LOAD);
10409   }
10410 
10411   DBUG_RETURN(HA_EXIT_SUCCESS);
10412 }
10413 
10414 /*
10415  Setting iterator upper/lower bounds for Seek/SeekForPrev.
10416  This makes RocksDB to avoid scanning tombstones outside of
10417  the given key ranges, when prefix_same_as_start=true was not passed
10418  (when prefix bloom filter can not be used).
10419  Inversing upper/lower bound is necessary on reverse order CF.
10420  This covers HA_READ_PREFIX_LAST* case as well. For example,
10421  if given query eq condition was 12 bytes and condition was
10422  0x0000b3eb003f65c5e78858b8, and if doing HA_READ_PREFIX_LAST,
10423  eq_cond_len was 11 (see calc_eq_cond_len() for details).
10424  If the index was reverse order, upper bound would be
10425  0x0000b3eb003f65c5e78857, and lower bound would be
10426  0x0000b3eb003f65c5e78859. These cover given eq condition range.
10427 
10428   @param lower_bound_buf  IN Buffer for lower bound
10429   @param upper_bound_buf  IN Buffer for upper bound
10430 
10431   @param outer_u
10432 */
10433 void ha_rocksdb::setup_iterator_bounds(
10434     const Rdb_key_def &kd, const rocksdb::Slice &eq_cond, size_t bound_len,
10435     uchar *const lower_bound, uchar *const upper_bound,
10436     rocksdb::Slice *lower_bound_slice, rocksdb::Slice *upper_bound_slice) {
10437   // If eq_cond is shorter than Rdb_key_def::INDEX_NUMBER_SIZE, we should be
10438   // able to get better bounds just by using index id directly.
10439   if (eq_cond.size() <= Rdb_key_def::INDEX_NUMBER_SIZE) {
10440     DBUG_ASSERT(bound_len == Rdb_key_def::INDEX_NUMBER_SIZE);
10441     uint size;
10442     kd.get_infimum_key(lower_bound, &size);
10443     DBUG_ASSERT(size == Rdb_key_def::INDEX_NUMBER_SIZE);
10444     kd.get_supremum_key(upper_bound, &size);
10445     DBUG_ASSERT(size == Rdb_key_def::INDEX_NUMBER_SIZE);
10446   } else {
10447     DBUG_ASSERT(bound_len <= eq_cond.size());
10448     memcpy(upper_bound, eq_cond.data(), bound_len);
10449     kd.successor(upper_bound, bound_len);
10450     memcpy(lower_bound, eq_cond.data(), bound_len);
10451     kd.predecessor(lower_bound, bound_len);
10452   }
10453 
10454   if (kd.m_is_reverse_cf) {
10455     *upper_bound_slice = rocksdb::Slice((const char *)lower_bound, bound_len);
10456     *lower_bound_slice = rocksdb::Slice((const char *)upper_bound, bound_len);
10457   } else {
10458     *upper_bound_slice = rocksdb::Slice((const char *)upper_bound, bound_len);
10459     *lower_bound_slice = rocksdb::Slice((const char *)lower_bound, bound_len);
10460   }
10461 }
10462 
10463 /*
10464   Open a cursor
10465 */
10466 
10467 void ha_rocksdb::setup_scan_iterator(const Rdb_key_def &kd,
10468                                      rocksdb::Slice *const slice,
10469                                      const bool use_all_keys,
10470                                      const uint eq_cond_len) {
10471   DBUG_ASSERT(slice->size() >= eq_cond_len);
10472 
10473   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10474 
10475   bool skip_bloom = true;
10476 
10477   const rocksdb::Slice eq_cond(slice->data(), eq_cond_len);
10478   // The size of m_scan_it_lower_bound (and upper) is technically
10479   // max_packed_sk_len as calculated in ha_rocksdb::alloc_key_buffers.  Rather
10480   // than recalculating that number, we pass in the max of eq_cond_len and
10481   // Rdb_key_def::INDEX_NUMBER_SIZE which is guaranteed to be smaller than
10482   // max_packed_sk_len, hence ensuring no buffer overrun.
10483   //
10484   // See ha_rocksdb::setup_iterator_bounds on how the bound_len parameter is
10485   // used.
10486   if (check_bloom_and_set_bounds(
10487           ha_thd(), kd, eq_cond, use_all_keys,
10488           std::max(eq_cond_len, (uint)Rdb_key_def::INDEX_NUMBER_SIZE),
10489           m_scan_it_lower_bound, m_scan_it_upper_bound,
10490           &m_scan_it_lower_bound_slice, &m_scan_it_upper_bound_slice)) {
10491     skip_bloom = false;
10492   }
10493 
10494   /*
10495     In some cases, setup_scan_iterator() is called multiple times from
10496     the same query but bloom filter can not always be used.
10497     Suppose the following query example. id2 is VARCHAR(30) and PRIMARY KEY
10498     (id1, id2).
10499      select count(*) from t2 WHERE id1=100 and id2 IN ('00000000000000000000',
10500     '100');
10501     In this case, setup_scan_iterator() is called twice, the first time is for
10502     (id1, id2)=(100, '00000000000000000000') and the second time is for (100,
10503     '100').
10504     If prefix bloom filter length is 24 bytes, prefix bloom filter can be used
10505     for the
10506     first condition but not for the second condition.
10507     If bloom filter condition is changed, currently it is necessary to destroy
10508     and
10509     re-create Iterator.
10510   */
10511   if (m_scan_it_skips_bloom != skip_bloom) {
10512     release_scan_iterator();
10513   }
10514 
10515   /*
10516     SQL layer can call rnd_init() multiple times in a row.
10517     In that case, re-use the iterator, but re-position it at the table start.
10518   */
10519   if (!m_scan_it) {
10520     const bool fill_cache = !THDVAR(ha_thd(), skip_fill_cache);
10521     if (commit_in_the_middle()) {
10522       DBUG_ASSERT(m_scan_it_snapshot == nullptr);
10523       m_scan_it_snapshot = rdb->GetSnapshot();
10524 
10525       auto read_opts = rocksdb::ReadOptions();
10526       // TODO(mung): set based on WHERE conditions
10527       read_opts.total_order_seek = true;
10528       read_opts.snapshot = m_scan_it_snapshot;
10529       m_scan_it = rdb->NewIterator(read_opts, kd.get_cf());
10530     } else {
10531       m_scan_it = tx->get_iterator(kd.get_cf(), skip_bloom, fill_cache,
10532                                    m_scan_it_lower_bound_slice,
10533                                    m_scan_it_upper_bound_slice);
10534     }
10535     m_scan_it_skips_bloom = skip_bloom;
10536   }
10537 }
10538 
10539 void ha_rocksdb::release_scan_iterator() {
10540   delete m_scan_it;
10541   m_scan_it = nullptr;
10542 
10543   if (m_scan_it_snapshot) {
10544     rdb->ReleaseSnapshot(m_scan_it_snapshot);
10545     m_scan_it_snapshot = nullptr;
10546   }
10547 }
10548 
10549 void ha_rocksdb::setup_iterator_for_rnd_scan() {
10550   uint key_size;
10551 
10552   int key_start_matching_bytes =
10553       m_pk_descr->get_first_key(m_pk_packed_tuple, &key_size);
10554 
10555   rocksdb::Slice table_key((const char *)m_pk_packed_tuple, key_size);
10556 
10557   setup_scan_iterator(*m_pk_descr, &table_key, false, key_start_matching_bytes);
10558   m_scan_it->Seek(table_key);
10559   m_skip_scan_it_next_call = true;
10560 }
10561 
10562 /**
10563   @return
10564     HA_EXIT_SUCCESS  OK
10565     other            HA_ERR error code (can be SE-specific)
10566 */
10567 int ha_rocksdb::rnd_init(bool scan) {
10568   DBUG_ENTER_FUNC();
10569 
10570   THD *thd = ha_thd();
10571   if (thd && thd->killed) {
10572     DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED);
10573   }
10574 
10575   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10576 
10577   // when this table is being updated, decode all fields
10578   m_converter->setup_field_decoders(table->read_set,
10579                                     m_lock_rows == RDB_LOCK_WRITE);
10580 
10581   if (scan) {
10582     m_rnd_scan_is_new_snapshot = !tx->has_snapshot();
10583     setup_iterator_for_rnd_scan();
10584   } else {
10585     /* We don't need any preparations for rnd_pos() calls. */
10586   }
10587 
10588   // If m_lock_rows is on then we will be doing a get_for_update when accessing
10589   // the index, so don't acquire the snapshot right away.  Otherwise acquire
10590   // the snapshot immediately.
10591   tx->acquire_snapshot(m_lock_rows == RDB_LOCK_NONE);
10592 
10593   DBUG_RETURN(HA_EXIT_SUCCESS);
10594 }
10595 
10596 /**
10597   @return
10598     HA_EXIT_SUCCESS  OK
10599     other            HA_ERR error code (can be SE-specific)
10600 */
10601 int ha_rocksdb::rnd_next(uchar *const buf) {
10602   DBUG_ENTER_FUNC();
10603 
10604   int rc;
10605   for (;;) {
10606     rc = rnd_next_with_direction(buf, true);
10607     if (!should_recreate_snapshot(rc, m_rnd_scan_is_new_snapshot)) {
10608       break; /* exit the loop */
10609     }
10610     // release the snapshot and iterator and then regenerate them
10611     Rdb_transaction *tx = get_or_create_tx(table->in_use);
10612     tx->release_snapshot();
10613     release_scan_iterator();
10614     setup_iterator_for_rnd_scan();
10615   }
10616 
10617   m_rnd_scan_is_new_snapshot = false;
10618 
10619   if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
10620 
10621   DBUG_RETURN(rc);
10622 }
10623 
10624 /*
10625   See also secondary_index_read().
10626 */
10627 int ha_rocksdb::rnd_next_with_direction(uchar *const buf, bool move_forward) {
10628   DBUG_ENTER_FUNC();
10629 
10630   int rc;
10631   THD *thd = ha_thd();
10632 
10633   table->status = STATUS_NOT_FOUND;
10634 #ifdef MARIAROCKS_NOT_YET
10635   stats.rows_requested++;
10636 #endif
10637   if (!m_scan_it || !is_valid(m_scan_it)) {
10638     /*
10639       We can get here when SQL layer has called
10640 
10641         h->index_init(PRIMARY);
10642         h->index_read_map(full index tuple, HA_READ_KEY_EXACT);
10643 
10644       In this case, we should return EOF.
10645     */
10646     DBUG_RETURN(HA_ERR_END_OF_FILE);
10647   }
10648 
10649   for (;;) {
10650     DEBUG_SYNC(thd, "rocksdb.check_flags_rnwd");
10651     if (thd && thd->killed) {
10652       rc = HA_ERR_QUERY_INTERRUPTED;
10653       break;
10654     }
10655 
10656     if (m_skip_scan_it_next_call) {
10657       m_skip_scan_it_next_call = false;
10658     } else {
10659       if (move_forward) {
10660         m_scan_it->Next(); /* this call cannot fail */
10661       } else {
10662         m_scan_it->Prev(); /* this call cannot fail */
10663       }
10664     }
10665 
10666     if (!is_valid(m_scan_it)) {
10667       rc = HA_ERR_END_OF_FILE;
10668       break;
10669     }
10670 
10671     /* check if we're out of this table */
10672     const rocksdb::Slice key = m_scan_it->key();
10673     if (!m_pk_descr->covers_key(key)) {
10674       rc = HA_ERR_END_OF_FILE;
10675       break;
10676     }
10677 
10678     if (m_lock_rows != RDB_LOCK_NONE) {
10679       /*
10680         Lock the row we've just read.
10681 
10682         Now we call get_for_update which will 1) Take a lock and 2) Will fail
10683         if the row was deleted since the snapshot was taken.
10684       */
10685       Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10686       DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete");
10687 
10688       if (m_pk_descr->has_ttl() &&
10689           should_hide_ttl_rec(*m_pk_descr, m_scan_it->value(),
10690                               tx->m_snapshot_timestamp)) {
10691         continue;
10692       }
10693 
10694       const rocksdb::Status s =
10695           get_for_update(tx, m_pk_descr->get_cf(), key, &m_retrieved_record);
10696       if (s.IsNotFound() &&
10697           should_skip_invalidated_record(HA_ERR_KEY_NOT_FOUND)) {
10698         continue;
10699       }
10700 
10701       if (!s.ok()) {
10702         DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr,
10703                                          m_tbl_def, m_table_handler));
10704       }
10705 
10706       // If we called get_for_update() use the value from that call not from
10707       // the iterator as it may be stale since we don't have a snapshot
10708       // when m_lock_rows is not RDB_LOCK_NONE.
10709       m_last_rowkey.copy(key.data(), key.size(), &my_charset_bin);
10710       rc = convert_record_from_storage_format(&key, buf);
10711     } else {
10712       // Use the value from the iterator
10713       rocksdb::Slice value = m_scan_it->value();
10714 
10715       if (m_pk_descr->has_ttl() &&
10716           should_hide_ttl_rec(
10717               *m_pk_descr, value,
10718               get_or_create_tx(table->in_use)->m_snapshot_timestamp)) {
10719         continue;
10720       }
10721 
10722       m_last_rowkey.copy(key.data(), key.size(), &my_charset_bin);
10723       rc = convert_record_from_storage_format(&key, &value, buf);
10724     }
10725 
10726     table->status = 0;
10727     break;
10728   }
10729 
10730   if (!rc) {
10731 #ifdef MARIAROCKS_NOT_YET
10732     stats.rows_read++;
10733     stats.rows_index_next++;
10734 #endif
10735     update_row_stats(ROWS_READ);
10736   }
10737 
10738   DBUG_RETURN(rc);
10739 }
10740 
10741 int ha_rocksdb::rnd_end() {
10742   DBUG_ENTER_FUNC();
10743 
10744   release_scan_iterator();
10745 
10746   DBUG_RETURN(HA_EXIT_SUCCESS);
10747 }
10748 
10749 /**
10750   @return
10751     HA_EXIT_SUCCESS  OK
10752     other            HA_ERR error code (can be SE-specific)
10753 */
10754 int ha_rocksdb::index_init(uint idx, bool sorted) {
10755   DBUG_ENTER_FUNC();
10756 
10757   THD *thd = ha_thd();
10758   if (thd && thd->killed) {
10759     DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED);
10760   }
10761 
10762   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10763   DBUG_ASSERT(tx != nullptr);
10764 
10765    m_start_range= NULL;
10766   // when this table is being updated, decode all fields
10767   m_converter->setup_field_decoders(table->read_set,
10768                                     m_lock_rows == RDB_LOCK_WRITE);
10769 
10770   if (!m_keyread_only) {
10771     m_key_descr_arr[idx]->get_lookup_bitmap(table, &m_lookup_bitmap);
10772   }
10773 
10774   // If m_lock_rows is not RDB_LOCK_NONE then we will be doing a get_for_update
10775   // when accessing the index, so don't acquire the snapshot right away.
10776   // Otherwise acquire the snapshot immediately.
10777   tx->acquire_snapshot(m_lock_rows == RDB_LOCK_NONE);
10778 
10779   active_index = idx;
10780 
10781   DBUG_RETURN(HA_EXIT_SUCCESS);
10782 }
10783 
10784 /**
10785   @return
10786     HA_EXIT_SUCCESS      OK
10787 */
10788 int ha_rocksdb::index_end() {
10789   DBUG_ENTER_FUNC();
10790 
10791   release_scan_iterator();
10792 
10793   bitmap_free(&m_lookup_bitmap);
10794 
10795   active_index = MAX_KEY;
10796   in_range_check_pushed_down = FALSE;
10797   m_start_range= NULL;
10798 
10799   DBUG_RETURN(HA_EXIT_SUCCESS);
10800 }
10801 
10802 /**
10803   @return
10804     HA_EXIT_SUCCESS  OK
10805     other            HA_ERR error code (can be SE-specific)
10806 */
10807 int ha_rocksdb::truncate() {
10808   DBUG_ENTER_FUNC();
10809 
10810   DBUG_ASSERT(m_tbl_def != nullptr);
10811 
10812   // Save table name to use later
10813   std::string table_name = m_tbl_def->full_tablename();
10814 
10815   // Delete current table
10816   int err = delete_table(m_tbl_def);
10817   if (err != HA_EXIT_SUCCESS) {
10818     DBUG_RETURN(err);
10819   }
10820 
10821   // Reset auto_increment_value to 1 if auto-increment feature is enabled
10822   // By default, the starting valid value for auto_increment_value is 1
10823   DBUG_RETURN(create_table(
10824       table_name, table,
10825       table->found_next_number_field ? 1 : 0 /* auto_increment_value */));
10826 }
10827 
10828 /*
10829   Delete the row we've last read. The row is also passed as parameter.
10830 
10831   @detail
10832     The caller guarantees table buf points to the row that was just read.
10833     The row is either table->record[0] or table->record[1].
10834     (Check out InnoDB: row_update_for_mysql() has "UT_NOT_USED(mysql_rec)"
10835 
10836   @return
10837     HA_EXIT_SUCCESS  OK
10838     other            HA_ERR error code (can be SE-specific)
10839 */
10840 int ha_rocksdb::delete_row(const uchar *const buf) {
10841   DBUG_ENTER_FUNC();
10842 
10843   DBUG_ASSERT(buf != nullptr);
10844 
10845   set_last_rowkey(buf);
10846 
10847   rocksdb::Slice key_slice(m_last_rowkey.ptr(), m_last_rowkey.length());
10848   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10849   ulonglong bytes_written = 0;
10850 
10851   const uint index = pk_index(table, m_tbl_def);
10852   rocksdb::Status s =
10853       delete_or_singledelete(index, tx, m_pk_descr->get_cf(), key_slice);
10854   if (!s.ok()) {
10855     DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def,
10856                                      m_table_handler));
10857   } else {
10858     bytes_written = key_slice.size();
10859   }
10860 
10861   longlong hidden_pk_id = 0;
10862   if (m_tbl_def->m_key_count > 1 && has_hidden_pk(table)) {
10863     int err = read_hidden_pk_id_from_rowkey(&hidden_pk_id);
10864     if (err) {
10865       DBUG_RETURN(err);
10866     }
10867   }
10868 
10869   // Delete the record for every secondary index
10870   for (uint i = 0; i < m_tbl_def->m_key_count; i++) {
10871     if (!is_pk(i, table, m_tbl_def)) {
10872       int packed_size;
10873       const Rdb_key_def &kd = *m_key_descr_arr[i];
10874       packed_size = kd.pack_record(table, m_pack_buffer, buf, m_sk_packed_tuple,
10875                                    nullptr, false, hidden_pk_id);
10876       rocksdb::Slice secondary_key_slice(
10877           reinterpret_cast<const char *>(m_sk_packed_tuple), packed_size);
10878       /* Deleting on secondary key doesn't need any locks: */
10879       tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(),
10880                                                   secondary_key_slice);
10881       bytes_written += secondary_key_slice.size();
10882     }
10883   }
10884 
10885   tx->incr_delete_count();
10886   tx->log_table_write_op(m_tbl_def);
10887 
10888   if (do_bulk_commit(tx)) {
10889     DBUG_RETURN(HA_ERR_ROCKSDB_BULK_LOAD);
10890   }
10891 #ifdef MARIAROCKS_NOT_YET
10892   stats.rows_deleted++;
10893 #endif
10894   update_row_stats(ROWS_DELETED);
10895   tx->update_bytes_written(bytes_written);
10896 
10897   DBUG_RETURN(HA_EXIT_SUCCESS);
10898 }
10899 
10900 rocksdb::Status ha_rocksdb::delete_or_singledelete(
10901     uint index, Rdb_transaction *const tx,
10902     rocksdb::ColumnFamilyHandle *const column_family,
10903     const rocksdb::Slice &key) {
10904   const bool assume_tracked = can_assume_tracked(ha_thd());
10905   if (can_use_single_delete(index)) {
10906     return tx->single_delete(column_family, key, assume_tracked);
10907   }
10908   return tx->delete_key(column_family, key, assume_tracked);
10909 }
10910 
10911 void ha_rocksdb::update_stats(void) {
10912   DBUG_ENTER_FUNC();
10913 
10914   stats.records = 0;
10915   stats.index_file_length = 0ul;
10916   stats.data_file_length = 0ul;
10917   stats.mean_rec_length = 0;
10918 
10919   for (uint i = 0; i < m_tbl_def->m_key_count; i++) {
10920     if (is_pk(i, table, m_tbl_def)) {
10921       stats.data_file_length = m_pk_descr->m_stats.m_actual_disk_size;
10922       stats.records = m_pk_descr->m_stats.m_rows;
10923     } else {
10924       stats.index_file_length += m_key_descr_arr[i]->m_stats.m_actual_disk_size;
10925     }
10926   }
10927 
10928   DBUG_VOID_RETURN;
10929 }
10930 
10931 /**
10932   @return
10933     HA_EXIT_SUCCESS  OK
10934     HA_EXIT_FAILURE  Error
10935 */
10936 int ha_rocksdb::info(uint flag) {
10937   DBUG_ENTER_FUNC();
10938 
10939   if (!table) {
10940     DBUG_RETURN(HA_EXIT_FAILURE);
10941   }
10942 
10943   if (flag & HA_STATUS_VARIABLE) {
10944     /*
10945       Test only to simulate corrupted stats
10946     */
10947     DBUG_EXECUTE_IF("myrocks_simulate_negative_stats",
10948                     m_pk_descr->m_stats.m_actual_disk_size =
10949                         -m_pk_descr->m_stats.m_actual_disk_size;);
10950 
10951     update_stats();
10952 
10953     /*
10954       If any stats are negative due to bad cached stats, re-run analyze table
10955       and re-retrieve the stats.
10956     */
10957     if (static_cast<longlong>(stats.data_file_length) < 0 ||
10958         static_cast<longlong>(stats.index_file_length) < 0 ||
10959         static_cast<longlong>(stats.records) < 0) {
10960       if (calculate_stats_for_table()) {
10961         DBUG_RETURN(HA_EXIT_FAILURE);
10962       }
10963 
10964       update_stats();
10965     }
10966 
10967     // if number of records is hardcoded, we do not want to force computation
10968     // of memtable cardinalities
10969     if (stats.records == 0 || (rocksdb_force_compute_memtable_stats &&
10970                                rocksdb_debug_optimizer_n_rows == 0)) {
10971       // First, compute SST files stats
10972       uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
10973       auto r = get_range(pk_index(table, m_tbl_def), buf);
10974       uint64_t sz = 0;
10975       uint8_t include_flags = rocksdb::DB::INCLUDE_FILES;
10976       // recompute SST files stats only if records count is 0
10977       if (stats.records == 0) {
10978         rdb->GetApproximateSizes(m_pk_descr->get_cf(), &r, 1, &sz,
10979                                  include_flags);
10980         stats.records += sz / ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE;
10981         stats.data_file_length += sz;
10982       }
10983       // Second, compute memtable stats. This call is expensive, so cache
10984       // values computed for some time.
10985       uint64_t cachetime = rocksdb_force_compute_memtable_stats_cachetime;
10986       uint64_t time = (cachetime == 0) ? 0 : my_interval_timer() / 1000;
10987       if (cachetime == 0 ||
10988           time > m_table_handler->m_mtcache_last_update + cachetime) {
10989         uint64_t memtableCount;
10990         uint64_t memtableSize;
10991 
10992         // the stats below are calculated from skiplist wich is a probablistic
10993         // data structure, so the results vary between test runs
10994         // it also can return 0 for quite a large tables which means that
10995         // cardinality for memtable only indxes will be reported as 0
10996         rdb->GetApproximateMemTableStats(m_pk_descr->get_cf(), r,
10997                                          &memtableCount, &memtableSize);
10998 
10999         // Atomically update all of these fields at the same time
11000         if (cachetime > 0) {
11001           if (m_table_handler->m_mtcache_lock.fetch_add(
11002                   1, std::memory_order_acquire) == 0) {
11003             m_table_handler->m_mtcache_count = memtableCount;
11004             m_table_handler->m_mtcache_size = memtableSize;
11005             m_table_handler->m_mtcache_last_update = time;
11006           }
11007           m_table_handler->m_mtcache_lock.fetch_sub(1,
11008                                                     std::memory_order_release);
11009         }
11010 
11011         stats.records += memtableCount;
11012         stats.data_file_length += memtableSize;
11013       } else {
11014         // Cached data is still valid, so use it instead
11015         stats.records += m_table_handler->m_mtcache_count;
11016         stats.data_file_length += m_table_handler->m_mtcache_size;
11017       }
11018 
11019       // Do like InnoDB does. stats.records=0 confuses the optimizer
11020       if (stats.records == 0 && !(flag & (HA_STATUS_TIME | HA_STATUS_OPEN))) {
11021         stats.records++;
11022       }
11023     }
11024 
11025     if (rocksdb_debug_optimizer_n_rows > 0)
11026       stats.records = rocksdb_debug_optimizer_n_rows;
11027 
11028     if (stats.records != 0) {
11029       stats.mean_rec_length = stats.data_file_length / stats.records;
11030     }
11031   }
11032 
11033   if (flag & HA_STATUS_CONST) {
11034     ref_length = m_pk_descr->max_storage_fmt_length();
11035 
11036     for (uint i = 0; i < m_tbl_def->m_key_count; i++) {
11037       if (is_hidden_pk(i, table, m_tbl_def)) {
11038         continue;
11039       }
11040       KEY *const k = &table->key_info[i];
11041       for (uint j = 0; j < k->ext_key_parts; j++) {
11042         const Rdb_index_stats &k_stats = m_key_descr_arr[i]->m_stats;
11043         uint x;
11044 
11045         if (k_stats.m_distinct_keys_per_prefix.size() > j &&
11046             k_stats.m_distinct_keys_per_prefix[j] > 0) {
11047           x = k_stats.m_rows / k_stats.m_distinct_keys_per_prefix[j];
11048           /*
11049             If the number of rows is less than the number of prefixes (due to
11050             sampling), the average number of rows with the same prefix is 1.
11051            */
11052           if (x == 0) {
11053             x = 1;
11054           }
11055         } else {
11056           x = 0;
11057         }
11058         if (x > stats.records) x = stats.records;
11059         if ((x == 0 && rocksdb_debug_optimizer_no_zero_cardinality) ||
11060             rocksdb_debug_optimizer_n_rows > 0) {
11061           // Fake cardinality implementation. For example, (idx1, idx2, idx3)
11062           // index
11063           /*
11064             Make MariaRocks behave the same way as MyRocks does:
11065             1. SQL layer thinks that unique secondary indexes are not extended
11066                with PK columns (both in MySQL and MariaDB)
11067             2. MariaDB also thinks that indexes with partially-covered columns
11068                are not extended with PK columns. Use the same number of
11069                keyparts that MyRocks would use.
11070           */
11071           uint ext_key_parts2;
11072           if (k->flags & HA_NOSAME)
11073             ext_key_parts2= k->ext_key_parts;  // This is #1
11074           else
11075             ext_key_parts2= m_key_descr_arr[i]->get_key_parts(); // This is #2.
11076 
11077           // will have rec_per_key for (idx1)=4, (idx1,2)=2, and (idx1,2,3)=1.
11078           // rec_per_key for the whole index is 1, and multiplied by 2^n if
11079           // n suffix columns of the index are not used.
11080           x = 1 << (ext_key_parts2 - j - 1);
11081         }
11082         k->rec_per_key[j] = x;
11083       }
11084     }
11085 
11086     stats.create_time = m_tbl_def->get_create_time();
11087   }
11088 
11089   if (flag & HA_STATUS_TIME) {
11090     stats.update_time = m_tbl_def->m_update_time;
11091   }
11092 
11093   if (flag & HA_STATUS_ERRKEY) {
11094     /*
11095       Currently we support only primary keys so we know which key had a
11096       uniqueness violation.
11097     */
11098     errkey = m_dupp_errkey;
11099     dup_ref = m_pk_tuple;  // TODO(?): this should store packed PK.
11100   }
11101 
11102   if (flag & HA_STATUS_AUTO) {
11103     stats.auto_increment_value = m_tbl_def->m_auto_incr_val;
11104   }
11105 
11106   DBUG_RETURN(HA_EXIT_SUCCESS);
11107 }
11108 
11109 void ha_rocksdb::position(const uchar *const record) {
11110   DBUG_ENTER_FUNC();
11111 
11112   longlong hidden_pk_id = 0;
11113   if (has_hidden_pk(table) && read_hidden_pk_id_from_rowkey(&hidden_pk_id)) {
11114     DBUG_ASSERT(false);  // should never reach here
11115   }
11116 
11117   /*
11118     Get packed primary key value from the record.
11119 
11120     (Note: m_last_rowkey has the packed PK of last-read-row, which allows to
11121     handle most cases, but there is an exception to this: when slave applies
11122     RBR events, it fills the record and then calls position(); rnd_pos())
11123 
11124     Also note that we *can* rely on PK column values being available. This is
11125     because table_flags() includes HA_PRIMARY_KEY_REQUIRED_FOR_POSITION bit.
11126     When that is true, table->prepare_for_position() adds PK columns into the
11127     read set (this may potentially disable index-only access when PK column
11128     cannot be restored from its mem-comparable form in the secondary indexes).
11129   */
11130   const uint packed_size = m_pk_descr->pack_record(
11131       table, m_pack_buffer, record, ref, nullptr, false, hidden_pk_id);
11132 
11133   /*
11134     It could be that mem-comparable form of PK occupies less than ref_length
11135     bytes. Fill the remainder with zeros.
11136   */
11137   if (ref_length > packed_size) {
11138     memset(ref + packed_size, 0, ref_length - packed_size);
11139   }
11140 
11141   DBUG_VOID_RETURN;
11142 }
11143 
11144 /**
11145   @return
11146     HA_EXIT_SUCCESS  OK
11147     other            HA_ERR error code (can be SE-specific)
11148 */
11149 int ha_rocksdb::rnd_pos(uchar *const buf, uchar *const pos) {
11150   DBUG_ENTER_FUNC();
11151 
11152   int rc;
11153   size_t len;
11154 
11155 #ifdef MARIAROCKS_NOT_YET
11156   stats.rows_requested++;
11157 #endif
11158   len = m_pk_descr->key_length(table,
11159                                rocksdb::Slice((const char *)pos, ref_length));
11160   if (len == size_t(-1)) {
11161     DBUG_RETURN(HA_ERR_ROCKSDB_CORRUPT_DATA); /* Data corruption? */
11162   }
11163 
11164   rc = get_row_by_rowid(buf, pos, len);
11165 
11166   if (!rc) {
11167 #ifdef MARIAROCKS_NOT_YET
11168     stats.rows_read++;
11169 #endif
11170     update_row_stats(ROWS_READ);
11171   }
11172 
11173   DBUG_RETURN(rc);
11174 }
11175 
11176 /*
11177   @brief
11178     Calculate (if needed) the bitmap of indexes that are modified by the
11179     current query
11180 
11181   @detail
11182     The calculation done by checking index definitions against the
11183   table->write_set
11184     column bitmap.
11185 */
11186 
11187 void ha_rocksdb::calc_updated_indexes() {
11188   if (!m_update_scope_is_valid) {
11189     m_update_scope_is_valid = true;
11190     m_update_scope.clear_all();
11191 
11192     for (uint keynr = 0; keynr < table->s->keys; keynr++) {
11193       const Rdb_key_def &kd = *m_key_descr_arr[keynr];
11194       /* Walk over all key parts, including the "extended key" suffix */
11195       const uint key_parts = kd.get_key_parts();
11196       for (uint kp = 0; kp < key_parts; kp++) {
11197         if (has_hidden_pk(table) && kp + 1 == key_parts) break;
11198 
11199         Field *const field = kd.get_table_field_for_part_no(table, kp);
11200         if (bitmap_is_set(table->write_set, field->field_index)) {
11201           m_update_scope.set_bit(keynr);
11202           break;
11203         }
11204       }
11205     }
11206   }
11207 }
11208 
11209 /**
11210   Update an existing row
11211   @param[in] old_data           nullptr for write, non-null for update
11212   @param[in] new_data           non-null for write/update
11213   @return
11214     HA_EXIT_SUCCESS  OK
11215     other            HA_ERR error code (can be SE-specific)
11216 */
11217 int ha_rocksdb::update_row(const uchar *const old_data,
11218                            const uchar *const new_data) {
11219   DBUG_ENTER_FUNC();
11220 
11221   DBUG_ASSERT(old_data != nullptr);
11222   DBUG_ASSERT(new_data != nullptr);
11223   DBUG_ASSERT(m_lock_rows == RDB_LOCK_WRITE);
11224   /*
11225     old_data points to record we're updating. It is the same as the record
11226     we've just read (for multi-table UPDATE, too, because SQL layer will make
11227     an rnd_pos() call to re-read the record before calling update_row())
11228   */
11229   DBUG_ASSERT(new_data == table->record[0]);
11230 
11231   const int rv = update_write_row(old_data, new_data, skip_unique_check());
11232 
11233   if (rv == 0) {
11234 #ifdef MARIAROCKS_NOT_YET
11235     stats.rows_updated++;
11236 #endif
11237     update_row_stats(ROWS_UPDATED);
11238   }
11239 
11240   DBUG_RETURN(rv);
11241 }
11242 
11243 /*
11244   MariaDB's temporary: MyRocks has this function in sql/handler.cc:
11245 */
11246 
11247 bool can_hold_read_locks_on_select(THD *thd, thr_lock_type lock_type)
11248 {
11249   return (lock_type == TL_READ_WITH_SHARED_LOCKS
11250           || lock_type == TL_READ_NO_INSERT
11251           || (lock_type != TL_IGNORE
11252             && thd->lex->sql_command != SQLCOM_SELECT));
11253 }
11254 
11255 
11256 /* The following function was copied from ha_blackhole::store_lock: */
11257 THR_LOCK_DATA **ha_rocksdb::store_lock(THD *const thd, THR_LOCK_DATA **to,
11258                                        enum thr_lock_type lock_type) {
11259   DBUG_ENTER_FUNC();
11260 
11261   DBUG_ASSERT(thd != nullptr);
11262   DBUG_ASSERT(to != nullptr);
11263 
11264   bool in_lock_tables = my_core::thd_in_lock_tables(thd);
11265 
11266   /* First, make a decision about MyRocks's internal locking */
11267   if (lock_type >= TL_WRITE_ALLOW_WRITE) {
11268     m_lock_rows = RDB_LOCK_WRITE;
11269   } else if (lock_type == TL_READ_WITH_SHARED_LOCKS) {
11270     m_lock_rows = RDB_LOCK_READ;
11271   } else if (lock_type != TL_IGNORE) {
11272     m_lock_rows = RDB_LOCK_NONE;
11273     if (THDVAR(thd, lock_scanned_rows)) {
11274       /*
11275         The following logic was copied directly from
11276         ha_innobase::store_lock_with_x_type() in
11277         storage/innobase/handler/ha_innodb.cc and causes MyRocks to leave
11278         locks in place on rows that are in a table that is not being updated.
11279       */
11280       const uint sql_command = my_core::thd_sql_command(thd);
11281       if ((lock_type == TL_READ && in_lock_tables) ||
11282           (lock_type == TL_READ_HIGH_PRIORITY && in_lock_tables) ||
11283           can_hold_read_locks_on_select(thd, lock_type)) {
11284         ulong tx_isolation = my_core::thd_tx_isolation(thd);
11285         if (sql_command != SQLCOM_CHECKSUM &&
11286             ((my_core::thd_test_options(thd, OPTION_BIN_LOG) &&
11287               tx_isolation > ISO_READ_COMMITTED) ||
11288              tx_isolation == ISO_SERIALIZABLE ||
11289              (lock_type != TL_READ && lock_type != TL_READ_NO_INSERT) ||
11290              (sql_command != SQLCOM_INSERT_SELECT &&
11291               sql_command != SQLCOM_REPLACE_SELECT &&
11292               sql_command != SQLCOM_UPDATE && sql_command != SQLCOM_DELETE &&
11293               sql_command != SQLCOM_CREATE_TABLE))) {
11294           m_lock_rows = RDB_LOCK_READ;
11295         }
11296       }
11297     }
11298   }
11299 
11300   /* Then, tell the SQL layer what kind of locking it should use: */
11301   if (lock_type != TL_IGNORE && m_db_lock.type == TL_UNLOCK) {
11302     /*
11303       Here is where we get into the guts of a row level lock.
11304       If TL_UNLOCK is set
11305       If we are not doing a LOCK TABLE or DISCARD/IMPORT
11306       TABLESPACE, then allow multiple writers
11307     */
11308 
11309     if ((lock_type >= TL_WRITE_CONCURRENT_INSERT && lock_type <= TL_WRITE) &&
11310         !in_lock_tables && !my_core::thd_tablespace_op(thd)) {
11311       lock_type = TL_WRITE_ALLOW_WRITE;
11312     }
11313 
11314     /*
11315       In queries of type INSERT INTO t1 SELECT ... FROM t2 ...
11316       MySQL would use the lock TL_READ_NO_INSERT on t2, and that
11317       would conflict with TL_WRITE_ALLOW_WRITE, blocking all inserts
11318       to t2. Convert the lock to a normal read lock to allow
11319       concurrent inserts to t2.
11320     */
11321 
11322     if (lock_type == TL_READ_NO_INSERT && !in_lock_tables) {
11323       lock_type = TL_READ;
11324     }
11325 
11326     m_db_lock.type = lock_type;
11327   }
11328 
11329   *to++ = &m_db_lock;
11330 
11331   DBUG_RETURN(to);
11332 }
11333 
11334 void ha_rocksdb::read_thd_vars(THD *const thd) {
11335   m_store_row_debug_checksums = THDVAR(thd, store_row_debug_checksums);
11336   m_converter->set_verify_row_debug_checksums(
11337       THDVAR(thd, verify_row_debug_checksums));
11338   m_checksums_pct = THDVAR(thd, checksums_pct);
11339 }
11340 
11341 ulonglong ha_rocksdb::table_flags() const
11342 {
11343   DBUG_ENTER_FUNC();
11344 
11345   /*
11346     HA_BINLOG_STMT_CAPABLE
11347     Upstream:  MyRocks advertises itself as it supports SBR, but has additional
11348       checks in ha_rocksdb::external_lock()/ start_stmt() which will return an
11349       error if one tries to run the statement.
11350       Exceptions: @@rocksdb_unsafe_for_binlog or we are an SQL slave thread.
11351 
11352     MariaDB: Inform the upper layer we don't support SBR, so it switches to RBR
11353       if possible. The exceptions are the same as with the upstream.
11354 
11355     HA_REC_NOT_IN_SEQ
11356       If we don't set it, filesort crashes, because it assumes rowids are
11357       1..8 byte numbers
11358     HA_PRIMARY_KEY_IN_READ_INDEX
11359       This flag is always set, even for tables that:
11360       - have no PK
11361       - have some (or all) of PK that can't be decoded from the secondary
11362         index.
11363   */
11364   THD *thd= ha_thd();
11365   DBUG_RETURN(HA_BINLOG_ROW_CAPABLE |
11366               ((thd && (THDVAR(thd, unsafe_for_binlog) ||thd->rgi_slave))?
11367                 HA_BINLOG_STMT_CAPABLE : 0) |
11368               HA_REC_NOT_IN_SEQ | HA_CAN_INDEX_BLOBS |
11369               HA_PRIMARY_KEY_IN_READ_INDEX |
11370               HA_PRIMARY_KEY_REQUIRED_FOR_POSITION | HA_NULL_IN_KEY |
11371               HA_PARTIAL_COLUMN_READ |
11372               HA_TABLE_SCAN_ON_INDEX);
11373 }
11374 
11375 
11376 
11377 /**
11378   @return
11379     HA_EXIT_SUCCESS  OK
11380     other            HA_ERR error code (cannot be SE-specific)
11381 */
11382 int ha_rocksdb::external_lock(THD *const thd, int lock_type) {
11383   DBUG_ENTER_FUNC();
11384 
11385   DBUG_ASSERT(thd != nullptr);
11386 
11387   int res = HA_EXIT_SUCCESS;
11388 #if 0
11389   // MariaDB uses a different way to implement this, see ha_rocksdb::table_flags
11390 
11391   int binlog_format = my_core::thd_binlog_format(thd);
11392   bool unsafe_for_binlog = THDVAR(ha_thd(), unsafe_for_binlog);
11393 
11394   /*
11395     A note about (*) below: In fb/mysql, LOCK TABLE myrocks_table WRITE
11396     - produces an error when @@binlog_format='ROW'.
11397     - does not produce an error with @@binlog_format='STATEMENT'
11398 
11399     We cannot easily provide this behavior in MariaDB, because LOCK ROWS
11400     changes @@binlog_format for the duration of the statement.
11401 
11402     Instead we will make the check in ha_rocksdb::start_stmt and will produce
11403     an error in the first DML statement done under LOCK TABLE
11404 
11405     (**) - The same happens for other "DDL-type" commands, so allow them here
11406     as well.
11407   */
11408   if (lock_type == F_WRLCK && !thd->rgi_slave && !unsafe_for_binlog &&
11409       binlog_format != BINLOG_FORMAT_ROW &&
11410       binlog_format != BINLOG_FORMAT_UNSPEC &&
11411       thd->lex->sql_command != SQLCOM_LOCK_TABLES &&  // (*)
11412       thd->lex->sql_command != SQLCOM_ANALYZE &&   // (**)
11413       thd->lex->sql_command != SQLCOM_OPTIMIZE &&  // (**)
11414       thd->lex->sql_command != SQLCOM_FLUSH &&  // (**)
11415       my_core::thd_binlog_filter_ok(thd)) {
11416     my_error(ER_REQUIRE_ROW_BINLOG_FORMAT, MYF(0));
11417     DBUG_RETURN(HA_ERR_UNSUPPORTED);
11418   }
11419 #endif
11420 
11421   if (lock_type == F_UNLCK) {
11422     Rdb_transaction *const tx = get_tx_from_thd(thd);
11423 
11424     if (tx) {
11425       tx->io_perf_end_and_record(&m_io_perf);
11426       tx->m_n_mysql_tables_in_use--;
11427       if (tx->m_n_mysql_tables_in_use == 0 &&
11428           !my_core::thd_test_options(thd,
11429                                      OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
11430         /*
11431           Do like InnoDB: when we get here, it's time to commit a
11432           single-statement transaction.
11433 
11434           If the statement involved multiple tables, this code will be executed
11435           for each of them, but that's ok because non-first tx->commit() calls
11436           will be no-ops.
11437         */
11438         if (tx->commit_or_rollback()) {
11439           res = HA_ERR_INTERNAL_ERROR;
11440         }
11441       }
11442     }
11443   } else {
11444     if (my_core::thd_tx_isolation(thd) < ISO_READ_COMMITTED ||
11445         my_core::thd_tx_isolation(thd) > ISO_REPEATABLE_READ) {
11446       my_error(ER_ISOLATION_MODE_NOT_SUPPORTED, MYF(0),
11447                tx_isolation_names[my_core::thd_tx_isolation(thd)]);
11448       DBUG_RETURN(HA_ERR_UNSUPPORTED);
11449     }
11450     /*
11451       It's nice to do the following on start of every statement. The problem
11452       is, handler->start_stmt() is not called for INSERTs.
11453       So, we put this code here.
11454     */
11455     Rdb_transaction *const tx = get_or_create_tx(thd);
11456     read_thd_vars(thd);
11457 
11458     m_update_scope_is_valid = false;
11459 
11460     if (skip_unique_check()) {
11461       if ((thd->lex->sql_command == SQLCOM_INSERT ||
11462            thd->lex->sql_command == SQLCOM_LOAD ||
11463            thd->lex->sql_command == SQLCOM_REPLACE) &&
11464           (thd->lex->duplicates == DUP_REPLACE ||
11465            thd->lex->duplicates == DUP_UPDATE)) {
11466         my_error(ER_ON_DUPLICATE_DISABLED, MYF(0), thd->query());
11467         DBUG_RETURN(HA_ERR_UNSUPPORTED);
11468       }
11469     }
11470 
11471     if (lock_type == F_WRLCK) {
11472       if (tx->is_tx_read_only()) {
11473         my_error(ER_UPDATES_WITH_CONSISTENT_SNAPSHOT, MYF(0));
11474         DBUG_RETURN(HA_ERR_UNSUPPORTED);
11475       }
11476 
11477 #ifdef MARIADB_NOT_YET
11478       if (thd->get_explicit_snapshot()) {
11479         my_error(ER_UPDATES_WITH_EXPLICIT_SNAPSHOT, MYF(0));
11480         DBUG_RETURN(HA_ERR_UNSUPPORTED);
11481       }
11482 #endif
11483 
11484       /*
11485         SQL layer signals us to take a write lock. It does so when starting DML
11486         statement. We should put locks on the rows we're reading.
11487 
11488         Note: sometimes, external_lock() can be called without a prior
11489         ::store_lock call.  That's why we need to set lock_* members here, too.
11490       */
11491       m_lock_rows = RDB_LOCK_WRITE;
11492 
11493       if (thd->lex->sql_command == SQLCOM_CREATE_INDEX ||
11494           thd->lex->sql_command == SQLCOM_DROP_INDEX ||
11495           thd->lex->sql_command == SQLCOM_ALTER_TABLE) {
11496         tx->m_ddl_transaction = true;
11497       }
11498     }
11499     tx->m_n_mysql_tables_in_use++;
11500     rocksdb_register_tx(rocksdb_hton, thd, tx);
11501     tx->io_perf_start(&m_io_perf);
11502   }
11503 
11504   DBUG_RETURN(res);
11505 }
11506 
11507 /**
11508   @note
11509   A quote from ha_innobase::start_stmt():
11510   <quote>
11511   MySQL calls this function at the start of each SQL statement inside LOCK
11512   TABLES. Inside LOCK TABLES the ::external_lock method does not work to
11513   mark SQL statement borders.
11514   </quote>
11515 
11516   @return
11517     HA_EXIT_SUCCESS  OK
11518 */
11519 
11520 int ha_rocksdb::start_stmt(THD *const thd, thr_lock_type lock_type) {
11521   DBUG_ENTER_FUNC();
11522 
11523   DBUG_ASSERT(thd != nullptr);
11524 
11525   Rdb_transaction *const tx = get_or_create_tx(thd);
11526   read_thd_vars(thd);
11527   rocksdb_register_tx(ht, thd, tx);
11528   tx->io_perf_start(&m_io_perf);
11529 
11530   DBUG_RETURN(HA_EXIT_SUCCESS);
11531 }
11532 
11533 rocksdb::Range get_range(uint32_t i,
11534                          uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2],
11535                          int offset1, int offset2) {
11536   uchar *buf_begin = buf;
11537   uchar *buf_end = buf + Rdb_key_def::INDEX_NUMBER_SIZE;
11538   rdb_netbuf_store_index(buf_begin, i + offset1);
11539   rdb_netbuf_store_index(buf_end, i + offset2);
11540 
11541   return rocksdb::Range(
11542       rocksdb::Slice((const char *)buf_begin, Rdb_key_def::INDEX_NUMBER_SIZE),
11543       rocksdb::Slice((const char *)buf_end, Rdb_key_def::INDEX_NUMBER_SIZE));
11544 }
11545 
11546 static rocksdb::Range get_range(const Rdb_key_def &kd,
11547                                 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2],
11548                                 int offset1, int offset2) {
11549   return get_range(kd.get_index_number(), buf, offset1, offset2);
11550 }
11551 
11552 rocksdb::Range get_range(const Rdb_key_def &kd,
11553                          uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]) {
11554   if (kd.m_is_reverse_cf) {
11555     return myrocks::get_range(kd, buf, 1, 0);
11556   } else {
11557     return myrocks::get_range(kd, buf, 0, 1);
11558   }
11559 }
11560 
11561 rocksdb::Range ha_rocksdb::get_range(
11562     const int i, uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]) const {
11563   return myrocks::get_range(*m_key_descr_arr[i], buf);
11564 }
11565 
11566 /*
11567  This function is called with total_order_seek=true, but
11568  upper/lower bound setting is not necessary.
11569  Boundary set is useful when there is no matching key,
11570  but in drop_index_thread's case, it means index is marked as removed,
11571  so no further seek will happen for the index id.
11572 */
11573 static bool is_myrocks_index_empty(rocksdb::ColumnFamilyHandle *cfh,
11574                                    const bool is_reverse_cf,
11575                                    const rocksdb::ReadOptions &read_opts,
11576                                    const uint index_id) {
11577   bool index_removed = false;
11578   uchar key_buf[Rdb_key_def::INDEX_NUMBER_SIZE] = {0};
11579   rdb_netbuf_store_uint32(key_buf, index_id);
11580   const rocksdb::Slice key =
11581       rocksdb::Slice(reinterpret_cast<char *>(key_buf), sizeof(key_buf));
11582   std::unique_ptr<rocksdb::Iterator> it(rdb->NewIterator(read_opts, cfh));
11583   rocksdb_smart_seek(is_reverse_cf, it.get(), key);
11584   if (!it->Valid()) {
11585     index_removed = true;
11586   } else {
11587     if (memcmp(it->key().data(), key_buf, Rdb_key_def::INDEX_NUMBER_SIZE)) {
11588       // Key does not have same prefix
11589       index_removed = true;
11590     }
11591   }
11592   return index_removed;
11593 }
11594 
11595 /*
11596   Drop index thread's main logic
11597 */
11598 
11599 void Rdb_drop_index_thread::run() {
11600   RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
11601 
11602   for (;;) {
11603     // The stop flag might be set by shutdown command
11604     // after drop_index_thread releases signal_mutex
11605     // (i.e. while executing expensive Seek()). To prevent drop_index_thread
11606     // from entering long cond_timedwait, checking if stop flag
11607     // is true or not is needed, with drop_index_interrupt_mutex held.
11608     if (m_stop) {
11609       break;
11610     }
11611 
11612     timespec ts;
11613     int sec= dict_manager.is_drop_index_empty()
11614                      ? 24 * 60 * 60  // no filtering
11615                      : 60;           // filtering
11616     set_timespec(ts,sec);
11617 
11618     const auto ret MY_ATTRIBUTE((__unused__)) =
11619         mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts);
11620     if (m_stop) {
11621       break;
11622     }
11623     // make sure, no program error is returned
11624     DBUG_ASSERT(ret == 0 || ret == ETIMEDOUT);
11625     RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
11626 
11627     std::unordered_set<GL_INDEX_ID> indices;
11628     dict_manager.get_ongoing_drop_indexes(&indices);
11629     if (!indices.empty()) {
11630       std::unordered_set<GL_INDEX_ID> finished;
11631       rocksdb::ReadOptions read_opts;
11632       read_opts.total_order_seek = true;  // disable bloom filter
11633 
11634       for (const auto d : indices) {
11635         uint32 cf_flags = 0;
11636         if (!dict_manager.get_cf_flags(d.cf_id, &cf_flags)) {
11637           // NO_LINT_DEBUG
11638           sql_print_error(
11639               "RocksDB: Failed to get column family flags "
11640               "from cf id %u. MyRocks data dictionary may "
11641               "get corrupted.",
11642               d.cf_id);
11643           if (rocksdb_ignore_datadic_errors)
11644           {
11645             sql_print_error("RocksDB: rocksdb_ignore_datadic_errors=1, "
11646                             "trying to continue");
11647             continue;
11648           }
11649           abort();
11650         }
11651         rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(d.cf_id);
11652         DBUG_ASSERT(cfh);
11653         const bool is_reverse_cf = cf_flags & Rdb_key_def::REVERSE_CF_FLAG;
11654 
11655         uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
11656         rocksdb::Range range = get_range(d.index_id, buf, is_reverse_cf ? 1 : 0,
11657                                          is_reverse_cf ? 0 : 1);
11658         rocksdb::Status status = DeleteFilesInRange(rdb->GetBaseDB(), cfh,
11659                                                     &range.start, &range.limit);
11660         if (!status.ok()) {
11661           if (status.IsShutdownInProgress()) {
11662             break;
11663           }
11664           rdb_handle_io_error(status, RDB_IO_ERROR_BG_THREAD);
11665         }
11666         status = rdb->CompactRange(getCompactRangeOptions(), cfh, &range.start,
11667                                    &range.limit);
11668         if (!status.ok()) {
11669           if (status.IsShutdownInProgress()) {
11670             break;
11671           }
11672           rdb_handle_io_error(status, RDB_IO_ERROR_BG_THREAD);
11673         }
11674         if (is_myrocks_index_empty(cfh, is_reverse_cf, read_opts, d.index_id)) {
11675           finished.insert(d);
11676         }
11677       }
11678 
11679       if (!finished.empty()) {
11680         dict_manager.finish_drop_indexes(finished);
11681       }
11682     }
11683     RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
11684   }
11685 
11686   RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
11687 }
11688 
11689 Rdb_tbl_def *ha_rocksdb::get_table_if_exists(const char *const tablename) {
11690   std::string str;
11691   if (rdb_normalize_tablename(tablename, &str) != HA_EXIT_SUCCESS) {
11692     // We were not passed table name?
11693     DBUG_ASSERT(0);
11694     return nullptr;
11695   }
11696 
11697   return ddl_manager.find(str);
11698 }
11699 
11700 /*
11701   Overload func for delete table ---it deletes table meta data in data
11702   dictionary immediately and delete real data in background thread(async)
11703 
11704   @param tbl       IN      MyRocks table definition
11705 
11706   @return
11707     HA_EXIT_SUCCESS  OK
11708     other            HA_ERR error code (can be SE-specific)
11709 */
11710 int ha_rocksdb::delete_table(Rdb_tbl_def *const tbl) {
11711   DBUG_ENTER_FUNC();
11712 
11713   DBUG_ASSERT(tbl != nullptr);
11714   DBUG_ASSERT(m_tbl_def == nullptr || m_tbl_def == tbl);
11715 
11716   const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
11717   rocksdb::WriteBatch *const batch = wb.get();
11718 
11719   dict_manager.add_drop_table(tbl->m_key_descr_arr, tbl->m_key_count, batch);
11720 
11721   /*
11722     Remove the table entry in data dictionary (this will also remove it from
11723     the persistent data dictionary).
11724   */
11725   ddl_manager.remove(tbl, batch, true);
11726 
11727   int err = dict_manager.commit(batch);
11728   if (err) {
11729     DBUG_RETURN(err);
11730   }
11731 
11732   rdb_drop_idx_thread.signal();
11733   // avoid dangling pointer
11734   m_tbl_def = nullptr;
11735   DBUG_RETURN(HA_EXIT_SUCCESS);
11736 }
11737 
11738 /*
11739   Note: the following function is called when the table is not open. That is,
11740   this->table==nullptr, pk_key_descr==nullptr, etc.
11741 
11742   tablename points to line in form "./dbname/tablename".
11743 
11744   @return
11745     HA_EXIT_SUCCESS  OK
11746     other            HA_ERR error code (can be SE-specific)
11747 */
11748 
11749 int ha_rocksdb::delete_table(const char *const tablename) {
11750   DBUG_ENTER_FUNC();
11751 
11752   DBUG_ASSERT(tablename != nullptr);
11753 
11754   /* Find the table in the hash */
11755   Rdb_tbl_def *const tbl = get_table_if_exists(tablename);
11756   if (!tbl) {
11757     DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
11758   }
11759 
11760   DBUG_RETURN(delete_table(tbl));
11761 }
11762 
11763 int ha_rocksdb::remove_rows(Rdb_tbl_def *const tbl) {
11764   const rocksdb::WriteOptions wo =
11765       rdb_get_rocksdb_write_options(handler::ha_thd());
11766 
11767   rocksdb::ReadOptions opts;
11768   opts.total_order_seek = true;
11769   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
11770 
11771   char key_buf[MAX_KEY_LENGTH];
11772   uint key_len;
11773   ulonglong bytes_written = 0;
11774 
11775   uchar lower_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE];
11776   uchar upper_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE];
11777   rocksdb::Slice lower_bound_slice;
11778   rocksdb::Slice upper_bound_slice;
11779 
11780   /*
11781     Remove all records in each index.
11782     (This is is not crash-safe, but it doesn't matter, because bulk row
11783     deletion will be handled on rocksdb side)
11784   */
11785   for (uint i = 0; i < tbl->m_key_count; i++) {
11786     const Rdb_key_def &kd = *tbl->m_key_descr_arr[i];
11787     kd.get_infimum_key(reinterpret_cast<uchar *>(key_buf), &key_len);
11788     rocksdb::ColumnFamilyHandle *cf = kd.get_cf();
11789     const rocksdb::Slice table_key(key_buf, key_len);
11790     setup_iterator_bounds(kd, table_key, Rdb_key_def::INDEX_NUMBER_SIZE,
11791                           lower_bound_buf, upper_bound_buf, &lower_bound_slice,
11792                           &upper_bound_slice);
11793     DBUG_ASSERT(key_len == Rdb_key_def::INDEX_NUMBER_SIZE);
11794     opts.iterate_lower_bound = &lower_bound_slice;
11795     opts.iterate_upper_bound = &upper_bound_slice;
11796     std::unique_ptr<rocksdb::Iterator> it(rdb->NewIterator(opts, cf));
11797 
11798     it->Seek(table_key);
11799     while (it->Valid()) {
11800       const rocksdb::Slice key = it->key();
11801       if (!kd.covers_key(key)) {
11802         break;
11803       }
11804 
11805       rocksdb::Status s;
11806       if (can_use_single_delete(i)) {
11807         s = rdb->SingleDelete(wo, cf, key);
11808       } else {
11809         s = rdb->Delete(wo, cf, key);
11810       }
11811 
11812       if (!s.ok()) {
11813         return tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def,
11814                                     m_table_handler);
11815       }
11816       bytes_written += key.size();
11817       it->Next();
11818     }
11819   }
11820 
11821   tx->update_bytes_written(bytes_written);
11822 
11823   return HA_EXIT_SUCCESS;
11824 }
11825 
11826 /**
11827   @return
11828     HA_EXIT_SUCCESS  OK
11829     other            HA_ERR error code (cannot be SE-specific)
11830 */
11831 int ha_rocksdb::rename_table(const char *const from, const char *const to) {
11832   DBUG_ENTER_FUNC();
11833 
11834   std::string from_str;
11835   std::string to_str;
11836   std::string from_db;
11837   std::string to_db;
11838 
11839   int rc = rdb_normalize_tablename(from, &from_str);
11840   if (rc != HA_EXIT_SUCCESS) {
11841     DBUG_RETURN(rc);
11842   }
11843 
11844   rc = rdb_split_normalized_tablename(from_str, &from_db);
11845   if (rc != HA_EXIT_SUCCESS) {
11846     DBUG_RETURN(rc);
11847   }
11848 
11849   rc = rdb_normalize_tablename(to, &to_str);
11850   if (rc != HA_EXIT_SUCCESS) {
11851     DBUG_RETURN(rc);
11852   }
11853 
11854   rc = rdb_split_normalized_tablename(to_str, &to_db);
11855   if (rc != HA_EXIT_SUCCESS) {
11856     DBUG_RETURN(rc);
11857   }
11858 
11859   // If the user changed the database part of the name then validate that the
11860   // 'to' database exists.
11861   if (from_db != to_db && !rdb_database_exists(to_db)) {
11862     // If we return a RocksDB specific error code here we get
11863     // "error: 206 - Unknown error 206".  InnoDB gets
11864     // "error -1 - Unknown error -1" so let's match them.
11865     DBUG_RETURN(-1);
11866   }
11867 
11868   DBUG_EXECUTE_IF("gen_sql_table_name", to_str = to_str + "#sql-test";);
11869 
11870   const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
11871   rocksdb::WriteBatch *const batch = wb.get();
11872   dict_manager.lock();
11873 
11874   if (ddl_manager.rename(from_str, to_str, batch)) {
11875     rc = HA_ERR_NO_SUCH_TABLE;
11876   } else {
11877     rc = dict_manager.commit(batch);
11878   }
11879   dict_manager.unlock();
11880 
11881   DBUG_RETURN(rc);
11882 }
11883 
11884 /**
11885   check_if_incompatible_data() called if ALTER TABLE can't detect otherwise
11886   if new and old definition are compatible
11887 
11888   @details If there are no other explicit signs like changed number of
11889   fields this function will be called by compare_tables()
11890   (sql/sql_tables.cc) to decide should we rewrite whole table or only .frm
11891   file.
11892 
11893 */
11894 
11895 bool ha_rocksdb::check_if_incompatible_data(HA_CREATE_INFO *const info,
11896                                             uint table_changes) {
11897   DBUG_ENTER_FUNC();
11898 
11899   DBUG_ASSERT(info != nullptr);
11900 
11901   // this function is needed only for online alter-table
11902   DBUG_RETURN(COMPATIBLE_DATA_NO);
11903 }
11904 
11905 /**
11906   @return
11907     HA_EXIT_SUCCESS  OK
11908 */
11909 int ha_rocksdb::extra(enum ha_extra_function operation) {
11910   DBUG_ENTER_FUNC();
11911 
11912   switch (operation) {
11913     case HA_EXTRA_KEYREAD:
11914       m_keyread_only = true;
11915       break;
11916     case HA_EXTRA_NO_KEYREAD:
11917       m_keyread_only = false;
11918       break;
11919     case HA_EXTRA_FLUSH:
11920       /*
11921         If the table has blobs, then they are part of m_retrieved_record.
11922         This call invalidates them.
11923       */
11924       m_retrieved_record.Reset();
11925       break;
11926     case HA_EXTRA_INSERT_WITH_UPDATE:
11927       // INSERT ON DUPLICATE KEY UPDATE
11928       if (rocksdb_enable_insert_with_update_caching) {
11929         m_insert_with_update = true;
11930       }
11931       break;
11932     case HA_EXTRA_NO_IGNORE_DUP_KEY:
11933       // PAIRED with HA_EXTRA_INSERT_WITH_UPDATE or HA_EXTRA_WRITE_CAN_REPLACE
11934       // that indicates the end of REPLACE / INSERT ON DUPLICATE KEY
11935       m_insert_with_update = false;
11936       break;
11937 
11938     default:
11939       break;
11940   }
11941 
11942   DBUG_RETURN(HA_EXIT_SUCCESS);
11943 }
11944 
11945 /*
11946   Given a starting key and an ending key, estimate the number of rows that
11947   will exist between the two keys.
11948 */
11949 ha_rows ha_rocksdb::records_in_range(uint inx, key_range *const min_key,
11950                                      key_range *const max_key) {
11951   DBUG_ENTER_FUNC();
11952 
11953   ha_rows ret = THDVAR(ha_thd(), records_in_range);
11954   if (ret) {
11955     DBUG_RETURN(ret);
11956   }
11957   if (table->force_index) {
11958     const ha_rows force_rows = THDVAR(ha_thd(), force_index_records_in_range);
11959     if (force_rows) {
11960       DBUG_RETURN(force_rows);
11961     }
11962   }
11963 
11964   const Rdb_key_def &kd = *m_key_descr_arr[inx];
11965 
11966   uint size1 = 0;
11967   if (min_key) {
11968     size1 = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
11969                                 min_key->key, min_key->keypart_map);
11970     if (min_key->flag == HA_READ_PREFIX_LAST_OR_PREV ||
11971         min_key->flag == HA_READ_PREFIX_LAST ||
11972         min_key->flag == HA_READ_AFTER_KEY) {
11973       kd.successor(m_sk_packed_tuple, size1);
11974     }
11975   } else {
11976     kd.get_infimum_key(m_sk_packed_tuple, &size1);
11977   }
11978 
11979   uint size2 = 0;
11980   if (max_key) {
11981     size2 = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple_old,
11982                                 max_key->key, max_key->keypart_map);
11983     if (max_key->flag == HA_READ_PREFIX_LAST_OR_PREV ||
11984         max_key->flag == HA_READ_PREFIX_LAST ||
11985         max_key->flag == HA_READ_AFTER_KEY) {
11986       kd.successor(m_sk_packed_tuple_old, size2);
11987     }
11988     // pad the upper key with FFFFs to make sure it is more than the lower
11989     if (size1 > size2) {
11990       memset(m_sk_packed_tuple_old + size2, 0xff, size1 - size2);
11991       size2 = size1;
11992     }
11993   } else {
11994     kd.get_supremum_key(m_sk_packed_tuple_old, &size2);
11995   }
11996 
11997   const rocksdb::Slice slice1((const char *)m_sk_packed_tuple, size1);
11998   const rocksdb::Slice slice2((const char *)m_sk_packed_tuple_old, size2);
11999 
12000   // slice1 >= slice2 means no row will match
12001   if (slice1.compare(slice2) >= 0) {
12002     DBUG_RETURN(HA_EXIT_SUCCESS);
12003   }
12004 
12005   rocksdb::Range r(kd.m_is_reverse_cf ? slice2 : slice1,
12006                    kd.m_is_reverse_cf ? slice1 : slice2);
12007 
12008   uint64_t sz = 0;
12009   auto disk_size = kd.m_stats.m_actual_disk_size;
12010   if (disk_size == 0) disk_size = kd.m_stats.m_data_size;
12011   auto rows = kd.m_stats.m_rows;
12012   if (rows == 0 || disk_size == 0) {
12013     rows = 1;
12014     disk_size = ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE;
12015   }
12016 
12017   // Getting statistics, including from Memtables
12018   uint8_t include_flags = rocksdb::DB::INCLUDE_FILES;
12019   rdb->GetApproximateSizes(kd.get_cf(), &r, 1, &sz, include_flags);
12020   ret = rows * sz / disk_size;
12021   uint64_t memTableCount;
12022   rdb->GetApproximateMemTableStats(kd.get_cf(), r, &memTableCount, &sz);
12023   ret += memTableCount;
12024 
12025   /*
12026     GetApproximateSizes() gives estimates so ret might exceed stats.records.
12027     MySQL then decides to use full index scan rather than range scan, which
12028     is not efficient for most cases.
12029     To prevent this, changing estimated records slightly smaller than
12030     stats.records.
12031   */
12032   if (ret >= stats.records) {
12033     ret = stats.records * 0.99;
12034   }
12035 
12036   if (rocksdb_debug_optimizer_n_rows > 0) {
12037     ret = rocksdb_debug_optimizer_n_rows;
12038   } else if (ret == 0) {
12039     ret = 1;
12040   }
12041 
12042   DBUG_RETURN(ret);
12043 }
12044 
12045 void ha_rocksdb::update_create_info(HA_CREATE_INFO *const create_info) {
12046   DBUG_ENTER_FUNC();
12047 
12048   DBUG_ASSERT(create_info != nullptr);
12049 
12050   if (!(create_info->used_fields & HA_CREATE_USED_AUTO)) {
12051     create_info->auto_increment_value = m_tbl_def->m_auto_incr_val;
12052   }
12053 
12054   DBUG_VOID_RETURN;
12055 }
12056 
12057 /**
12058   @brief
12059   Doing manual compaction on OPTIMIZE TABLE in RocksDB.
12060   Compaction itself is executed by background thread in RocksDB, but
12061   CompactRange() waits until compaction completes so this function
12062   may take a long time.
12063   Since RocksDB dataset is allocated per index id, OPTIMIZE TABLE
12064   triggers manual compaction for all indexes of the table.
12065   @details
12066   Compaction range is from the beginning of the index id to
12067   the first row of the next index id. When using reverse order
12068   column family, the first row of the next index id should be
12069   the last row of the previous index id.
12070 
12071   @return
12072     HA_ADMIN_OK      OK
12073     other            HA_ADMIN error code
12074 */
12075 int ha_rocksdb::optimize(THD *const thd, HA_CHECK_OPT *const check_opt) {
12076   DBUG_ENTER_FUNC();
12077 
12078   DBUG_ASSERT(thd != nullptr);
12079   DBUG_ASSERT(check_opt != nullptr);
12080 
12081   for (uint i = 0; i < table->s->keys; i++) {
12082     uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
12083     auto range = get_range(i, buf);
12084     const rocksdb::Status s = rdb->CompactRange(getCompactRangeOptions(),
12085                                                 m_key_descr_arr[i]->get_cf(),
12086                                                 &range.start, &range.limit);
12087     if (!s.ok()) {
12088       DBUG_RETURN(rdb_error_to_mysql(s));
12089     }
12090   }
12091 
12092   DBUG_RETURN(HA_EXIT_SUCCESS);
12093 }
12094 
12095 static int calculate_stats(
12096     const std::unordered_map<GL_INDEX_ID, std::shared_ptr<const Rdb_key_def>>
12097         &to_recalc,
12098     bool include_memtables) {
12099   DBUG_ENTER_FUNC();
12100 
12101   // find per column family key ranges which need to be queried
12102   std::unordered_map<rocksdb::ColumnFamilyHandle *, std::vector<rocksdb::Range>>
12103       ranges;
12104   std::unordered_map<GL_INDEX_ID, Rdb_index_stats> stats;
12105   std::vector<uchar> buf(to_recalc.size() * 2 * Rdb_key_def::INDEX_NUMBER_SIZE);
12106 
12107   uchar *bufp = buf.data();
12108   for (const auto &it : to_recalc) {
12109     const GL_INDEX_ID index_id = it.first;
12110     auto &kd = it.second;
12111     ranges[kd->get_cf()].push_back(myrocks::get_range(*kd, bufp));
12112     bufp += 2 * Rdb_key_def::INDEX_NUMBER_SIZE;
12113 
12114     stats[index_id] = Rdb_index_stats(index_id);
12115     DBUG_ASSERT(kd->get_key_parts() > 0);
12116     stats[index_id].m_distinct_keys_per_prefix.resize(kd->get_key_parts());
12117   }
12118 
12119   // get RocksDB table properties for these ranges
12120   rocksdb::TablePropertiesCollection props;
12121   for (const auto &it : ranges) {
12122     const auto old_size MY_ATTRIBUTE((__unused__)) = props.size();
12123     const auto status = rdb->GetPropertiesOfTablesInRange(
12124         it.first, &it.second[0], it.second.size(), &props);
12125     DBUG_ASSERT(props.size() >= old_size);
12126     if (!status.ok()) {
12127       DBUG_RETURN(ha_rocksdb::rdb_error_to_mysql(
12128           status, "Could not access RocksDB properties"));
12129     }
12130   }
12131 
12132   int num_sst = 0;
12133   for (const auto &it : props) {
12134     std::vector<Rdb_index_stats> sst_stats;
12135     Rdb_tbl_prop_coll::read_stats_from_tbl_props(it.second, &sst_stats);
12136     /*
12137       sst_stats is a list of index statistics for indexes that have entries
12138       in the current SST file.
12139     */
12140     for (const auto &it1 : sst_stats) {
12141       /*
12142         Only update statistics for indexes that belong to this SQL table.
12143 
12144         The reason is: We are walking through all SST files that have
12145         entries from this table (and so can compute good statistics). For
12146         other SQL tables, it can be that we're only seeing a small fraction
12147         of table's entries (and so we can't update statistics based on that).
12148       */
12149       if (stats.find(it1.m_gl_index_id) == stats.end()) {
12150         continue;
12151       }
12152 
12153       auto it_index = to_recalc.find(it1.m_gl_index_id);
12154       DBUG_ASSERT(it_index != to_recalc.end());
12155       if (it_index == to_recalc.end()) {
12156         continue;
12157       }
12158       stats[it1.m_gl_index_id].merge(
12159           it1, true, it_index->second->max_storage_fmt_length());
12160     }
12161     num_sst++;
12162   }
12163 
12164   if (include_memtables) {
12165     // calculate memtable cardinality
12166     Rdb_tbl_card_coll cardinality_collector(rocksdb_table_stats_sampling_pct);
12167     auto read_opts = rocksdb::ReadOptions();
12168     read_opts.read_tier = rocksdb::ReadTier::kMemtableTier;
12169     for (const auto &it_kd : to_recalc) {
12170       const std::shared_ptr<const Rdb_key_def> &kd = it_kd.second;
12171       Rdb_index_stats &stat = stats[kd->get_gl_index_id()];
12172 
12173       uchar r_buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
12174       auto r = myrocks::get_range(*kd, r_buf);
12175       uint64_t memtableCount;
12176       uint64_t memtableSize;
12177       rdb->GetApproximateMemTableStats(kd->get_cf(), r, &memtableCount,
12178                                        &memtableSize);
12179       if (memtableCount < (uint64_t)stat.m_rows / 10) {
12180         // skip tables that already have enough stats from SST files to reduce
12181         // overhead and avoid degradation of big tables stats by sampling from
12182         // relatively tiny (less than 10% of full data set) memtable dataset
12183         continue;
12184       }
12185 
12186       std::unique_ptr<rocksdb::Iterator> it =
12187           std::unique_ptr<rocksdb::Iterator>(
12188               rdb->NewIterator(read_opts, kd->get_cf()));
12189 
12190       rocksdb::Slice first_index_key((const char *)r_buf,
12191                                      Rdb_key_def::INDEX_NUMBER_SIZE);
12192 
12193       cardinality_collector.Reset();
12194       for (it->Seek(first_index_key); is_valid(it.get()); it->Next()) {
12195         const rocksdb::Slice key = it->key();
12196         if (!kd->covers_key(key)) {
12197           break;  // end of this index
12198         }
12199         stat.m_rows++;
12200 
12201         cardinality_collector.ProcessKey(key, kd.get(), &stat);
12202       }
12203       cardinality_collector.AdjustStats(&stat);
12204     }
12205   }
12206 
12207   // set and persist new stats
12208   ddl_manager.set_stats(stats);
12209   ddl_manager.persist_stats(true);
12210 
12211   DBUG_RETURN(HA_EXIT_SUCCESS);
12212 }
12213 
12214 int ha_rocksdb::calculate_stats_for_table() {
12215   DBUG_ENTER_FUNC();
12216 
12217   std::unordered_map<GL_INDEX_ID, std::shared_ptr<const Rdb_key_def>>
12218       ids_to_check;
12219   for (uint i = 0; i < table->s->keys; i++) {
12220     ids_to_check.insert(std::make_pair(m_key_descr_arr[i]->get_gl_index_id(),
12221                                        m_key_descr_arr[i]));
12222   }
12223 
12224   DBUG_RETURN(calculate_stats(ids_to_check, true));
12225 }
12226 
12227 /*
12228   @return
12229     HA_ADMIN_OK      OK
12230     other            HA_ADMIN error code
12231 */
12232 int ha_rocksdb::analyze(THD *const thd, HA_CHECK_OPT *const check_opt) {
12233   DBUG_ENTER_FUNC();
12234 
12235   if (table) {
12236     if (calculate_stats_for_table() != HA_EXIT_SUCCESS) {
12237       DBUG_RETURN(HA_ADMIN_FAILED);
12238     }
12239   }
12240 
12241   // A call to ::info is needed to repopulate some SQL level structs. This is
12242   // necessary for online analyze because we cannot rely on another ::open
12243   // call to call info for us.
12244   if (info(HA_STATUS_CONST | HA_STATUS_VARIABLE) != HA_EXIT_SUCCESS) {
12245     DBUG_RETURN(HA_ADMIN_FAILED);
12246   }
12247 
12248   DBUG_RETURN(HA_ADMIN_OK);
12249 }
12250 
12251 void ha_rocksdb::get_auto_increment(ulonglong off, ulonglong inc,
12252                                     ulonglong nb_desired_values,
12253                                     ulonglong *const first_value,
12254                                     ulonglong *const nb_reserved_values) {
12255   /*
12256     MySQL has a somewhat complicated way of handling the auto-increment value.
12257     The first time get_auto_increment is called for a statement,
12258     nb_desired_values is the estimate for how many values will be needed.  The
12259     engine can then reserve some values, and those will be automatically used
12260     by MySQL, until a hard-coded value shows up in the insert statement, after
12261     which MySQL again calls this function to reset its starting value.
12262    *
12263     For simplicity we will just ignore nb_desired_values - we aren't going to
12264     reserve any extra values for a multi-insert statement.  Each row will
12265     simply acquire the next value as needed and we will always tell MySQL that
12266     we only reserved 1 value.  Since we are using an atomic value for
12267     m_auto_incr_val this should be safe - if we had to grab a mutex, doing
12268     an actual reserve of some values might be a better solution.
12269    */
12270   DEBUG_SYNC(ha_thd(), "rocksdb.autoinc_vars");
12271   DEBUG_SYNC(ha_thd(), "rocksdb.autoinc_vars2");
12272 
12273   if (off > inc) {
12274     off = 1;
12275   }
12276 
12277   Field *field;
12278   ulonglong new_val, max_val;
12279   field = table->key_info[table->s->next_number_index].key_part[0].field;
12280   max_val = rdb_get_int_col_max_value(field);
12281 
12282   // Local variable reference to simplify code below
12283   auto &auto_incr = m_tbl_def->m_auto_incr_val;
12284 
12285   if (inc == 1) {
12286     DBUG_ASSERT(off == 1);
12287     // Optimization for the standard case where we are always simply
12288     // incrementing from the last position
12289 
12290     // Use CAS operation in a loop to make sure automically get the next auto
12291     // increment value while ensuring that we don't wrap around to a negative
12292     // number.
12293     //
12294     // We set auto_incr to the min of max_val and new_val + 1. This means that
12295     // if we're at the maximum, we should be returning the same value for
12296     // multiple rows, resulting in duplicate key errors (as expected).
12297     //
12298     // If we return values greater than the max, the SQL layer will "truncate"
12299     // the value anyway, but it means that we store invalid values into
12300     // auto_incr that will be visible in SHOW CREATE TABLE.
12301     new_val = auto_incr;
12302     while (new_val != std::numeric_limits<ulonglong>::max()) {
12303       if (auto_incr.compare_exchange_weak(new_val,
12304                                           std::min(new_val + 1, max_val))) {
12305         break;
12306       }
12307     }
12308   } else {
12309     // The next value can be more complicated if either 'inc' or 'off' is not 1
12310     ulonglong last_val = auto_incr;
12311 
12312     if (last_val > max_val) {
12313       new_val = std::numeric_limits<ulonglong>::max();
12314     } else {
12315       // Loop until we can correctly update the atomic value
12316       do {
12317         DBUG_ASSERT(last_val > 0);
12318         // Calculate the next value in the auto increment series: offset
12319         // + N * increment where N is 0, 1, 2, ...
12320         //
12321         // For further information please visit:
12322         // http://dev.mysql.com/doc/refman/5.7/en/replication-options-master.html
12323         //
12324         // The following is confusing so here is an explanation:
12325         // To get the next number in the sequence above you subtract out the
12326         // offset, calculate the next sequence (N * increment) and then add the
12327         // offset back in.
12328         //
12329         // The additions are rearranged to avoid overflow.  The following is
12330         // equivalent to (last_val - 1 + inc - off) / inc. This uses the fact
12331         // that (a+b)/c = a/c + b/c + (a%c + b%c)/c. To show why:
12332         //
12333         // (a+b)/c
12334         // = (a - a%c + a%c + b - b%c + b%c) / c
12335         // = (a - a%c) / c + (b - b%c) / c + (a%c + b%c) / c
12336         // = a/c + b/c + (a%c + b%c) / c
12337         //
12338         // Now, substitute a = last_val - 1, b = inc - off, c = inc to get the
12339         // following statement.
12340         ulonglong n =
12341             (last_val - 1) / inc + ((last_val - 1) % inc + inc - off) / inc;
12342 
12343         // Check if n * inc + off will overflow. This can only happen if we have
12344         // an UNSIGNED BIGINT field.
12345         if (n > (std::numeric_limits<ulonglong>::max() - off) / inc) {
12346           DBUG_ASSERT(max_val == std::numeric_limits<ulonglong>::max());
12347           // The 'last_val' value is already equal to or larger than the largest
12348           // value in the sequence.  Continuing would wrap around (technically
12349           // the behavior would be undefined).  What should we do?
12350           // We could:
12351           //   1) set the new value to the last possible number in our sequence
12352           //      as described above.  The problem with this is that this
12353           //      number could be smaller than a value in an existing row.
12354           //   2) set the new value to the largest possible number.  This number
12355           //      may not be in our sequence, but it is guaranteed to be equal
12356           //      to or larger than any other value already inserted.
12357           //
12358           //  For now I'm going to take option 2.
12359           //
12360           //  Returning ULLONG_MAX from get_auto_increment will cause the SQL
12361           //  layer to fail with ER_AUTOINC_READ_FAILED. This means that due to
12362           //  the SE API for get_auto_increment, inserts will fail with
12363           //  ER_AUTOINC_READ_FAILED if the column is UNSIGNED BIGINT, but
12364           //  inserts will fail with ER_DUP_ENTRY for other types (or no failure
12365           //  if the column is in a non-unique SK).
12366           new_val = std::numeric_limits<ulonglong>::max();
12367           auto_incr = new_val;  // Store the largest value into auto_incr
12368           break;
12369         }
12370 
12371         new_val = n * inc + off;
12372 
12373         // Attempt to store the new value (plus 1 since m_auto_incr_val contains
12374         // the next available value) into the atomic value.  If the current
12375         // value no longer matches what we have in 'last_val' this will fail and
12376         // we will repeat the loop (`last_val` will automatically get updated
12377         // with the current value).
12378         //
12379         // See above explanation for inc == 1 for why we use std::min.
12380       } while (!auto_incr.compare_exchange_weak(
12381           last_val, std::min(new_val + 1, max_val)));
12382     }
12383   }
12384 
12385   *first_value = new_val;
12386   *nb_reserved_values = 1;
12387 }
12388 
12389 #ifndef DBUG_OFF
12390 
12391 /* Debugger help function */
12392 static char dbug_item_print_buf[512];
12393 
12394 const char *dbug_print_item(Item *const item) {
12395   char *const buf = dbug_item_print_buf;
12396   String str(buf, sizeof(dbug_item_print_buf), &my_charset_bin);
12397   str.length(0);
12398   if (!item) return "(Item*)nullptr";
12399   item->print(&str, QT_ORDINARY);
12400   if (str.c_ptr() == buf) {
12401     return buf;
12402   } else {
12403     return "Couldn't fit into buffer";
12404   }
12405 }
12406 
12407 #endif /*DBUG_OFF*/
12408 
12409 /**
12410   SQL layer calls this function to push an index condition.
12411 
12412   @details
12413     The condition is for index keyno (only one condition can be pushed at a
12414     time).
12415     The caller guarantees that condition refers only to index fields; besides
12416     that, fields must have
12417 
12418       $field->part_of_key.set_bit(keyno)
12419 
12420     which means that
12421 
12422        (handler->index_flags(keyno, $keypart, 0) & HA_KEYREAD_ONLY) == 1
12423 
12424     which means that field value can be restored from the index tuple.
12425 
12426   @return
12427     Part of condition we couldn't check (always nullptr).
12428 */
12429 
12430 class Item *ha_rocksdb::idx_cond_push(uint keyno, class Item *const idx_cond) {
12431   DBUG_ENTER_FUNC();
12432 
12433   DBUG_ASSERT(keyno != MAX_KEY);
12434   DBUG_ASSERT(idx_cond != nullptr);
12435 
12436   pushed_idx_cond = idx_cond;
12437   pushed_idx_cond_keyno = keyno;
12438   in_range_check_pushed_down = TRUE;
12439 
12440   /* We will check the whole condition */
12441   DBUG_RETURN(nullptr);
12442 }
12443 
12444 /*
12445   Checks if inplace alter is supported for a given operation.
12446 */
12447 
12448 my_core::enum_alter_inplace_result ha_rocksdb::check_if_supported_inplace_alter(
12449     TABLE *altered_table, my_core::Alter_inplace_info *const ha_alter_info) {
12450   DBUG_ENTER_FUNC();
12451 
12452   DBUG_ASSERT(ha_alter_info != nullptr);
12453 
12454   if (ha_alter_info->handler_flags &
12455       ~(ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX |
12456         ALTER_DROP_UNIQUE_INDEX |
12457         ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX |
12458         ALTER_PARTITIONED |
12459         ALTER_ADD_UNIQUE_INDEX |
12460         ALTER_CHANGE_CREATE_OPTION)) {
12461     DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED);
12462   }
12463 
12464   /* We don't support unique keys on table w/ no primary keys */
12465   if ((ha_alter_info->handler_flags &
12466        ALTER_ADD_UNIQUE_INDEX) &&
12467       has_hidden_pk(altered_table)) {
12468     DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED);
12469   }
12470 
12471   /* We only support changing auto_increment for table options. */
12472   if ((ha_alter_info->handler_flags & ALTER_CHANGE_CREATE_OPTION) &&
12473       !(ha_alter_info->create_info->used_fields & HA_CREATE_USED_AUTO)) {
12474     DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED);
12475   }
12476 
12477   /* FIXME: MDEV-16099 Use alter algorithm=nocopy or algorithm=instant
12478 		for non-InnoDB engine */
12479   DBUG_RETURN(my_core::HA_ALTER_INPLACE_COPY_LOCK);
12480 }
12481 
12482 /**
12483   Allows the storage engine to update internal structures with concurrent
12484   writes blocked. If check_if_supported_inplace_alter() returns
12485   HA_ALTER_INPLACE_COPY_NO_LOCK or HA_ALTER_INPLACE_COPY_LOCK.
12486   this function is called with exclusive lock otherwise the same level of
12487   locking as for inplace_alter_table() will be used.
12488 
12489   @note Storage engines are responsible for reporting any errors by
12490   calling my_error()/print_error()
12491 
12492   @note If this function reports error, commit_inplace_alter_table()
12493   will be called with commit= false.
12494 
12495   @note For partitioning, failing to prepare one partition, means that
12496   commit_inplace_alter_table() will be called to roll back changes for
12497   all partitions. This means that commit_inplace_alter_table() might be
12498   called without prepare_inplace_alter_table() having been called first
12499   for a given partition.
12500 
12501   @param    altered_table     TABLE object for new version of table.
12502   @param    ha_alter_info     Structure describing changes to be done
12503                               by ALTER TABLE and holding data used
12504                               during in-place alter.
12505 
12506   @retval   true              Error
12507   @retval   false             Success
12508 */
12509 bool ha_rocksdb::prepare_inplace_alter_table(
12510     TABLE *const altered_table,
12511     my_core::Alter_inplace_info *const ha_alter_info) {
12512   DBUG_ENTER_FUNC();
12513 
12514   DBUG_ASSERT(altered_table != nullptr);
12515   DBUG_ASSERT(ha_alter_info != nullptr);
12516 
12517   Rdb_tbl_def *new_tdef = nullptr;
12518   std::shared_ptr<Rdb_key_def> *old_key_descr = nullptr;
12519   std::shared_ptr<Rdb_key_def> *new_key_descr = nullptr;
12520   uint old_n_keys = m_tbl_def->m_key_count;
12521   uint new_n_keys = altered_table->s->keys;
12522   std::unordered_set<std::shared_ptr<Rdb_key_def>> added_indexes;
12523   std::unordered_set<GL_INDEX_ID> dropped_index_ids;
12524   uint n_dropped_keys = 0;
12525   uint n_added_keys = 0;
12526   ulonglong max_auto_incr = 0;
12527 
12528   if (ha_alter_info->handler_flags &
12529       (ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX |
12530        ALTER_DROP_UNIQUE_INDEX |
12531        ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX |
12532        ALTER_ADD_UNIQUE_INDEX)) {
12533     if (has_hidden_pk(altered_table)) {
12534       new_n_keys += 1;
12535     }
12536 
12537     const TABLE *const old_table = table;
12538     old_key_descr = m_tbl_def->m_key_descr_arr;
12539     new_key_descr = new std::shared_ptr<Rdb_key_def>[new_n_keys];
12540 
12541     new_tdef = new Rdb_tbl_def(m_tbl_def->full_tablename());
12542     new_tdef->m_key_descr_arr = new_key_descr;
12543     new_tdef->m_key_count = new_n_keys;
12544     new_tdef->m_auto_incr_val =
12545         m_tbl_def->m_auto_incr_val.load(std::memory_order_relaxed);
12546     new_tdef->m_hidden_pk_val =
12547         m_tbl_def->m_hidden_pk_val.load(std::memory_order_relaxed);
12548 
12549     if (create_key_defs(altered_table, new_tdef, table, m_tbl_def)) {
12550       /* Delete the new key descriptors */
12551       delete[] new_key_descr;
12552 
12553       /*
12554         Explicitly mark as nullptr so we don't accidentally remove entries
12555         from data dictionary on cleanup (or cause double delete[]).
12556         */
12557       new_tdef->m_key_descr_arr = nullptr;
12558       delete new_tdef;
12559 
12560       my_error(ER_KEY_CREATE_DURING_ALTER, MYF(0));
12561       DBUG_RETURN(HA_EXIT_FAILURE);
12562     }
12563 
12564     uint i;
12565     uint j;
12566 
12567     /* Determine which(if any) key definition(s) need to be dropped */
12568     for (i = 0; i < ha_alter_info->index_drop_count; i++) {
12569       const KEY *const dropped_key = ha_alter_info->index_drop_buffer[i];
12570       for (j = 0; j < old_n_keys; j++) {
12571         const KEY *const old_key =
12572             &old_table->key_info[old_key_descr[j]->get_keyno()];
12573 
12574         if (!compare_keys(old_key, dropped_key)) {
12575           dropped_index_ids.insert(old_key_descr[j]->get_gl_index_id());
12576           break;
12577         }
12578       }
12579     }
12580 
12581     /* Determine which(if any) key definitions(s) need to be added */
12582     int identical_indexes_found = 0;
12583     for (i = 0; i < ha_alter_info->index_add_count; i++) {
12584       const KEY *const added_key =
12585           &ha_alter_info->key_info_buffer[ha_alter_info->index_add_buffer[i]];
12586       for (j = 0; j < new_n_keys; j++) {
12587         const KEY *const new_key =
12588             &altered_table->key_info[new_key_descr[j]->get_keyno()];
12589         if (!compare_keys(new_key, added_key)) {
12590           /*
12591             Check for cases where an 'identical' index is being dropped and
12592             re-added in a single ALTER statement.  Turn this into a no-op as the
12593             index has not changed.
12594 
12595             E.G. Unique index -> non-unique index requires no change
12596 
12597             Note that cases where the index name remains the same but the
12598             key-parts are changed is already handled in create_inplace_key_defs.
12599             In these cases the index needs to be rebuilt.
12600             */
12601           if (dropped_index_ids.count(new_key_descr[j]->get_gl_index_id())) {
12602             dropped_index_ids.erase(new_key_descr[j]->get_gl_index_id());
12603             identical_indexes_found++;
12604           } else {
12605             added_indexes.insert(new_key_descr[j]);
12606           }
12607 
12608           break;
12609         }
12610       }
12611     }
12612 
12613     n_dropped_keys = ha_alter_info->index_drop_count - identical_indexes_found;
12614     n_added_keys = ha_alter_info->index_add_count - identical_indexes_found;
12615     DBUG_ASSERT(dropped_index_ids.size() == n_dropped_keys);
12616     DBUG_ASSERT(added_indexes.size() == n_added_keys);
12617     DBUG_ASSERT(new_n_keys == (old_n_keys - n_dropped_keys + n_added_keys));
12618   }
12619   if (ha_alter_info->handler_flags & ALTER_CHANGE_CREATE_OPTION) {
12620     if (!new_tdef) {
12621       new_tdef = m_tbl_def;
12622     }
12623     if (table->found_next_number_field) {
12624       max_auto_incr = load_auto_incr_value_from_index();
12625     }
12626   }
12627 
12628   ha_alter_info->handler_ctx = new Rdb_inplace_alter_ctx(
12629       new_tdef, old_key_descr, new_key_descr, old_n_keys, new_n_keys,
12630       added_indexes, dropped_index_ids, n_added_keys, n_dropped_keys,
12631       max_auto_incr);
12632   DBUG_RETURN(HA_EXIT_SUCCESS);
12633 }
12634 
12635 /**
12636   Alter the table structure in-place with operations specified using
12637   HA_ALTER_FLAGS and Alter_inplace_info. The level of concurrency allowed
12638   during this operation depends on the return value from
12639   check_if_supported_inplace_alter().
12640 
12641   @note Storage engines are responsible for reporting any errors by
12642   calling my_error()/print_error()
12643 
12644   @note If this function reports error, commit_inplace_alter_table()
12645   will be called with commit= false.
12646 
12647   @param    altered_table     TABLE object for new version of table.
12648   @param    ha_alter_info     Structure describing changes to be done
12649                               by ALTER TABLE and holding data used
12650                               during in-place alter.
12651 
12652   @retval   true              Error
12653   @retval   false             Success
12654 */
12655 bool ha_rocksdb::inplace_alter_table(
12656     TABLE *const altered_table,
12657     my_core::Alter_inplace_info *const ha_alter_info) {
12658   DBUG_ENTER_FUNC();
12659 
12660   DBUG_ASSERT(altered_table != nullptr);
12661   DBUG_ASSERT(ha_alter_info != nullptr);
12662   DBUG_ASSERT(ha_alter_info->handler_ctx != nullptr);
12663 
12664   Rdb_inplace_alter_ctx *const ctx =
12665       static_cast<Rdb_inplace_alter_ctx *>(ha_alter_info->handler_ctx);
12666 
12667   if (ha_alter_info->handler_flags &
12668       (ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX |
12669        ALTER_ADD_UNIQUE_INDEX)) {
12670     /*
12671       Buffers need to be set up again to account for new, possibly longer
12672       secondary keys.
12673     */
12674     free_key_buffers();
12675 
12676     DBUG_ASSERT(ctx != nullptr);
12677 
12678     /*
12679       If adding unique index, allocate special buffers for duplicate checking.
12680     */
12681     int err;
12682     if ((err = alloc_key_buffers(
12683              altered_table, ctx->m_new_tdef,
12684              ha_alter_info->handler_flags &
12685                  ALTER_ADD_UNIQUE_INDEX))) {
12686       my_error(ER_OUT_OF_RESOURCES, MYF(0));
12687       DBUG_RETURN(err);
12688     }
12689 
12690     /* Populate all new secondary keys by scanning the primary key. */
12691     if ((err = inplace_populate_sk(altered_table, ctx->m_added_indexes))) {
12692       my_error(ER_SK_POPULATE_DURING_ALTER, MYF(0));
12693       DBUG_RETURN(HA_EXIT_FAILURE);
12694     }
12695   }
12696 
12697   DBUG_EXECUTE_IF("myrocks_simulate_index_create_rollback", {
12698     dbug_create_err_inplace_alter();
12699     DBUG_RETURN(HA_EXIT_FAILURE);
12700   };);
12701 
12702   DBUG_RETURN(HA_EXIT_SUCCESS);
12703 }
12704 
12705 /**
12706  Scan the Primary Key index entries and populate the new secondary keys.
12707 */
12708 int ha_rocksdb::inplace_populate_sk(
12709     TABLE *const new_table_arg,
12710     const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes) {
12711   DBUG_ENTER_FUNC();
12712   int res = HA_EXIT_SUCCESS;
12713   const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
12714   rocksdb::WriteBatch *const batch = wb.get();
12715 
12716   /* Update the data dictionary */
12717   std::unordered_set<GL_INDEX_ID> create_index_ids;
12718   for (const auto &index : indexes) {
12719     create_index_ids.insert(index->get_gl_index_id());
12720   }
12721   dict_manager.add_create_index(create_index_ids, batch);
12722   res = dict_manager.commit(batch);
12723   if (res != HA_EXIT_SUCCESS) {
12724     return res;
12725   }
12726 
12727   /*
12728     Add uncommitted key definitons to ddl_manager.  We need to do this
12729     so that the property collector can find this keydef when it needs to
12730     update stats.  The property collector looks for the keydef in the
12731     data dictionary, but it won't be there yet since this key definition
12732     is still in the creation process.
12733   */
12734   ddl_manager.add_uncommitted_keydefs(indexes);
12735 
12736   const bool hidden_pk_exists = has_hidden_pk(table);
12737 
12738   Rdb_transaction *tx = get_or_create_tx(table->in_use);
12739 
12740   /*
12741     There is one specific scenario where m_sst_info may not be nullptr. This
12742     happens if the handler we're using happens to be the handler where the PK
12743     bulk load was done on. The sequence of events that lead to this is as
12744     follows (T1 is PK bulk load, T2 is SK alter table):
12745 
12746     T1: Execute last INSERT statement
12747     T1: Return TABLE and handler object back to Table_cache_manager
12748     T1: Close connection
12749     T2: Execute ALTER statement
12750     T2: Take same TABLE/handler from Table_cache_manager
12751     T2: Call closefrm which will call finalize_bulk_load on every other open
12752         table/handler *except* the one it's on.
12753     T2: Acquire stale snapshot of PK
12754     T1: Call finalize_bulk_load
12755 
12756     This is rare because usually, closefrm will call the destructor (and thus
12757     finalize_bulk_load) on the handler where PK bulk load is done. However, if
12758     the thread ids of the bulk load thread and the alter thread differ by a
12759     multiple of table_cache_instances (8 by default), then they hash to the
12760     same bucket in Table_cache_manager and the alter thread will not not call
12761     the destructor on the handler it is holding. Thus, its m_sst_info will not
12762     be nullptr.
12763 
12764     At this point, it is safe to refresh the snapshot because we know all other
12765     open handlers have been closed at this point, and the one we're on is the
12766     only one left.
12767   */
12768   if (m_sst_info) {
12769     if ((res = finalize_bulk_load())) {
12770       DBUG_RETURN(res);
12771     }
12772     tx->commit();
12773   }
12774 
12775   const ulonglong rdb_merge_buf_size = THDVAR(ha_thd(), merge_buf_size);
12776   const ulonglong rdb_merge_combine_read_size =
12777       THDVAR(ha_thd(), merge_combine_read_size);
12778   const ulonglong rdb_merge_tmp_file_removal_delay =
12779       THDVAR(ha_thd(), merge_tmp_file_removal_delay_ms);
12780 
12781   for (const auto &index : indexes) {
12782     bool is_unique_index =
12783         new_table_arg->key_info[index->get_keyno()].flags & HA_NOSAME;
12784 
12785     Rdb_index_merge rdb_merge(tx->get_rocksdb_tmpdir(), rdb_merge_buf_size,
12786                               rdb_merge_combine_read_size,
12787                               rdb_merge_tmp_file_removal_delay,
12788                               index->get_cf());
12789 
12790     if ((res = rdb_merge.init())) {
12791       DBUG_RETURN(res);
12792     }
12793 
12794     /*
12795       Note: We pass in the currently existing table + tbl_def object here,
12796       as the pk index position may have changed in the case of hidden primary
12797       keys.
12798     */
12799     const uint pk = pk_index(table, m_tbl_def);
12800     ha_index_init(pk, true);
12801 
12802     /* Scan each record in the primary key in order */
12803     for (res = index_first(table->record[0]); res == 0;
12804          res = index_next(table->record[0])) {
12805       longlong hidden_pk_id = 0;
12806       if (hidden_pk_exists &&
12807           (res = read_hidden_pk_id_from_rowkey(&hidden_pk_id))) {
12808         // NO_LINT_DEBUG
12809         sql_print_error("Error retrieving hidden pk id.");
12810         ha_index_end();
12811         DBUG_RETURN(res);
12812       }
12813 
12814       /* Create new secondary index entry */
12815       const int new_packed_size = index->pack_record(
12816           new_table_arg, m_pack_buffer, table->record[0], m_sk_packed_tuple,
12817           &m_sk_tails, should_store_row_debug_checksums(), hidden_pk_id, 0,
12818           nullptr, m_ttl_bytes);
12819 
12820       const rocksdb::Slice key = rocksdb::Slice(
12821           reinterpret_cast<const char *>(m_sk_packed_tuple), new_packed_size);
12822       const rocksdb::Slice val =
12823           rocksdb::Slice(reinterpret_cast<const char *>(m_sk_tails.ptr()),
12824                          m_sk_tails.get_current_pos());
12825 
12826       /*
12827         Add record to offset tree in preparation for writing out to
12828         disk in sorted chunks.
12829       */
12830       if ((res = rdb_merge.add(key, val))) {
12831         ha_index_end();
12832         DBUG_RETURN(res);
12833       }
12834     }
12835 
12836     if (res != HA_ERR_END_OF_FILE) {
12837       // NO_LINT_DEBUG
12838       sql_print_error("Error retrieving index entry from primary key.");
12839       ha_index_end();
12840       DBUG_RETURN(res);
12841     }
12842 
12843     ha_index_end();
12844 
12845     /*
12846       Perform an n-way merge of n sorted buffers on disk, then writes all
12847       results to RocksDB via SSTFileWriter API.
12848     */
12849     rocksdb::Slice merge_key;
12850     rocksdb::Slice merge_val;
12851 
12852     struct unique_sk_buf_info sk_info;
12853     sk_info.dup_sk_buf = m_dup_sk_packed_tuple;
12854     sk_info.dup_sk_buf_old = m_dup_sk_packed_tuple_old;
12855 
12856     while ((res = rdb_merge.next(&merge_key, &merge_val)) == 0) {
12857       /* Perform uniqueness check if needed */
12858       if (is_unique_index) {
12859         if (check_duplicate_sk(new_table_arg, *index, &merge_key, &sk_info)) {
12860           /*
12861             Duplicate entry found when trying to create unique secondary key.
12862             We need to unpack the record into new_table_arg->record[0] as it
12863             is used inside print_keydup_error so that the error message shows
12864             the duplicate record.
12865           */
12866           if (index->unpack_record(
12867                   new_table_arg, new_table_arg->record[0], &merge_key,
12868                   &merge_val, m_converter->get_verify_row_debug_checksums())) {
12869             /* Should never reach here */
12870             DBUG_ASSERT(0);
12871           }
12872 
12873           print_keydup_error(new_table_arg,
12874                              &new_table_arg->key_info[index->get_keyno()],
12875                              MYF(0));
12876           DBUG_RETURN(ER_DUP_ENTRY);
12877         }
12878       }
12879 
12880       /*
12881         Insert key and slice to SST via SSTFileWriter API.
12882       */
12883       if ((res = bulk_load_key(tx, *index, merge_key, merge_val, false))) {
12884         break;
12885       }
12886     }
12887 
12888     /*
12889       Here, res == -1 means that we are finished, while > 0 means an error
12890       occurred.
12891     */
12892     if (res > 0) {
12893       // NO_LINT_DEBUG
12894       sql_print_error("Error while bulk loading keys in external merge sort.");
12895       DBUG_RETURN(res);
12896     }
12897 
12898     bool is_critical_error;
12899     res = tx->finish_bulk_load(&is_critical_error);
12900     if (res && is_critical_error) {
12901       // NO_LINT_DEBUG
12902       sql_print_error("Error finishing bulk load.");
12903       DBUG_RETURN(res);
12904     }
12905   }
12906 
12907   /*
12908     Explicitly tell jemalloc to clean up any unused dirty pages at this point.
12909     See https://reviews.facebook.net/D63723 for more details.
12910   */
12911   purge_all_jemalloc_arenas();
12912 
12913   DBUG_EXECUTE_IF("crash_during_online_index_creation", DBUG_SUICIDE(););
12914   DBUG_RETURN(res);
12915 }
12916 
12917 /**
12918   Commit or rollback the changes made during prepare_inplace_alter_table()
12919   and inplace_alter_table() inside the storage engine.
12920   Note that in case of rollback the allowed level of concurrency during
12921   this operation will be the same as for inplace_alter_table() and thus
12922   might be higher than during prepare_inplace_alter_table(). (For example,
12923   concurrent writes were blocked during prepare, but might not be during
12924   rollback).
12925 
12926   @note Storage engines are responsible for reporting any errors by
12927   calling my_error()/print_error()
12928 
12929   @note If this function with commit= true reports error, it will be called
12930   again with commit= false.
12931 
12932   @note In case of partitioning, this function might be called for rollback
12933   without prepare_inplace_alter_table() having been called first.
12934   Also partitioned tables sets ha_alter_info->group_commit_ctx to a NULL
12935   terminated array of the partitions handlers and if all of them are
12936   committed as one, then group_commit_ctx should be set to NULL to indicate
12937   to the partitioning handler that all partitions handlers are committed.
12938   @see prepare_inplace_alter_table().
12939 
12940   @param    altered_table     TABLE object for new version of table.
12941   @param    ha_alter_info     Structure describing changes to be done
12942                               by ALTER TABLE and holding data used
12943                               during in-place alter.
12944   @param    commit            True => Commit, False => Rollback.
12945 
12946   @retval   true              Error
12947   @retval   false             Success
12948 */
12949 bool ha_rocksdb::commit_inplace_alter_table(
12950     my_core::TABLE *const altered_table,
12951     my_core::Alter_inplace_info *const ha_alter_info, bool commit) {
12952   DBUG_ENTER_FUNC();
12953 
12954   DBUG_ASSERT(altered_table != nullptr);
12955   DBUG_ASSERT(ha_alter_info != nullptr);
12956 
12957   Rdb_inplace_alter_ctx *const ctx0 =
12958       static_cast<Rdb_inplace_alter_ctx *>(ha_alter_info->handler_ctx);
12959 
12960   DEBUG_SYNC(ha_thd(), "rocksdb.commit_in_place_alter_table");
12961 
12962   /*
12963     IMPORTANT: When rollback is requested, mysql will abort with
12964     an assertion failure. That means every failed commit during inplace alter
12965     table will result in a fatal error on the server. Indexes ongoing creation
12966     will be detected when the server restarts, and dropped.
12967 
12968     For partitioned tables, a rollback call to this function (commit == false)
12969     is done for each partition.  A successful commit call only executes once
12970     for all partitions.
12971   */
12972   if (!commit) {
12973     /* If ctx has not been created yet, nothing to do here */
12974     if (!ctx0) {
12975       DBUG_RETURN(HA_EXIT_SUCCESS);
12976     }
12977 
12978     /*
12979       Cannot call destructor for Rdb_tbl_def directly because we don't want to
12980       erase the mappings inside the ddl_manager, as the old_key_descr is still
12981       using them.
12982     */
12983     if (ctx0->m_new_key_descr) {
12984       /* Delete the new key descriptors */
12985       for (uint i = 0; i < ctx0->m_new_tdef->m_key_count; i++) {
12986         ctx0->m_new_key_descr[i] = nullptr;
12987       }
12988 
12989       delete[] ctx0->m_new_key_descr;
12990       ctx0->m_new_key_descr = nullptr;
12991       ctx0->m_new_tdef->m_key_descr_arr = nullptr;
12992 
12993       delete ctx0->m_new_tdef;
12994     }
12995 
12996     /* Remove uncommitted key definitons from ddl_manager */
12997     ddl_manager.remove_uncommitted_keydefs(ctx0->m_added_indexes);
12998 
12999     /* Rollback any partially created indexes */
13000     dict_manager.rollback_ongoing_index_creation();
13001 
13002     DBUG_RETURN(HA_EXIT_SUCCESS);
13003   }
13004 
13005   DBUG_ASSERT(ctx0);
13006 
13007   /*
13008     For partitioned tables, we need to commit all changes to all tables at
13009     once, unlike in the other inplace alter API methods.
13010   */
13011   inplace_alter_handler_ctx **ctx_array;
13012   inplace_alter_handler_ctx *ctx_single[2];
13013 
13014   if (ha_alter_info->group_commit_ctx) {
13015     DBUG_EXECUTE_IF("crash_during_index_creation_partition", DBUG_SUICIDE(););
13016     ctx_array = ha_alter_info->group_commit_ctx;
13017   } else {
13018     ctx_single[0] = ctx0;
13019     ctx_single[1] = nullptr;
13020     ctx_array = ctx_single;
13021   }
13022 
13023   DBUG_ASSERT(ctx0 == ctx_array[0]);
13024   ha_alter_info->group_commit_ctx = nullptr;
13025 
13026   if (ha_alter_info->handler_flags &
13027       (ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX |
13028        ALTER_DROP_UNIQUE_INDEX |
13029        ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX |
13030        ALTER_ADD_UNIQUE_INDEX)) {
13031     const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
13032     rocksdb::WriteBatch *const batch = wb.get();
13033     std::unordered_set<GL_INDEX_ID> create_index_ids;
13034 
13035     m_tbl_def = ctx0->m_new_tdef;
13036     m_key_descr_arr = m_tbl_def->m_key_descr_arr;
13037     m_pk_descr = m_key_descr_arr[pk_index(altered_table, m_tbl_def)];
13038 
13039     dict_manager.lock();
13040     for (inplace_alter_handler_ctx **pctx = ctx_array; *pctx; pctx++) {
13041       Rdb_inplace_alter_ctx *const ctx =
13042           static_cast<Rdb_inplace_alter_ctx *>(*pctx);
13043 
13044       /* Mark indexes to be dropped */
13045       dict_manager.add_drop_index(ctx->m_dropped_index_ids, batch);
13046 
13047       for (const auto &index : ctx->m_added_indexes) {
13048         create_index_ids.insert(index->get_gl_index_id());
13049       }
13050 
13051       if (ddl_manager.put_and_write(ctx->m_new_tdef, batch)) {
13052         /*
13053           Failed to write new entry into data dictionary, this should never
13054           happen.
13055         */
13056         DBUG_ASSERT(0);
13057       }
13058 
13059       /*
13060         Remove uncommitted key definitons from ddl_manager, as they are now
13061         committed into the data dictionary.
13062       */
13063       ddl_manager.remove_uncommitted_keydefs(ctx->m_added_indexes);
13064     }
13065 
13066     if (dict_manager.commit(batch)) {
13067       /*
13068         Should never reach here. We assume MyRocks will abort if commit fails.
13069       */
13070       DBUG_ASSERT(0);
13071     }
13072 
13073     dict_manager.unlock();
13074 
13075     /* Mark ongoing create indexes as finished/remove from data dictionary */
13076     dict_manager.finish_indexes_operation(
13077         create_index_ids, Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
13078 
13079     rdb_drop_idx_thread.signal();
13080   }
13081 
13082   if (ha_alter_info->handler_flags & ALTER_CHANGE_CREATE_OPTION) {
13083     const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
13084     rocksdb::WriteBatch *const batch = wb.get();
13085     std::unordered_set<GL_INDEX_ID> create_index_ids;
13086 
13087     ulonglong auto_incr_val = ha_alter_info->create_info->auto_increment_value;
13088 
13089     for (inplace_alter_handler_ctx **pctx = ctx_array; *pctx; pctx++) {
13090       Rdb_inplace_alter_ctx *const ctx =
13091           static_cast<Rdb_inplace_alter_ctx *>(*pctx);
13092       auto_incr_val = std::max(auto_incr_val, ctx->m_max_auto_incr);
13093       dict_manager.put_auto_incr_val(
13094           batch, ctx->m_new_tdef->get_autoincr_gl_index_id(), auto_incr_val,
13095           true /* overwrite */);
13096       ctx->m_new_tdef->m_auto_incr_val = auto_incr_val;
13097     }
13098 
13099     if (dict_manager.commit(batch)) {
13100       DBUG_ASSERT(0);
13101     }
13102   }
13103 
13104   DBUG_RETURN(HA_EXIT_SUCCESS);
13105 }
13106 
13107 #define SHOW_FNAME(name) rocksdb_show_##name
13108 
13109 #define DEF_SHOW_FUNC(name, key)                                           \
13110   static int SHOW_FNAME(name)(MYSQL_THD thd, SHOW_VAR * var, char *buff) { \
13111     rocksdb_status_counters.name =                                         \
13112         rocksdb_stats->getTickerCount(rocksdb::key);                       \
13113     var->type = SHOW_LONGLONG;                                             \
13114     var->value = reinterpret_cast<char *>(&rocksdb_status_counters.name);  \
13115     return HA_EXIT_SUCCESS;                                                \
13116   }
13117 
13118 #define DEF_STATUS_VAR(name) \
13119   { "rocksdb_" #name, (char *)&SHOW_FNAME(name), SHOW_FUNC }
13120 
13121 #define DEF_STATUS_VAR_PTR(name, ptr, option) \
13122   { "rocksdb_" name, (char *)ptr, option }
13123 
13124 #define DEF_STATUS_VAR_FUNC(name, ptr, option) \
13125   { name, reinterpret_cast<char *>(ptr), option }
13126 
13127 struct rocksdb_status_counters_t {
13128   uint64_t block_cache_miss;
13129   uint64_t block_cache_hit;
13130   uint64_t block_cache_add;
13131   uint64_t block_cache_add_failures;
13132   uint64_t block_cache_index_miss;
13133   uint64_t block_cache_index_hit;
13134   uint64_t block_cache_index_add;
13135   uint64_t block_cache_index_bytes_insert;
13136   uint64_t block_cache_index_bytes_evict;
13137   uint64_t block_cache_filter_miss;
13138   uint64_t block_cache_filter_hit;
13139   uint64_t block_cache_filter_add;
13140   uint64_t block_cache_filter_bytes_insert;
13141   uint64_t block_cache_filter_bytes_evict;
13142   uint64_t block_cache_bytes_read;
13143   uint64_t block_cache_bytes_write;
13144   uint64_t block_cache_data_bytes_insert;
13145   uint64_t block_cache_data_miss;
13146   uint64_t block_cache_data_hit;
13147   uint64_t block_cache_data_add;
13148   uint64_t bloom_filter_useful;
13149   uint64_t bloom_filter_full_positive;
13150   uint64_t bloom_filter_full_true_positive;
13151   uint64_t memtable_hit;
13152   uint64_t memtable_miss;
13153   uint64_t get_hit_l0;
13154   uint64_t get_hit_l1;
13155   uint64_t get_hit_l2_and_up;
13156   uint64_t compaction_key_drop_new;
13157   uint64_t compaction_key_drop_obsolete;
13158   uint64_t compaction_key_drop_user;
13159   uint64_t number_keys_written;
13160   uint64_t number_keys_read;
13161   uint64_t number_keys_updated;
13162   uint64_t bytes_written;
13163   uint64_t bytes_read;
13164   uint64_t number_db_seek;
13165   uint64_t number_db_seek_found;
13166   uint64_t number_db_next;
13167   uint64_t number_db_next_found;
13168   uint64_t number_db_prev;
13169   uint64_t number_db_prev_found;
13170   uint64_t iter_bytes_read;
13171   uint64_t no_file_closes;
13172   uint64_t no_file_opens;
13173   uint64_t no_file_errors;
13174   uint64_t stall_micros;
13175   uint64_t num_iterators;
13176   uint64_t number_multiget_get;
13177   uint64_t number_multiget_keys_read;
13178   uint64_t number_multiget_bytes_read;
13179   uint64_t number_deletes_filtered;
13180   uint64_t number_merge_failures;
13181   uint64_t bloom_filter_prefix_checked;
13182   uint64_t bloom_filter_prefix_useful;
13183   uint64_t number_reseeks_iteration;
13184   uint64_t getupdatessince_calls;
13185   uint64_t block_cachecompressed_miss;
13186   uint64_t block_cachecompressed_hit;
13187   uint64_t wal_synced;
13188   uint64_t wal_bytes;
13189   uint64_t write_self;
13190   uint64_t write_other;
13191   uint64_t write_timedout;
13192   uint64_t write_wal;
13193   uint64_t flush_write_bytes;
13194   uint64_t compact_read_bytes;
13195   uint64_t compact_write_bytes;
13196   uint64_t number_superversion_acquires;
13197   uint64_t number_superversion_releases;
13198   uint64_t number_superversion_cleanups;
13199   uint64_t number_block_not_compressed;
13200 };
13201 
13202 static rocksdb_status_counters_t rocksdb_status_counters;
13203 
13204 DEF_SHOW_FUNC(block_cache_miss, BLOCK_CACHE_MISS)
13205 DEF_SHOW_FUNC(block_cache_hit, BLOCK_CACHE_HIT)
13206 DEF_SHOW_FUNC(block_cache_add, BLOCK_CACHE_ADD)
13207 DEF_SHOW_FUNC(block_cache_add_failures, BLOCK_CACHE_ADD_FAILURES)
13208 DEF_SHOW_FUNC(block_cache_index_miss, BLOCK_CACHE_INDEX_MISS)
13209 DEF_SHOW_FUNC(block_cache_index_hit, BLOCK_CACHE_INDEX_HIT)
13210 DEF_SHOW_FUNC(block_cache_index_add, BLOCK_CACHE_INDEX_ADD)
13211 DEF_SHOW_FUNC(block_cache_index_bytes_insert, BLOCK_CACHE_INDEX_BYTES_INSERT)
13212 DEF_SHOW_FUNC(block_cache_index_bytes_evict, BLOCK_CACHE_INDEX_BYTES_EVICT)
13213 DEF_SHOW_FUNC(block_cache_filter_miss, BLOCK_CACHE_FILTER_MISS)
13214 DEF_SHOW_FUNC(block_cache_filter_hit, BLOCK_CACHE_FILTER_HIT)
13215 DEF_SHOW_FUNC(block_cache_filter_add, BLOCK_CACHE_FILTER_ADD)
13216 DEF_SHOW_FUNC(block_cache_filter_bytes_insert, BLOCK_CACHE_FILTER_BYTES_INSERT)
13217 DEF_SHOW_FUNC(block_cache_filter_bytes_evict, BLOCK_CACHE_FILTER_BYTES_EVICT)
13218 DEF_SHOW_FUNC(block_cache_bytes_read, BLOCK_CACHE_BYTES_READ)
13219 DEF_SHOW_FUNC(block_cache_bytes_write, BLOCK_CACHE_BYTES_WRITE)
13220 DEF_SHOW_FUNC(block_cache_data_bytes_insert, BLOCK_CACHE_DATA_BYTES_INSERT)
13221 DEF_SHOW_FUNC(block_cache_data_miss, BLOCK_CACHE_DATA_MISS)
13222 DEF_SHOW_FUNC(block_cache_data_hit, BLOCK_CACHE_DATA_HIT)
13223 DEF_SHOW_FUNC(block_cache_data_add, BLOCK_CACHE_DATA_ADD)
13224 DEF_SHOW_FUNC(bloom_filter_useful, BLOOM_FILTER_USEFUL)
13225 DEF_SHOW_FUNC(bloom_filter_full_positive, BLOOM_FILTER_FULL_POSITIVE)
13226 DEF_SHOW_FUNC(bloom_filter_full_true_positive, BLOOM_FILTER_FULL_TRUE_POSITIVE)
13227 DEF_SHOW_FUNC(memtable_hit, MEMTABLE_HIT)
13228 DEF_SHOW_FUNC(memtable_miss, MEMTABLE_MISS)
13229 DEF_SHOW_FUNC(get_hit_l0, GET_HIT_L0)
13230 DEF_SHOW_FUNC(get_hit_l1, GET_HIT_L1)
13231 DEF_SHOW_FUNC(get_hit_l2_and_up, GET_HIT_L2_AND_UP)
13232 DEF_SHOW_FUNC(compaction_key_drop_new, COMPACTION_KEY_DROP_NEWER_ENTRY)
13233 DEF_SHOW_FUNC(compaction_key_drop_obsolete, COMPACTION_KEY_DROP_OBSOLETE)
13234 DEF_SHOW_FUNC(compaction_key_drop_user, COMPACTION_KEY_DROP_USER)
13235 DEF_SHOW_FUNC(number_keys_written, NUMBER_KEYS_WRITTEN)
13236 DEF_SHOW_FUNC(number_keys_read, NUMBER_KEYS_READ)
13237 DEF_SHOW_FUNC(number_keys_updated, NUMBER_KEYS_UPDATED)
13238 DEF_SHOW_FUNC(bytes_written, BYTES_WRITTEN)
13239 DEF_SHOW_FUNC(bytes_read, BYTES_READ)
13240 DEF_SHOW_FUNC(number_db_seek, NUMBER_DB_SEEK)
13241 DEF_SHOW_FUNC(number_db_seek_found, NUMBER_DB_SEEK_FOUND)
13242 DEF_SHOW_FUNC(number_db_next, NUMBER_DB_NEXT)
13243 DEF_SHOW_FUNC(number_db_next_found, NUMBER_DB_NEXT_FOUND)
13244 DEF_SHOW_FUNC(number_db_prev, NUMBER_DB_PREV)
13245 DEF_SHOW_FUNC(number_db_prev_found, NUMBER_DB_PREV_FOUND)
13246 DEF_SHOW_FUNC(iter_bytes_read, ITER_BYTES_READ)
13247 DEF_SHOW_FUNC(no_file_closes, NO_FILE_CLOSES)
13248 DEF_SHOW_FUNC(no_file_opens, NO_FILE_OPENS)
13249 DEF_SHOW_FUNC(no_file_errors, NO_FILE_ERRORS)
13250 DEF_SHOW_FUNC(stall_micros, STALL_MICROS)
13251 DEF_SHOW_FUNC(num_iterators, NO_ITERATORS)
13252 DEF_SHOW_FUNC(number_multiget_get, NUMBER_MULTIGET_CALLS)
13253 DEF_SHOW_FUNC(number_multiget_keys_read, NUMBER_MULTIGET_KEYS_READ)
13254 DEF_SHOW_FUNC(number_multiget_bytes_read, NUMBER_MULTIGET_BYTES_READ)
13255 DEF_SHOW_FUNC(number_deletes_filtered, NUMBER_FILTERED_DELETES)
13256 DEF_SHOW_FUNC(number_merge_failures, NUMBER_MERGE_FAILURES)
13257 DEF_SHOW_FUNC(bloom_filter_prefix_checked, BLOOM_FILTER_PREFIX_CHECKED)
13258 DEF_SHOW_FUNC(bloom_filter_prefix_useful, BLOOM_FILTER_PREFIX_USEFUL)
13259 DEF_SHOW_FUNC(number_reseeks_iteration, NUMBER_OF_RESEEKS_IN_ITERATION)
13260 DEF_SHOW_FUNC(getupdatessince_calls, GET_UPDATES_SINCE_CALLS)
13261 DEF_SHOW_FUNC(block_cachecompressed_miss, BLOCK_CACHE_COMPRESSED_MISS)
13262 DEF_SHOW_FUNC(block_cachecompressed_hit, BLOCK_CACHE_COMPRESSED_HIT)
13263 DEF_SHOW_FUNC(wal_synced, WAL_FILE_SYNCED)
13264 DEF_SHOW_FUNC(wal_bytes, WAL_FILE_BYTES)
13265 DEF_SHOW_FUNC(write_self, WRITE_DONE_BY_SELF)
13266 DEF_SHOW_FUNC(write_other, WRITE_DONE_BY_OTHER)
13267 DEF_SHOW_FUNC(write_timedout, WRITE_TIMEDOUT)
13268 DEF_SHOW_FUNC(write_wal, WRITE_WITH_WAL)
13269 DEF_SHOW_FUNC(flush_write_bytes, FLUSH_WRITE_BYTES)
13270 DEF_SHOW_FUNC(compact_read_bytes, COMPACT_READ_BYTES)
13271 DEF_SHOW_FUNC(compact_write_bytes, COMPACT_WRITE_BYTES)
13272 DEF_SHOW_FUNC(number_superversion_acquires, NUMBER_SUPERVERSION_ACQUIRES)
13273 DEF_SHOW_FUNC(number_superversion_releases, NUMBER_SUPERVERSION_RELEASES)
13274 DEF_SHOW_FUNC(number_superversion_cleanups, NUMBER_SUPERVERSION_CLEANUPS)
13275 DEF_SHOW_FUNC(number_block_not_compressed, NUMBER_BLOCK_NOT_COMPRESSED)
13276 
13277 static void myrocks_update_status() {
13278   export_stats.rows_deleted = global_stats.rows[ROWS_DELETED];
13279   export_stats.rows_inserted = global_stats.rows[ROWS_INSERTED];
13280   export_stats.rows_read = global_stats.rows[ROWS_READ];
13281   export_stats.rows_updated = global_stats.rows[ROWS_UPDATED];
13282   export_stats.rows_deleted_blind = global_stats.rows[ROWS_DELETED_BLIND];
13283   export_stats.rows_expired = global_stats.rows[ROWS_EXPIRED];
13284   export_stats.rows_filtered = global_stats.rows[ROWS_FILTERED];
13285 
13286   export_stats.system_rows_deleted = global_stats.system_rows[ROWS_DELETED];
13287   export_stats.system_rows_inserted = global_stats.system_rows[ROWS_INSERTED];
13288   export_stats.system_rows_read = global_stats.system_rows[ROWS_READ];
13289   export_stats.system_rows_updated = global_stats.system_rows[ROWS_UPDATED];
13290 
13291   export_stats.queries_point = global_stats.queries[QUERIES_POINT];
13292   export_stats.queries_range = global_stats.queries[QUERIES_RANGE];
13293 
13294   export_stats.covered_secondary_key_lookups =
13295       global_stats.covered_secondary_key_lookups;
13296 }
13297 
13298 static void myrocks_update_memory_status() {
13299   std::vector<rocksdb::DB *> dbs;
13300   std::unordered_set<const rocksdb::Cache *> cache_set;
13301   dbs.push_back(rdb);
13302   std::map<rocksdb::MemoryUtil::UsageType, uint64_t> temp_usage_by_type;
13303   rocksdb::MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set,
13304                                                        &temp_usage_by_type);
13305   memory_stats.memtable_total =
13306       temp_usage_by_type[rocksdb::MemoryUtil::kMemTableTotal];
13307   memory_stats.memtable_unflushed =
13308       temp_usage_by_type[rocksdb::MemoryUtil::kMemTableUnFlushed];
13309 }
13310 
13311 static SHOW_VAR myrocks_status_variables[] = {
13312     DEF_STATUS_VAR_FUNC("rows_deleted", &export_stats.rows_deleted,
13313                         SHOW_LONGLONG),
13314     DEF_STATUS_VAR_FUNC("rows_inserted", &export_stats.rows_inserted,
13315                         SHOW_LONGLONG),
13316     DEF_STATUS_VAR_FUNC("rows_read", &export_stats.rows_read, SHOW_LONGLONG),
13317     DEF_STATUS_VAR_FUNC("rows_updated", &export_stats.rows_updated,
13318                         SHOW_LONGLONG),
13319     DEF_STATUS_VAR_FUNC("rows_deleted_blind", &export_stats.rows_deleted_blind,
13320                         SHOW_LONGLONG),
13321     DEF_STATUS_VAR_FUNC("rows_expired", &export_stats.rows_expired,
13322                         SHOW_LONGLONG),
13323     DEF_STATUS_VAR_FUNC("rows_filtered", &export_stats.rows_filtered,
13324                         SHOW_LONGLONG),
13325     DEF_STATUS_VAR_FUNC("system_rows_deleted",
13326                         &export_stats.system_rows_deleted, SHOW_LONGLONG),
13327     DEF_STATUS_VAR_FUNC("system_rows_inserted",
13328                         &export_stats.system_rows_inserted, SHOW_LONGLONG),
13329     DEF_STATUS_VAR_FUNC("system_rows_read", &export_stats.system_rows_read,
13330                         SHOW_LONGLONG),
13331     DEF_STATUS_VAR_FUNC("system_rows_updated",
13332                         &export_stats.system_rows_updated, SHOW_LONGLONG),
13333     DEF_STATUS_VAR_FUNC("memtable_total", &memory_stats.memtable_total,
13334                         SHOW_LONGLONG),
13335     DEF_STATUS_VAR_FUNC("memtable_unflushed", &memory_stats.memtable_unflushed,
13336                         SHOW_LONGLONG),
13337     DEF_STATUS_VAR_FUNC("queries_point", &export_stats.queries_point,
13338                         SHOW_LONGLONG),
13339     DEF_STATUS_VAR_FUNC("queries_range", &export_stats.queries_range,
13340                         SHOW_LONGLONG),
13341     DEF_STATUS_VAR_FUNC("covered_secondary_key_lookups",
13342                         &export_stats.covered_secondary_key_lookups,
13343                         SHOW_LONGLONG),
13344 
13345     {NullS, NullS, SHOW_LONG}};
13346 
13347 static void show_myrocks_vars(THD *thd, SHOW_VAR *var, char *buff) {
13348   myrocks_update_status();
13349   myrocks_update_memory_status();
13350   var->type = SHOW_ARRAY;
13351   var->value = reinterpret_cast<char *>(&myrocks_status_variables);
13352 }
13353 
13354 static ulonglong io_stall_prop_value(
13355     const std::map<std::string, std::string> &props, const std::string &key) {
13356   std::map<std::string, std::string>::const_iterator iter =
13357       props.find("io_stalls." + key);
13358   if (iter != props.end()) {
13359     return std::stoull(iter->second);
13360   } else {
13361     DBUG_PRINT("warning",
13362                ("RocksDB GetMapPropery hasn't returned key=%s", key.c_str()));
13363     DBUG_ASSERT(0);
13364     return 0;
13365   }
13366 }
13367 
13368 static void update_rocksdb_stall_status() {
13369   st_io_stall_stats local_io_stall_stats;
13370   for (const auto &cf_name : cf_manager.get_cf_names()) {
13371     rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(cf_name);
13372     if (cfh == nullptr) {
13373       continue;
13374     }
13375 
13376     std::map<std::string, std::string> props;
13377     if (!rdb->GetMapProperty(cfh, "rocksdb.cfstats", &props)) {
13378       continue;
13379     }
13380 
13381     local_io_stall_stats.level0_slowdown +=
13382         io_stall_prop_value(props, "level0_slowdown");
13383     local_io_stall_stats.level0_slowdown_with_compaction +=
13384         io_stall_prop_value(props, "level0_slowdown_with_compaction");
13385     local_io_stall_stats.level0_numfiles +=
13386         io_stall_prop_value(props, "level0_numfiles");
13387     local_io_stall_stats.level0_numfiles_with_compaction +=
13388         io_stall_prop_value(props, "level0_numfiles_with_compaction");
13389     local_io_stall_stats.stop_for_pending_compaction_bytes +=
13390         io_stall_prop_value(props, "stop_for_pending_compaction_bytes");
13391     local_io_stall_stats.slowdown_for_pending_compaction_bytes +=
13392         io_stall_prop_value(props, "slowdown_for_pending_compaction_bytes");
13393     local_io_stall_stats.memtable_compaction +=
13394         io_stall_prop_value(props, "memtable_compaction");
13395     local_io_stall_stats.memtable_slowdown +=
13396         io_stall_prop_value(props, "memtable_slowdown");
13397     local_io_stall_stats.total_stop += io_stall_prop_value(props, "total_stop");
13398     local_io_stall_stats.total_slowdown +=
13399         io_stall_prop_value(props, "total_slowdown");
13400   }
13401   io_stall_stats = local_io_stall_stats;
13402 }
13403 
13404 static SHOW_VAR rocksdb_stall_status_variables[] = {
13405     DEF_STATUS_VAR_FUNC("l0_file_count_limit_slowdowns",
13406                         &io_stall_stats.level0_slowdown, SHOW_LONGLONG),
13407     DEF_STATUS_VAR_FUNC("locked_l0_file_count_limit_slowdowns",
13408                         &io_stall_stats.level0_slowdown_with_compaction,
13409                         SHOW_LONGLONG),
13410     DEF_STATUS_VAR_FUNC("l0_file_count_limit_stops",
13411                         &io_stall_stats.level0_numfiles, SHOW_LONGLONG),
13412     DEF_STATUS_VAR_FUNC("locked_l0_file_count_limit_stops",
13413                         &io_stall_stats.level0_numfiles_with_compaction,
13414                         SHOW_LONGLONG),
13415     DEF_STATUS_VAR_FUNC("pending_compaction_limit_stops",
13416                         &io_stall_stats.stop_for_pending_compaction_bytes,
13417                         SHOW_LONGLONG),
13418     DEF_STATUS_VAR_FUNC("pending_compaction_limit_slowdowns",
13419                         &io_stall_stats.slowdown_for_pending_compaction_bytes,
13420                         SHOW_LONGLONG),
13421     DEF_STATUS_VAR_FUNC("memtable_limit_stops",
13422                         &io_stall_stats.memtable_compaction, SHOW_LONGLONG),
13423     DEF_STATUS_VAR_FUNC("memtable_limit_slowdowns",
13424                         &io_stall_stats.memtable_slowdown, SHOW_LONGLONG),
13425     DEF_STATUS_VAR_FUNC("total_stops", &io_stall_stats.total_stop,
13426                         SHOW_LONGLONG),
13427     DEF_STATUS_VAR_FUNC("total_slowdowns", &io_stall_stats.total_slowdown,
13428                         SHOW_LONGLONG),
13429     // end of the array marker
13430     {NullS, NullS, SHOW_LONG}};
13431 
13432 static void show_rocksdb_stall_vars(THD *thd, SHOW_VAR *var, char *buff) {
13433   update_rocksdb_stall_status();
13434   var->type = SHOW_ARRAY;
13435   var->value = reinterpret_cast<char *>(&rocksdb_stall_status_variables);
13436 }
13437 
13438 static SHOW_VAR rocksdb_status_vars[] = {
13439     DEF_STATUS_VAR(block_cache_miss),
13440     DEF_STATUS_VAR(block_cache_hit),
13441     DEF_STATUS_VAR(block_cache_add),
13442     DEF_STATUS_VAR(block_cache_add_failures),
13443     DEF_STATUS_VAR(block_cache_index_miss),
13444     DEF_STATUS_VAR(block_cache_index_hit),
13445     DEF_STATUS_VAR(block_cache_index_add),
13446     DEF_STATUS_VAR(block_cache_index_bytes_insert),
13447     DEF_STATUS_VAR(block_cache_index_bytes_evict),
13448     DEF_STATUS_VAR(block_cache_filter_miss),
13449     DEF_STATUS_VAR(block_cache_filter_hit),
13450     DEF_STATUS_VAR(block_cache_filter_add),
13451     DEF_STATUS_VAR(block_cache_filter_bytes_insert),
13452     DEF_STATUS_VAR(block_cache_filter_bytes_evict),
13453     DEF_STATUS_VAR(block_cache_bytes_read),
13454     DEF_STATUS_VAR(block_cache_bytes_write),
13455     DEF_STATUS_VAR(block_cache_data_bytes_insert),
13456     DEF_STATUS_VAR(block_cache_data_miss),
13457     DEF_STATUS_VAR(block_cache_data_hit),
13458     DEF_STATUS_VAR(block_cache_data_add),
13459     DEF_STATUS_VAR(bloom_filter_useful),
13460     DEF_STATUS_VAR(bloom_filter_full_positive),
13461     DEF_STATUS_VAR(bloom_filter_full_true_positive),
13462     DEF_STATUS_VAR(memtable_hit),
13463     DEF_STATUS_VAR(memtable_miss),
13464     DEF_STATUS_VAR(get_hit_l0),
13465     DEF_STATUS_VAR(get_hit_l1),
13466     DEF_STATUS_VAR(get_hit_l2_and_up),
13467     DEF_STATUS_VAR(compaction_key_drop_new),
13468     DEF_STATUS_VAR(compaction_key_drop_obsolete),
13469     DEF_STATUS_VAR(compaction_key_drop_user),
13470     DEF_STATUS_VAR(number_keys_written),
13471     DEF_STATUS_VAR(number_keys_read),
13472     DEF_STATUS_VAR(number_keys_updated),
13473     DEF_STATUS_VAR(bytes_written),
13474     DEF_STATUS_VAR(bytes_read),
13475     DEF_STATUS_VAR(number_db_seek),
13476     DEF_STATUS_VAR(number_db_seek_found),
13477     DEF_STATUS_VAR(number_db_next),
13478     DEF_STATUS_VAR(number_db_next_found),
13479     DEF_STATUS_VAR(number_db_prev),
13480     DEF_STATUS_VAR(number_db_prev_found),
13481     DEF_STATUS_VAR(iter_bytes_read),
13482     DEF_STATUS_VAR(no_file_closes),
13483     DEF_STATUS_VAR(no_file_opens),
13484     DEF_STATUS_VAR(no_file_errors),
13485     DEF_STATUS_VAR(stall_micros),
13486     DEF_STATUS_VAR(num_iterators),
13487     DEF_STATUS_VAR(number_multiget_get),
13488     DEF_STATUS_VAR(number_multiget_keys_read),
13489     DEF_STATUS_VAR(number_multiget_bytes_read),
13490     DEF_STATUS_VAR(number_deletes_filtered),
13491     DEF_STATUS_VAR(number_merge_failures),
13492     DEF_STATUS_VAR(bloom_filter_prefix_checked),
13493     DEF_STATUS_VAR(bloom_filter_prefix_useful),
13494     DEF_STATUS_VAR(number_reseeks_iteration),
13495     DEF_STATUS_VAR(getupdatessince_calls),
13496     DEF_STATUS_VAR(block_cachecompressed_miss),
13497     DEF_STATUS_VAR(block_cachecompressed_hit),
13498     DEF_STATUS_VAR(wal_synced),
13499     DEF_STATUS_VAR(wal_bytes),
13500     DEF_STATUS_VAR(write_self),
13501     DEF_STATUS_VAR(write_other),
13502     DEF_STATUS_VAR(write_timedout),
13503     DEF_STATUS_VAR(write_wal),
13504     DEF_STATUS_VAR(flush_write_bytes),
13505     DEF_STATUS_VAR(compact_read_bytes),
13506     DEF_STATUS_VAR(compact_write_bytes),
13507     DEF_STATUS_VAR(number_superversion_acquires),
13508     DEF_STATUS_VAR(number_superversion_releases),
13509     DEF_STATUS_VAR(number_superversion_cleanups),
13510     DEF_STATUS_VAR(number_block_not_compressed),
13511     DEF_STATUS_VAR_PTR("row_lock_deadlocks", &rocksdb_row_lock_deadlocks,
13512                        SHOW_LONGLONG),
13513     DEF_STATUS_VAR_PTR("row_lock_wait_timeouts",
13514                        &rocksdb_row_lock_wait_timeouts, SHOW_LONGLONG),
13515     DEF_STATUS_VAR_PTR("snapshot_conflict_errors",
13516                        &rocksdb_snapshot_conflict_errors, SHOW_LONGLONG),
13517     DEF_STATUS_VAR_PTR("wal_group_syncs", &rocksdb_wal_group_syncs,
13518                        SHOW_LONGLONG),
13519     DEF_STATUS_VAR_PTR("manual_compactions_processed",
13520                        &rocksdb_manual_compactions_processed, SHOW_LONGLONG),
13521     DEF_STATUS_VAR_PTR("manual_compactions_running",
13522                        &rocksdb_manual_compactions_running, SHOW_LONGLONG),
13523     DEF_STATUS_VAR_PTR("number_sst_entry_put", &rocksdb_num_sst_entry_put,
13524                        SHOW_LONGLONG),
13525     DEF_STATUS_VAR_PTR("number_sst_entry_delete", &rocksdb_num_sst_entry_delete,
13526                        SHOW_LONGLONG),
13527     DEF_STATUS_VAR_PTR("number_sst_entry_singledelete",
13528                        &rocksdb_num_sst_entry_singledelete, SHOW_LONGLONG),
13529     DEF_STATUS_VAR_PTR("number_sst_entry_merge", &rocksdb_num_sst_entry_merge,
13530                        SHOW_LONGLONG),
13531     DEF_STATUS_VAR_PTR("number_sst_entry_other", &rocksdb_num_sst_entry_other,
13532                        SHOW_LONGLONG),
13533 #ifndef DBUG_OFF
13534     DEF_STATUS_VAR_PTR("num_get_for_update_calls",
13535                        &rocksdb_num_get_for_update_calls, SHOW_LONGLONG),
13536 #endif
13537     // the variables generated by SHOW_FUNC are sorted only by prefix (first
13538     // arg in the tuple below), so make sure it is unique to make sorting
13539     // deterministic as quick sort is not stable
13540     {"rocksdb", reinterpret_cast<char *>(&show_myrocks_vars), SHOW_FUNC},
13541     {"rocksdb_stall", reinterpret_cast<char *>(&show_rocksdb_stall_vars),
13542      SHOW_FUNC},
13543     {NullS, NullS, SHOW_LONG}};
13544 
13545 /*
13546   Background thread's main logic
13547 */
13548 
13549 void Rdb_background_thread::run() {
13550   // How many seconds to wait till flushing the WAL next time.
13551   const int WAKE_UP_INTERVAL = 1;
13552 
13553   timespec ts_next_sync;
13554   set_timespec(ts_next_sync, WAKE_UP_INTERVAL);
13555 
13556   for (;;) {
13557     // Wait until the next timeout or until we receive a signal to stop the
13558     // thread. Request to stop the thread should only be triggered when the
13559     // storage engine is being unloaded.
13560     RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
13561     const auto ret MY_ATTRIBUTE((__unused__)) =
13562         mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts_next_sync);
13563 
13564     // Check that we receive only the expected error codes.
13565     DBUG_ASSERT(ret == 0 || ret == ETIMEDOUT);
13566     const bool local_stop = m_stop;
13567     const bool local_save_stats = m_save_stats;
13568     reset();
13569     RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
13570 
13571     if (local_stop) {
13572       // If we're here then that's because condition variable was signaled by
13573       // another thread and we're shutting down. Break out the loop to make
13574       // sure that shutdown thread can proceed.
13575       break;
13576     }
13577 
13578     // This path should be taken only when the timer expired.
13579     DBUG_ASSERT(ret == ETIMEDOUT);
13580 
13581     if (local_save_stats) {
13582       ddl_manager.persist_stats();
13583     }
13584 
13585     // Set the next timestamp for mysql_cond_timedwait() (which ends up calling
13586     // pthread_cond_timedwait()) to wait on.
13587     set_timespec(ts_next_sync, WAKE_UP_INTERVAL);
13588 
13589     // Flush the WAL. Sync it for both background and never modes to copy
13590     // InnoDB's behavior. For mode never, the wal file isn't even written,
13591     // whereas background writes to the wal file, but issues the syncs in a
13592     // background thread.
13593     if (rdb && (rocksdb_flush_log_at_trx_commit != FLUSH_LOG_SYNC) &&
13594         !rocksdb_db_options->allow_mmap_writes) {
13595       const rocksdb::Status s = rdb->FlushWAL(true);
13596       if (!s.ok()) {
13597         rdb_handle_io_error(s, RDB_IO_ERROR_BG_THREAD);
13598       }
13599     }
13600     // Recalculate statistics for indexes.
13601     if (rocksdb_stats_recalc_rate) {
13602       std::unordered_map<GL_INDEX_ID, std::shared_ptr<const Rdb_key_def>>
13603           to_recalc;
13604 
13605       if (rdb_indexes_to_recalc.empty()) {
13606         struct Rdb_index_collector : public Rdb_tables_scanner {
13607           int add_table(Rdb_tbl_def *tdef) override {
13608             for (uint i = 0; i < tdef->m_key_count; i++) {
13609               rdb_indexes_to_recalc.push_back(
13610                   tdef->m_key_descr_arr[i]->get_gl_index_id());
13611             }
13612             return HA_EXIT_SUCCESS;
13613           }
13614         } collector;
13615         ddl_manager.scan_for_tables(&collector);
13616       }
13617 
13618       while (to_recalc.size() < rocksdb_stats_recalc_rate &&
13619              !rdb_indexes_to_recalc.empty()) {
13620         const auto index_id = rdb_indexes_to_recalc.back();
13621         rdb_indexes_to_recalc.pop_back();
13622 
13623         std::shared_ptr<const Rdb_key_def> keydef =
13624             ddl_manager.safe_find(index_id);
13625 
13626         if (keydef) {
13627           to_recalc.insert(std::make_pair(keydef->get_gl_index_id(), keydef));
13628         }
13629       }
13630 
13631       if (!to_recalc.empty()) {
13632         calculate_stats(to_recalc, false);
13633       }
13634     }
13635 
13636   }
13637 
13638   // save remaining stats which might've left unsaved
13639   ddl_manager.persist_stats();
13640 }
13641 
13642 /*
13643   A background thread to handle manual compactions,
13644   except for dropping indexes/tables. Every second, it checks
13645   pending manual compactions, and it calls CompactRange if there is.
13646 */
13647 void Rdb_manual_compaction_thread::run() {
13648   mysql_mutex_init(0, &m_mc_mutex, MY_MUTEX_INIT_FAST);
13649   RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
13650   for (;;) {
13651     if (m_stop) {
13652       break;
13653     }
13654     timespec ts;
13655     set_timespec(ts, 1);
13656 
13657     const auto ret MY_ATTRIBUTE((__unused__)) =
13658         mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts);
13659     if (m_stop) {
13660       break;
13661     }
13662     // make sure, no program error is returned
13663     DBUG_ASSERT(ret == 0 || ret == ETIMEDOUT);
13664     RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
13665 
13666     RDB_MUTEX_LOCK_CHECK(m_mc_mutex);
13667     // Grab the first item and proceed, if not empty.
13668     if (m_requests.empty()) {
13669       RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
13670       RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
13671       continue;
13672     }
13673     Manual_compaction_request &mcr = m_requests.begin()->second;
13674     DBUG_ASSERT(mcr.cf != nullptr);
13675     DBUG_ASSERT(mcr.state == Manual_compaction_request::INITED);
13676     mcr.state = Manual_compaction_request::RUNNING;
13677     RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
13678 
13679     DBUG_ASSERT(mcr.state == Manual_compaction_request::RUNNING);
13680     // NO_LINT_DEBUG
13681     sql_print_information("Manual Compaction id %d cf %s started.", mcr.mc_id,
13682                           mcr.cf->GetName().c_str());
13683     rocksdb_manual_compactions_running++;
13684     if (rocksdb_debug_manual_compaction_delay > 0) {
13685       my_sleep(rocksdb_debug_manual_compaction_delay * 1000000);
13686     }
13687     // CompactRange may take a very long time. On clean shutdown,
13688     // it is cancelled by CancelAllBackgroundWork, then status is
13689     // set to shutdownInProgress.
13690     const rocksdb::Status s = rdb->CompactRange(
13691         getCompactRangeOptions(mcr.concurrency), mcr.cf, mcr.start, mcr.limit);
13692     rocksdb_manual_compactions_running--;
13693     if (s.ok()) {
13694       // NO_LINT_DEBUG
13695       sql_print_information("Manual Compaction id %d cf %s ended.", mcr.mc_id,
13696                             mcr.cf->GetName().c_str());
13697     } else {
13698       // NO_LINT_DEBUG
13699       sql_print_information("Manual Compaction id %d cf %s aborted. %s",
13700                             mcr.mc_id, mcr.cf->GetName().c_str(), s.getState());
13701       if (!s.IsShutdownInProgress()) {
13702         rdb_handle_io_error(s, RDB_IO_ERROR_BG_THREAD);
13703       } else {
13704         DBUG_ASSERT(m_requests.size() == 1);
13705       }
13706     }
13707     rocksdb_manual_compactions_processed++;
13708     clear_manual_compaction_request(mcr.mc_id, false);
13709     RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
13710   }
13711   clear_all_manual_compaction_requests();
13712   DBUG_ASSERT(m_requests.empty());
13713   RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
13714   mysql_mutex_destroy(&m_mc_mutex);
13715 }
13716 
13717 void Rdb_manual_compaction_thread::clear_all_manual_compaction_requests() {
13718   RDB_MUTEX_LOCK_CHECK(m_mc_mutex);
13719   m_requests.clear();
13720   RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
13721 }
13722 
13723 void Rdb_manual_compaction_thread::clear_manual_compaction_request(
13724     int mc_id, bool init_only) {
13725   bool erase = true;
13726   RDB_MUTEX_LOCK_CHECK(m_mc_mutex);
13727   auto it = m_requests.find(mc_id);
13728   if (it != m_requests.end()) {
13729     if (init_only) {
13730       Manual_compaction_request mcr = it->second;
13731       if (mcr.state != Manual_compaction_request::INITED) {
13732         erase = false;
13733       }
13734     }
13735     if (erase) {
13736       m_requests.erase(it);
13737     }
13738   } else {
13739     // Current code path guarantees that erasing by the same mc_id happens
13740     // at most once. INITED state may be erased by a thread that requested
13741     // the compaction. RUNNING state is erased by mc thread only.
13742     DBUG_ASSERT(0);
13743   }
13744   RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
13745 }
13746 
13747 int Rdb_manual_compaction_thread::request_manual_compaction(
13748     rocksdb::ColumnFamilyHandle *cf, rocksdb::Slice *start,
13749     rocksdb::Slice *limit, int concurrency) {
13750   int mc_id = -1;
13751   RDB_MUTEX_LOCK_CHECK(m_mc_mutex);
13752   if (m_requests.size() >= rocksdb_max_manual_compactions) {
13753     RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
13754     return mc_id;
13755   }
13756   Manual_compaction_request mcr;
13757   mc_id = mcr.mc_id = ++m_latest_mc_id;
13758   mcr.state = Manual_compaction_request::INITED;
13759   mcr.cf = cf;
13760   mcr.start = start;
13761   mcr.limit = limit;
13762   mcr.concurrency = concurrency;
13763   m_requests.insert(std::make_pair(mcr.mc_id, mcr));
13764   RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
13765   return mc_id;
13766 }
13767 
13768 bool Rdb_manual_compaction_thread::is_manual_compaction_finished(int mc_id) {
13769   bool finished = false;
13770   RDB_MUTEX_LOCK_CHECK(m_mc_mutex);
13771   if (m_requests.count(mc_id) == 0) {
13772     finished = true;
13773   }
13774   RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
13775   return finished;
13776 }
13777 
13778 /**
13779  * Locking read + Not Found + Read Committed occurs if we accessed
13780  * a row by Seek, tried to lock it, failed, released and reacquired the
13781  * snapshot (because of READ COMMITTED mode) and the row was deleted by
13782  * someone else in the meantime.
13783  * If so, we either just skipping the row, or re-creating a snapshot
13784  * and seek again. In both cases, Read Committed constraint is not broken.
13785  */
13786 bool ha_rocksdb::should_skip_invalidated_record(const int rc) {
13787   if ((m_lock_rows != RDB_LOCK_NONE && rc == HA_ERR_KEY_NOT_FOUND &&
13788        my_core::thd_tx_isolation(ha_thd()) == ISO_READ_COMMITTED)) {
13789     return true;
13790   }
13791   return false;
13792 }
13793 /**
13794  * Indicating snapshot needs to be re-created and retrying seek again,
13795  * instead of returning errors or empty set. This is normally applicable
13796  * when hitting kBusy when locking the first row of the transaction,
13797  * with Repeatable Read isolation level.
13798  */
13799 bool ha_rocksdb::should_recreate_snapshot(const int rc,
13800                                           const bool is_new_snapshot) {
13801   if (should_skip_invalidated_record(rc) ||
13802       (rc == HA_ERR_ROCKSDB_STATUS_BUSY && is_new_snapshot)) {
13803     return true;
13804   }
13805   return false;
13806 }
13807 
13808 /**
13809  * If calling put/delete/singledelete without locking the row,
13810  * it is necessary to pass assume_tracked=false to RocksDB TX API.
13811  * Read Free Replication and Blind Deletes are the cases when
13812  * using TX API and skipping row locking.
13813  */
13814 bool ha_rocksdb::can_assume_tracked(THD *thd) {
13815   if (/* MARIAROCKS_NOT_YET use_read_free_rpl() ||*/ (THDVAR(thd, blind_delete_primary_key))) {
13816     return false;
13817   }
13818   return true;
13819 }
13820 
13821 bool ha_rocksdb::check_bloom_and_set_bounds(
13822     THD *thd, const Rdb_key_def &kd, const rocksdb::Slice &eq_cond,
13823     const bool use_all_keys, size_t bound_len, uchar *const lower_bound,
13824     uchar *const upper_bound, rocksdb::Slice *lower_bound_slice,
13825     rocksdb::Slice *upper_bound_slice) {
13826   bool can_use_bloom = can_use_bloom_filter(thd, kd, eq_cond, use_all_keys);
13827   if (!can_use_bloom) {
13828     setup_iterator_bounds(kd, eq_cond, bound_len, lower_bound, upper_bound,
13829                           lower_bound_slice, upper_bound_slice);
13830   }
13831   return can_use_bloom;
13832 }
13833 
13834 /**
13835   Deciding if it is possible to use bloom filter or not.
13836 
13837   @detail
13838    Even if bloom filter exists, it is not always possible
13839    to use bloom filter. If using bloom filter when you shouldn't,
13840    false negative may happen -- fewer rows than expected may be returned.
13841    It is users' responsibility to use bloom filter correctly.
13842 
13843    If bloom filter does not exist, return value does not matter because
13844    RocksDB does not use bloom filter internally.
13845 
13846   @param kd
13847   @param eq_cond      Equal condition part of the key. This always includes
13848                       system index id (4 bytes).
13849   @param use_all_keys True if all key parts are set with equal conditions.
13850                       This is aware of extended keys.
13851 */
13852 bool ha_rocksdb::can_use_bloom_filter(THD *thd, const Rdb_key_def &kd,
13853                                       const rocksdb::Slice &eq_cond,
13854                                       const bool use_all_keys) {
13855   bool can_use = false;
13856 
13857   if (THDVAR(thd, skip_bloom_filter_on_read)) {
13858     return can_use;
13859   }
13860 
13861   const rocksdb::SliceTransform *prefix_extractor = kd.get_extractor();
13862   if (prefix_extractor) {
13863     /*
13864       This is an optimized use case for CappedPrefixTransform.
13865       If eq_cond length >= prefix extractor length and if
13866       all keys are used for equal lookup, it is
13867       always possible to use bloom filter.
13868 
13869       Prefix bloom filter can't be used on descending scan with
13870       prefix lookup (i.e. WHERE id1=1 ORDER BY id2 DESC), because of
13871       RocksDB's limitation. On ascending (or not sorting) scan,
13872       keys longer than the capped prefix length will be truncated down
13873       to the capped length and the resulting key is added to the bloom filter.
13874 
13875       Keys shorter than the capped prefix length will be added to
13876       the bloom filter. When keys are looked up, key conditionals
13877       longer than the capped length can be used; key conditionals
13878       shorter require all parts of the key to be available
13879       for the short key match.
13880     */
13881     if ((use_all_keys && prefix_extractor->InRange(eq_cond)) ||
13882         prefix_extractor->SameResultWhenAppended(eq_cond)) {
13883       can_use = true;
13884     } else {
13885       can_use = false;
13886     }
13887   } else {
13888     /*
13889       if prefix extractor is not defined, all key parts have to be
13890       used by eq_cond.
13891     */
13892     if (use_all_keys) {
13893       can_use = true;
13894     } else {
13895       can_use = false;
13896     }
13897   }
13898 
13899   return can_use;
13900 }
13901 
13902 /* For modules that need access to the global data structures */
13903 rocksdb::TransactionDB *rdb_get_rocksdb_db() { return rdb; }
13904 
13905 Rdb_cf_manager &rdb_get_cf_manager() { return cf_manager; }
13906 
13907 const rocksdb::BlockBasedTableOptions &rdb_get_table_options() {
13908   return *rocksdb_tbl_options;
13909 }
13910 
13911 bool rdb_is_ttl_enabled() { return rocksdb_enable_ttl; }
13912 bool rdb_is_ttl_read_filtering_enabled() {
13913   return rocksdb_enable_ttl_read_filtering;
13914 }
13915 #ifndef DBUG_OFF
13916 int rdb_dbug_set_ttl_rec_ts() { return rocksdb_debug_ttl_rec_ts; }
13917 int rdb_dbug_set_ttl_snapshot_ts() { return rocksdb_debug_ttl_snapshot_ts; }
13918 int rdb_dbug_set_ttl_read_filter_ts() {
13919   return rocksdb_debug_ttl_read_filter_ts;
13920 }
13921 bool rdb_dbug_set_ttl_ignore_pk() { return rocksdb_debug_ttl_ignore_pk; }
13922 #endif
13923 
13924 void rdb_update_global_stats(const operation_type &type, uint count,
13925                              bool is_system_table) {
13926   DBUG_ASSERT(type < ROWS_MAX);
13927 
13928   if (count == 0) {
13929     return;
13930   }
13931 
13932   if (is_system_table) {
13933     global_stats.system_rows[type].add(count);
13934   } else {
13935     global_stats.rows[type].add(count);
13936   }
13937 }
13938 
13939 int rdb_get_table_perf_counters(const char *const tablename,
13940                                 Rdb_perf_counters *const counters) {
13941   DBUG_ASSERT(tablename != nullptr);
13942 
13943   Rdb_table_handler *table_handler;
13944   table_handler = rdb_open_tables.get_table_handler(tablename);
13945   if (table_handler == nullptr) {
13946     return HA_ERR_ROCKSDB_INVALID_TABLE;
13947   }
13948 
13949   counters->load(table_handler->m_table_perf_context);
13950 
13951   rdb_open_tables.release_table_handler(table_handler);
13952   return HA_EXIT_SUCCESS;
13953 }
13954 
13955 const char *get_rdb_io_error_string(const RDB_IO_ERROR_TYPE err_type) {
13956   // If this assertion fails then this means that a member has been either added
13957   // to or removed from RDB_IO_ERROR_TYPE enum and this function needs to be
13958   // changed to return the appropriate value.
13959   static_assert(RDB_IO_ERROR_LAST == 4, "Please handle all the error types.");
13960 
13961   switch (err_type) {
13962     case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_TX_COMMIT:
13963       return "RDB_IO_ERROR_TX_COMMIT";
13964     case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_DICT_COMMIT:
13965       return "RDB_IO_ERROR_DICT_COMMIT";
13966     case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_BG_THREAD:
13967       return "RDB_IO_ERROR_BG_THREAD";
13968     case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_GENERAL:
13969       return "RDB_IO_ERROR_GENERAL";
13970     default:
13971       DBUG_ASSERT(false);
13972       return "(unknown)";
13973   }
13974 }
13975 
13976 // In case of core dump generation we want this function NOT to be optimized
13977 // so that we can capture as much data as possible to debug the root cause
13978 // more efficiently.
13979 #ifdef __GNUC__
13980 #endif
13981 void rdb_handle_io_error(const rocksdb::Status status,
13982                          const RDB_IO_ERROR_TYPE err_type) {
13983   if (status.IsIOError()) {
13984     /* skip dumping core if write failed and we are allowed to do so */
13985 #ifdef MARIAROCKS_NOT_YET
13986     if (skip_core_dump_on_error) {
13987       opt_core_file = false;
13988     }
13989 #endif
13990     switch (err_type) {
13991       case RDB_IO_ERROR_TX_COMMIT:
13992       case RDB_IO_ERROR_DICT_COMMIT: {
13993         rdb_log_status_error(status, "failed to write to WAL");
13994         /* NO_LINT_DEBUG */
13995         sql_print_error("MyRocks: aborting on WAL write error.");
13996         abort();
13997         break;
13998       }
13999       case RDB_IO_ERROR_BG_THREAD: {
14000         rdb_log_status_error(status, "BG thread failed to write to RocksDB");
14001         /* NO_LINT_DEBUG */
14002         sql_print_error("MyRocks: aborting on BG write error.");
14003         abort();
14004         break;
14005       }
14006       case RDB_IO_ERROR_GENERAL: {
14007         rdb_log_status_error(status, "failed on I/O");
14008         /* NO_LINT_DEBUG */
14009         sql_print_error("MyRocks: aborting on I/O error.");
14010         abort();
14011         break;
14012       }
14013       default:
14014         DBUG_ASSERT(0);
14015         break;
14016     }
14017   } else if (status.IsCorruption()) {
14018     rdb_log_status_error(status, "data corruption detected!");
14019     rdb_persist_corruption_marker();
14020     /* NO_LINT_DEBUG */
14021     sql_print_error("MyRocks: aborting because of data corruption.");
14022     abort();
14023   } else if (!status.ok()) {
14024     switch (err_type) {
14025       case RDB_IO_ERROR_DICT_COMMIT: {
14026         rdb_log_status_error(status, "Failed to write to WAL (dictionary)");
14027         /* NO_LINT_DEBUG */
14028         sql_print_error("MyRocks: aborting on WAL write error.");
14029         abort();
14030         break;
14031       }
14032       default:
14033         rdb_log_status_error(status, "Failed to read/write in RocksDB");
14034         break;
14035     }
14036   }
14037 }
14038 #ifdef __GNUC__
14039 #endif
14040 Rdb_dict_manager *rdb_get_dict_manager(void) { return &dict_manager; }
14041 
14042 Rdb_ddl_manager *rdb_get_ddl_manager(void) { return &ddl_manager; }
14043 
14044 Rdb_binlog_manager *rdb_get_binlog_manager(void) { return &binlog_manager; }
14045 
14046 void rocksdb_set_compaction_options(
14047     my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
14048     my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14049     void *const var_ptr, const void *const save) {
14050   if (var_ptr && save) {
14051     *(uint64_t *)var_ptr = *(const uint64_t *)save;
14052   }
14053   const Rdb_compact_params params = {
14054       (uint64_t)rocksdb_compaction_sequential_deletes,
14055       (uint64_t)rocksdb_compaction_sequential_deletes_window,
14056       (uint64_t)rocksdb_compaction_sequential_deletes_file_size};
14057   if (properties_collector_factory) {
14058     properties_collector_factory->SetCompactionParams(params);
14059   }
14060 }
14061 
14062 void rocksdb_set_table_stats_sampling_pct(
14063     my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
14064     my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14065     void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
14066   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14067 
14068   const uint32_t new_val = *static_cast<const uint32_t *>(save);
14069 
14070   if (new_val != rocksdb_table_stats_sampling_pct) {
14071     rocksdb_table_stats_sampling_pct = new_val;
14072 
14073     if (properties_collector_factory) {
14074       properties_collector_factory->SetTableStatsSamplingPct(
14075           rocksdb_table_stats_sampling_pct);
14076     }
14077   }
14078 
14079   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14080 }
14081 
14082 /*
14083   This function allows setting the rate limiter's bytes per second value
14084   but only if the rate limiter is turned on which has to be done at startup.
14085   If the rate is already 0 (turned off) or we are changing it to 0 (trying
14086   to turn it off) this function will push a warning to the client and do
14087   nothing.
14088   This is similar to the code in innodb_doublewrite_update (found in
14089   storage/innobase/handler/ha_innodb.cc).
14090 */
14091 void rocksdb_set_rate_limiter_bytes_per_sec(
14092     my_core::THD *const thd,
14093     my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14094     void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
14095   const uint64_t new_val = *static_cast<const uint64_t *>(save);
14096   if (new_val == 0 || rocksdb_rate_limiter_bytes_per_sec == 0) {
14097     /*
14098       If a rate_limiter was not enabled at startup we can't change it nor
14099       can we disable it if one was created at startup
14100     */
14101     push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_WRONG_ARGUMENTS,
14102                         "RocksDB: rocksdb_rate_limiter_bytes_per_sec cannot "
14103                         "be dynamically changed to or from 0.  Do a clean "
14104                         "shutdown if you want to change it from or to 0.");
14105   } else if (new_val != rocksdb_rate_limiter_bytes_per_sec) {
14106     /* Apply the new value to the rate limiter and store it locally */
14107     DBUG_ASSERT(rocksdb_rate_limiter != nullptr);
14108     rocksdb_rate_limiter_bytes_per_sec = new_val;
14109     rocksdb_rate_limiter->SetBytesPerSecond(new_val);
14110   }
14111 }
14112 
14113 void rocksdb_set_sst_mgr_rate_bytes_per_sec(
14114     my_core::THD *const thd,
14115     my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14116     void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
14117   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14118 
14119   const uint64_t new_val = *static_cast<const uint64_t *>(save);
14120 
14121   if (new_val != rocksdb_sst_mgr_rate_bytes_per_sec) {
14122     rocksdb_sst_mgr_rate_bytes_per_sec = new_val;
14123 
14124     rocksdb_db_options->sst_file_manager->SetDeleteRateBytesPerSecond(
14125         rocksdb_sst_mgr_rate_bytes_per_sec);
14126   }
14127 
14128   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14129 }
14130 
14131 void rocksdb_set_delayed_write_rate(THD *thd, struct st_mysql_sys_var *var,
14132                                     void *var_ptr, const void *save) {
14133   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14134   const uint64_t new_val = *static_cast<const uint64_t *>(save);
14135   if (rocksdb_delayed_write_rate != new_val) {
14136     rocksdb_delayed_write_rate = new_val;
14137     rocksdb::Status s =
14138         rdb->SetDBOptions({{"delayed_write_rate", std::to_string(new_val)}});
14139 
14140     if (!s.ok()) {
14141       /* NO_LINT_DEBUG */
14142       sql_print_warning(
14143           "MyRocks: failed to update delayed_write_rate. "
14144           "status code = %d, status = %s",
14145           s.code(), s.ToString().c_str());
14146     }
14147   }
14148   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14149 }
14150 
14151 void rocksdb_set_max_latest_deadlocks(THD *thd, struct st_mysql_sys_var *var,
14152                                       void *var_ptr, const void *save) {
14153   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14154   const uint32_t new_val = *static_cast<const uint32_t *>(save);
14155   if (rocksdb_max_latest_deadlocks != new_val) {
14156     rocksdb_max_latest_deadlocks = new_val;
14157     rdb->SetDeadlockInfoBufferSize(rocksdb_max_latest_deadlocks);
14158   }
14159   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14160 }
14161 
14162 void rdb_set_collation_exception_list(const char *const exception_list) {
14163   DBUG_ASSERT(rdb_collation_exceptions != nullptr);
14164 
14165   if (!rdb_collation_exceptions->set_patterns(exception_list)) {
14166     my_core::warn_about_bad_patterns(rdb_collation_exceptions,
14167                                      "strict_collation_exceptions");
14168   }
14169 }
14170 
14171 void rocksdb_set_collation_exception_list(THD *const thd,
14172                                           struct st_mysql_sys_var *const var,
14173                                           void *const var_ptr,
14174                                           const void *const save) {
14175   const char *const val = *static_cast<const char *const *>(save);
14176 
14177   rdb_set_collation_exception_list(val == nullptr ? "" : val);
14178 
14179   //psergey-todo: what is the purpose of the below??
14180   const char *val_copy= val? my_strdup(val, MYF(0)): nullptr;
14181   my_free(*static_cast<char**>(var_ptr));
14182   *static_cast<const char**>(var_ptr) = val_copy;
14183 }
14184 
14185 int mysql_value_to_bool(struct st_mysql_value *value, my_bool *return_value) {
14186   int new_value_type = value->value_type(value);
14187   if (new_value_type == MYSQL_VALUE_TYPE_STRING) {
14188     char buf[16];
14189     int len = sizeof(buf);
14190     const char *str = value->val_str(value, buf, &len);
14191     if (str && (my_strcasecmp(system_charset_info, "true", str) == 0 ||
14192                 my_strcasecmp(system_charset_info, "on", str) == 0)) {
14193       *return_value = TRUE;
14194     } else if (str && (my_strcasecmp(system_charset_info, "false", str) == 0 ||
14195                        my_strcasecmp(system_charset_info, "off", str) == 0)) {
14196       *return_value = FALSE;
14197     } else {
14198       return 1;
14199     }
14200   } else if (new_value_type == MYSQL_VALUE_TYPE_INT) {
14201     long long intbuf;
14202     value->val_int(value, &intbuf);
14203     if (intbuf > 1) return 1;
14204     *return_value = intbuf > 0 ? TRUE : FALSE;
14205   } else {
14206     return 1;
14207   }
14208 
14209   return 0;
14210 }
14211 
14212 int rocksdb_check_bulk_load(
14213     THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)),
14214     void *save, struct st_mysql_value *value) {
14215   my_bool new_value;
14216   if (mysql_value_to_bool(value, &new_value) != 0) {
14217     return 1;
14218   }
14219 
14220   Rdb_transaction *tx = get_tx_from_thd(thd);
14221   if (tx != nullptr) {
14222     bool is_critical_error;
14223     const int rc = tx->finish_bulk_load(&is_critical_error);
14224     if (rc != 0 && is_critical_error) {
14225       // NO_LINT_DEBUG
14226       sql_print_error(
14227           "RocksDB: Error %d finalizing last SST file while "
14228           "setting bulk loading variable",
14229           rc);
14230       THDVAR(thd, bulk_load) = 0;
14231       return 1;
14232     }
14233   }
14234 
14235   *static_cast<bool *>(save) = new_value;
14236   return 0;
14237 }
14238 
14239 int rocksdb_check_bulk_load_allow_unsorted(
14240     THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)),
14241     void *save, struct st_mysql_value *value) {
14242   my_bool new_value;
14243   if (mysql_value_to_bool(value, &new_value) != 0) {
14244     return 1;
14245   }
14246 
14247   if (THDVAR(thd, bulk_load)) {
14248     my_error(ER_ERROR_WHEN_EXECUTING_COMMAND, MYF(0), "SET",
14249              "Cannot change this setting while bulk load is enabled");
14250 
14251     return 1;
14252   }
14253 
14254   *static_cast<bool *>(save) = new_value;
14255   return 0;
14256 }
14257 
14258 static void rocksdb_set_max_background_jobs(THD *thd,
14259                                             struct st_mysql_sys_var *const var,
14260                                             void *const var_ptr,
14261                                             const void *const save) {
14262   DBUG_ASSERT(save != nullptr);
14263   DBUG_ASSERT(rocksdb_db_options != nullptr);
14264   DBUG_ASSERT(rocksdb_db_options->env != nullptr);
14265 
14266   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14267 
14268   const int new_val = *static_cast<const int *>(save);
14269 
14270   if (rocksdb_db_options->max_background_jobs != new_val) {
14271     rocksdb_db_options->max_background_jobs = new_val;
14272     rocksdb::Status s =
14273         rdb->SetDBOptions({{"max_background_jobs", std::to_string(new_val)}});
14274 
14275     if (!s.ok()) {
14276       /* NO_LINT_DEBUG */
14277       sql_print_warning(
14278           "MyRocks: failed to update max_background_jobs. "
14279           "Status code = %d, status = %s.",
14280           s.code(), s.ToString().c_str());
14281     }
14282   }
14283 
14284   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14285 }
14286 
14287 static void rocksdb_set_bytes_per_sync(
14288     THD *thd MY_ATTRIBUTE((__unused__)),
14289     struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14290     void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
14291   DBUG_ASSERT(save != nullptr);
14292   DBUG_ASSERT(rocksdb_db_options != nullptr);
14293   DBUG_ASSERT(rocksdb_db_options->env != nullptr);
14294 
14295   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14296 
14297   const ulonglong new_val = *static_cast<const ulonglong *>(save);
14298 
14299   if (rocksdb_db_options->bytes_per_sync != new_val) {
14300     rocksdb_db_options->bytes_per_sync = new_val;
14301     rocksdb::Status s =
14302         rdb->SetDBOptions({{"bytes_per_sync", std::to_string(new_val)}});
14303 
14304     if (!s.ok()) {
14305       /* NO_LINT_DEBUG */
14306       sql_print_warning(
14307           "MyRocks: failed to update max_background_jobs. "
14308           "Status code = %d, status = %s.",
14309           s.code(), s.ToString().c_str());
14310     }
14311   }
14312 
14313   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14314 }
14315 
14316 static void rocksdb_set_wal_bytes_per_sync(
14317     THD *thd MY_ATTRIBUTE((__unused__)),
14318     struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14319     void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
14320   DBUG_ASSERT(save != nullptr);
14321   DBUG_ASSERT(rocksdb_db_options != nullptr);
14322   DBUG_ASSERT(rocksdb_db_options->env != nullptr);
14323 
14324   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14325 
14326   const ulonglong new_val = *static_cast<const ulonglong *>(save);
14327 
14328   if (rocksdb_db_options->wal_bytes_per_sync != new_val) {
14329     rocksdb_db_options->wal_bytes_per_sync = new_val;
14330     rocksdb::Status s =
14331         rdb->SetDBOptions({{"wal_bytes_per_sync", std::to_string(new_val)}});
14332 
14333     if (!s.ok()) {
14334       /* NO_LINT_DEBUG */
14335       sql_print_warning(
14336           "MyRocks: failed to update max_background_jobs. "
14337           "Status code = %d, status = %s.",
14338           s.code(), s.ToString().c_str());
14339     }
14340   }
14341 
14342   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14343 }
14344 
14345 /*
14346   Validating and updating block cache size via sys_var::check path.
14347   SetCapacity may take seconds when reducing block cache, and
14348   sys_var::update holds LOCK_global_system_variables mutex, so
14349   updating block cache size is done at check path instead.
14350 */
14351 static int rocksdb_validate_set_block_cache_size(
14352     THD *thd MY_ATTRIBUTE((__unused__)),
14353     struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14354     void *var_ptr, struct st_mysql_value *value) {
14355   DBUG_ASSERT(value != nullptr);
14356 
14357   long long new_value;
14358 
14359   /* value is NULL */
14360   if (value->val_int(value, &new_value)) {
14361     return HA_EXIT_FAILURE;
14362   }
14363 
14364   if (new_value < RDB_MIN_BLOCK_CACHE_SIZE ||
14365       (uint64_t)new_value > (uint64_t)LLONG_MAX) {
14366     return HA_EXIT_FAILURE;
14367   }
14368 
14369   RDB_MUTEX_LOCK_CHECK(rdb_block_cache_resize_mutex);
14370   const rocksdb::BlockBasedTableOptions &table_options =
14371       rdb_get_table_options();
14372 
14373   if (rocksdb_block_cache_size != new_value && table_options.block_cache) {
14374     table_options.block_cache->SetCapacity(new_value);
14375   }
14376   *static_cast<int64_t *>(var_ptr) = static_cast<int64_t>(new_value);
14377   RDB_MUTEX_UNLOCK_CHECK(rdb_block_cache_resize_mutex);
14378   return HA_EXIT_SUCCESS;
14379 }
14380 
14381 static int rocksdb_validate_update_cf_options(
14382     THD * /* unused */, struct st_mysql_sys_var * /*unused*/, void *save,
14383     struct st_mysql_value *value) {
14384   char buff[STRING_BUFFER_USUAL_SIZE];
14385   const char *str;
14386   int length;
14387   length = sizeof(buff);
14388   str = value->val_str(value, buff, &length);
14389   // In some cases, str can point to buff in the stack.
14390   // This can cause invalid memory access after validation is finished.
14391   // To avoid this kind case, let's alway duplicate the str if str is not
14392   // nullptr
14393   *(const char **)save = (str == nullptr) ? nullptr : my_strdup(str, MYF(0));
14394 
14395   if (str == nullptr) {
14396     return HA_EXIT_SUCCESS;
14397   }
14398 
14399   Rdb_cf_options::Name_to_config_t option_map;
14400 
14401   // Basic sanity checking and parsing the options into a map. If this fails
14402   // then there's no point to proceed.
14403   if (!Rdb_cf_options::parse_cf_options(str, &option_map)) {
14404     my_error(ER_WRONG_VALUE_FOR_VAR, MYF(0), "rocksdb_update_cf_options", str);
14405     // Free what we've copied with my_strdup above.
14406     my_free((void*)(*(const char **)save));
14407     return HA_EXIT_FAILURE;
14408   }
14409   // Loop through option_map and create missing column families
14410   for (Rdb_cf_options::Name_to_config_t::iterator it = option_map.begin();
14411        it != option_map.end(); ++it) {
14412     cf_manager.get_or_create_cf(rdb, it->first);
14413   }
14414   return HA_EXIT_SUCCESS;
14415 }
14416 
14417 static void rocksdb_set_update_cf_options(
14418     THD *const /* unused */, struct st_mysql_sys_var *const /* unused */,
14419     void *const var_ptr, const void *const save) {
14420   const char *const val = *static_cast<const char *const *>(save);
14421 
14422   RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14423 
14424   my_free(*reinterpret_cast<char **>(var_ptr));
14425 
14426   if (!val) {
14427     *reinterpret_cast<char **>(var_ptr) = nullptr;
14428     RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14429     return;
14430   }
14431 
14432   DBUG_ASSERT(val != nullptr);
14433 
14434   // Reset the pointers regardless of how much success we had with updating
14435   // the CF options. This will results in consistent behavior and avoids
14436   // dealing with cases when only a subset of CF-s was successfully updated.
14437   *reinterpret_cast<const char **>(var_ptr) = val;
14438 
14439   // Do the real work of applying the changes.
14440   Rdb_cf_options::Name_to_config_t option_map;
14441 
14442   // This should never fail, because of rocksdb_validate_update_cf_options
14443   if (!Rdb_cf_options::parse_cf_options(val, &option_map)) {
14444     my_free(*reinterpret_cast<char**>(var_ptr));
14445     RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14446     return;
14447   }
14448 
14449   // For each CF we have, see if we need to update any settings.
14450   for (const auto &cf_name : cf_manager.get_cf_names()) {
14451     DBUG_ASSERT(!cf_name.empty());
14452 
14453     rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(cf_name);
14454     DBUG_ASSERT(cfh != nullptr);
14455 
14456     const auto it = option_map.find(cf_name);
14457     std::string per_cf_options = (it != option_map.end()) ? it->second : "";
14458 
14459     if (!per_cf_options.empty()) {
14460       Rdb_cf_options::Name_to_config_t opt_map;
14461       rocksdb::Status s = rocksdb::StringToMap(per_cf_options, &opt_map);
14462 
14463       if (s != rocksdb::Status::OK()) {
14464         // NO_LINT_DEBUG
14465         sql_print_warning(
14466             "MyRocks: failed to convert the options for column "
14467             "family '%s' to a map. %s",
14468             cf_name.c_str(), s.ToString().c_str());
14469       } else {
14470         DBUG_ASSERT(rdb != nullptr);
14471 
14472         // Finally we can apply the options.
14473         s = rdb->SetOptions(cfh, opt_map);
14474 
14475         if (s != rocksdb::Status::OK()) {
14476           // NO_LINT_DEBUG
14477           sql_print_warning(
14478               "MyRocks: failed to apply the options for column "
14479               "family '%s'. %s",
14480               cf_name.c_str(), s.ToString().c_str());
14481         } else {
14482           // NO_LINT_DEBUG
14483           sql_print_information(
14484               "MyRocks: options for column family '%s' "
14485               "have been successfully updated.",
14486               cf_name.c_str());
14487 
14488           // Make sure that data is internally consistent as well and update
14489           // the CF options. This is necessary also to make sure that the CF
14490           // options will be correctly reflected in the relevant table:
14491           // ROCKSDB_CF_OPTIONS in INFORMATION_SCHEMA.
14492           rocksdb::ColumnFamilyOptions cf_options = rdb->GetOptions(cfh);
14493           std::string updated_options;
14494 
14495           s = rocksdb::GetStringFromColumnFamilyOptions(&updated_options,
14496                                                         cf_options);
14497 
14498           DBUG_ASSERT(s == rocksdb::Status::OK());
14499           DBUG_ASSERT(!updated_options.empty());
14500 
14501           cf_manager.update_options_map(cf_name, updated_options);
14502         }
14503       }
14504     }
14505   }
14506 
14507   // Our caller (`plugin_var_memalloc_global_update`) will call `my_free` to
14508   // free up resources used before.
14509 
14510   RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14511 }
14512 
14513 void rdb_queue_save_stats_request() { rdb_bg_thread.request_save_stats(); }
14514 
14515 #ifdef MARIAROCKS_NOT_YET // MDEV-10976
14516 
14517 void ha_rocksdb::rpl_before_delete_rows() {
14518   DBUG_ENTER_FUNC();
14519 
14520   m_in_rpl_delete_rows = true;
14521 
14522   DBUG_VOID_RETURN;
14523 }
14524 
14525 void ha_rocksdb::rpl_after_delete_rows() {
14526   DBUG_ENTER_FUNC();
14527 
14528   m_in_rpl_delete_rows = false;
14529 
14530   DBUG_VOID_RETURN;
14531 }
14532 
14533 void ha_rocksdb::rpl_before_update_rows() {
14534   DBUG_ENTER_FUNC();
14535 
14536   m_in_rpl_update_rows = true;
14537 
14538   DBUG_VOID_RETURN;
14539 }
14540 
14541 void ha_rocksdb::rpl_after_update_rows() {
14542   DBUG_ENTER_FUNC();
14543 
14544   m_in_rpl_update_rows = false;
14545 
14546   DBUG_VOID_RETURN;
14547 }
14548 
14549 #if 0
14550 bool ha_rocksdb::is_read_free_rpl_table() const {
14551   return table->s && m_tbl_def->m_is_read_free_rpl_table;
14552 }
14553 #endif
14554 
14555 /**
14556   @brief
14557   Read Free Replication can be used or not. Returning true means
14558   Read Free Replication can be used.
14559 */
14560 bool ha_rocksdb::use_read_free_rpl() const {
14561   DBUG_ENTER_FUNC();
14562 
14563   if (!ha_thd()->rli_slave || table->triggers || /* !is_read_free_rpl_table()*/ ) {
14564     DBUG_RETURN(false);
14565   }
14566 
14567 #if 0 // MARIAROCKS_NOT_YET : read-free replication is not supported
14568   switch (rocksdb_read_free_rpl) {
14569     case read_free_rpl_type::OFF:
14570       DBUG_RETURN(false);
14571     case read_free_rpl_type::PK_ONLY:
14572       DBUG_RETURN(!has_hidden_pk(table) && table->s->keys == 1);
14573     case read_free_rpl_type::PK_SK:
14574       DBUG_RETURN(!has_hidden_pk(table));
14575   }
14576 #else
14577   DBUG_RETURN(false);
14578 #endif
14579 
14580   DBUG_ASSERT(false);
14581   DBUG_RETURN(false);
14582 }
14583 #endif // MARIAROCKS_NOT_YET
14584 
14585 double ha_rocksdb::read_time(uint index, uint ranges, ha_rows rows) {
14586   DBUG_ENTER_FUNC();
14587 
14588   if (index != table->s->primary_key) {
14589     /* Non covering index range scan */
14590     DBUG_RETURN(handler::read_time(index, ranges, rows));
14591   }
14592 
14593   DBUG_RETURN((rows / 20.0) + 1);
14594 }
14595 
14596 void ha_rocksdb::print_error(int error, myf errflag) {
14597   if (error == HA_ERR_ROCKSDB_STATUS_BUSY) {
14598     error = HA_ERR_LOCK_DEADLOCK;
14599   }
14600   handler::print_error(error, errflag);
14601 }
14602 
14603 std::string rdb_corruption_marker_file_name() {
14604   std::string ret(rocksdb_datadir);
14605   ret.append("/ROCKSDB_CORRUPTED");
14606   return ret;
14607 }
14608 
14609 void sql_print_verbose_info(const char *format, ...)
14610 {
14611   va_list args;
14612 
14613   if (global_system_variables.log_warnings > 2) {
14614     va_start(args, format);
14615     sql_print_information_v(format, args);
14616     va_end(args);
14617   }
14618 }
14619 
14620 }  // namespace myrocks
14621 
14622 
14623 /**
14624   Construct and emit duplicate key error message using information
14625   from table's record buffer.
14626 
14627   @sa print_keydup_error(table, key, msg, errflag, thd, org_table_name).
14628 */
14629 
14630 void print_keydup_error(TABLE *table, KEY *key, myf errflag,
14631                         const THD *thd, const char *org_table_name)
14632 {
14633   print_keydup_error(table, key, ER(ER_DUP_ENTRY_WITH_KEY_NAME), errflag);
14634 }
14635 
14636 /*
14637   Register the storage engine plugin outside of myrocks namespace
14638   so that mysql_declare_plugin does not get confused when it does
14639   its name generation.
14640 */
14641 
14642 
14643 struct st_mysql_storage_engine rocksdb_storage_engine = {
14644     MYSQL_HANDLERTON_INTERFACE_VERSION};
14645 
14646 maria_declare_plugin(rocksdb_se){
14647     MYSQL_STORAGE_ENGINE_PLUGIN,       /* Plugin Type */
14648     &rocksdb_storage_engine,           /* Plugin Descriptor */
14649     "ROCKSDB",                         /* Plugin Name */
14650     "Monty Program Ab",                /* Plugin Author */
14651     "RocksDB storage engine",          /* Plugin Description */
14652     PLUGIN_LICENSE_GPL,                /* Plugin Licence */
14653     myrocks::rocksdb_init_func,        /* Plugin Entry Point */
14654     myrocks::rocksdb_done_func,        /* Plugin Deinitializer */
14655     0x0001,                            /* version number (0.1) */
14656     myrocks::rocksdb_status_vars,      /* status variables */
14657     myrocks::rocksdb_system_variables, /* system variables */
14658   "1.0",                                        /* string version */
14659   myrocks::MYROCKS_MARIADB_PLUGIN_MATURITY_LEVEL
14660 },
14661     myrocks::rdb_i_s_cfstats, myrocks::rdb_i_s_dbstats,
14662     myrocks::rdb_i_s_perf_context, myrocks::rdb_i_s_perf_context_global,
14663     myrocks::rdb_i_s_cfoptions, myrocks::rdb_i_s_compact_stats,
14664     myrocks::rdb_i_s_global_info, myrocks::rdb_i_s_ddl,
14665     myrocks::rdb_i_s_sst_props, myrocks::rdb_i_s_index_file_map,
14666     myrocks::rdb_i_s_lock_info, myrocks::rdb_i_s_trx_info,
14667     myrocks::rdb_i_s_deadlock_info
14668 maria_declare_plugin_end;
14669