1 /*
2    Copyright (c) 2012, Monty Program Ab
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; version 2 of the License.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
16 
17 #ifdef USE_PRAGMA_IMPLEMENTATION
18 #pragma implementation // gcc: Class implementation
19 #endif
20 
21 #define MYSQL_SERVER 1
22 
23 /* The C++ file's header */
24 #include "./ha_rocksdb.h"
25 
26 /* C++ standard header files */
27 #include <algorithm>
28 #include <queue>
29 #include <set>
30 #include <string>
31 #include <vector>
32 
33 /* MySQL includes */
34 #include "./debug_sync.h"
35 #include "./my_bit.h"
36 #include "./my_stacktrace.h"
37 #include "./sql_audit.h"
38 #include "./sql_table.h"
39 #include <mysql/psi/mysql_table.h>
40 #include <mysql/thread_pool_priv.h>
41 #include <mysys_err.h>
42 
43 /* RocksDB includes */
44 #include "rocksdb/compaction_filter.h"
45 #include "rocksdb/persistent_cache.h"
46 #include "rocksdb/rate_limiter.h"
47 #include "rocksdb/slice_transform.h"
48 #include "rocksdb/utilities/checkpoint.h"
49 #include "rocksdb/utilities/convenience.h"
50 #include "rocksdb/utilities/memory_util.h"
51 
52 /* MyRocks includes */
53 #include "./event_listener.h"
54 #include "./ha_rocksdb_proto.h"
55 #include "./logger.h"
56 #include "./rdb_cf_manager.h"
57 #include "./rdb_cf_options.h"
58 #include "./rdb_datadic.h"
59 #include "./rdb_i_s.h"
60 #include "./rdb_index_merge.h"
61 #include "./rdb_mutex_wrapper.h"
62 #include "./rdb_threads.h"
63 
64 // Internal MySQL APIs not exposed in any header.
65 extern "C" {
66 /**
67   Mark transaction to rollback and mark error as fatal to a sub-statement.
68   @param  thd   Thread handle
69   @param  all   TRUE <=> rollback main transaction.
70 */
71 void thd_mark_transaction_to_rollback(MYSQL_THD thd, bool all);
72 
73 /**
74  *   Get the user thread's binary logging format
75  *   @param thd  user thread
76  *   @return Value to be used as index into the binlog_format_names array
77 */
78 int thd_binlog_format(const MYSQL_THD thd);
79 
80 /**
81  *   Check if binary logging is filtered for thread's current db.
82  *   @param  thd   Thread handle
83  *   @retval 1 the query is not filtered, 0 otherwise.
84 */
85 bool thd_binlog_filter_ok(const MYSQL_THD thd);
86 }
87 
88 namespace myrocks {
89 
90 static st_global_stats global_stats;
91 static st_export_stats export_stats;
92 
93 /**
94   Updates row counters based on the table type and operation type.
95 */
update_row_stats(const operation_type & type)96 void ha_rocksdb::update_row_stats(const operation_type &type) {
97   DBUG_ASSERT(type < ROWS_MAX);
98   // Find if we are modifying system databases.
99   if (table->s && m_tbl_def->m_is_mysql_system_table)
100     global_stats.system_rows[type].inc();
101   else
102     global_stats.rows[type].inc();
103 }
104 
105 void dbug_dump_database(rocksdb::DB *db);
106 static handler *rocksdb_create_handler(my_core::handlerton *hton,
107                                        my_core::TABLE_SHARE *table_arg,
108                                        my_core::MEM_ROOT *mem_root);
109 
110 bool can_use_bloom_filter(THD *thd, const Rdb_key_def &kd,
111                           const rocksdb::Slice &eq_cond,
112                           const bool use_all_keys, bool is_ascending);
113 
114 ///////////////////////////////////////////////////////////
115 // Parameters and settings
116 ///////////////////////////////////////////////////////////
117 static char *rocksdb_default_cf_options;
118 static char *rocksdb_override_cf_options;
119 Rdb_cf_options rocksdb_cf_options_map;
120 
121 ///////////////////////////////////////////////////////////
122 // Globals
123 ///////////////////////////////////////////////////////////
124 handlerton *rocksdb_hton;
125 
126 rocksdb::TransactionDB *rdb = nullptr;
127 
128 static std::shared_ptr<rocksdb::Statistics> rocksdb_stats;
129 static std::shared_ptr<Rdb_tbl_prop_coll_factory> properties_collector_factory;
130 
131 Rdb_dict_manager dict_manager;
132 Rdb_cf_manager cf_manager;
133 Rdb_ddl_manager ddl_manager;
134 
135 /**
136   MyRocks background thread control
137   N.B. This is besides RocksDB's own background threads
138        (@see rocksdb::CancelAllBackgroundWork())
139 */
140 
141 static Rdb_background_thread rdb_bg_thread;
142 
143 // List of table names (using regex) that are exceptions to the strict
144 // collation check requirement.
145 Regex *rdb_collation_exceptions;
146 
147 static const char *const ERRSTR_ROLLBACK_ONLY =
148     "This transaction was rolled back and cannot be "
149     "committed. Only supported operation is to roll it back, "
150     "so all pending changes will be discarded. "
151     "Please restart another transaction.";
152 
rocksdb_flush_all_memtables()153 static void rocksdb_flush_all_memtables() {
154   const Rdb_cf_manager &cf_manager = rdb_get_cf_manager();
155   for (const auto &cf_handle : cf_manager.get_all_cf()) {
156     rdb->Flush(rocksdb::FlushOptions(), cf_handle);
157   }
158 }
159 
rocksdb_compact_column_family_stub(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)160 static void rocksdb_compact_column_family_stub(
161     THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
162     const void *const save) {}
163 
rocksdb_compact_column_family(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,struct st_mysql_value * const value)164 static int rocksdb_compact_column_family(THD *const thd,
165                                          struct st_mysql_sys_var *const var,
166                                          void *const var_ptr,
167                                          struct st_mysql_value *const value) {
168   char buff[STRING_BUFFER_USUAL_SIZE];
169   int len = sizeof(buff);
170 
171   DBUG_ASSERT(value != nullptr);
172 
173   if (const char *const cf = value->val_str(value, buff, &len)) {
174     bool is_automatic;
175     auto cfh = cf_manager.get_cf(cf, "", nullptr, &is_automatic);
176     if (cfh != nullptr && rdb != nullptr) {
177       sql_print_information("RocksDB: Manual compaction of column family: %s\n",
178                             cf);
179       rdb->CompactRange(rocksdb::CompactRangeOptions(), cfh, nullptr, nullptr);
180     }
181   }
182   return HA_EXIT_SUCCESS;
183 }
184 
185 ///////////////////////////////////////////////////////////
186 // Hash map: table name => open table handler
187 ///////////////////////////////////////////////////////////
188 
189 namespace // anonymous namespace = not visible outside this source file
190 {
191 
192 const ulong TABLE_HASH_SIZE = 32;
193 
194 struct Rdb_open_tables_map {
195   /* Hash table used to track the handlers of open tables */
196   my_core::HASH m_hash;
197   /* The mutex used to protect the hash table */
198   mutable mysql_mutex_t m_mutex;
199 
init_hashmyrocks::__anoncf2112d80111::Rdb_open_tables_map200   void init_hash(void) {
201     (void)my_hash_init(&m_hash, my_core::system_charset_info, TABLE_HASH_SIZE,
202                        0, 0, (my_hash_get_key)Rdb_open_tables_map::get_hash_key,
203                        0, 0);
204   }
205 
free_hashmyrocks::__anoncf2112d80111::Rdb_open_tables_map206   void free_hash(void) { my_hash_free(&m_hash); }
207 
208   static uchar *get_hash_key(Rdb_table_handler *const table_handler,
209                              size_t *const length,
210                              my_bool not_used __attribute__((__unused__)));
211 
212   Rdb_table_handler *get_table_handler(const char *const table_name);
213   void release_table_handler(Rdb_table_handler *const table_handler);
214 
215   std::vector<std::string> get_table_names(void) const;
216 };
217 
218 } // anonymous namespace
219 
220 static Rdb_open_tables_map rdb_open_tables;
221 
rdb_normalize_dir(std::string dir)222 static std::string rdb_normalize_dir(std::string dir) {
223   while (dir.size() > 0 && dir.back() == '/') {
224     dir.resize(dir.size() - 1);
225   }
226   return dir;
227 }
228 
rocksdb_create_checkpoint(THD * const thd,struct st_mysql_sys_var * const var,void * const save,struct st_mysql_value * const value)229 static int rocksdb_create_checkpoint(THD *const thd __attribute__((__unused__)),
230                                      struct st_mysql_sys_var *const var
231                                      __attribute__((__unused__)),
232                                      void *const save
233                                      __attribute__((__unused__)),
234                                      struct st_mysql_value *const value) {
235   char buf[FN_REFLEN];
236   int len = sizeof(buf);
237   const char *const checkpoint_dir_raw = value->val_str(value, buf, &len);
238   if (checkpoint_dir_raw) {
239     if (rdb != nullptr) {
240       std::string checkpoint_dir = rdb_normalize_dir(checkpoint_dir_raw);
241       // NO_LINT_DEBUG
242       sql_print_information("RocksDB: creating checkpoint in directory : %s\n",
243                             checkpoint_dir.c_str());
244       rocksdb::Checkpoint *checkpoint;
245       auto status = rocksdb::Checkpoint::Create(rdb, &checkpoint);
246       if (status.ok()) {
247         status = checkpoint->CreateCheckpoint(checkpoint_dir.c_str());
248         if (status.ok()) {
249           sql_print_information(
250               "RocksDB: created checkpoint in directory : %s\n",
251               checkpoint_dir.c_str());
252         } else {
253           my_printf_error(
254               ER_UNKNOWN_ERROR,
255               "RocksDB: Failed to create checkpoint directory. status %d %s",
256               MYF(0), status.code(), status.ToString().c_str());
257         }
258         delete checkpoint;
259       } else {
260         const std::string err_text(status.ToString());
261         my_printf_error(
262             ER_UNKNOWN_ERROR,
263             "RocksDB: failed to initialize checkpoint. status %d %s\n", MYF(0),
264             status.code(), err_text.c_str());
265       }
266       return status.code();
267     }
268   }
269   return HA_ERR_INTERNAL_ERROR;
270 }
271 
272 /* This method is needed to indicate that the
273    ROCKSDB_CREATE_CHECKPOINT command is not read-only */
rocksdb_create_checkpoint_stub(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)274 static void rocksdb_create_checkpoint_stub(THD *const thd,
275                                            struct st_mysql_sys_var *const var,
276                                            void *const var_ptr,
277                                            const void *const save) {}
278 
rocksdb_force_flush_memtable_now_stub(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)279 static void rocksdb_force_flush_memtable_now_stub(
280     THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
281     const void *const save) {}
282 
rocksdb_force_flush_memtable_now(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,struct st_mysql_value * const value)283 static int rocksdb_force_flush_memtable_now(
284     THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
285     struct st_mysql_value *const value) {
286   sql_print_information("RocksDB: Manual memtable flush\n");
287   rocksdb_flush_all_memtables();
288   return HA_EXIT_SUCCESS;
289 }
290 
291 static void rocksdb_drop_index_wakeup_thread(
292     my_core::THD *const thd __attribute__((__unused__)),
293     struct st_mysql_sys_var *const var __attribute__((__unused__)),
294     void *const var_ptr __attribute__((__unused__)), const void *const save);
295 
296 static my_bool rocksdb_pause_background_work = 0;
297 static mysql_mutex_t rdb_sysvars_mutex;
298 
rocksdb_set_pause_background_work(my_core::THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)299 static void rocksdb_set_pause_background_work(
300     my_core::THD *const thd __attribute__((__unused__)),
301     struct st_mysql_sys_var *const var __attribute__((__unused__)),
302     void *const var_ptr __attribute__((__unused__)), const void *const save) {
303   mysql_mutex_lock(&rdb_sysvars_mutex);
304   const bool pause_requested = *static_cast<const bool *>(save);
305   if (rocksdb_pause_background_work != pause_requested) {
306     if (pause_requested) {
307       rdb->PauseBackgroundWork();
308     } else {
309       rdb->ContinueBackgroundWork();
310     }
311     rocksdb_pause_background_work = pause_requested;
312   }
313   mysql_mutex_unlock(&rdb_sysvars_mutex);
314 }
315 
316 static void rocksdb_set_compaction_options(THD *thd,
317                                            struct st_mysql_sys_var *var,
318                                            void *var_ptr, const void *save);
319 
320 static void rocksdb_set_table_stats_sampling_pct(THD *thd,
321                                                  struct st_mysql_sys_var *var,
322                                                  void *var_ptr,
323                                                  const void *save);
324 
325 static void rocksdb_set_rate_limiter_bytes_per_sec(THD *thd,
326                                                    struct st_mysql_sys_var *var,
327                                                    void *var_ptr,
328                                                    const void *save);
329 
330 static void rdb_set_collation_exception_list(const char *exception_list);
331 static void rocksdb_set_collation_exception_list(THD *thd,
332                                                  struct st_mysql_sys_var *var,
333                                                  void *var_ptr,
334                                                  const void *save);
335 
336 static void rocksdb_set_bulk_load(THD *thd, struct st_mysql_sys_var *var
337                                   __attribute__((__unused__)),
338                                   void *var_ptr, const void *save);
339 
340 static void rocksdb_set_max_background_compactions(
341     THD *thd, struct st_mysql_sys_var *const var, void *const var_ptr,
342     const void *const save);
343 //////////////////////////////////////////////////////////////////////////////
344 // Options definitions
345 //////////////////////////////////////////////////////////////////////////////
346 static long long rocksdb_block_cache_size;
347 /* Use unsigned long long instead of uint64_t because of MySQL compatibility */
348 static unsigned long long // NOLINT(runtime/int)
349     rocksdb_rate_limiter_bytes_per_sec;
350 static unsigned long // NOLINT(runtime/int)
351     rocksdb_persistent_cache_size;
352 static uint64_t rocksdb_info_log_level;
353 static char *rocksdb_wal_dir;
354 static char *rocksdb_persistent_cache_path;
355 static uint64_t rocksdb_index_type;
356 static char rocksdb_background_sync;
357 static uint32_t rocksdb_debug_optimizer_n_rows;
358 static my_bool rocksdb_debug_optimizer_no_zero_cardinality;
359 static uint32_t rocksdb_wal_recovery_mode;
360 static uint32_t rocksdb_access_hint_on_compaction_start;
361 static char *rocksdb_compact_cf_name;
362 static char *rocksdb_checkpoint_name;
363 static my_bool rocksdb_signal_drop_index_thread;
364 static my_bool rocksdb_strict_collation_check = 1;
365 static my_bool rocksdb_enable_2pc = 0;
366 static char *rocksdb_strict_collation_exceptions;
367 static my_bool rocksdb_collect_sst_properties = 1;
368 static my_bool rocksdb_force_flush_memtable_now_var = 0;
369 static uint64_t rocksdb_number_stat_computes = 0;
370 static uint32_t rocksdb_seconds_between_stat_computes = 3600;
371 static long long rocksdb_compaction_sequential_deletes = 0l;
372 static long long rocksdb_compaction_sequential_deletes_window = 0l;
373 static long long rocksdb_compaction_sequential_deletes_file_size = 0l;
374 static uint32_t rocksdb_validate_tables = 1;
375 static char *rocksdb_datadir;
376 static uint32_t rocksdb_table_stats_sampling_pct;
377 static my_bool rocksdb_enable_bulk_load_api = 1;
378 static my_bool rpl_skip_tx_api_var = 0;
379 static my_bool rocksdb_print_snapshot_conflict_queries = 0;
380 
381 std::atomic<uint64_t> rocksdb_snapshot_conflict_errors(0);
382 std::atomic<uint64_t> rocksdb_wal_group_syncs(0);
383 
rdb_init_rocksdb_db_options(void)384 static rocksdb::DBOptions rdb_init_rocksdb_db_options(void) {
385   rocksdb::DBOptions o;
386 
387   o.create_if_missing = true;
388   o.listeners.push_back(std::make_shared<Rdb_event_listener>(&ddl_manager));
389   o.info_log_level = rocksdb::InfoLogLevel::INFO_LEVEL;
390   o.max_subcompactions = DEFAULT_SUBCOMPACTIONS;
391 
392   return o;
393 }
394 
395 static rocksdb::DBOptions rocksdb_db_options = rdb_init_rocksdb_db_options();
396 static rocksdb::BlockBasedTableOptions rocksdb_tbl_options;
397 
398 static std::shared_ptr<rocksdb::RateLimiter> rocksdb_rate_limiter;
399 
400 /* This enum needs to be kept up to date with rocksdb::InfoLogLevel */
401 static const char *info_log_level_names[] = {"debug_level", "info_level",
402                                              "warn_level",  "error_level",
403                                              "fatal_level", NullS};
404 
405 static TYPELIB info_log_level_typelib = {
406     array_elements(info_log_level_names) - 1, "info_log_level_typelib",
407     info_log_level_names, nullptr};
408 
rocksdb_set_rocksdb_info_log_level(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)409 static void rocksdb_set_rocksdb_info_log_level(
410     THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
411     const void *const save) {
412   DBUG_ASSERT(save != nullptr);
413 
414   mysql_mutex_lock(&rdb_sysvars_mutex);
415   rocksdb_info_log_level = *static_cast<const uint64_t *>(save);
416   rocksdb_db_options.info_log->SetInfoLogLevel(
417       static_cast<const rocksdb::InfoLogLevel>(rocksdb_info_log_level));
418   mysql_mutex_unlock(&rdb_sysvars_mutex);
419 }
420 
421 static const char *index_type_names[] = {"kBinarySearch", "kHashSearch", NullS};
422 
423 static TYPELIB index_type_typelib = {array_elements(index_type_names) - 1,
424                                      "index_type_typelib", index_type_names,
425                                      nullptr};
426 
427 const ulong RDB_MAX_LOCK_WAIT_SECONDS = 1024 * 1024 * 1024;
428 const ulong RDB_MAX_ROW_LOCKS = 1024 * 1024 * 1024;
429 const ulong RDB_DEFAULT_BULK_LOAD_SIZE = 1000;
430 const ulong RDB_MAX_BULK_LOAD_SIZE = 1024 * 1024 * 1024;
431 const size_t RDB_DEFAULT_MERGE_BUF_SIZE = 64 * 1024 * 1024;
432 const size_t RDB_MIN_MERGE_BUF_SIZE = 100;
433 const size_t RDB_DEFAULT_MERGE_COMBINE_READ_SIZE = 1024 * 1024 * 1024;
434 const size_t RDB_MIN_MERGE_COMBINE_READ_SIZE = 100;
435 const int64 RDB_DEFAULT_BLOCK_CACHE_SIZE = 512 * 1024 * 1024;
436 const int64 RDB_MIN_BLOCK_CACHE_SIZE = 1024;
437 const int RDB_MAX_CHECKSUMS_PCT = 100;
438 
439 // TODO: 0 means don't wait at all, and we don't support it yet?
440 static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG,
441                           "Number of seconds to wait for lock", nullptr,
442                           nullptr, /*default*/ 1, /*min*/ 1,
443                           /*max*/ RDB_MAX_LOCK_WAIT_SECONDS, 0);
444 
445 static MYSQL_THDVAR_BOOL(deadlock_detect, PLUGIN_VAR_RQCMDARG,
446                          "Enables deadlock detection", nullptr, nullptr, FALSE);
447 
448 static MYSQL_THDVAR_BOOL(
449     trace_sst_api, PLUGIN_VAR_RQCMDARG,
450     "Generate trace output in the log for each call to the SstFileWriter",
451     nullptr, nullptr, FALSE);
452 
453 static MYSQL_THDVAR_BOOL(
454     bulk_load, PLUGIN_VAR_RQCMDARG,
455     "Use bulk-load mode for inserts. This disables "
456     "unique_checks and enables rocksdb_commit_in_the_middle.",
457     nullptr, rocksdb_set_bulk_load, FALSE);
458 
459 static MYSQL_SYSVAR_BOOL(enable_bulk_load_api, rocksdb_enable_bulk_load_api,
460                          PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
461                          "Enables using SstFileWriter for bulk loading",
462                          nullptr, nullptr, rocksdb_enable_bulk_load_api);
463 
464 static MYSQL_THDVAR_STR(tmpdir, PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_MEMALLOC,
465                         "Directory for temporary files during DDL operations.",
466                         nullptr, nullptr, "");
467 
468 static MYSQL_THDVAR_BOOL(
469     commit_in_the_middle, PLUGIN_VAR_RQCMDARG,
470     "Commit rows implicitly every rocksdb_bulk_load_size, on bulk load/insert, "
471     "update and delete",
472     nullptr, nullptr, FALSE);
473 
474 static MYSQL_THDVAR_STR(
475     read_free_rpl_tables, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
476     "Regex that describes set of tables that will use read-free replication "
477     "on the slave (i.e. not lookup a row during replication)",
478     nullptr, nullptr, "");
479 
480 static MYSQL_SYSVAR_BOOL(
481     rpl_skip_tx_api, rpl_skip_tx_api_var, PLUGIN_VAR_RQCMDARG,
482     "Use write batches for replication thread instead of tx api", nullptr,
483     nullptr, FALSE);
484 
485 static MYSQL_THDVAR_BOOL(skip_bloom_filter_on_read, PLUGIN_VAR_RQCMDARG,
486                          "Skip using bloom filter for reads", nullptr, nullptr,
487                          FALSE);
488 
489 static MYSQL_THDVAR_ULONG(max_row_locks, PLUGIN_VAR_RQCMDARG,
490                           "Maximum number of locks a transaction can have",
491                           nullptr, nullptr,
492                           /*default*/ RDB_MAX_ROW_LOCKS,
493                           /*min*/ 1,
494                           /*max*/ RDB_MAX_ROW_LOCKS, 0);
495 
496 static MYSQL_THDVAR_BOOL(
497     lock_scanned_rows, PLUGIN_VAR_RQCMDARG,
498     "Take and hold locks on rows that are scanned but not updated", nullptr,
499     nullptr, FALSE);
500 
501 static MYSQL_THDVAR_ULONG(bulk_load_size, PLUGIN_VAR_RQCMDARG,
502                           "Max #records in a batch for bulk-load mode", nullptr,
503                           nullptr,
504                           /*default*/ RDB_DEFAULT_BULK_LOAD_SIZE,
505                           /*min*/ 1,
506                           /*max*/ RDB_MAX_BULK_LOAD_SIZE, 0);
507 
508 static MYSQL_THDVAR_ULONGLONG(
509     merge_buf_size, PLUGIN_VAR_RQCMDARG,
510     "Size to allocate for merge sort buffers written out to disk "
511     "during inplace index creation.",
512     nullptr, nullptr,
513     /* default (64MB) */ RDB_DEFAULT_MERGE_BUF_SIZE,
514     /* min (100B) */ RDB_MIN_MERGE_BUF_SIZE,
515     /* max */ SIZE_T_MAX, 1);
516 
517 static MYSQL_THDVAR_ULONGLONG(
518     merge_combine_read_size, PLUGIN_VAR_RQCMDARG,
519     "Size that we have to work with during combine (reading from disk) phase "
520     "of "
521     "external sort during fast index creation.",
522     nullptr, nullptr,
523     /* default (1GB) */ RDB_DEFAULT_MERGE_COMBINE_READ_SIZE,
524     /* min (100B) */ RDB_MIN_MERGE_COMBINE_READ_SIZE,
525     /* max */ SIZE_T_MAX, 1);
526 
527 static MYSQL_SYSVAR_BOOL(
528     create_if_missing,
529     *reinterpret_cast<my_bool *>(&rocksdb_db_options.create_if_missing),
530     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
531     "DBOptions::create_if_missing for RocksDB", nullptr, nullptr,
532     rocksdb_db_options.create_if_missing);
533 
534 static MYSQL_SYSVAR_BOOL(
535     create_missing_column_families,
536     *reinterpret_cast<my_bool *>(
537         &rocksdb_db_options.create_missing_column_families),
538     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
539     "DBOptions::create_missing_column_families for RocksDB", nullptr, nullptr,
540     rocksdb_db_options.create_missing_column_families);
541 
542 static MYSQL_SYSVAR_BOOL(
543     error_if_exists,
544     *reinterpret_cast<my_bool *>(&rocksdb_db_options.error_if_exists),
545     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
546     "DBOptions::error_if_exists for RocksDB", nullptr, nullptr,
547     rocksdb_db_options.error_if_exists);
548 
549 static MYSQL_SYSVAR_BOOL(
550     paranoid_checks,
551     *reinterpret_cast<my_bool *>(&rocksdb_db_options.paranoid_checks),
552     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
553     "DBOptions::paranoid_checks for RocksDB", nullptr, nullptr,
554     rocksdb_db_options.paranoid_checks);
555 
556 static MYSQL_SYSVAR_ULONGLONG(
557     rate_limiter_bytes_per_sec, rocksdb_rate_limiter_bytes_per_sec,
558     PLUGIN_VAR_RQCMDARG, "DBOptions::rate_limiter bytes_per_sec for RocksDB",
559     nullptr, rocksdb_set_rate_limiter_bytes_per_sec, /* default */ 0L,
560     /* min */ 0L, /* max */ MAX_RATE_LIMITER_BYTES_PER_SEC, 0);
561 
562 static MYSQL_SYSVAR_ENUM(
563     info_log_level, rocksdb_info_log_level, PLUGIN_VAR_RQCMDARG,
564     "Filter level for info logs to be written mysqld error log. "
565     "Valid values include 'debug_level', 'info_level', 'warn_level'"
566     "'error_level' and 'fatal_level'.",
567     nullptr, rocksdb_set_rocksdb_info_log_level,
568     rocksdb::InfoLogLevel::ERROR_LEVEL, &info_log_level_typelib);
569 
570 static MYSQL_THDVAR_INT(
571     perf_context_level, PLUGIN_VAR_RQCMDARG,
572     "Perf Context Level for rocksdb internal timer stat collection", nullptr,
573     nullptr,
574     /* default */ rocksdb::PerfLevel::kUninitialized,
575     /* min */ rocksdb::PerfLevel::kUninitialized,
576     /* max */ rocksdb::PerfLevel::kOutOfBounds - 1, 0);
577 
578 static MYSQL_SYSVAR_UINT(
579     wal_recovery_mode, rocksdb_wal_recovery_mode, PLUGIN_VAR_RQCMDARG,
580     "DBOptions::wal_recovery_mode for RocksDB", nullptr, nullptr,
581     /* default */ (uint)rocksdb::WALRecoveryMode::kPointInTimeRecovery,
582     /* min */ (uint)rocksdb::WALRecoveryMode::kTolerateCorruptedTailRecords,
583     /* max */ (uint)rocksdb::WALRecoveryMode::kSkipAnyCorruptedRecords, 0);
584 
585 static MYSQL_SYSVAR_ULONG(compaction_readahead_size,
586                           rocksdb_db_options.compaction_readahead_size,
587                           PLUGIN_VAR_RQCMDARG,
588                           "DBOptions::compaction_readahead_size for RocksDB",
589                           nullptr, nullptr,
590                           rocksdb_db_options.compaction_readahead_size,
591                           /* min */ 0L, /* max */ ULONG_MAX, 0);
592 
593 static MYSQL_SYSVAR_BOOL(
594     new_table_reader_for_compaction_inputs,
595     *reinterpret_cast<my_bool *>(
596         &rocksdb_db_options.new_table_reader_for_compaction_inputs),
597     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
598     "DBOptions::new_table_reader_for_compaction_inputs for RocksDB", nullptr,
599     nullptr, rocksdb_db_options.new_table_reader_for_compaction_inputs);
600 
601 static MYSQL_SYSVAR_UINT(
602     access_hint_on_compaction_start, rocksdb_access_hint_on_compaction_start,
603     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
604     "DBOptions::access_hint_on_compaction_start for RocksDB", nullptr, nullptr,
605     /* default */ (uint)rocksdb::Options::AccessHint::NORMAL,
606     /* min */ (uint)rocksdb::Options::AccessHint::NONE,
607     /* max */ (uint)rocksdb::Options::AccessHint::WILLNEED, 0);
608 
609 static MYSQL_SYSVAR_BOOL(
610     allow_concurrent_memtable_write,
611     *reinterpret_cast<my_bool *>(
612         &rocksdb_db_options.allow_concurrent_memtable_write),
613     PLUGIN_VAR_RQCMDARG,
614     "DBOptions::allow_concurrent_memtable_write for RocksDB", nullptr, nullptr,
615     false);
616 
617 static MYSQL_SYSVAR_BOOL(
618     enable_write_thread_adaptive_yield,
619     *reinterpret_cast<my_bool *>(
620         &rocksdb_db_options.enable_write_thread_adaptive_yield),
621     PLUGIN_VAR_RQCMDARG,
622     "DBOptions::enable_write_thread_adaptive_yield for RocksDB", nullptr,
623     nullptr, false);
624 
625 static MYSQL_SYSVAR_INT(max_open_files, rocksdb_db_options.max_open_files,
626                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
627                         "DBOptions::max_open_files for RocksDB", nullptr,
628                         nullptr, rocksdb_db_options.max_open_files,
629                         /* min */ -1, /* max */ INT_MAX, 0);
630 
631 static MYSQL_SYSVAR_ULONG(max_total_wal_size,
632                           rocksdb_db_options.max_total_wal_size,
633                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
634                           "DBOptions::max_total_wal_size for RocksDB", nullptr,
635                           nullptr, rocksdb_db_options.max_total_wal_size,
636                           /* min */ 0L, /* max */ LONG_MAX, 0);
637 
638 static MYSQL_SYSVAR_BOOL(
639     disabledatasync,
640     *reinterpret_cast<my_bool *>(&rocksdb_db_options.disableDataSync),
641     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
642     "DBOptions::disableDataSync for RocksDB", nullptr, nullptr,
643     rocksdb_db_options.disableDataSync);
644 
645 static MYSQL_SYSVAR_BOOL(
646     use_fsync, *reinterpret_cast<my_bool *>(&rocksdb_db_options.use_fsync),
647     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
648     "DBOptions::use_fsync for RocksDB", nullptr, nullptr,
649     rocksdb_db_options.use_fsync);
650 
651 static MYSQL_SYSVAR_STR(wal_dir, rocksdb_wal_dir,
652                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
653                         "DBOptions::wal_dir for RocksDB", nullptr, nullptr,
654                         rocksdb_db_options.wal_dir.c_str());
655 
656 static MYSQL_SYSVAR_STR(
657     persistent_cache_path, rocksdb_persistent_cache_path,
658     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
659     "Path for BlockBasedTableOptions::persistent_cache for RocksDB", nullptr,
660     nullptr, "");
661 
662 static MYSQL_SYSVAR_ULONG(
663     persistent_cache_size, rocksdb_persistent_cache_size,
664     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
665     "Size of cache for BlockBasedTableOptions::persistent_cache for RocksDB",
666     nullptr, nullptr, rocksdb_persistent_cache_size,
667     /* min */ 0L, /* max */ ULONG_MAX, 0);
668 
669 static MYSQL_SYSVAR_ULONG(
670     delete_obsolete_files_period_micros,
671     rocksdb_db_options.delete_obsolete_files_period_micros,
672     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
673     "DBOptions::delete_obsolete_files_period_micros for RocksDB", nullptr,
674     nullptr, rocksdb_db_options.delete_obsolete_files_period_micros,
675     /* min */ 0L, /* max */ LONG_MAX, 0);
676 
677 static MYSQL_SYSVAR_INT(base_background_compactions,
678                         rocksdb_db_options.base_background_compactions,
679                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
680                         "DBOptions::base_background_compactions for RocksDB",
681                         nullptr, nullptr,
682                         rocksdb_db_options.base_background_compactions,
683                         /* min */ -1, /* max */ MAX_BACKGROUND_COMPACTIONS, 0);
684 
685 static MYSQL_SYSVAR_INT(max_background_compactions,
686                         rocksdb_db_options.max_background_compactions,
687                         PLUGIN_VAR_RQCMDARG,
688                         "DBOptions::max_background_compactions for RocksDB",
689                         nullptr, rocksdb_set_max_background_compactions,
690                         rocksdb_db_options.max_background_compactions,
691                         /* min */ 1, /* max */ MAX_BACKGROUND_COMPACTIONS, 0);
692 
693 static MYSQL_SYSVAR_INT(max_background_flushes,
694                         rocksdb_db_options.max_background_flushes,
695                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
696                         "DBOptions::max_background_flushes for RocksDB",
697                         nullptr, nullptr,
698                         rocksdb_db_options.max_background_flushes,
699                         /* min */ 1, /* max */ MAX_BACKGROUND_FLUSHES, 0);
700 
701 static MYSQL_SYSVAR_UINT(max_subcompactions,
702                          rocksdb_db_options.max_subcompactions,
703                          PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
704                          "DBOptions::max_subcompactions for RocksDB", nullptr,
705                          nullptr, rocksdb_db_options.max_subcompactions,
706                          /* min */ 1, /* max */ MAX_SUBCOMPACTIONS, 0);
707 
708 static MYSQL_SYSVAR_ULONG(max_log_file_size,
709                           rocksdb_db_options.max_log_file_size,
710                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
711                           "DBOptions::max_log_file_size for RocksDB", nullptr,
712                           nullptr, rocksdb_db_options.max_log_file_size,
713                           /* min */ 0L, /* max */ LONG_MAX, 0);
714 
715 static MYSQL_SYSVAR_ULONG(log_file_time_to_roll,
716                           rocksdb_db_options.log_file_time_to_roll,
717                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
718                           "DBOptions::log_file_time_to_roll for RocksDB",
719                           nullptr, nullptr,
720                           rocksdb_db_options.log_file_time_to_roll,
721                           /* min */ 0L, /* max */ LONG_MAX, 0);
722 
723 static MYSQL_SYSVAR_ULONG(keep_log_file_num,
724                           rocksdb_db_options.keep_log_file_num,
725                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
726                           "DBOptions::keep_log_file_num for RocksDB", nullptr,
727                           nullptr, rocksdb_db_options.keep_log_file_num,
728                           /* min */ 0L, /* max */ LONG_MAX, 0);
729 
730 static MYSQL_SYSVAR_ULONG(max_manifest_file_size,
731                           rocksdb_db_options.max_manifest_file_size,
732                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
733                           "DBOptions::max_manifest_file_size for RocksDB",
734                           nullptr, nullptr,
735                           rocksdb_db_options.max_manifest_file_size,
736                           /* min */ 0L, /* max */ ULONG_MAX, 0);
737 
738 static MYSQL_SYSVAR_INT(table_cache_numshardbits,
739                         rocksdb_db_options.table_cache_numshardbits,
740                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
741                         "DBOptions::table_cache_numshardbits for RocksDB",
742                         nullptr, nullptr,
743                         rocksdb_db_options.table_cache_numshardbits,
744                         /* min */ 0, /* max */ INT_MAX, 0);
745 
746 static MYSQL_SYSVAR_ULONG(wal_ttl_seconds, rocksdb_db_options.WAL_ttl_seconds,
747                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
748                           "DBOptions::WAL_ttl_seconds for RocksDB", nullptr,
749                           nullptr, rocksdb_db_options.WAL_ttl_seconds,
750                           /* min */ 0L, /* max */ LONG_MAX, 0);
751 
752 static MYSQL_SYSVAR_ULONG(wal_size_limit_mb,
753                           rocksdb_db_options.WAL_size_limit_MB,
754                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
755                           "DBOptions::WAL_size_limit_MB for RocksDB", nullptr,
756                           nullptr, rocksdb_db_options.WAL_size_limit_MB,
757                           /* min */ 0L, /* max */ LONG_MAX, 0);
758 
759 static MYSQL_SYSVAR_ULONG(manifest_preallocation_size,
760                           rocksdb_db_options.manifest_preallocation_size,
761                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
762                           "DBOptions::manifest_preallocation_size for RocksDB",
763                           nullptr, nullptr,
764                           rocksdb_db_options.manifest_preallocation_size,
765                           /* min */ 0L, /* max */ LONG_MAX, 0);
766 
767 static MYSQL_SYSVAR_BOOL(
768     use_direct_reads,
769     *reinterpret_cast<my_bool *>(&rocksdb_db_options.use_direct_reads),
770     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
771     "DBOptions::use_direct_reads for RocksDB", nullptr, nullptr,
772     rocksdb_db_options.use_direct_reads);
773 
774 static MYSQL_SYSVAR_BOOL(
775     use_direct_writes,
776     *reinterpret_cast<my_bool *>(&rocksdb_db_options.use_direct_writes),
777     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
778     "DBOptions::use_direct_writes for RocksDB", nullptr, nullptr,
779     rocksdb_db_options.use_direct_writes);
780 
781 static MYSQL_SYSVAR_BOOL(
782     allow_mmap_reads,
783     *reinterpret_cast<my_bool *>(&rocksdb_db_options.allow_mmap_reads),
784     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
785     "DBOptions::allow_mmap_reads for RocksDB", nullptr, nullptr,
786     rocksdb_db_options.allow_mmap_reads);
787 
788 static MYSQL_SYSVAR_BOOL(
789     allow_mmap_writes,
790     *reinterpret_cast<my_bool *>(&rocksdb_db_options.allow_mmap_writes),
791     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
792     "DBOptions::allow_mmap_writes for RocksDB", nullptr, nullptr,
793     rocksdb_db_options.allow_mmap_writes);
794 
795 static MYSQL_SYSVAR_BOOL(
796     is_fd_close_on_exec,
797     *reinterpret_cast<my_bool *>(&rocksdb_db_options.is_fd_close_on_exec),
798     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
799     "DBOptions::is_fd_close_on_exec for RocksDB", nullptr, nullptr,
800     rocksdb_db_options.is_fd_close_on_exec);
801 
802 static MYSQL_SYSVAR_UINT(stats_dump_period_sec,
803                          rocksdb_db_options.stats_dump_period_sec,
804                          PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
805                          "DBOptions::stats_dump_period_sec for RocksDB",
806                          nullptr, nullptr,
807                          rocksdb_db_options.stats_dump_period_sec,
808                          /* min */ 0, /* max */ INT_MAX, 0);
809 
810 static MYSQL_SYSVAR_BOOL(
811     advise_random_on_open,
812     *reinterpret_cast<my_bool *>(&rocksdb_db_options.advise_random_on_open),
813     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
814     "DBOptions::advise_random_on_open for RocksDB", nullptr, nullptr,
815     rocksdb_db_options.advise_random_on_open);
816 
817 static MYSQL_SYSVAR_ULONG(db_write_buffer_size,
818                           rocksdb_db_options.db_write_buffer_size,
819                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
820                           "DBOptions::db_write_buffer_size for RocksDB",
821                           nullptr, nullptr,
822                           rocksdb_db_options.db_write_buffer_size,
823                           /* min */ 0L, /* max */ LONG_MAX, 0);
824 
825 static MYSQL_SYSVAR_BOOL(
826     use_adaptive_mutex,
827     *reinterpret_cast<my_bool *>(&rocksdb_db_options.use_adaptive_mutex),
828     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
829     "DBOptions::use_adaptive_mutex for RocksDB", nullptr, nullptr,
830     rocksdb_db_options.use_adaptive_mutex);
831 
832 static MYSQL_SYSVAR_ULONG(bytes_per_sync, rocksdb_db_options.bytes_per_sync,
833                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
834                           "DBOptions::bytes_per_sync for RocksDB", nullptr,
835                           nullptr, rocksdb_db_options.bytes_per_sync,
836                           /* min */ 0L, /* max */ LONG_MAX, 0);
837 
838 static MYSQL_SYSVAR_ULONG(wal_bytes_per_sync,
839                           rocksdb_db_options.wal_bytes_per_sync,
840                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
841                           "DBOptions::wal_bytes_per_sync for RocksDB", nullptr,
842                           nullptr, rocksdb_db_options.wal_bytes_per_sync,
843                           /* min */ 0L, /* max */ LONG_MAX, 0);
844 
845 static MYSQL_SYSVAR_BOOL(
846     enable_thread_tracking,
847     *reinterpret_cast<my_bool *>(&rocksdb_db_options.enable_thread_tracking),
848     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
849     "DBOptions::enable_thread_tracking for RocksDB", nullptr, nullptr,
850     rocksdb_db_options.enable_thread_tracking);
851 
852 static MYSQL_SYSVAR_LONGLONG(block_cache_size, rocksdb_block_cache_size,
853                              PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
854                              "block_cache size for RocksDB", nullptr, nullptr,
855                              /* default */ RDB_DEFAULT_BLOCK_CACHE_SIZE,
856                              /* min */ RDB_MIN_BLOCK_CACHE_SIZE,
857                              /* max */ LONGLONG_MAX,
858                              /* Block size */ RDB_MIN_BLOCK_CACHE_SIZE);
859 
860 static MYSQL_SYSVAR_BOOL(
861     cache_index_and_filter_blocks,
862     *reinterpret_cast<my_bool *>(
863         &rocksdb_tbl_options.cache_index_and_filter_blocks),
864     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
865     "BlockBasedTableOptions::cache_index_and_filter_blocks for RocksDB",
866     nullptr, nullptr, true);
867 
868 // When pin_l0_filter_and_index_blocks_in_cache is true, RocksDB will  use the
869 // LRU cache, but will always keep the filter & idndex block's handle checked
870 // out (=won't call ShardedLRUCache::Release), plus the parsed out objects
871 // the LRU cache will never push flush them out, hence they're pinned.
872 //
873 // This fixes the mutex contention between :ShardedLRUCache::Lookup and
874 // ShardedLRUCache::Release which reduced the QPS ratio (QPS using secondary
875 // index / QPS using PK).
876 static MYSQL_SYSVAR_BOOL(
877     pin_l0_filter_and_index_blocks_in_cache,
878     *reinterpret_cast<my_bool *>(
879         &rocksdb_tbl_options.pin_l0_filter_and_index_blocks_in_cache),
880     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
881     "pin_l0_filter_and_index_blocks_in_cache for RocksDB", nullptr, nullptr,
882     true);
883 
884 static MYSQL_SYSVAR_ENUM(index_type, rocksdb_index_type,
885                          PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
886                          "BlockBasedTableOptions::index_type for RocksDB",
887                          nullptr, nullptr,
888                          (uint64_t)rocksdb_tbl_options.index_type,
889                          &index_type_typelib);
890 
891 static MYSQL_SYSVAR_BOOL(
892     hash_index_allow_collision,
893     *reinterpret_cast<my_bool *>(
894         &rocksdb_tbl_options.hash_index_allow_collision),
895     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
896     "BlockBasedTableOptions::hash_index_allow_collision for RocksDB", nullptr,
897     nullptr, rocksdb_tbl_options.hash_index_allow_collision);
898 
899 static MYSQL_SYSVAR_BOOL(
900     no_block_cache,
901     *reinterpret_cast<my_bool *>(&rocksdb_tbl_options.no_block_cache),
902     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
903     "BlockBasedTableOptions::no_block_cache for RocksDB", nullptr, nullptr,
904     rocksdb_tbl_options.no_block_cache);
905 
906 static MYSQL_SYSVAR_ULONG(block_size, rocksdb_tbl_options.block_size,
907                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
908                           "BlockBasedTableOptions::block_size for RocksDB",
909                           nullptr, nullptr, rocksdb_tbl_options.block_size,
910                           /* min */ 1L, /* max */ LONG_MAX, 0);
911 
912 static MYSQL_SYSVAR_INT(
913     block_size_deviation, rocksdb_tbl_options.block_size_deviation,
914     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
915     "BlockBasedTableOptions::block_size_deviation for RocksDB", nullptr,
916     nullptr, rocksdb_tbl_options.block_size_deviation,
917     /* min */ 0, /* max */ INT_MAX, 0);
918 
919 static MYSQL_SYSVAR_INT(
920     block_restart_interval, rocksdb_tbl_options.block_restart_interval,
921     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
922     "BlockBasedTableOptions::block_restart_interval for RocksDB", nullptr,
923     nullptr, rocksdb_tbl_options.block_restart_interval,
924     /* min */ 1, /* max */ INT_MAX, 0);
925 
926 static MYSQL_SYSVAR_BOOL(
927     whole_key_filtering,
928     *reinterpret_cast<my_bool *>(&rocksdb_tbl_options.whole_key_filtering),
929     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
930     "BlockBasedTableOptions::whole_key_filtering for RocksDB", nullptr, nullptr,
931     rocksdb_tbl_options.whole_key_filtering);
932 
933 static MYSQL_SYSVAR_STR(default_cf_options, rocksdb_default_cf_options,
934                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
935                         "default cf options for RocksDB", nullptr, nullptr, "");
936 
937 static MYSQL_SYSVAR_STR(override_cf_options, rocksdb_override_cf_options,
938                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
939                         "option overrides per cf for RocksDB", nullptr, nullptr,
940                         "");
941 
942 static MYSQL_SYSVAR_BOOL(background_sync, rocksdb_background_sync,
943                          PLUGIN_VAR_RQCMDARG,
944                          "turns on background syncs for RocksDB", nullptr,
945                          nullptr, FALSE);
946 
947 static MYSQL_THDVAR_BOOL(write_sync, PLUGIN_VAR_RQCMDARG,
948                          "WriteOptions::sync for RocksDB", nullptr, nullptr,
949                          rocksdb::WriteOptions().sync);
950 
951 static MYSQL_THDVAR_BOOL(write_disable_wal, PLUGIN_VAR_RQCMDARG,
952                          "WriteOptions::disableWAL for RocksDB", nullptr,
953                          nullptr, rocksdb::WriteOptions().disableWAL);
954 
955 static MYSQL_THDVAR_BOOL(
956     write_ignore_missing_column_families, PLUGIN_VAR_RQCMDARG,
957     "WriteOptions::ignore_missing_column_families for RocksDB", nullptr,
958     nullptr, rocksdb::WriteOptions().ignore_missing_column_families);
959 
960 static MYSQL_THDVAR_BOOL(skip_fill_cache, PLUGIN_VAR_RQCMDARG,
961                          "Skip filling block cache on read requests", nullptr,
962                          nullptr, FALSE);
963 
964 static MYSQL_THDVAR_BOOL(
965     unsafe_for_binlog, PLUGIN_VAR_RQCMDARG,
966     "Allowing statement based binary logging which may break consistency",
967     nullptr, nullptr, FALSE);
968 
969 static MYSQL_THDVAR_UINT(records_in_range, PLUGIN_VAR_RQCMDARG,
970                          "Used to override the result of records_in_range(). "
971                          "Set to a positive number to override",
972                          nullptr, nullptr, 0,
973                          /* min */ 0, /* max */ INT_MAX, 0);
974 
975 static MYSQL_THDVAR_UINT(force_index_records_in_range, PLUGIN_VAR_RQCMDARG,
976                          "Used to override the result of records_in_range() "
977                          "when FORCE INDEX is used.",
978                          nullptr, nullptr, 0,
979                          /* min */ 0, /* max */ INT_MAX, 0);
980 
981 static MYSQL_SYSVAR_UINT(
982     debug_optimizer_n_rows, rocksdb_debug_optimizer_n_rows,
983     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_NOSYSVAR,
984     "Test only to override rocksdb estimates of table size in a memtable",
985     nullptr, nullptr, 0, /* min */ 0, /* max */ INT_MAX, 0);
986 
987 static MYSQL_SYSVAR_BOOL(
988     debug_optimizer_no_zero_cardinality,
989     rocksdb_debug_optimizer_no_zero_cardinality, PLUGIN_VAR_RQCMDARG,
990     "In case if cardinality is zero, overrides it with some value", nullptr,
991     nullptr, TRUE);
992 
993 static MYSQL_SYSVAR_STR(compact_cf, rocksdb_compact_cf_name,
994                         PLUGIN_VAR_RQCMDARG, "Compact column family",
995                         rocksdb_compact_column_family,
996                         rocksdb_compact_column_family_stub, "");
997 
998 static MYSQL_SYSVAR_STR(create_checkpoint, rocksdb_checkpoint_name,
999                         PLUGIN_VAR_RQCMDARG, "Checkpoint directory",
1000                         rocksdb_create_checkpoint,
1001                         rocksdb_create_checkpoint_stub, "");
1002 
1003 static MYSQL_SYSVAR_BOOL(signal_drop_index_thread,
1004                          rocksdb_signal_drop_index_thread, PLUGIN_VAR_RQCMDARG,
1005                          "Wake up drop index thread", nullptr,
1006                          rocksdb_drop_index_wakeup_thread, FALSE);
1007 
1008 static MYSQL_SYSVAR_BOOL(pause_background_work, rocksdb_pause_background_work,
1009                          PLUGIN_VAR_RQCMDARG,
1010                          "Disable all rocksdb background operations", nullptr,
1011                          rocksdb_set_pause_background_work, FALSE);
1012 
1013 static MYSQL_SYSVAR_BOOL(enable_2pc, rocksdb_enable_2pc, PLUGIN_VAR_RQCMDARG,
1014                          "Enable two phase commit for MyRocks", nullptr,
1015                          nullptr, TRUE);
1016 
1017 static MYSQL_SYSVAR_BOOL(strict_collation_check, rocksdb_strict_collation_check,
1018                          PLUGIN_VAR_RQCMDARG,
1019                          "Enforce case sensitive collation for MyRocks indexes",
1020                          nullptr, nullptr, TRUE);
1021 
1022 static MYSQL_SYSVAR_STR(strict_collation_exceptions,
1023                         rocksdb_strict_collation_exceptions,
1024                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
1025                         "Regex that describes set of tables that are excluded "
1026                         "from the case sensitive collation enforcement",
1027                         nullptr, rocksdb_set_collation_exception_list, "");
1028 
1029 static MYSQL_SYSVAR_BOOL(collect_sst_properties, rocksdb_collect_sst_properties,
1030                          PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1031                          "Enables collecting SST file properties on each flush",
1032                          nullptr, nullptr, rocksdb_collect_sst_properties);
1033 
1034 static MYSQL_SYSVAR_BOOL(
1035     force_flush_memtable_now, rocksdb_force_flush_memtable_now_var,
1036     PLUGIN_VAR_RQCMDARG,
1037     "Forces memstore flush which may block all write requests so be careful",
1038     rocksdb_force_flush_memtable_now, rocksdb_force_flush_memtable_now_stub,
1039     FALSE);
1040 
1041 static MYSQL_THDVAR_BOOL(
1042     flush_memtable_on_analyze, PLUGIN_VAR_RQCMDARG,
1043     "Forces memtable flush on ANALZYE table to get accurate cardinality",
1044     nullptr, nullptr, true);
1045 
1046 static MYSQL_SYSVAR_UINT(
1047     seconds_between_stat_computes, rocksdb_seconds_between_stat_computes,
1048     PLUGIN_VAR_RQCMDARG,
1049     "Sets a number of seconds to wait between optimizer stats recomputation. "
1050     "Only changed indexes will be refreshed.",
1051     nullptr, nullptr, rocksdb_seconds_between_stat_computes,
1052     /* min */ 0L, /* max */ UINT_MAX, 0);
1053 
1054 static MYSQL_SYSVAR_LONGLONG(compaction_sequential_deletes,
1055                              rocksdb_compaction_sequential_deletes,
1056                              PLUGIN_VAR_RQCMDARG,
1057                              "RocksDB will trigger compaction for the file if "
1058                              "it has more than this number sequential deletes "
1059                              "per window",
1060                              nullptr, rocksdb_set_compaction_options,
1061                              DEFAULT_COMPACTION_SEQUENTIAL_DELETES,
1062                              /* min */ 0L,
1063                              /* max */ MAX_COMPACTION_SEQUENTIAL_DELETES, 0);
1064 
1065 static MYSQL_SYSVAR_LONGLONG(
1066     compaction_sequential_deletes_window,
1067     rocksdb_compaction_sequential_deletes_window, PLUGIN_VAR_RQCMDARG,
1068     "Size of the window for counting rocksdb_compaction_sequential_deletes",
1069     nullptr, rocksdb_set_compaction_options,
1070     DEFAULT_COMPACTION_SEQUENTIAL_DELETES_WINDOW,
1071     /* min */ 0L, /* max */ MAX_COMPACTION_SEQUENTIAL_DELETES_WINDOW, 0);
1072 
1073 static MYSQL_SYSVAR_LONGLONG(
1074     compaction_sequential_deletes_file_size,
1075     rocksdb_compaction_sequential_deletes_file_size, PLUGIN_VAR_RQCMDARG,
1076     "Minimum file size required for compaction_sequential_deletes", nullptr,
1077     rocksdb_set_compaction_options, 0L,
1078     /* min */ -1L, /* max */ LONGLONG_MAX, 0);
1079 
1080 static MYSQL_SYSVAR_BOOL(
1081     compaction_sequential_deletes_count_sd,
1082     rocksdb_compaction_sequential_deletes_count_sd, PLUGIN_VAR_RQCMDARG,
1083     "Counting SingleDelete as rocksdb_compaction_sequential_deletes", nullptr,
1084     nullptr, rocksdb_compaction_sequential_deletes_count_sd);
1085 
1086 static MYSQL_SYSVAR_BOOL(
1087     print_snapshot_conflict_queries, rocksdb_print_snapshot_conflict_queries,
1088     PLUGIN_VAR_RQCMDARG,
1089     "Logging queries that got snapshot conflict errors into *.err log", nullptr,
1090     nullptr, rocksdb_print_snapshot_conflict_queries);
1091 
1092 static MYSQL_THDVAR_INT(checksums_pct, PLUGIN_VAR_RQCMDARG,
1093                         "How many percentages of rows to be checksummed",
1094                         nullptr, nullptr, RDB_MAX_CHECKSUMS_PCT,
1095                         /* min */ 0, /* max */ RDB_MAX_CHECKSUMS_PCT, 0);
1096 
1097 static MYSQL_THDVAR_BOOL(store_row_debug_checksums, PLUGIN_VAR_RQCMDARG,
1098                          "Include checksums when writing index/table records",
1099                          nullptr, nullptr, false /* default value */);
1100 
1101 static MYSQL_THDVAR_BOOL(verify_row_debug_checksums, PLUGIN_VAR_RQCMDARG,
1102                          "Verify checksums when reading index/table records",
1103                          nullptr, nullptr, false /* default value */);
1104 
1105 static MYSQL_SYSVAR_UINT(
1106     validate_tables, rocksdb_validate_tables,
1107     PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1108     "Verify all .frm files match all RocksDB tables (0 means no verification, "
1109     "1 means verify and fail on error, and 2 means verify but continue",
1110     nullptr, nullptr, 1 /* default value */, 0 /* min value */,
1111     2 /* max value */, 0);
1112 
1113 static MYSQL_SYSVAR_STR(datadir, rocksdb_datadir,
1114                         PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
1115                         "RocksDB data directory", nullptr, nullptr,
1116                         "./.rocksdb");
1117 
1118 static MYSQL_SYSVAR_UINT(
1119     table_stats_sampling_pct, rocksdb_table_stats_sampling_pct,
1120     PLUGIN_VAR_RQCMDARG,
1121     "Percentage of entries to sample when collecting statistics about table "
1122     "properties. Specify either 0 to sample everything or percentage "
1123     "[" STRINGIFY_ARG(RDB_TBL_STATS_SAMPLE_PCT_MIN) ".." STRINGIFY_ARG(
1124         RDB_TBL_STATS_SAMPLE_PCT_MAX) "]. "
1125                                       "By default " STRINGIFY_ARG(
1126                                           RDB_DEFAULT_TBL_STATS_SAMPLE_PCT) "% "
1127                                                                             "of"
1128                                                                             " e"
1129                                                                             "nt"
1130                                                                             "ri"
1131                                                                             "es"
1132                                                                             " a"
1133                                                                             "re"
1134                                                                             " "
1135                                                                             "sa"
1136                                                                             "mp"
1137                                                                             "le"
1138                                                                             "d"
1139                                                                             ".",
1140     nullptr, rocksdb_set_table_stats_sampling_pct, /* default */
1141     RDB_DEFAULT_TBL_STATS_SAMPLE_PCT, /* everything */ 0,
1142     /* max */ RDB_TBL_STATS_SAMPLE_PCT_MAX, 0);
1143 
1144 static const int ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE = 100;
1145 
1146 static struct st_mysql_sys_var *rocksdb_system_variables[] = {
1147     MYSQL_SYSVAR(lock_wait_timeout),
1148     MYSQL_SYSVAR(deadlock_detect),
1149     MYSQL_SYSVAR(max_row_locks),
1150     MYSQL_SYSVAR(lock_scanned_rows),
1151     MYSQL_SYSVAR(bulk_load),
1152     MYSQL_SYSVAR(trace_sst_api),
1153     MYSQL_SYSVAR(commit_in_the_middle),
1154     MYSQL_SYSVAR(read_free_rpl_tables),
1155     MYSQL_SYSVAR(rpl_skip_tx_api),
1156     MYSQL_SYSVAR(bulk_load_size),
1157     MYSQL_SYSVAR(merge_buf_size),
1158     MYSQL_SYSVAR(enable_bulk_load_api),
1159     MYSQL_SYSVAR(tmpdir),
1160     MYSQL_SYSVAR(merge_combine_read_size),
1161     MYSQL_SYSVAR(skip_bloom_filter_on_read),
1162 
1163     MYSQL_SYSVAR(create_if_missing),
1164     MYSQL_SYSVAR(create_missing_column_families),
1165     MYSQL_SYSVAR(error_if_exists),
1166     MYSQL_SYSVAR(paranoid_checks),
1167     MYSQL_SYSVAR(rate_limiter_bytes_per_sec),
1168     MYSQL_SYSVAR(info_log_level),
1169     MYSQL_SYSVAR(max_open_files),
1170     MYSQL_SYSVAR(max_total_wal_size),
1171     MYSQL_SYSVAR(disabledatasync),
1172     MYSQL_SYSVAR(use_fsync),
1173     MYSQL_SYSVAR(wal_dir),
1174     MYSQL_SYSVAR(persistent_cache_path),
1175     MYSQL_SYSVAR(persistent_cache_size),
1176     MYSQL_SYSVAR(delete_obsolete_files_period_micros),
1177     MYSQL_SYSVAR(base_background_compactions),
1178     MYSQL_SYSVAR(max_background_compactions),
1179     MYSQL_SYSVAR(max_background_flushes),
1180     MYSQL_SYSVAR(max_log_file_size),
1181     MYSQL_SYSVAR(max_subcompactions),
1182     MYSQL_SYSVAR(log_file_time_to_roll),
1183     MYSQL_SYSVAR(keep_log_file_num),
1184     MYSQL_SYSVAR(max_manifest_file_size),
1185     MYSQL_SYSVAR(table_cache_numshardbits),
1186     MYSQL_SYSVAR(wal_ttl_seconds),
1187     MYSQL_SYSVAR(wal_size_limit_mb),
1188     MYSQL_SYSVAR(manifest_preallocation_size),
1189     MYSQL_SYSVAR(use_direct_reads),
1190     MYSQL_SYSVAR(use_direct_writes),
1191     MYSQL_SYSVAR(allow_mmap_reads),
1192     MYSQL_SYSVAR(allow_mmap_writes),
1193     MYSQL_SYSVAR(is_fd_close_on_exec),
1194     MYSQL_SYSVAR(stats_dump_period_sec),
1195     MYSQL_SYSVAR(advise_random_on_open),
1196     MYSQL_SYSVAR(db_write_buffer_size),
1197     MYSQL_SYSVAR(use_adaptive_mutex),
1198     MYSQL_SYSVAR(bytes_per_sync),
1199     MYSQL_SYSVAR(wal_bytes_per_sync),
1200     MYSQL_SYSVAR(enable_thread_tracking),
1201     MYSQL_SYSVAR(perf_context_level),
1202     MYSQL_SYSVAR(wal_recovery_mode),
1203     MYSQL_SYSVAR(access_hint_on_compaction_start),
1204     MYSQL_SYSVAR(new_table_reader_for_compaction_inputs),
1205     MYSQL_SYSVAR(compaction_readahead_size),
1206     MYSQL_SYSVAR(allow_concurrent_memtable_write),
1207     MYSQL_SYSVAR(enable_write_thread_adaptive_yield),
1208 
1209     MYSQL_SYSVAR(block_cache_size),
1210     MYSQL_SYSVAR(cache_index_and_filter_blocks),
1211     MYSQL_SYSVAR(pin_l0_filter_and_index_blocks_in_cache),
1212     MYSQL_SYSVAR(index_type),
1213     MYSQL_SYSVAR(hash_index_allow_collision),
1214     MYSQL_SYSVAR(no_block_cache),
1215     MYSQL_SYSVAR(block_size),
1216     MYSQL_SYSVAR(block_size_deviation),
1217     MYSQL_SYSVAR(block_restart_interval),
1218     MYSQL_SYSVAR(whole_key_filtering),
1219 
1220     MYSQL_SYSVAR(default_cf_options),
1221     MYSQL_SYSVAR(override_cf_options),
1222 
1223     MYSQL_SYSVAR(background_sync),
1224 
1225     MYSQL_SYSVAR(write_sync),
1226     MYSQL_SYSVAR(write_disable_wal),
1227     MYSQL_SYSVAR(write_ignore_missing_column_families),
1228 
1229     MYSQL_SYSVAR(skip_fill_cache),
1230     MYSQL_SYSVAR(unsafe_for_binlog),
1231 
1232     MYSQL_SYSVAR(records_in_range),
1233     MYSQL_SYSVAR(force_index_records_in_range),
1234     MYSQL_SYSVAR(debug_optimizer_n_rows),
1235     MYSQL_SYSVAR(debug_optimizer_no_zero_cardinality),
1236 
1237     MYSQL_SYSVAR(compact_cf),
1238     MYSQL_SYSVAR(signal_drop_index_thread),
1239     MYSQL_SYSVAR(pause_background_work),
1240     MYSQL_SYSVAR(enable_2pc),
1241     MYSQL_SYSVAR(strict_collation_check),
1242     MYSQL_SYSVAR(strict_collation_exceptions),
1243     MYSQL_SYSVAR(collect_sst_properties),
1244     MYSQL_SYSVAR(force_flush_memtable_now),
1245     MYSQL_SYSVAR(flush_memtable_on_analyze),
1246     MYSQL_SYSVAR(seconds_between_stat_computes),
1247 
1248     MYSQL_SYSVAR(compaction_sequential_deletes),
1249     MYSQL_SYSVAR(compaction_sequential_deletes_window),
1250     MYSQL_SYSVAR(compaction_sequential_deletes_file_size),
1251     MYSQL_SYSVAR(compaction_sequential_deletes_count_sd),
1252     MYSQL_SYSVAR(print_snapshot_conflict_queries),
1253 
1254     MYSQL_SYSVAR(datadir),
1255     MYSQL_SYSVAR(create_checkpoint),
1256 
1257     MYSQL_SYSVAR(checksums_pct),
1258     MYSQL_SYSVAR(store_row_debug_checksums),
1259     MYSQL_SYSVAR(verify_row_debug_checksums),
1260 
1261     MYSQL_SYSVAR(validate_tables),
1262     MYSQL_SYSVAR(table_stats_sampling_pct),
1263     nullptr};
1264 
1265 static rocksdb::WriteOptions
rdb_get_rocksdb_write_options(my_core::THD * const thd)1266 rdb_get_rocksdb_write_options(my_core::THD *const thd) {
1267   rocksdb::WriteOptions opt;
1268 
1269   opt.sync = THDVAR(thd, write_sync);
1270   opt.disableWAL = THDVAR(thd, write_disable_wal);
1271   opt.ignore_missing_column_families =
1272       THDVAR(thd, write_ignore_missing_column_families);
1273 
1274   return opt;
1275 }
1276 
1277 ///////////////////////////////////////////////////////////////////////////////////////////
1278 
1279 /**
1280   @brief
1281   Function we use in the creation of our hash to get key.
1282 */
1283 
get_hash_key(Rdb_table_handler * const table_handler,size_t * const length,my_bool not_used)1284 uchar *Rdb_open_tables_map::get_hash_key(Rdb_table_handler *const table_handler,
1285                                          size_t *const length, my_bool not_used
1286                                          __attribute__((__unused__))) {
1287   *length = table_handler->m_table_name_length;
1288   return reinterpret_cast<uchar *>(table_handler->m_table_name);
1289 }
1290 
1291 /*
1292   The following is needed as an argument for mysql_stage_register,
1293   irrespectively of whether we're compiling with P_S or not.
1294 */
1295 PSI_stage_info stage_waiting_on_row_lock = {0, "Waiting for row lock", 0};
1296 
1297 #ifdef HAVE_PSI_INTERFACE
1298 static PSI_thread_key rdb_background_psi_thread_key;
1299 static PSI_thread_key rdb_drop_idx_psi_thread_key;
1300 
1301 static PSI_stage_info *all_rocksdb_stages[] = {&stage_waiting_on_row_lock};
1302 
1303 static my_core::PSI_mutex_key rdb_psi_open_tbls_mutex_key,
1304     rdb_signal_bg_psi_mutex_key, rdb_signal_drop_idx_psi_mutex_key,
1305     rdb_collation_data_mutex_key, rdb_mem_cmp_space_mutex_key,
1306     key_mutex_tx_list, rdb_sysvars_psi_mutex_key;
1307 
1308 static PSI_mutex_info all_rocksdb_mutexes[] = {
1309     {&rdb_psi_open_tbls_mutex_key, "open tables", PSI_FLAG_GLOBAL},
1310     {&rdb_signal_bg_psi_mutex_key, "stop background", PSI_FLAG_GLOBAL},
1311     {&rdb_signal_drop_idx_psi_mutex_key, "signal drop index", PSI_FLAG_GLOBAL},
1312     {&rdb_collation_data_mutex_key, "collation data init", PSI_FLAG_GLOBAL},
1313     {&rdb_mem_cmp_space_mutex_key, "collation space char data init",
1314      PSI_FLAG_GLOBAL},
1315     {&key_mutex_tx_list, "tx_list", PSI_FLAG_GLOBAL},
1316     {&rdb_sysvars_psi_mutex_key, "setting sysvar", PSI_FLAG_GLOBAL},
1317 };
1318 
1319 static PSI_rwlock_key key_rwlock_collation_exception_list;
1320 static PSI_rwlock_key key_rwlock_read_free_rpl_tables;
1321 
1322 static PSI_rwlock_info all_rocksdb_rwlocks[] = {
1323     {&key_rwlock_collation_exception_list, "collation_exception_list",
1324      PSI_FLAG_GLOBAL},
1325     {&key_rwlock_read_free_rpl_tables, "read_free_rpl_tables", PSI_FLAG_GLOBAL}
1326 };
1327 
1328 PSI_cond_key rdb_signal_bg_psi_cond_key, rdb_signal_drop_idx_psi_cond_key;
1329 
1330 static PSI_cond_info all_rocksdb_conds[] = {
1331     {&rdb_signal_bg_psi_cond_key, "cond signal background", PSI_FLAG_GLOBAL},
1332     {&rdb_signal_drop_idx_psi_cond_key, "cond signal drop index",
1333      PSI_FLAG_GLOBAL},
1334 };
1335 
1336 static PSI_thread_info all_rocksdb_threads[] = {
1337     {&rdb_background_psi_thread_key, "background", PSI_FLAG_GLOBAL},
1338     {&rdb_drop_idx_psi_thread_key, "drop index", PSI_FLAG_GLOBAL},
1339 };
1340 
init_rocksdb_psi_keys()1341 static void init_rocksdb_psi_keys() {
1342   const char *const category = "rocksdb";
1343   int count;
1344 
1345   if (PSI_server == nullptr)
1346     return;
1347 
1348   count = array_elements(all_rocksdb_mutexes);
1349   PSI_server->register_mutex(category, all_rocksdb_mutexes, count);
1350 
1351   count = array_elements(all_rocksdb_rwlocks);
1352   PSI_server->register_rwlock(category, all_rocksdb_rwlocks, count);
1353 
1354   count = array_elements(all_rocksdb_conds);
1355   // TODO Disabling PFS for conditions due to the bug
1356   // https://github.com/MySQLOnRocksDB/mysql-5.6/issues/92
1357   // PSI_server->register_cond(category, all_rocksdb_conds, count);
1358 
1359   count = array_elements(all_rocksdb_stages);
1360   mysql_stage_register(category, all_rocksdb_stages, count);
1361 
1362   count = array_elements(all_rocksdb_threads);
1363   mysql_thread_register(category, all_rocksdb_threads, count);
1364 }
1365 #endif
1366 
1367 /*
1368   Drop index thread's control
1369 */
1370 
1371 static Rdb_drop_index_thread rdb_drop_idx_thread;
1372 
rocksdb_drop_index_wakeup_thread(my_core::THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)1373 static void rocksdb_drop_index_wakeup_thread(
1374     my_core::THD *const thd __attribute__((__unused__)),
1375     struct st_mysql_sys_var *const var __attribute__((__unused__)),
1376     void *const var_ptr __attribute__((__unused__)), const void *const save) {
1377   if (*static_cast<const bool *>(save)) {
1378     rdb_drop_idx_thread.signal();
1379   }
1380 }
1381 
rocksdb_perf_context_level(THD * const thd)1382 static inline uint32_t rocksdb_perf_context_level(THD *const thd) {
1383   DBUG_ASSERT(thd != nullptr);
1384 
1385   const int session_perf_context_level = THDVAR(thd, perf_context_level);
1386   if (session_perf_context_level > rocksdb::PerfLevel::kUninitialized) {
1387     return session_perf_context_level;
1388   }
1389 
1390   /*
1391     Fallback to global thdvar, if session specific one was not set to a valid
1392     value.
1393   */
1394 
1395   const int global_perf_context_level = THDVAR(nullptr, perf_context_level);
1396   if (global_perf_context_level > rocksdb::PerfLevel::kUninitialized) {
1397     return global_perf_context_level;
1398   }
1399 
1400   return rocksdb::PerfLevel::kDisable;
1401 }
1402 
1403 /*
1404   Very short (functor-like) interface to be passed to
1405   Rdb_transaction::walk_tx_list()
1406 */
1407 
1408 interface Rdb_tx_list_walker {
1409   virtual ~Rdb_tx_list_walker() {}
1410   virtual void process_tran(const Rdb_transaction *const) = 0;
1411 };
1412 
1413 /*
1414   This is a helper class that is passed to RocksDB to get notifications when
1415   a snapshot gets created.
1416 */
1417 
1418 class Rdb_snapshot_notifier : public rocksdb::TransactionNotifier {
1419   Rdb_transaction *m_owning_tx;
1420 
1421   void SnapshotCreated(const rocksdb::Snapshot *snapshot) override;
1422 
1423 public:
1424   Rdb_snapshot_notifier(const Rdb_snapshot_notifier &) = delete;
1425   Rdb_snapshot_notifier &operator=(const Rdb_snapshot_notifier &) = delete;
1426 
Rdb_snapshot_notifier(Rdb_transaction * const owning_tx)1427   explicit Rdb_snapshot_notifier(Rdb_transaction *const owning_tx)
1428       : m_owning_tx(owning_tx) {}
1429 
1430   // If the owning Rdb_transaction gets destructed we need to not reference
1431   // it anymore.
detach()1432   void detach() { m_owning_tx = nullptr; }
1433 };
1434 
1435 /* This is the base class for transactions when interacting with rocksdb.
1436 */
1437 class Rdb_transaction {
1438 protected:
1439   ulonglong m_write_count = 0;
1440   ulonglong m_lock_count = 0;
1441 
1442   bool m_is_delayed_snapshot = false;
1443   bool m_is_two_phase = false;
1444 
1445   THD *m_thd = nullptr;
1446 
1447   rocksdb::ReadOptions m_read_opts;
1448 
1449   static std::multiset<Rdb_transaction *> s_tx_list;
1450   static mysql_mutex_t s_tx_list_mutex;
1451 
1452   Rdb_io_perf *m_tbl_io_perf;
1453 
1454   bool m_tx_read_only = false;
1455 
1456   int m_timeout_sec; /* Cached value of @@rocksdb_lock_wait_timeout */
1457 
1458   /* Maximum number of locks the transaction can have */
1459   ulonglong m_max_row_locks;
1460 
1461   bool m_is_tx_failed = false;
1462   bool m_rollback_only = false;
1463 
1464   std::shared_ptr<Rdb_snapshot_notifier> m_notifier;
1465 
1466   // This should be used only when updating binlog information.
1467   virtual bool commit_no_binlog() = 0;
1468   virtual rocksdb::Iterator *
1469   get_iterator(const rocksdb::ReadOptions &options,
1470                rocksdb::ColumnFamilyHandle *column_family) = 0;
1471 
1472 public:
1473   int64_t m_snapshot_timestamp = 0;
1474   bool m_ddl_transaction;
1475 
1476   /*
1477     for distinction between rdb_transaction_impl and rdb_writebatch_impl
1478     when using walk tx list
1479   */
1480   virtual bool is_writebatch_trx() const = 0;
1481 
init_mutex()1482   static void init_mutex() {
1483     mysql_mutex_init(key_mutex_tx_list, &s_tx_list_mutex, MY_MUTEX_INIT_FAST);
1484   }
1485 
term_mutex()1486   static void term_mutex() {
1487     DBUG_ASSERT(s_tx_list.size() == 0);
1488     mysql_mutex_destroy(&s_tx_list_mutex);
1489   }
1490 
walk_tx_list(Rdb_tx_list_walker * walker)1491   static void walk_tx_list(Rdb_tx_list_walker *walker) {
1492     DBUG_ASSERT(walker != nullptr);
1493 
1494     mysql_mutex_lock(&s_tx_list_mutex);
1495     for (auto it : s_tx_list)
1496       walker->process_tran(it);
1497     mysql_mutex_unlock(&s_tx_list_mutex);
1498   }
1499 
set_status_error(THD * const thd,const rocksdb::Status & s,const Rdb_key_def & kd,Rdb_tbl_def * const tbl_def)1500   int set_status_error(THD *const thd, const rocksdb::Status &s,
1501                        const Rdb_key_def &kd, Rdb_tbl_def *const tbl_def) {
1502     DBUG_ASSERT(!s.ok());
1503     DBUG_ASSERT(tbl_def != nullptr);
1504 
1505     if (s.IsTimedOut()) {
1506       /*
1507         SQL layer has weird expectations. If we return an error when
1508         doing a read in DELETE IGNORE, it will ignore the error ("because it's
1509         an IGNORE command!) but then will fail an assert, because "error code
1510         was returned, but no error happened".  Do what InnoDB's
1511         convert_error_code_to_mysql() does: force a statement
1512         rollback before returning HA_ERR_LOCK_WAIT_TIMEOUT:
1513         */
1514       my_core::thd_mark_transaction_to_rollback(thd, false /*just statement*/);
1515 
1516       return HA_ERR_LOCK_WAIT_TIMEOUT;
1517     }
1518 
1519     if (s.IsDeadlock()) {
1520       my_core::thd_mark_transaction_to_rollback(thd,
1521                                                 false /* just statement */);
1522       return HA_ERR_LOCK_DEADLOCK;
1523     } else if (s.IsBusy()) {
1524       rocksdb_snapshot_conflict_errors++;
1525       if (rocksdb_print_snapshot_conflict_queries) {
1526         char user_host_buff[MAX_USER_HOST_SIZE + 1];
1527         make_user_name(thd, user_host_buff);
1528         // NO_LINT_DEBUG
1529         sql_print_warning("Got snapshot conflict errors: User: %s "
1530                           "Query: %s",
1531                           user_host_buff, thd->query());
1532       }
1533       return HA_ERR_LOCK_DEADLOCK;
1534     }
1535 
1536     if (s.IsLockLimit()) {
1537       return HA_ERR_ROCKSDB_TOO_MANY_LOCKS;
1538     }
1539 
1540     if (s.IsIOError() || s.IsCorruption()) {
1541       rdb_handle_io_error(s, RDB_IO_ERROR_GENERAL);
1542     }
1543     my_error(ER_INTERNAL_ERROR, MYF(0), s.ToString().c_str());
1544     return HA_ERR_INTERNAL_ERROR;
1545   }
1546 
get_thd() const1547   THD *get_thd() const { return m_thd; }
1548 
1549   /* Used for tracking io_perf counters */
io_perf_start(Rdb_io_perf * const io_perf)1550   void io_perf_start(Rdb_io_perf *const io_perf) {
1551     /*
1552       Since perf_context is tracked per thread, it is difficult and expensive
1553       to maintain perf_context on a per table basis. Therefore, roll all
1554       perf_context data into the first table used in a query. This works well
1555       for single table queries and is probably good enough for queries that hit
1556       multiple tables.
1557 
1558       perf_context stats gathering is started when the table lock is acquired
1559       or when ha_rocksdb::start_stmt is called in case of LOCK TABLES. They
1560       are recorded when the table lock is released, or when commit/rollback
1561       is called on the transaction, whichever comes first. Table lock release
1562       and commit/rollback can happen in different orders. In the case where
1563       the lock is released before commit/rollback is called, an extra step to
1564       gather stats during commit/rollback is needed.
1565     */
1566     if (m_tbl_io_perf == nullptr &&
1567         io_perf->start(rocksdb_perf_context_level(m_thd))) {
1568       m_tbl_io_perf = io_perf;
1569     }
1570   }
1571 
io_perf_end_and_record(void)1572   void io_perf_end_and_record(void) {
1573     if (m_tbl_io_perf != nullptr) {
1574       m_tbl_io_perf->end_and_record(rocksdb_perf_context_level(m_thd));
1575       m_tbl_io_perf = nullptr;
1576     }
1577   }
1578 
io_perf_end_and_record(Rdb_io_perf * const io_perf)1579   void io_perf_end_and_record(Rdb_io_perf *const io_perf) {
1580     if (m_tbl_io_perf == io_perf) {
1581       io_perf_end_and_record();
1582     }
1583   }
1584 
set_params(int timeout_sec_arg,int max_row_locks_arg)1585   void set_params(int timeout_sec_arg, int max_row_locks_arg) {
1586     m_timeout_sec = timeout_sec_arg;
1587     m_max_row_locks = max_row_locks_arg;
1588     set_lock_timeout(timeout_sec_arg);
1589   }
1590 
1591   virtual void set_lock_timeout(int timeout_sec_arg) = 0;
1592 
get_write_count() const1593   ulonglong get_write_count() const { return m_write_count; }
1594 
get_timeout_sec() const1595   int get_timeout_sec() const { return m_timeout_sec; }
1596 
get_lock_count() const1597   ulonglong get_lock_count() const { return m_lock_count; }
1598 
1599   virtual void set_sync(bool sync) = 0;
1600 
1601   virtual void release_lock(rocksdb::ColumnFamilyHandle *const column_family,
1602                             const std::string &rowkey) = 0;
1603 
1604   virtual bool prepare(const rocksdb::TransactionName &name) = 0;
1605 
commit_or_rollback()1606   bool commit_or_rollback() {
1607     bool res;
1608     if (m_is_tx_failed) {
1609       rollback();
1610       res = false;
1611     } else
1612       res = commit();
1613     return res;
1614   }
1615 
commit()1616   bool commit() {
1617     if (get_write_count() == 0) {
1618       rollback();
1619       return false;
1620     } else if (m_rollback_only) {
1621       /*
1622         Transactions marked as rollback_only are expected to be rolled back at
1623         prepare(). But there are some exceptions like below that prepare() is
1624         never called and commit() is called instead.
1625          1. Binlog is disabled
1626          2. No modification exists in binlog cache for the transaction (#195)
1627         In both cases, rolling back transaction is safe. Nothing is written to
1628         binlog.
1629        */
1630       my_printf_error(ER_UNKNOWN_ERROR, ERRSTR_ROLLBACK_ONLY, MYF(0));
1631       rollback();
1632       return true;
1633     } else {
1634       return commit_no_binlog();
1635     }
1636   }
1637 
1638   virtual void rollback() = 0;
1639 
snapshot_created(const rocksdb::Snapshot * const snapshot)1640   void snapshot_created(const rocksdb::Snapshot *const snapshot) {
1641     DBUG_ASSERT(snapshot != nullptr);
1642 
1643     m_read_opts.snapshot = snapshot;
1644     rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp);
1645     m_is_delayed_snapshot = false;
1646   }
1647 
1648   virtual void acquire_snapshot(bool acquire_now) = 0;
1649   virtual void release_snapshot() = 0;
1650 
has_snapshot() const1651   bool has_snapshot() const { return m_read_opts.snapshot != nullptr; }
1652 
1653 private:
1654   // The tables we are currently loading.  In a partitioned table this can
1655   // have more than one entry
1656   std::vector<ha_rocksdb *> m_curr_bulk_load;
1657 
1658 public:
finish_bulk_load()1659   int finish_bulk_load() {
1660     int rc = 0;
1661 
1662     std::vector<ha_rocksdb *>::iterator it;
1663     while ((it = m_curr_bulk_load.begin()) != m_curr_bulk_load.end()) {
1664       int rc2 = (*it)->finalize_bulk_load();
1665       if (rc2 != 0 && rc == 0) {
1666         rc = rc2;
1667       }
1668     }
1669 
1670     DBUG_ASSERT(m_curr_bulk_load.size() == 0);
1671 
1672     return rc;
1673   }
1674 
start_bulk_load(ha_rocksdb * const bulk_load)1675   void start_bulk_load(ha_rocksdb *const bulk_load) {
1676     /*
1677      If we already have an open bulk load of a table and the name doesn't
1678      match the current one, close out the currently running one.  This allows
1679      multiple bulk loads to occur on a partitioned table, but then closes
1680      them all out when we switch to another table.
1681     */
1682     DBUG_ASSERT(bulk_load != nullptr);
1683 
1684     if (!m_curr_bulk_load.empty() &&
1685         !bulk_load->same_table(*m_curr_bulk_load[0])) {
1686       const auto res = finish_bulk_load();
1687       SHIP_ASSERT(res == 0);
1688     }
1689 
1690     m_curr_bulk_load.push_back(bulk_load);
1691   }
1692 
end_bulk_load(ha_rocksdb * const bulk_load)1693   void end_bulk_load(ha_rocksdb *const bulk_load) {
1694     for (auto it = m_curr_bulk_load.begin(); it != m_curr_bulk_load.end();
1695          it++) {
1696       if (*it == bulk_load) {
1697         m_curr_bulk_load.erase(it);
1698         return;
1699       }
1700     }
1701 
1702     // Should not reach here
1703     SHIP_ASSERT(0);
1704   }
1705 
num_ongoing_bulk_load() const1706   int num_ongoing_bulk_load() const { return m_curr_bulk_load.size(); }
1707 
1708   /*
1709     Flush the data accumulated so far. This assumes we're doing a bulk insert.
1710 
1711     @detail
1712       This should work like transaction commit, except that we don't
1713       synchronize with the binlog (there is no API that would allow to have
1714       binlog flush the changes accumulated so far and return its current
1715       position)
1716 
1717     @todo
1718       Add test coverage for what happens when somebody attempts to do bulk
1719       inserts while inside a multi-statement transaction.
1720   */
flush_batch()1721   bool flush_batch() {
1722     if (get_write_count() == 0)
1723       return false;
1724 
1725     /* Commit the current transaction */
1726     if (commit_no_binlog())
1727       return true;
1728 
1729     /* Start another one */
1730     start_tx();
1731     return false;
1732   }
1733 
1734   virtual rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family,
1735                               const rocksdb::Slice &key,
1736                               const rocksdb::Slice &value) = 0;
1737   virtual rocksdb::Status
1738   delete_key(rocksdb::ColumnFamilyHandle *const column_family,
1739              const rocksdb::Slice &key) = 0;
1740   virtual rocksdb::Status
1741   single_delete(rocksdb::ColumnFamilyHandle *const column_family,
1742                 const rocksdb::Slice &key) = 0;
1743 
1744   virtual bool has_modifications() const = 0;
1745 
1746   virtual rocksdb::WriteBatchBase *get_indexed_write_batch() = 0;
1747   /*
1748     Return a WriteBatch that one can write to. The writes will skip any
1749     transaction locking. The writes will NOT be visible to the transaction.
1750   */
get_blind_write_batch()1751   rocksdb::WriteBatchBase *get_blind_write_batch() {
1752     return get_indexed_write_batch()->GetWriteBatch();
1753   }
1754 
1755   virtual rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family,
1756                               const rocksdb::Slice &key,
1757                               std::string *value) const = 0;
1758   virtual rocksdb::Status
1759   get_for_update(rocksdb::ColumnFamilyHandle *const column_family,
1760                  const rocksdb::Slice &key, std::string *const value,
1761                  bool exclusive) = 0;
1762 
1763   rocksdb::Iterator *
get_iterator(rocksdb::ColumnFamilyHandle * const column_family,bool skip_bloom_filter,bool fill_cache,bool read_current=false,bool create_snapshot=true)1764   get_iterator(rocksdb::ColumnFamilyHandle *const column_family,
1765                bool skip_bloom_filter, bool fill_cache,
1766                bool read_current = false, bool create_snapshot = true) {
1767     // Make sure we are not doing both read_current (which implies we don't
1768     // want a snapshot) and create_snapshot which makes sure we create
1769     // a snapshot
1770     DBUG_ASSERT(column_family != nullptr);
1771     DBUG_ASSERT(!read_current || !create_snapshot);
1772 
1773     if (create_snapshot)
1774       acquire_snapshot(true);
1775 
1776     rocksdb::ReadOptions options = m_read_opts;
1777 
1778     if (skip_bloom_filter) {
1779       options.total_order_seek = true;
1780     } else {
1781       // With this option, Iterator::Valid() returns false if key
1782       // is outside of the prefix bloom filter range set at Seek().
1783       // Must not be set to true if not using bloom filter.
1784       options.prefix_same_as_start = true;
1785     }
1786     options.fill_cache = fill_cache;
1787     if (read_current) {
1788       options.snapshot = nullptr;
1789     }
1790     return get_iterator(options, column_family);
1791   }
1792 
1793   virtual bool is_tx_started() const = 0;
1794   virtual void start_tx() = 0;
1795   virtual void start_stmt() = 0;
1796   virtual void rollback_stmt() = 0;
1797 
set_tx_failed(bool failed_arg)1798   void set_tx_failed(bool failed_arg) { m_is_tx_failed = failed_arg; }
1799 
can_prepare() const1800   bool can_prepare() const {
1801     if (m_rollback_only) {
1802       my_printf_error(ER_UNKNOWN_ERROR, ERRSTR_ROLLBACK_ONLY, MYF(0));
1803       return false;
1804     }
1805     return true;
1806   }
1807 
rollback_to_savepoint(void * const savepoint)1808   int rollback_to_savepoint(void *const savepoint) {
1809     if (has_modifications()) {
1810       my_printf_error(ER_UNKNOWN_ERROR,
1811                       "MyRocks currently does not support ROLLBACK TO "
1812                       "SAVEPOINT if modifying rows.",
1813                       MYF(0));
1814       m_rollback_only = true;
1815       return HA_EXIT_FAILURE;
1816     }
1817     return HA_EXIT_SUCCESS;
1818   }
1819 
1820   /*
1821     This is used by transactions started with "START TRANSACTION WITH "
1822     "CONSISTENT [ROCKSDB] SNAPSHOT". When tx_read_only is turned on,
1823     snapshot has to be created via DB::GetSnapshot(), not via Transaction
1824     API.
1825   */
is_tx_read_only() const1826   bool is_tx_read_only() const { return m_tx_read_only; }
1827 
is_two_phase() const1828   bool is_two_phase() const { return m_is_two_phase; }
1829 
set_tx_read_only(bool val)1830   void set_tx_read_only(bool val) { m_tx_read_only = val; }
1831 
Rdb_transaction(THD * const thd)1832   explicit Rdb_transaction(THD *const thd)
1833       : m_thd(thd), m_tbl_io_perf(nullptr) {
1834     mysql_mutex_lock(&s_tx_list_mutex);
1835     s_tx_list.insert(this);
1836     mysql_mutex_unlock(&s_tx_list_mutex);
1837   }
1838 
~Rdb_transaction()1839   virtual ~Rdb_transaction() {
1840     mysql_mutex_lock(&s_tx_list_mutex);
1841     s_tx_list.erase(this);
1842     mysql_mutex_unlock(&s_tx_list_mutex);
1843   }
1844 };
1845 
1846 /*
1847   This is a rocksdb transaction. Its members represent the current transaction,
1848   which consists of:
1849   - the snapshot
1850   - the changes we've made but are not seeing yet.
1851 
1852   The changes are made to individual tables, which store them here and then
1853   this object commits them on commit.
1854 */
1855 class Rdb_transaction_impl : public Rdb_transaction {
1856   rocksdb::Transaction *m_rocksdb_tx = nullptr;
1857   rocksdb::Transaction *m_rocksdb_reuse_tx = nullptr;
1858 
1859 public:
set_lock_timeout(int timeout_sec_arg)1860   void set_lock_timeout(int timeout_sec_arg) override {
1861     if (m_rocksdb_tx)
1862       m_rocksdb_tx->SetLockTimeout(rdb_convert_sec_to_ms(m_timeout_sec));
1863   }
1864 
set_sync(bool sync)1865   void set_sync(bool sync) override {
1866     m_rocksdb_tx->GetWriteOptions()->sync = sync;
1867   }
1868 
release_lock(rocksdb::ColumnFamilyHandle * const column_family,const std::string & rowkey)1869   void release_lock(rocksdb::ColumnFamilyHandle *const column_family,
1870                     const std::string &rowkey) override {
1871     if (!THDVAR(m_thd, lock_scanned_rows)) {
1872       m_rocksdb_tx->UndoGetForUpdate(column_family, rocksdb::Slice(rowkey));
1873     }
1874   }
1875 
is_writebatch_trx() const1876   virtual bool is_writebatch_trx() const override { return false; }
1877 
1878 private:
release_tx(void)1879   void release_tx(void) {
1880     // We are done with the current active transaction object.  Preserve it
1881     // for later reuse.
1882     DBUG_ASSERT(m_rocksdb_reuse_tx == nullptr);
1883     m_rocksdb_reuse_tx = m_rocksdb_tx;
1884     m_rocksdb_tx = nullptr;
1885   }
1886 
prepare(const rocksdb::TransactionName & name)1887   bool prepare(const rocksdb::TransactionName &name) override {
1888     rocksdb::Status s;
1889     s = m_rocksdb_tx->SetName(name);
1890     if (!s.ok()) {
1891       rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
1892       return false;
1893     }
1894 
1895     s = m_rocksdb_tx->Prepare();
1896     if (!s.ok()) {
1897       rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
1898       return false;
1899     }
1900     return true;
1901   }
1902 
commit_no_binlog()1903   bool commit_no_binlog() override {
1904     bool res = false;
1905     release_snapshot();
1906     const rocksdb::Status s = m_rocksdb_tx->Commit();
1907     if (!s.ok()) {
1908       rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
1909       res = true;
1910     }
1911 
1912     /* Save the transaction object to be reused */
1913     release_tx();
1914 
1915     m_write_count = 0;
1916     m_lock_count = 0;
1917     set_tx_read_only(false);
1918     m_rollback_only = false;
1919     return res;
1920   }
1921 
1922 public:
rollback()1923   void rollback() override {
1924     m_write_count = 0;
1925     m_lock_count = 0;
1926     m_ddl_transaction = false;
1927     if (m_rocksdb_tx) {
1928       release_snapshot();
1929       /* This will also release all of the locks: */
1930       m_rocksdb_tx->Rollback();
1931 
1932       /* Save the transaction object to be reused */
1933       release_tx();
1934 
1935       set_tx_read_only(false);
1936       m_rollback_only = false;
1937     }
1938   }
1939 
acquire_snapshot(bool acquire_now)1940   void acquire_snapshot(bool acquire_now) override {
1941     if (m_read_opts.snapshot == nullptr) {
1942       if (is_tx_read_only()) {
1943         snapshot_created(rdb->GetSnapshot());
1944       } else if (acquire_now) {
1945         m_rocksdb_tx->SetSnapshot();
1946         snapshot_created(m_rocksdb_tx->GetSnapshot());
1947       } else if (!m_is_delayed_snapshot) {
1948         m_rocksdb_tx->SetSnapshotOnNextOperation(m_notifier);
1949         m_is_delayed_snapshot = true;
1950       }
1951     }
1952   }
1953 
release_snapshot()1954   void release_snapshot() override {
1955     bool need_clear = m_is_delayed_snapshot;
1956 
1957     if (m_read_opts.snapshot != nullptr) {
1958       m_snapshot_timestamp = 0;
1959       if (is_tx_read_only()) {
1960         rdb->ReleaseSnapshot(m_read_opts.snapshot);
1961         need_clear = false;
1962       } else {
1963         need_clear = true;
1964       }
1965       m_read_opts.snapshot = nullptr;
1966     }
1967 
1968     if (need_clear && m_rocksdb_tx != nullptr)
1969       m_rocksdb_tx->ClearSnapshot();
1970   }
1971 
has_snapshot()1972   bool has_snapshot() { return m_read_opts.snapshot != nullptr; }
1973 
put(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const rocksdb::Slice & value)1974   rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family,
1975                       const rocksdb::Slice &key,
1976                       const rocksdb::Slice &value) override {
1977     ++m_write_count;
1978     ++m_lock_count;
1979     if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks)
1980       return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
1981     return m_rocksdb_tx->Put(column_family, key, value);
1982   }
1983 
delete_key(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key)1984   rocksdb::Status delete_key(rocksdb::ColumnFamilyHandle *const column_family,
1985                              const rocksdb::Slice &key) override {
1986     ++m_write_count;
1987     ++m_lock_count;
1988     if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks)
1989       return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
1990     return m_rocksdb_tx->Delete(column_family, key);
1991   }
1992 
1993   rocksdb::Status
single_delete(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key)1994   single_delete(rocksdb::ColumnFamilyHandle *const column_family,
1995                 const rocksdb::Slice &key) override {
1996     ++m_write_count;
1997     ++m_lock_count;
1998     if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks)
1999       return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
2000     return m_rocksdb_tx->SingleDelete(column_family, key);
2001   }
2002 
has_modifications() const2003   bool has_modifications() const override {
2004     return m_rocksdb_tx->GetWriteBatch() &&
2005            m_rocksdb_tx->GetWriteBatch()->GetWriteBatch() &&
2006            m_rocksdb_tx->GetWriteBatch()->GetWriteBatch()->Count() > 0;
2007   }
2008 
2009   /*
2010     Return a WriteBatch that one can write to. The writes will skip any
2011     transaction locking. The writes WILL be visible to the transaction.
2012   */
get_indexed_write_batch()2013   rocksdb::WriteBatchBase *get_indexed_write_batch() override {
2014     ++m_write_count;
2015     return m_rocksdb_tx->GetWriteBatch();
2016   }
2017 
get(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,std::string * value) const2018   rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family,
2019                       const rocksdb::Slice &key,
2020                       std::string *value) const override {
2021     return m_rocksdb_tx->Get(m_read_opts, column_family, key, value);
2022   }
2023 
2024   rocksdb::Status
get_for_update(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,std::string * const value,bool exclusive)2025   get_for_update(rocksdb::ColumnFamilyHandle *const column_family,
2026                  const rocksdb::Slice &key, std::string *const value,
2027                  bool exclusive) override {
2028     if (++m_lock_count > m_max_row_locks)
2029       return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
2030 
2031     return m_rocksdb_tx->GetForUpdate(m_read_opts, column_family, key, value,
2032                                       exclusive);
2033   }
2034 
2035   rocksdb::Iterator *
get_iterator(const rocksdb::ReadOptions & options,rocksdb::ColumnFamilyHandle * const column_family)2036   get_iterator(const rocksdb::ReadOptions &options,
2037                rocksdb::ColumnFamilyHandle *const column_family) override {
2038     return m_rocksdb_tx->GetIterator(options, column_family);
2039   }
2040 
get_rdb_trx() const2041   const rocksdb::Transaction *get_rdb_trx() const { return m_rocksdb_tx; }
2042 
is_tx_started() const2043   bool is_tx_started() const override { return (m_rocksdb_tx != nullptr); }
2044 
start_tx()2045   void start_tx() override {
2046     rocksdb::TransactionOptions tx_opts;
2047     rocksdb::WriteOptions write_opts;
2048     tx_opts.set_snapshot = false;
2049     tx_opts.lock_timeout = rdb_convert_sec_to_ms(m_timeout_sec);
2050     tx_opts.deadlock_detect = THDVAR(m_thd, deadlock_detect);
2051 
2052     write_opts.sync = THDVAR(m_thd, write_sync);
2053     write_opts.disableWAL = THDVAR(m_thd, write_disable_wal);
2054     write_opts.ignore_missing_column_families =
2055         THDVAR(m_thd, write_ignore_missing_column_families);
2056     m_is_two_phase = rocksdb_enable_2pc;
2057 
2058     /*
2059       If m_rocksdb_reuse_tx is null this will create a new transaction object.
2060       Otherwise it will reuse the existing one.
2061     */
2062     m_rocksdb_tx =
2063         rdb->BeginTransaction(write_opts, tx_opts, m_rocksdb_reuse_tx);
2064     m_rocksdb_reuse_tx = nullptr;
2065 
2066     m_read_opts = rocksdb::ReadOptions();
2067 
2068     m_ddl_transaction = false;
2069   }
2070 
2071   /*
2072     Start a statement inside a multi-statement transaction.
2073 
2074     @todo: are we sure this is called once (and not several times) per
2075     statement start?
2076 
2077     For hooking to start of statement that is its own transaction, see
2078     ha_rocksdb::external_lock().
2079   */
start_stmt()2080   void start_stmt() override {
2081     // Set the snapshot to delayed acquisition (SetSnapshotOnNextOperation)
2082     acquire_snapshot(false);
2083     m_rocksdb_tx->SetSavePoint();
2084   }
2085 
2086   /*
2087     This must be called when last statement is rolled back, but the transaction
2088     continues
2089   */
rollback_stmt()2090   void rollback_stmt() override {
2091     /* TODO: here we must release the locks taken since the start_stmt() call */
2092     if (m_rocksdb_tx) {
2093       const rocksdb::Snapshot *const org_snapshot = m_rocksdb_tx->GetSnapshot();
2094       m_rocksdb_tx->RollbackToSavePoint();
2095 
2096       const rocksdb::Snapshot *const cur_snapshot = m_rocksdb_tx->GetSnapshot();
2097       if (org_snapshot != cur_snapshot) {
2098         if (org_snapshot != nullptr)
2099           m_snapshot_timestamp = 0;
2100 
2101         m_read_opts.snapshot = cur_snapshot;
2102         if (cur_snapshot != nullptr)
2103           rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp);
2104         else
2105           m_is_delayed_snapshot = true;
2106       }
2107     }
2108   }
2109 
Rdb_transaction_impl(THD * const thd)2110   explicit Rdb_transaction_impl(THD *const thd)
2111       : Rdb_transaction(thd), m_rocksdb_tx(nullptr) {
2112     // Create a notifier that can be called when a snapshot gets generated.
2113     m_notifier = std::make_shared<Rdb_snapshot_notifier>(this);
2114   }
2115 
~Rdb_transaction_impl()2116   virtual ~Rdb_transaction_impl() {
2117     rollback();
2118 
2119     // Theoretically the notifier could outlive the Rdb_transaction_impl
2120     // (because of the shared_ptr), so let it know it can't reference
2121     // the transaction anymore.
2122     m_notifier->detach();
2123 
2124     // Free any transaction memory that is still hanging around.
2125     delete m_rocksdb_reuse_tx;
2126     DBUG_ASSERT(m_rocksdb_tx == nullptr);
2127   }
2128 };
2129 
2130 /* This is a rocksdb write batch. This class doesn't hold or wait on any
2131    transaction locks (skips rocksdb transaction API) thus giving better
2132    performance. The commit is done through rdb->GetBaseDB()->Commit().
2133 
2134    Currently this is only used for replication threads which are guaranteed
2135    to be non-conflicting. Any further usage of this class should completely
2136    be thought thoroughly.
2137 */
2138 class Rdb_writebatch_impl : public Rdb_transaction {
2139   rocksdb::WriteBatchWithIndex *m_batch;
2140   rocksdb::WriteOptions write_opts;
2141   // Called after commit/rollback.
reset()2142   void reset() {
2143     m_batch->Clear();
2144     m_read_opts = rocksdb::ReadOptions();
2145     m_ddl_transaction = false;
2146   }
2147 
2148 private:
prepare(const rocksdb::TransactionName & name)2149   bool prepare(const rocksdb::TransactionName &name) override { return true; }
2150 
commit_no_binlog()2151   bool commit_no_binlog() override {
2152     bool res = false;
2153     release_snapshot();
2154     const rocksdb::Status s =
2155         rdb->GetBaseDB()->Write(write_opts, m_batch->GetWriteBatch());
2156     if (!s.ok()) {
2157       rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
2158       res = true;
2159     }
2160     reset();
2161 
2162     m_write_count = 0;
2163     set_tx_read_only(false);
2164     m_rollback_only = false;
2165     return res;
2166   }
2167 
2168 public:
is_writebatch_trx() const2169   bool is_writebatch_trx() const override { return true; }
2170 
set_lock_timeout(int timeout_sec_arg)2171   void set_lock_timeout(int timeout_sec_arg) override {
2172     // Nothing to do here.
2173   }
2174 
set_sync(bool sync)2175   void set_sync(bool sync) override { write_opts.sync = sync; }
2176 
release_lock(rocksdb::ColumnFamilyHandle * const column_family,const std::string & rowkey)2177   void release_lock(rocksdb::ColumnFamilyHandle *const column_family,
2178                     const std::string &rowkey) override {
2179     // Nothing to do here since we don't hold any row locks.
2180   }
2181 
rollback()2182   void rollback() override {
2183     m_write_count = 0;
2184     m_lock_count = 0;
2185     release_snapshot();
2186 
2187     reset();
2188     set_tx_read_only(false);
2189     m_rollback_only = false;
2190   }
2191 
acquire_snapshot(bool acquire_now)2192   void acquire_snapshot(bool acquire_now) override {
2193     if (m_read_opts.snapshot == nullptr)
2194       snapshot_created(rdb->GetSnapshot());
2195   }
2196 
release_snapshot()2197   void release_snapshot() override {
2198     if (m_read_opts.snapshot != nullptr) {
2199       rdb->ReleaseSnapshot(m_read_opts.snapshot);
2200       m_read_opts.snapshot = nullptr;
2201     }
2202   }
2203 
put(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const rocksdb::Slice & value)2204   rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family,
2205                       const rocksdb::Slice &key,
2206                       const rocksdb::Slice &value) override {
2207     ++m_write_count;
2208     m_batch->Put(column_family, key, value);
2209     // Note Put/Delete in write batch doesn't return any error code. We simply
2210     // return OK here.
2211     return rocksdb::Status::OK();
2212   }
2213 
delete_key(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key)2214   rocksdb::Status delete_key(rocksdb::ColumnFamilyHandle *const column_family,
2215                              const rocksdb::Slice &key) override {
2216     ++m_write_count;
2217     m_batch->Delete(column_family, key);
2218     return rocksdb::Status::OK();
2219   }
2220 
2221   rocksdb::Status
single_delete(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key)2222   single_delete(rocksdb::ColumnFamilyHandle *const column_family,
2223                 const rocksdb::Slice &key) override {
2224     ++m_write_count;
2225     m_batch->SingleDelete(column_family, key);
2226     return rocksdb::Status::OK();
2227   }
2228 
has_modifications() const2229   bool has_modifications() const override {
2230     return m_batch->GetWriteBatch()->Count() > 0;
2231   }
2232 
get_indexed_write_batch()2233   rocksdb::WriteBatchBase *get_indexed_write_batch() override {
2234     ++m_write_count;
2235     return m_batch;
2236   }
2237 
get(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,std::string * const value) const2238   rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family,
2239                       const rocksdb::Slice &key,
2240                       std::string *const value) const override {
2241     return m_batch->GetFromBatchAndDB(rdb, m_read_opts, column_family, key,
2242                                       value);
2243   }
2244 
2245   rocksdb::Status
get_for_update(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,std::string * const value,bool exclusive)2246   get_for_update(rocksdb::ColumnFamilyHandle *const column_family,
2247                  const rocksdb::Slice &key, std::string *const value,
2248                  bool exclusive) override {
2249     return get(column_family, key, value);
2250   }
2251 
2252   rocksdb::Iterator *
get_iterator(const rocksdb::ReadOptions & options,rocksdb::ColumnFamilyHandle * const column_family)2253   get_iterator(const rocksdb::ReadOptions &options,
2254                rocksdb::ColumnFamilyHandle *const column_family) override {
2255     const auto it = rdb->NewIterator(options);
2256     return m_batch->NewIteratorWithBase(it);
2257   }
2258 
is_tx_started() const2259   bool is_tx_started() const override { return (m_batch != nullptr); }
2260 
start_tx()2261   void start_tx() override {
2262     reset();
2263     write_opts.sync = THDVAR(m_thd, write_sync);
2264     write_opts.disableWAL = THDVAR(m_thd, write_disable_wal);
2265     write_opts.ignore_missing_column_families =
2266         THDVAR(m_thd, write_ignore_missing_column_families);
2267   }
2268 
start_stmt()2269   void start_stmt() override { m_batch->SetSavePoint(); }
2270 
rollback_stmt()2271   void rollback_stmt() override {
2272     if (m_batch)
2273       m_batch->RollbackToSavePoint();
2274   }
2275 
Rdb_writebatch_impl(THD * const thd)2276   explicit Rdb_writebatch_impl(THD *const thd)
2277       : Rdb_transaction(thd), m_batch(nullptr) {
2278     m_batch = new rocksdb::WriteBatchWithIndex(rocksdb::BytewiseComparator(), 0,
2279                                                true);
2280   }
2281 
~Rdb_writebatch_impl()2282   virtual ~Rdb_writebatch_impl() {
2283     rollback();
2284     delete m_batch;
2285   }
2286 };
2287 
SnapshotCreated(const rocksdb::Snapshot * const snapshot)2288 void Rdb_snapshot_notifier::SnapshotCreated(
2289     const rocksdb::Snapshot *const snapshot) {
2290   if (m_owning_tx != nullptr) {
2291     m_owning_tx->snapshot_created(snapshot);
2292   }
2293 }
2294 
2295 std::multiset<Rdb_transaction *> Rdb_transaction::s_tx_list;
2296 mysql_mutex_t Rdb_transaction::s_tx_list_mutex;
2297 
get_tx_from_thd(THD * const thd)2298 static Rdb_transaction *&get_tx_from_thd(THD *const thd) {
2299   return *reinterpret_cast<Rdb_transaction **>(
2300       my_core::thd_ha_data(thd, rocksdb_hton));
2301 }
2302 
2303 namespace {
2304 
2305 class Rdb_perf_context_guard {
2306   Rdb_io_perf m_io_perf;
2307   THD *m_thd;
2308 
2309 public:
2310   Rdb_perf_context_guard(const Rdb_perf_context_guard &) = delete;
2311   Rdb_perf_context_guard &operator=(const Rdb_perf_context_guard &) = delete;
2312 
Rdb_perf_context_guard(THD * const thd)2313   explicit Rdb_perf_context_guard(THD *const thd) : m_thd(thd) {
2314     Rdb_transaction *&tx = get_tx_from_thd(m_thd);
2315     /*
2316       if perf_context information is already being recorded, this becomes a
2317       no-op
2318     */
2319     if (tx != nullptr) {
2320       tx->io_perf_start(&m_io_perf);
2321     }
2322   }
2323 
~Rdb_perf_context_guard()2324   ~Rdb_perf_context_guard() {
2325     Rdb_transaction *&tx = get_tx_from_thd(m_thd);
2326     if (tx != nullptr) {
2327       tx->io_perf_end_and_record();
2328     }
2329   }
2330 };
2331 
2332 } // anonymous namespace
2333 
2334 /*
2335   TODO: maybe, call this in external_lock() and store in ha_rocksdb..
2336 */
2337 
get_or_create_tx(THD * const thd)2338 static Rdb_transaction *get_or_create_tx(THD *const thd) {
2339   Rdb_transaction *&tx = get_tx_from_thd(thd);
2340   // TODO: this is called too many times.. O(#rows)
2341   if (tx == nullptr) {
2342     if (rpl_skip_tx_api_var && thd->rli_slave)
2343       tx = new Rdb_writebatch_impl(thd);
2344     else
2345       tx = new Rdb_transaction_impl(thd);
2346     tx->set_params(THDVAR(thd, lock_wait_timeout), THDVAR(thd, max_row_locks));
2347     tx->start_tx();
2348   } else {
2349     tx->set_params(THDVAR(thd, lock_wait_timeout), THDVAR(thd, max_row_locks));
2350     if (!tx->is_tx_started()) {
2351       tx->start_tx();
2352     }
2353   }
2354 
2355   return tx;
2356 }
2357 
rocksdb_close_connection(handlerton * const hton,THD * const thd)2358 static int rocksdb_close_connection(handlerton *const hton, THD *const thd) {
2359   Rdb_transaction *&tx = get_tx_from_thd(thd);
2360   if (tx != nullptr) {
2361     int rc = tx->finish_bulk_load();
2362     if (rc != 0) {
2363       // NO_LINT_DEBUG
2364       sql_print_error("RocksDB: Error %d finalizing last SST file while "
2365                       "disconnecting",
2366                       rc);
2367       abort_with_stack_traces();
2368     }
2369 
2370     delete tx;
2371     tx = nullptr;
2372   }
2373   return HA_EXIT_SUCCESS;
2374 }
2375 
2376 /*
2377  * Serializes an xid to a string so that it can
2378  * be used as a rocksdb transaction name
2379  */
rdb_xid_to_string(const XID & src)2380 static std::string rdb_xid_to_string(const XID &src) {
2381   DBUG_ASSERT(src.gtrid_length >= 0 && src.gtrid_length <= MAXGTRIDSIZE);
2382   DBUG_ASSERT(src.bqual_length >= 0 && src.bqual_length <= MAXBQUALSIZE);
2383 
2384   std::string buf;
2385   buf.reserve(RDB_XIDHDR_LEN + src.gtrid_length + src.bqual_length);
2386 
2387   /*
2388    * expand formatID to fill 8 bytes if it doesn't already
2389    * then reinterpret bit pattern as unsigned and store in network order
2390    */
2391   uchar fidbuf[RDB_FORMATID_SZ];
2392   int64 signed_fid8 = src.formatID;
2393   const uint64 raw_fid8 = *reinterpret_cast<uint64 *>(&signed_fid8);
2394   rdb_netbuf_store_uint64(fidbuf, raw_fid8);
2395   buf.append(reinterpret_cast<const char *>(fidbuf), RDB_FORMATID_SZ);
2396 
2397   buf.push_back(src.gtrid_length);
2398   buf.push_back(src.bqual_length);
2399   buf.append(src.data, (src.gtrid_length) + (src.bqual_length));
2400   return buf;
2401 }
2402 
2403 /**
2404   Called by hton->flush_logs after MySQL group commit prepares a set of
2405   transactions.
2406 */
rocksdb_flush_wal(handlerton * const hton)2407 static bool rocksdb_flush_wal(handlerton *const hton
2408                               __attribute__((__unused__))) {
2409   DBUG_ASSERT(rdb != nullptr);
2410   rocksdb_wal_group_syncs++;
2411   const rocksdb::Status s = rdb->SyncWAL();
2412   if (!s.ok()) {
2413     return HA_EXIT_FAILURE;
2414   }
2415   return HA_EXIT_SUCCESS;
2416 }
2417 
2418 /**
2419   For a slave, prepare() updates the slave_gtid_info table which tracks the
2420   replication progress.
2421 */
rocksdb_prepare(handlerton * const hton,THD * const thd,bool prepare_tx)2422 static int rocksdb_prepare(handlerton *const hton, THD *const thd,
2423                            bool prepare_tx) {
2424   Rdb_transaction *&tx = get_tx_from_thd(thd);
2425   if (!tx->can_prepare()) {
2426     return HA_EXIT_FAILURE;
2427   }
2428   if (prepare_tx ||
2429       (!my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
2430 
2431     if (tx->is_two_phase()) {
2432       if (thd->durability_property == HA_IGNORE_DURABILITY) {
2433         tx->set_sync(false);
2434       }
2435       XID xid;
2436       thd_get_xid(thd, reinterpret_cast<MYSQL_XID *>(&xid));
2437       if (!tx->prepare(rdb_xid_to_string(xid))) {
2438         return HA_EXIT_FAILURE;
2439       }
2440     }
2441 
2442     DEBUG_SYNC(thd, "rocksdb.prepared");
2443   }
2444 
2445   return HA_EXIT_SUCCESS;
2446 }
2447 
2448 /**
2449  do nothing for prepare/commit by xid
2450  this is needed to avoid crashes in XA scenarios
2451 */
rocksdb_commit_by_xid(handlerton * const hton,XID * const xid)2452 static int rocksdb_commit_by_xid(handlerton *const hton, XID *const xid) {
2453   const auto name = rdb_xid_to_string(*xid);
2454   rocksdb::Transaction *const trx = rdb->GetTransactionByName(name);
2455   if (trx == nullptr) {
2456     return HA_EXIT_FAILURE;
2457   }
2458   const rocksdb::Status s = trx->Commit();
2459   if (!s.ok()) {
2460     return HA_EXIT_FAILURE;
2461   }
2462   delete trx;
2463   return HA_EXIT_SUCCESS;
2464 }
2465 
rocksdb_rollback_by_xid(handlerton * const hton,XID * const xid)2466 static int rocksdb_rollback_by_xid(handlerton *const hton
2467                                    __attribute__((__unused__)),
2468                                    XID *const xid) {
2469   const auto name = rdb_xid_to_string(*xid);
2470   rocksdb::Transaction *const trx = rdb->GetTransactionByName(name);
2471   if (trx == nullptr) {
2472     return HA_EXIT_FAILURE;
2473   }
2474   const rocksdb::Status s = trx->Rollback();
2475   if (!s.ok()) {
2476     return HA_EXIT_FAILURE;
2477   }
2478   delete trx;
2479   return HA_EXIT_SUCCESS;
2480 }
2481 
2482 /**
2483   Rebuilds an XID from a serialized version stored in a string.
2484 */
rdb_xid_from_string(const std::string & src,XID * const dst)2485 static void rdb_xid_from_string(const std::string &src, XID *const dst) {
2486   DBUG_ASSERT(dst != nullptr);
2487   uint offset = 0;
2488   uint64 raw_fid8 =
2489       rdb_netbuf_to_uint64(reinterpret_cast<const uchar *>(src.data()));
2490   const int64 signed_fid8 = *reinterpret_cast<int64 *>(&raw_fid8);
2491   dst->formatID = signed_fid8;
2492   offset += RDB_FORMATID_SZ;
2493   dst->gtrid_length = src.at(offset);
2494   offset += RDB_GTRID_SZ;
2495   dst->bqual_length = src.at(offset);
2496   offset += RDB_BQUAL_SZ;
2497 
2498   DBUG_ASSERT(dst->gtrid_length >= 0 && dst->gtrid_length <= MAXGTRIDSIZE);
2499   DBUG_ASSERT(dst->bqual_length >= 0 && dst->bqual_length <= MAXBQUALSIZE);
2500 
2501   src.copy(dst->data, (dst->gtrid_length) + (dst->bqual_length),
2502            RDB_XIDHDR_LEN);
2503 }
2504 
2505 /**
2506   Reading last committed binary log info from RocksDB system row.
2507   The info is needed for crash safe slave/master to work.
2508 */
rocksdb_recover(handlerton * const hton,XID * const xid_list,uint len)2509 static int rocksdb_recover(handlerton *const hton, XID *const xid_list,
2510                            uint len) {
2511   if (len == 0 || xid_list == nullptr) {
2512     return HA_EXIT_SUCCESS;
2513   }
2514 
2515   std::vector<rocksdb::Transaction *> trans_list;
2516   rdb->GetAllPreparedTransactions(&trans_list);
2517 
2518   uint count = 0;
2519   for (auto &trans : trans_list) {
2520     if (count >= len) {
2521       break;
2522     }
2523     auto name = trans->GetName();
2524     rdb_xid_from_string(name, &xid_list[count]);
2525     count++;
2526   }
2527   return count;
2528 }
2529 
rocksdb_commit(handlerton * const hton,THD * const thd,bool commit_tx)2530 static int rocksdb_commit(handlerton *const hton, THD *const thd,
2531                           bool commit_tx) {
2532   DBUG_ENTER_FUNC();
2533 
2534   DBUG_ASSERT(hton != nullptr);
2535   DBUG_ASSERT(thd != nullptr);
2536 
2537   /* this will trigger saving of perf_context information */
2538   Rdb_perf_context_guard guard(thd);
2539 
2540   /* note: h->external_lock(F_UNLCK) is called after this function is called) */
2541   Rdb_transaction *&tx = get_tx_from_thd(thd);
2542 
2543   if (tx != nullptr) {
2544     if (commit_tx || (!my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT |
2545                                                           OPTION_BEGIN))) {
2546       /*
2547         We get here
2548          - For a COMMIT statement that finishes a multi-statement transaction
2549          - For a statement that has its own transaction
2550       */
2551       if (tx->commit())
2552         DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
2553     } else {
2554       /*
2555         We get here when committing a statement within a transaction.
2556 
2557         We don't need to do anything here. tx->start_stmt() will notify
2558         Rdb_transaction_impl that another statement has started.
2559       */
2560       tx->set_tx_failed(false);
2561     }
2562 
2563     if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED) {
2564       // For READ_COMMITTED, we release any existing snapshot so that we will
2565       // see any changes that occurred since the last statement.
2566       tx->release_snapshot();
2567     }
2568   }
2569 
2570   DBUG_RETURN(HA_EXIT_SUCCESS);
2571 }
2572 
rocksdb_rollback(handlerton * const hton,THD * const thd,bool rollback_tx)2573 static int rocksdb_rollback(handlerton *const hton, THD *const thd,
2574                             bool rollback_tx) {
2575   Rdb_perf_context_guard guard(thd);
2576   Rdb_transaction *&tx = get_tx_from_thd(thd);
2577 
2578   if (tx != nullptr) {
2579     if (rollback_tx) {
2580       /*
2581         We get here, when
2582         - ROLLBACK statement is issued.
2583 
2584         Discard the changes made by the transaction
2585       */
2586       tx->rollback();
2587     } else {
2588       /*
2589         We get here when
2590         - a statement with AUTOCOMMIT=1 is being rolled back (because of some
2591           error)
2592         - a statement inside a transaction is rolled back
2593       */
2594 
2595       tx->rollback_stmt();
2596       tx->set_tx_failed(true);
2597     }
2598 
2599     if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED) {
2600       // For READ_COMMITTED, we release any existing snapshot so that we will
2601       // see any changes that occurred since the last statement.
2602       tx->release_snapshot();
2603     }
2604   }
2605   return HA_EXIT_SUCCESS;
2606 }
2607 
print_stats(THD * const thd,std::string const & type,std::string const & name,std::string const & status,stat_print_fn * stat_print)2608 static bool print_stats(THD *const thd, std::string const &type,
2609                         std::string const &name, std::string const &status,
2610                         stat_print_fn *stat_print) {
2611   return stat_print(thd, type.c_str(), type.size(), name.c_str(), name.size(),
2612                     status.c_str(), status.size());
2613 }
2614 
format_string(const char * const format,...)2615 static std::string format_string(const char *const format, ...) {
2616   std::string res;
2617   va_list args;
2618   va_list args_copy;
2619   char static_buff[256];
2620 
2621   DBUG_ASSERT(format != nullptr);
2622 
2623   va_start(args, format);
2624   va_copy(args_copy, args);
2625 
2626   // Calculate how much space we will need
2627   int len = vsnprintf(nullptr, 0, format, args);
2628   va_end(args);
2629 
2630   if (len < 0) {
2631     res = std::string("<format error>");
2632   } else if (len == 0) {
2633     // Shortcut for an empty string
2634     res = std::string("");
2635   } else {
2636     // For short enough output use a static buffer
2637     char *buff = static_buff;
2638     std::unique_ptr<char[]> dynamic_buff = nullptr;
2639 
2640     len++; // Add one for null terminator
2641 
2642     // for longer output use an allocated buffer
2643     if (static_cast<uint>(len) > sizeof(static_buff)) {
2644       dynamic_buff.reset(new char[len]);
2645       buff = dynamic_buff.get();
2646     }
2647 
2648     // Now re-do the vsnprintf with the buffer which is now large enough
2649     (void)vsnprintf(buff, len, format, args_copy);
2650 
2651     // Convert to a std::string.  Note we could have created a std::string
2652     // large enough and then converted the buffer to a 'char*' and created
2653     // the output in place.  This would probably work but feels like a hack.
2654     // Since this isn't code that needs to be super-performant we are going
2655     // with this 'safer' method.
2656      res = std::string(buff);
2657   }
2658 
2659   va_end(args_copy);
2660 
2661   return res;
2662 }
2663 
2664 class Rdb_snapshot_status : public Rdb_tx_list_walker {
2665 private:
2666   std::string m_data;
2667 
current_timestamp(void)2668   static std::string current_timestamp(void) {
2669     static const char *const format = "%d-%02d-%02d %02d:%02d:%02d";
2670     time_t currtime;
2671     struct tm currtm;
2672 
2673     time(&currtime);
2674 
2675     localtime_r(&currtime, &currtm);
2676 
2677     return format_string(format, currtm.tm_year + 1900, currtm.tm_mon + 1,
2678                          currtm.tm_mday, currtm.tm_hour, currtm.tm_min,
2679                          currtm.tm_sec);
2680   }
2681 
get_header(void)2682   static std::string get_header(void) {
2683     return "\n============================================================\n" +
2684            current_timestamp() +
2685            " ROCKSDB TRANSACTION MONITOR OUTPUT\n"
2686            "============================================================\n"
2687            "---------\n"
2688            "SNAPSHOTS\n"
2689            "---------\n"
2690            "LIST OF SNAPSHOTS FOR EACH SESSION:\n";
2691   }
2692 
get_footer(void)2693   static std::string get_footer(void) {
2694     return "-----------------------------------------\n"
2695            "END OF ROCKSDB TRANSACTION MONITOR OUTPUT\n"
2696            "=========================================\n";
2697   }
2698 
2699 public:
Rdb_snapshot_status()2700   Rdb_snapshot_status() : m_data(get_header()) {}
2701 
getResult()2702   std::string getResult() { return m_data + get_footer(); }
2703 
2704   /* Implement Rdb_transaction interface */
2705   /* Create one row in the snapshot status table */
process_tran(const Rdb_transaction * const tx)2706   void process_tran(const Rdb_transaction *const tx) override {
2707     DBUG_ASSERT(tx != nullptr);
2708 
2709     /* Calculate the duration the snapshot has existed */
2710     int64_t snapshot_timestamp = tx->m_snapshot_timestamp;
2711     if (snapshot_timestamp != 0) {
2712       int64_t curr_time;
2713       rdb->GetEnv()->GetCurrentTime(&curr_time);
2714 
2715       THD *thd = tx->get_thd();
2716       char buffer[1024];
2717       thd_security_context(thd, buffer, sizeof buffer, 0);
2718       m_data += format_string("---SNAPSHOT, ACTIVE %lld sec\n"
2719                               "%s\n"
2720                               "lock count %llu, write count %llu\n",
2721                               curr_time - snapshot_timestamp, buffer,
2722                               tx->get_lock_count(), tx->get_write_count());
2723     }
2724   }
2725 };
2726 
2727 /**
2728  * @brief
2729  * walks through all non-replication transactions and copies
2730  * out relevant information for information_schema.rocksdb_trx
2731  */
2732 class Rdb_trx_info_aggregator : public Rdb_tx_list_walker {
2733 private:
2734   std::vector<Rdb_trx_info> *m_trx_info;
2735 
2736 public:
Rdb_trx_info_aggregator(std::vector<Rdb_trx_info> * const trx_info)2737   explicit Rdb_trx_info_aggregator(std::vector<Rdb_trx_info> *const trx_info)
2738       : m_trx_info(trx_info) {}
2739 
process_tran(const Rdb_transaction * const tx)2740   void process_tran(const Rdb_transaction *const tx) override {
2741     static const std::map<int, std::string> state_map = {
2742         {rocksdb::Transaction::STARTED, "STARTED"},
2743         {rocksdb::Transaction::AWAITING_PREPARE, "AWAITING_PREPARE"},
2744         {rocksdb::Transaction::PREPARED, "PREPARED"},
2745         {rocksdb::Transaction::AWAITING_COMMIT, "AWAITING_COMMIT"},
2746         {rocksdb::Transaction::COMMITED, "COMMITED"},
2747         {rocksdb::Transaction::AWAITING_ROLLBACK, "AWAITING_ROLLBACK"},
2748         {rocksdb::Transaction::ROLLEDBACK, "ROLLEDBACK"},
2749     };
2750 
2751     DBUG_ASSERT(tx != nullptr);
2752 
2753     THD *const thd = tx->get_thd();
2754     ulong thread_id = thd->thread_id;
2755 
2756     if (tx->is_writebatch_trx()) {
2757       const auto wb_impl = static_cast<const Rdb_writebatch_impl *>(tx);
2758       DBUG_ASSERT(wb_impl);
2759       m_trx_info->push_back(
2760           {"",                            /* name */
2761            0,                             /* trx_id */
2762            wb_impl->get_write_count(), 0, /* lock_count */
2763            0,                             /* timeout_sec */
2764            "",                            /* state */
2765            "",                            /* waiting_key */
2766            0,                             /* waiting_cf_id */
2767            1,                             /*is_replication */
2768            1,                             /* skip_trx_api */
2769            wb_impl->is_tx_read_only(), 0, /* deadlock detection */
2770            wb_impl->num_ongoing_bulk_load(), thread_id, "" /* query string */});
2771     } else {
2772       const auto tx_impl = static_cast<const Rdb_transaction_impl *>(tx);
2773       DBUG_ASSERT(tx_impl);
2774       const rocksdb::Transaction *rdb_trx = tx_impl->get_rdb_trx();
2775 
2776       if (rdb_trx == nullptr) {
2777         return;
2778       }
2779 
2780       std::string query_str;
2781       LEX_STRING *const lex_str = thd_query_string(thd);
2782       if (lex_str != nullptr && lex_str->str != nullptr) {
2783         query_str = std::string(lex_str->str);
2784       }
2785 
2786       const auto state_it = state_map.find(rdb_trx->GetState());
2787       DBUG_ASSERT(state_it != state_map.end());
2788       const int is_replication = (thd->rli_slave != nullptr);
2789       uint32_t waiting_cf_id;
2790       std::string waiting_key;
2791       rdb_trx->GetWaitingTxns(&waiting_cf_id, &waiting_key),
2792 
2793           m_trx_info->push_back(
2794               {rdb_trx->GetName(), rdb_trx->GetID(), tx_impl->get_write_count(),
2795                tx_impl->get_lock_count(), tx_impl->get_timeout_sec(),
2796                state_it->second, waiting_key, waiting_cf_id, is_replication,
2797                0, /* skip_trx_api */
2798                tx_impl->is_tx_read_only(), rdb_trx->IsDeadlockDetect(),
2799                tx_impl->num_ongoing_bulk_load(), thread_id, query_str});
2800     }
2801   }
2802 };
2803 
2804 /*
2805   returns a vector of info for all non-replication threads
2806   for use by information_schema.rocksdb_trx
2807 */
rdb_get_all_trx_info()2808 std::vector<Rdb_trx_info> rdb_get_all_trx_info() {
2809   std::vector<Rdb_trx_info> trx_info;
2810   Rdb_trx_info_aggregator trx_info_agg(&trx_info);
2811   Rdb_transaction::walk_tx_list(&trx_info_agg);
2812   return trx_info;
2813 }
2814 
2815 /* Generate the snapshot status table */
rocksdb_show_snapshot_status(handlerton * const hton,THD * const thd,stat_print_fn * const stat_print)2816 static bool rocksdb_show_snapshot_status(handlerton *const hton, THD *const thd,
2817                                          stat_print_fn *const stat_print) {
2818   Rdb_snapshot_status showStatus;
2819 
2820   Rdb_transaction::walk_tx_list(&showStatus);
2821 
2822   /* Send the result data back to MySQL */
2823   return print_stats(thd, "SNAPSHOTS", "rocksdb", showStatus.getResult(),
2824                      stat_print);
2825 }
2826 
2827 /*
2828   This is called for SHOW ENGINE ROCKSDB STATUS|LOGS|etc.
2829 
2830   For now, produce info about live files (which gives an imprecise idea about
2831   what column families are there)
2832 */
2833 
rocksdb_show_status(handlerton * const hton,THD * const thd,stat_print_fn * const stat_print,enum ha_stat_type stat_type)2834 static bool rocksdb_show_status(handlerton *const hton, THD *const thd,
2835                                 stat_print_fn *const stat_print,
2836                                 enum ha_stat_type stat_type) {
2837   bool res = false;
2838   if (stat_type == HA_ENGINE_STATUS) {
2839     std::string str;
2840 
2841     /* Per DB stats */
2842     if (rdb->GetProperty("rocksdb.dbstats", &str)) {
2843       res |= print_stats(thd, "DBSTATS", "rocksdb", str, stat_print);
2844     }
2845 
2846     /* Per column family stats */
2847     for (const auto &cf_name : cf_manager.get_cf_names()) {
2848       rocksdb::ColumnFamilyHandle *cfh;
2849       bool is_automatic;
2850 
2851       /*
2852         Only the cf name is important. Whether it was generated automatically
2853         does not matter, so is_automatic is ignored.
2854       */
2855       cfh = cf_manager.get_cf(cf_name.c_str(), "", nullptr, &is_automatic);
2856       if (cfh == nullptr)
2857         continue;
2858 
2859       if (!rdb->GetProperty(cfh, "rocksdb.cfstats", &str))
2860         continue;
2861 
2862       res |= print_stats(thd, "CF_COMPACTION", cf_name, str, stat_print);
2863     }
2864 
2865     /* Memory Statistics */
2866     std::vector<rocksdb::DB *> dbs;
2867     std::unordered_set<const rocksdb::Cache *> cache_set;
2868     size_t internal_cache_count = 0;
2869     size_t kDefaultInternalCacheSize = 8 * 1024 * 1024;
2870     char buf[100];
2871 
2872     dbs.push_back(rdb);
2873     cache_set.insert(rocksdb_tbl_options.block_cache.get());
2874     for (const auto &cf_handle : cf_manager.get_all_cf()) {
2875       rocksdb::ColumnFamilyDescriptor cf_desc;
2876       cf_handle->GetDescriptor(&cf_desc);
2877       auto *const table_factory = cf_desc.options.table_factory.get();
2878       if (table_factory != nullptr) {
2879         std::string tf_name = table_factory->Name();
2880         if (tf_name.find("BlockBasedTable") != std::string::npos) {
2881           const rocksdb::BlockBasedTableOptions *const bbt_opt =
2882               reinterpret_cast<rocksdb::BlockBasedTableOptions *>(
2883                   table_factory->GetOptions());
2884           if (bbt_opt != nullptr) {
2885             if (bbt_opt->block_cache.get() != nullptr) {
2886               cache_set.insert(bbt_opt->block_cache.get());
2887             } else {
2888               internal_cache_count++;
2889             }
2890             cache_set.insert(bbt_opt->block_cache_compressed.get());
2891           }
2892         }
2893       }
2894     }
2895 
2896     std::map<rocksdb::MemoryUtil::UsageType, uint64_t> temp_usage_by_type;
2897     str.clear();
2898     rocksdb::MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set,
2899                                                          &temp_usage_by_type);
2900     snprintf(buf, sizeof(buf), "\nMemTable Total: %lu",
2901              temp_usage_by_type[rocksdb::MemoryUtil::kMemTableTotal]);
2902     str.append(buf);
2903     snprintf(buf, sizeof(buf), "\nMemTable Unflushed: %lu",
2904              temp_usage_by_type[rocksdb::MemoryUtil::kMemTableUnFlushed]);
2905     str.append(buf);
2906     snprintf(buf, sizeof(buf), "\nTable Readers Total: %lu",
2907              temp_usage_by_type[rocksdb::MemoryUtil::kTableReadersTotal]);
2908     str.append(buf);
2909     snprintf(buf, sizeof(buf), "\nCache Total: %lu",
2910              temp_usage_by_type[rocksdb::MemoryUtil::kCacheTotal]);
2911     str.append(buf);
2912     snprintf(buf, sizeof(buf), "\nDefault Cache Capacity: %lu",
2913              internal_cache_count * kDefaultInternalCacheSize);
2914     str.append(buf);
2915     res |= print_stats(thd, "Memory_Stats", "rocksdb", str, stat_print);
2916   }
2917 
2918   return res;
2919 }
2920 
rocksdb_register_tx(handlerton * const hton,THD * const thd,Rdb_transaction * const tx)2921 static inline void rocksdb_register_tx(handlerton *const hton, THD *const thd,
2922                                        Rdb_transaction *const tx) {
2923   DBUG_ASSERT(tx != nullptr);
2924 
2925   trans_register_ha(thd, FALSE, rocksdb_hton);
2926   if (my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
2927     tx->start_stmt();
2928     trans_register_ha(thd, TRUE, rocksdb_hton);
2929   }
2930 }
2931 
2932 /*
2933     Supporting START TRANSACTION WITH CONSISTENT SNAPSHOT
2934 
2935     - START TRANSACTION WITH CONSISTENT SNAPSHOT
2936     takes both InnoDB and RocksDB snapshots, and both InnoDB and RocksDB
2937     participate in transaction. When executing COMMIT, both InnoDB and
2938     RocksDB modifications are committed. Remember that XA is not supported yet,
2939     so mixing engines is not recommended anyway.
2940 */
rocksdb_start_tx_and_assign_read_view(handlerton * const hton,THD * const thd)2941 static int rocksdb_start_tx_and_assign_read_view(
2942     handlerton *const hton,          /*!< in: RocksDB handlerton */
2943     THD *const thd)                  /*!< in: MySQL thread handle of the
2944                                      user for whom the transaction should
2945                                      be committed */
2946 {
2947   Rdb_perf_context_guard guard(thd);
2948 
2949   ulong const tx_isolation = my_core::thd_tx_isolation(thd);
2950 
2951   Rdb_transaction* tx= get_or_create_tx(thd);
2952   DBUG_ASSERT(!tx->has_snapshot());
2953   tx->set_tx_read_only(true);
2954   rocksdb_register_tx(hton, thd, tx);
2955 
2956   if (tx_isolation == ISO_REPEATABLE_READ) {
2957     tx->acquire_snapshot(true);
2958   } else {
2959     push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, HA_ERR_UNSUPPORTED,
2960                         "Only REPEATABLE READ isolation level is supported "
2961                         "for START TRANSACTION WITH CONSISTENT SNAPSHOT "
2962                         "in RocksDB Storage Engine. Snapshot has not been "
2963                         "taken.");
2964   }  return HA_EXIT_SUCCESS;
2965 }
2966 
2967 /* Dummy SAVEPOINT support. This is needed for long running transactions
2968  * like mysqldump (https://bugs.mysql.com/bug.php?id=71017).
2969  * Current SAVEPOINT does not correctly handle ROLLBACK and does not return
2970  * errors. This needs to be addressed in future versions (Issue#96).
2971  */
rocksdb_savepoint(handlerton * const hton,THD * const thd,void * const savepoint)2972 static int rocksdb_savepoint(handlerton *const hton, THD *const thd,
2973                              void *const savepoint) {
2974   return HA_EXIT_SUCCESS;
2975 }
2976 
rocksdb_rollback_to_savepoint(handlerton * const hton,THD * const thd,void * const savepoint)2977 static int rocksdb_rollback_to_savepoint(handlerton *const hton, THD *const thd,
2978                                          void *const savepoint) {
2979   Rdb_transaction *&tx = get_tx_from_thd(thd);
2980   return tx->rollback_to_savepoint(savepoint);
2981 }
2982 
2983 static bool
rocksdb_rollback_to_savepoint_can_release_mdl(handlerton * const hton,THD * const thd)2984 rocksdb_rollback_to_savepoint_can_release_mdl(handlerton *const hton,
2985                                               THD *const thd) {
2986   return true;
2987 }
2988 
check_rocksdb_options_compatibility(const char * const dbpath,const rocksdb::Options & main_opts,const std::vector<rocksdb::ColumnFamilyDescriptor> & cf_descr)2989 static rocksdb::Status check_rocksdb_options_compatibility(
2990   const char *const dbpath, const rocksdb::Options& main_opts,
2991   const std::vector<rocksdb::ColumnFamilyDescriptor>& cf_descr)
2992 {
2993   DBUG_ASSERT(rocksdb_datadir != nullptr);
2994 
2995   rocksdb::DBOptions loaded_db_opt;
2996   std::vector<rocksdb::ColumnFamilyDescriptor> loaded_cf_descs;
2997   rocksdb::Status status = LoadLatestOptions(dbpath,
2998                                              rocksdb::Env::Default(),
2999                                              &loaded_db_opt, &loaded_cf_descs);
3000 
3001   // If we're starting from scratch and there are no options saved yet then this
3002   // is a valid case. Therefore we can't compare the current set of options to
3003   // anything.
3004   if (status.IsNotFound()) {
3005     return rocksdb::Status::OK();
3006   }
3007 
3008   if (!status.ok()) {
3009     return status;
3010   }
3011 
3012   if (loaded_cf_descs.size() != cf_descr.size()) {
3013     return rocksdb::Status::NotSupported("Mismatched size of column family "
3014                                          "descriptors.");
3015   }
3016 
3017   // Please see RocksDB documentation for more context about why we need to set
3018   // user-defined functions and pointer-typed options manually.
3019   for (size_t i = 0; i < loaded_cf_descs.size(); i++) {
3020     loaded_cf_descs[i].options.compaction_filter =
3021         cf_descr[i].options.compaction_filter;
3022     loaded_cf_descs[i].options.compaction_filter_factory =
3023         cf_descr[i].options.compaction_filter_factory;
3024     loaded_cf_descs[i].options.comparator = cf_descr[i].options.comparator;
3025     loaded_cf_descs[i].options.memtable_factory =
3026         cf_descr[i].options.memtable_factory;
3027     loaded_cf_descs[i].options.merge_operator =
3028         cf_descr[i].options.merge_operator;
3029     loaded_cf_descs[i].options.prefix_extractor =
3030         cf_descr[i].options.prefix_extractor;
3031     loaded_cf_descs[i].options.table_factory =
3032         cf_descr[i].options.table_factory;
3033   }
3034 
3035   // This is the essence of the function - determine if it's safe to open the
3036   // database or not.
3037   status = CheckOptionsCompatibility(dbpath, rocksdb::Env::Default(), main_opts,
3038                                      loaded_cf_descs);
3039 
3040   return status;
3041 }
3042 
3043 /*
3044   Storage Engine initialization function, invoked when plugin is loaded.
3045 */
3046 
rocksdb_init_func(void * const p)3047 static int rocksdb_init_func(void *const p) {
3048   DBUG_ENTER_FUNC();
3049 
3050   // Validate the assumption about the size of ROCKSDB_SIZEOF_HIDDEN_PK_COLUMN.
3051   static_assert(sizeof(longlong) == 8, "Assuming that longlong is 8 bytes.");
3052 
3053 #ifdef HAVE_PSI_INTERFACE
3054   init_rocksdb_psi_keys();
3055 #endif
3056 
3057   rocksdb_hton = (handlerton *)p;
3058   mysql_mutex_init(rdb_psi_open_tbls_mutex_key, &rdb_open_tables.m_mutex,
3059                    MY_MUTEX_INIT_FAST);
3060 #ifdef HAVE_PSI_INTERFACE
3061   rdb_bg_thread.init(rdb_signal_bg_psi_mutex_key, rdb_signal_bg_psi_cond_key);
3062   rdb_drop_idx_thread.init(rdb_signal_drop_idx_psi_mutex_key,
3063                            rdb_signal_drop_idx_psi_cond_key);
3064 #else
3065   rdb_bg_thread.init();
3066   rdb_drop_idx_thread.init();
3067 #endif
3068   mysql_mutex_init(rdb_collation_data_mutex_key, &rdb_collation_data_mutex,
3069                    MY_MUTEX_INIT_FAST);
3070   mysql_mutex_init(rdb_mem_cmp_space_mutex_key, &rdb_mem_cmp_space_mutex,
3071                    MY_MUTEX_INIT_FAST);
3072 
3073 #if defined(HAVE_PSI_INTERFACE)
3074   rdb_collation_exceptions = new Regex(key_rwlock_collation_exception_list);
3075 #else
3076   rdb_collation_exceptions = new Regex();
3077 #endif
3078 
3079   mysql_mutex_init(rdb_sysvars_psi_mutex_key, &rdb_sysvars_mutex,
3080                    MY_MUTEX_INIT_FAST);
3081   rdb_open_tables.init_hash();
3082   Rdb_transaction::init_mutex();
3083 
3084   rocksdb_hton->state = SHOW_OPTION_YES;
3085   rocksdb_hton->create = rocksdb_create_handler;
3086   rocksdb_hton->close_connection = rocksdb_close_connection;
3087   rocksdb_hton->prepare = rocksdb_prepare;
3088   rocksdb_hton->commit_by_xid = rocksdb_commit_by_xid;
3089   rocksdb_hton->rollback_by_xid = rocksdb_rollback_by_xid;
3090   rocksdb_hton->recover = rocksdb_recover;
3091   rocksdb_hton->commit = rocksdb_commit;
3092   rocksdb_hton->rollback = rocksdb_rollback;
3093   rocksdb_hton->db_type = DB_TYPE_ROCKSDB;
3094   rocksdb_hton->show_status = rocksdb_show_status;
3095   rocksdb_hton->start_consistent_snapshot =
3096       rocksdb_start_tx_and_assign_read_view;
3097   rocksdb_hton->savepoint_set = rocksdb_savepoint;
3098   rocksdb_hton->savepoint_rollback = rocksdb_rollback_to_savepoint;
3099   rocksdb_hton->savepoint_rollback_can_release_mdl =
3100       rocksdb_rollback_to_savepoint_can_release_mdl;
3101   rocksdb_hton->flush_logs = rocksdb_flush_wal;
3102 
3103   rocksdb_hton->flags = HTON_TEMPORARY_NOT_SUPPORTED |
3104                         HTON_SUPPORTS_EXTENDED_KEYS | HTON_CAN_RECREATE;
3105 
3106   DBUG_ASSERT(!mysqld_embedded);
3107 
3108   rocksdb_stats = rocksdb::CreateDBStatistics();
3109   rocksdb_db_options.statistics = rocksdb_stats;
3110 
3111   if (rocksdb_rate_limiter_bytes_per_sec != 0) {
3112     rocksdb_rate_limiter.reset(
3113         rocksdb::NewGenericRateLimiter(rocksdb_rate_limiter_bytes_per_sec));
3114     rocksdb_db_options.rate_limiter = rocksdb_rate_limiter;
3115   }
3116 
3117   std::shared_ptr<Rdb_logger> myrocks_logger = std::make_shared<Rdb_logger>();
3118   rocksdb::Status s = rocksdb::CreateLoggerFromOptions(
3119       rocksdb_datadir, rocksdb_db_options, &rocksdb_db_options.info_log);
3120   if (s.ok()) {
3121     myrocks_logger->SetRocksDBLogger(rocksdb_db_options.info_log);
3122   }
3123 
3124   rocksdb_db_options.info_log = myrocks_logger;
3125   myrocks_logger->SetInfoLogLevel(
3126       static_cast<rocksdb::InfoLogLevel>(rocksdb_info_log_level));
3127   rocksdb_db_options.wal_dir = rocksdb_wal_dir;
3128 
3129   rocksdb_db_options.wal_recovery_mode =
3130       static_cast<rocksdb::WALRecoveryMode>(rocksdb_wal_recovery_mode);
3131 
3132   rocksdb_db_options.access_hint_on_compaction_start =
3133       static_cast<rocksdb::Options::AccessHint>(
3134           rocksdb_access_hint_on_compaction_start);
3135 
3136   if (rocksdb_db_options.allow_mmap_reads &&
3137       rocksdb_db_options.use_direct_reads) {
3138     // allow_mmap_reads implies !use_direct_reads and RocksDB will not open if
3139     // mmap_reads and direct_reads are both on.   (NO_LINT_DEBUG)
3140     sql_print_error("RocksDB: Can't enable both use_direct_reads "
3141                     "and allow_mmap_reads\n");
3142     rdb_open_tables.free_hash();
3143     DBUG_RETURN(HA_EXIT_FAILURE);
3144   }
3145 
3146   if (rocksdb_db_options.allow_mmap_writes &&
3147       rocksdb_db_options.use_direct_writes) {
3148     // See above comment for allow_mmap_reads. (NO_LINT_DEBUG)
3149     sql_print_error("RocksDB: Can't enable both use_direct_writes "
3150                     "and allow_mmap_writes\n");
3151     rdb_open_tables.free_hash();
3152     DBUG_RETURN(HA_EXIT_FAILURE);
3153   }
3154 
3155   std::vector<std::string> cf_names;
3156   rocksdb::Status status;
3157   status = rocksdb::DB::ListColumnFamilies(rocksdb_db_options, rocksdb_datadir,
3158                                            &cf_names);
3159   if (!status.ok()) {
3160     /*
3161       When we start on an empty datadir, ListColumnFamilies returns IOError,
3162       and RocksDB doesn't provide any way to check what kind of error it was.
3163       Checking system errno happens to work right now.
3164     */
3165     if (status.IsIOError() && errno == ENOENT) {
3166       sql_print_information("RocksDB: Got ENOENT when listing column families");
3167       sql_print_information(
3168           "RocksDB:   assuming that we're creating a new database");
3169     } else {
3170       std::string err_text = status.ToString();
3171       sql_print_error("RocksDB: Error listing column families: %s",
3172                       err_text.c_str());
3173       rdb_open_tables.free_hash();
3174       DBUG_RETURN(HA_EXIT_FAILURE);
3175     }
3176   } else
3177     sql_print_information("RocksDB: %ld column families found",
3178                           cf_names.size());
3179 
3180   std::vector<rocksdb::ColumnFamilyDescriptor> cf_descr;
3181   std::vector<rocksdb::ColumnFamilyHandle *> cf_handles;
3182 
3183   rocksdb_tbl_options.index_type =
3184       (rocksdb::BlockBasedTableOptions::IndexType)rocksdb_index_type;
3185 
3186   if (!rocksdb_tbl_options.no_block_cache) {
3187     rocksdb_tbl_options.block_cache =
3188         rocksdb::NewLRUCache(rocksdb_block_cache_size);
3189   }
3190   // Using newer BlockBasedTable format version for better compression
3191   // and better memory allocation.
3192   // See:
3193   // https://github.com/facebook/rocksdb/commit/9ab5adfc59a621d12357580c94451d9f7320c2dd
3194   rocksdb_tbl_options.format_version = 2;
3195 
3196   if (rocksdb_collect_sst_properties) {
3197     properties_collector_factory =
3198         std::make_shared<Rdb_tbl_prop_coll_factory>(&ddl_manager);
3199 
3200     rocksdb_set_compaction_options(nullptr, nullptr, nullptr, nullptr);
3201 
3202     mysql_mutex_lock(&rdb_sysvars_mutex);
3203 
3204     DBUG_ASSERT(rocksdb_table_stats_sampling_pct <=
3205                 RDB_TBL_STATS_SAMPLE_PCT_MAX);
3206     properties_collector_factory->SetTableStatsSamplingPct(
3207         rocksdb_table_stats_sampling_pct);
3208 
3209     mysql_mutex_unlock(&rdb_sysvars_mutex);
3210   }
3211 
3212   if (rocksdb_persistent_cache_size > 0) {
3213     std::shared_ptr<rocksdb::PersistentCache> pcache;
3214     rocksdb::NewPersistentCache(
3215         rocksdb::Env::Default(), std::string(rocksdb_persistent_cache_path),
3216         rocksdb_persistent_cache_size, myrocks_logger, true, &pcache);
3217     rocksdb_tbl_options.persistent_cache = pcache;
3218   } else if (strlen(rocksdb_persistent_cache_path)) {
3219     sql_print_error("RocksDB: Must specify rocksdb_persistent_cache_size");
3220     DBUG_RETURN(1);
3221   }
3222 
3223   if (!rocksdb_cf_options_map.init(
3224           rocksdb_tbl_options, properties_collector_factory,
3225           rocksdb_default_cf_options, rocksdb_override_cf_options)) {
3226     // NO_LINT_DEBUG
3227     sql_print_error("RocksDB: Failed to initialize CF options map.");
3228     rdb_open_tables.free_hash();
3229     DBUG_RETURN(HA_EXIT_FAILURE);
3230   }
3231 
3232   /*
3233     If there are no column families, we're creating the new database.
3234     Create one column family named "default".
3235   */
3236   if (cf_names.size() == 0)
3237     cf_names.push_back(DEFAULT_CF_NAME);
3238 
3239   std::vector<int> compaction_enabled_cf_indices;
3240   sql_print_information("RocksDB: Column Families at start:");
3241   for (size_t i = 0; i < cf_names.size(); ++i) {
3242     rocksdb::ColumnFamilyOptions opts;
3243     rocksdb_cf_options_map.get_cf_options(cf_names[i], &opts);
3244 
3245     sql_print_information("  cf=%s", cf_names[i].c_str());
3246     sql_print_information("    write_buffer_size=%ld", opts.write_buffer_size);
3247     sql_print_information("    target_file_size_base=%" PRIu64,
3248                           opts.target_file_size_base);
3249 
3250     /*
3251       Temporarily disable compactions to prevent a race condition where
3252       compaction starts before compaction filter is ready.
3253     */
3254     if (!opts.disable_auto_compactions) {
3255       compaction_enabled_cf_indices.push_back(i);
3256       opts.disable_auto_compactions = true;
3257     }
3258     cf_descr.push_back(rocksdb::ColumnFamilyDescriptor(cf_names[i], opts));
3259   }
3260 
3261   rocksdb::Options main_opts(rocksdb_db_options,
3262                              rocksdb_cf_options_map.get_defaults());
3263 
3264   main_opts.env->SetBackgroundThreads(main_opts.max_background_flushes,
3265                                       rocksdb::Env::Priority::HIGH);
3266   main_opts.env->SetBackgroundThreads(main_opts.max_background_compactions,
3267                                       rocksdb::Env::Priority::LOW);
3268   rocksdb::TransactionDBOptions tx_db_options;
3269   tx_db_options.transaction_lock_timeout = 2; // 2 seconds
3270   tx_db_options.custom_mutex_factory = std::make_shared<Rdb_mutex_factory>();
3271 
3272   status =
3273       check_rocksdb_options_compatibility(rocksdb_datadir, main_opts, cf_descr);
3274 
3275   // We won't start if we'll determine that there's a chance of data corruption
3276   // because of incompatible options.
3277   if (!status.ok()) {
3278     // NO_LINT_DEBUG
3279     sql_print_error("RocksDB: compatibility check against existing database "
3280                     "options failed. %s",
3281                     status.ToString().c_str());
3282     rdb_open_tables.free_hash();
3283     DBUG_RETURN(HA_EXIT_FAILURE);
3284   }
3285 
3286   status = rocksdb::TransactionDB::Open(
3287       main_opts, tx_db_options, rocksdb_datadir, cf_descr, &cf_handles, &rdb);
3288 
3289   if (!status.ok()) {
3290     std::string err_text = status.ToString();
3291     sql_print_error("RocksDB: Error opening instance: %s", err_text.c_str());
3292     rdb_open_tables.free_hash();
3293     DBUG_RETURN(HA_EXIT_FAILURE);
3294   }
3295   cf_manager.init(&rocksdb_cf_options_map, &cf_handles);
3296 
3297   if (dict_manager.init(rdb->GetBaseDB(), &cf_manager)) {
3298     // NO_LINT_DEBUG
3299     sql_print_error("RocksDB: Failed to initialize data dictionary.");
3300     rdb_open_tables.free_hash();
3301     DBUG_RETURN(HA_EXIT_FAILURE);
3302   }
3303 
3304   if (ddl_manager.init(&dict_manager, &cf_manager, rocksdb_validate_tables)) {
3305     // NO_LINT_DEBUG
3306     sql_print_error("RocksDB: Failed to initialize DDL manager.");
3307     rdb_open_tables.free_hash();
3308     DBUG_RETURN(HA_EXIT_FAILURE);
3309   }
3310 
3311   for (const auto &cf_handle : cf_manager.get_all_cf()) {
3312     uint flags;
3313     if (!dict_manager.get_cf_flags(cf_handle->GetID(), &flags)) {
3314       const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
3315       rocksdb::WriteBatch *const batch = wb.get();
3316       dict_manager.add_cf_flags(batch, cf_handle->GetID(), 0);
3317       dict_manager.commit(batch);
3318     }
3319   }
3320 
3321 
3322   Rdb_sst_info::init(rdb);
3323 
3324   /*
3325     Enable auto compaction, things needed for compaction filter are finished
3326     initializing
3327   */
3328   std::vector<rocksdb::ColumnFamilyHandle *> compaction_enabled_cf_handles;
3329   compaction_enabled_cf_handles.reserve(compaction_enabled_cf_indices.size());
3330   for (const auto &index : compaction_enabled_cf_indices) {
3331     compaction_enabled_cf_handles.push_back(cf_handles[index]);
3332   }
3333 
3334   status = rdb->EnableAutoCompaction(compaction_enabled_cf_handles);
3335 
3336   if (!status.ok()) {
3337     const std::string err_text = status.ToString();
3338     // NO_LINT_DEBUG
3339     sql_print_error("RocksDB: Error enabling compaction: %s", err_text.c_str());
3340     rdb_open_tables.free_hash();
3341     DBUG_RETURN(HA_EXIT_FAILURE);
3342   }
3343 
3344   auto err = rdb_bg_thread.create_thread(BG_THREAD_NAME
3345 #ifdef HAVE_PSI_INTERFACE
3346                                          ,
3347                                          rdb_background_psi_thread_key
3348 #endif
3349                                          );
3350   if (err != 0) {
3351     sql_print_error("RocksDB: Couldn't start the background thread: (errno=%d)",
3352                     err);
3353     rdb_open_tables.free_hash();
3354     DBUG_RETURN(HA_EXIT_FAILURE);
3355   }
3356 
3357   err = rdb_drop_idx_thread.create_thread(INDEX_THREAD_NAME
3358 #ifdef HAVE_PSI_INTERFACE
3359                                           ,
3360                                           rdb_drop_idx_psi_thread_key
3361 #endif
3362                                           );
3363   if (err != 0) {
3364     sql_print_error("RocksDB: Couldn't start the drop index thread: (errno=%d)",
3365                     err);
3366     rdb_open_tables.free_hash();
3367     DBUG_RETURN(HA_EXIT_FAILURE);
3368   }
3369 
3370   rdb_set_collation_exception_list(rocksdb_strict_collation_exceptions);
3371 
3372   if (rocksdb_pause_background_work) {
3373     rdb->PauseBackgroundWork();
3374   }
3375 
3376   sql_print_information("RocksDB instance opened");
3377   DBUG_RETURN(HA_EXIT_SUCCESS);
3378 }
3379 
3380 /*
3381   Storage Engine deinitialization function, invoked when plugin is unloaded.
3382 */
3383 
rocksdb_done_func(void * const p)3384 static int rocksdb_done_func(void *const p) {
3385   DBUG_ENTER_FUNC();
3386 
3387   int error = 0;
3388 
3389   // signal the drop index thread to stop
3390   rdb_drop_idx_thread.signal(true);
3391 
3392   // Flush all memtables for not lose data, even if WAL is disabled.
3393   rocksdb_flush_all_memtables();
3394 
3395   // Stop all rocksdb background work
3396   CancelAllBackgroundWork(rdb->GetBaseDB(), true);
3397 
3398   // Signal the background thread to stop and to persist all stats collected
3399   // from background flushes and compactions. This will add more keys to a new
3400   // memtable, but since the memtables were just flushed, it should not trigger
3401   // a flush that can stall due to background threads being stopped. As long
3402   // as these keys are stored in a WAL file, they can be retrieved on restart.
3403   rdb_bg_thread.signal(true);
3404 
3405   // Wait for the background thread to finish.
3406   auto err = rdb_bg_thread.join();
3407   if (err != 0) {
3408     // We'll log the message and continue because we're shutting down and
3409     // continuation is the optimal strategy.
3410     // NO_LINT_DEBUG
3411     sql_print_error("RocksDB: Couldn't stop the background thread: (errno=%d)",
3412                     err);
3413   }
3414 
3415   // Wait for the drop index thread to finish.
3416   err = rdb_drop_idx_thread.join();
3417   if (err != 0) {
3418     // NO_LINT_DEBUG
3419     sql_print_error("RocksDB: Couldn't stop the index thread: (errno=%d)", err);
3420   }
3421 
3422   if (rdb_open_tables.m_hash.records) {
3423     // Looks like we are getting unloaded and yet we have some open tables
3424     // left behind.
3425     error = 1;
3426   }
3427 
3428   rdb_open_tables.free_hash();
3429   mysql_mutex_destroy(&rdb_open_tables.m_mutex);
3430   mysql_mutex_destroy(&rdb_sysvars_mutex);
3431 
3432   delete rdb_collation_exceptions;
3433   mysql_mutex_destroy(&rdb_collation_data_mutex);
3434   mysql_mutex_destroy(&rdb_mem_cmp_space_mutex);
3435 
3436   Rdb_transaction::term_mutex();
3437 
3438   for (auto &it : rdb_collation_data) {
3439     delete it;
3440     it = nullptr;
3441   }
3442 
3443   ddl_manager.cleanup();
3444   dict_manager.cleanup();
3445   cf_manager.cleanup();
3446 
3447   delete rdb;
3448   rdb = nullptr;
3449 
3450 // Disown the cache data since we're shutting down.
3451 // This results in memory leaks but it improved the shutdown time.
3452 // Don't disown when running under valgrind
3453 #ifndef HAVE_purify
3454   if (rocksdb_tbl_options.block_cache) {
3455     rocksdb_tbl_options.block_cache->DisownData();
3456   }
3457 #endif /* HAVE_purify */
3458 
3459   DBUG_RETURN(error);
3460 }
3461 
3462 /**
3463   @brief
3464   Example of simple lock controls. The "table_handler" it creates is a
3465   structure we will pass to each ha_rocksdb handler. Do you have to have
3466   one of these? Well, you have pieces that are used for locking, and
3467   they are needed to function.
3468 */
3469 
3470 Rdb_table_handler *
get_table_handler(const char * const table_name)3471 Rdb_open_tables_map::get_table_handler(const char *const table_name) {
3472   Rdb_table_handler *table_handler;
3473   uint length;
3474   char *tmp_name;
3475 
3476   DBUG_ASSERT(table_name != nullptr);
3477   length = (uint)strlen(table_name);
3478 
3479   // First, look up the table in the hash map.
3480   mysql_mutex_lock(&m_mutex);
3481   if (!(table_handler = reinterpret_cast<Rdb_table_handler *>(my_hash_search(
3482             &m_hash, reinterpret_cast<const uchar *>(table_name), length)))) {
3483     // Since we did not find it in the hash map, attempt to create and add it
3484     // to the hash map.
3485     if (!(table_handler = reinterpret_cast<Rdb_table_handler *>(my_multi_malloc(
3486               MYF(MY_WME | MY_ZEROFILL), &table_handler, sizeof(*table_handler),
3487               &tmp_name, length + 1, NullS)))) {
3488       // Allocating a new Rdb_table_handler and a new table name failed.
3489       mysql_mutex_unlock(&m_mutex);
3490       return nullptr;
3491     }
3492 
3493     table_handler->m_ref_count = 0;
3494     table_handler->m_table_name_length = length;
3495     table_handler->m_table_name = tmp_name;
3496     strmov(table_handler->m_table_name, table_name);
3497 
3498     if (my_hash_insert(&m_hash, reinterpret_cast<uchar *>(table_handler))) {
3499       // Inserting into the hash map failed.
3500       mysql_mutex_unlock(&m_mutex);
3501       my_free(table_handler);
3502       return nullptr;
3503     }
3504 
3505     thr_lock_init(&table_handler->m_thr_lock);
3506     table_handler->m_io_perf_read.init();
3507   }
3508   DBUG_ASSERT(table_handler->m_ref_count >= 0);
3509   table_handler->m_ref_count++;
3510 
3511   mysql_mutex_unlock(&m_mutex);
3512 
3513   return table_handler;
3514 }
3515 
rdb_get_open_table_names(void)3516 std::vector<std::string> rdb_get_open_table_names(void) {
3517   return rdb_open_tables.get_table_names();
3518 }
3519 
get_table_names(void) const3520 std::vector<std::string> Rdb_open_tables_map::get_table_names(void) const {
3521   ulong i;
3522   const Rdb_table_handler *table_handler;
3523   std::vector<std::string> names;
3524 
3525   mysql_mutex_lock(&m_mutex);
3526   for (i = 0; (table_handler = reinterpret_cast<const Rdb_table_handler *>(
3527                    my_hash_const_element(&m_hash, i)));
3528        i++) {
3529     DBUG_ASSERT(table_handler != nullptr);
3530     names.push_back(table_handler->m_table_name);
3531   }
3532   DBUG_ASSERT(i == m_hash.records);
3533   mysql_mutex_unlock(&m_mutex);
3534 
3535   return names;
3536 }
3537 
load_auto_incr_value()3538 void ha_rocksdb::load_auto_incr_value() {
3539   const int save_active_index = active_index;
3540   active_index = table->s->next_number_index;
3541   const uint8 save_table_status = table->status;
3542 
3543   /*
3544     load_auto_incr_value() may be called by statements that
3545     do not execute implicit commits (i.e. SHOW CREATE TABLE).
3546     index_last() creates a snapshot. When a snapshot is created
3547     here, it has to be released as well. (GitHub issue#189)
3548   */
3549   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
3550   const bool is_new_snapshot = !tx->has_snapshot();
3551 
3552   // Do a lookup. We only need index column, so it should be index-only.
3553   // (another reason to make it index-only is that table->read_set is
3554   //  not set appropriately and non-index-only lookup will not read the value)
3555   const bool save_keyread_only = m_keyread_only;
3556   m_keyread_only = true;
3557 
3558   if (!index_last(table->record[0]))
3559     update_auto_incr_val();
3560 
3561   m_keyread_only = save_keyread_only;
3562   if (is_new_snapshot) {
3563     tx->release_snapshot();
3564   }
3565 
3566   table->status = save_table_status;
3567   active_index = save_active_index;
3568 
3569   /*
3570     Do what ha_rocksdb::index_end() does.
3571     (Why don't we use index_init/index_end? class handler defines index_init
3572      as private, for some reason).
3573   */
3574   release_scan_iterator();
3575 }
3576 
3577 /* Get PK value from table->record[0]. */
3578 /*
3579   TODO(alexyang): No existing support for auto_increment on non-pk columns, see
3580   end of ha_rocksdb::create. Also see opened issue here:
3581   https://github.com/facebook/mysql-5.6/issues/153
3582 */
update_auto_incr_val()3583 void ha_rocksdb::update_auto_incr_val() {
3584   Field *field;
3585   longlong new_val;
3586   field = table->key_info[table->s->next_number_index].key_part[0].field;
3587 
3588   my_bitmap_map *const old_map =
3589       dbug_tmp_use_all_columns(table, table->read_set);
3590   new_val = field->val_int() + 1;
3591   dbug_tmp_restore_column_map(table->read_set, old_map);
3592 
3593   longlong auto_incr_val = m_tbl_def->m_auto_incr_val;
3594   while (auto_incr_val < new_val &&
3595          !m_tbl_def->m_auto_incr_val.compare_exchange_weak(auto_incr_val,
3596                                                            new_val)) {
3597     // Do nothing - just loop until auto_incr_val is >= new_val or
3598     // we successfully set it
3599   }
3600 }
3601 
load_hidden_pk_value()3602 int ha_rocksdb::load_hidden_pk_value() {
3603   const int save_active_index = active_index;
3604   active_index = m_tbl_def->m_key_count - 1;
3605   const uint8 save_table_status = table->status;
3606 
3607   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
3608   const bool is_new_snapshot = !tx->has_snapshot();
3609 
3610   // Do a lookup.
3611   if (!index_last(table->record[0])) {
3612     /*
3613       Decode PK field from the key
3614     */
3615     longlong hidden_pk_id = 0;
3616     if (read_hidden_pk_id_from_rowkey(&hidden_pk_id)) {
3617       if (is_new_snapshot) {
3618         tx->release_snapshot();
3619       }
3620       return HA_ERR_INTERNAL_ERROR;
3621     }
3622 
3623     hidden_pk_id++;
3624     longlong old = m_tbl_def->m_hidden_pk_val;
3625     while (
3626         old < hidden_pk_id &&
3627         !m_tbl_def->m_hidden_pk_val.compare_exchange_weak(old, hidden_pk_id)) {
3628     }
3629   }
3630 
3631   if (is_new_snapshot) {
3632     tx->release_snapshot();
3633   }
3634 
3635   table->status = save_table_status;
3636   active_index = save_active_index;
3637 
3638   release_scan_iterator();
3639 
3640   return HA_EXIT_SUCCESS;
3641 }
3642 
3643 /* Get PK value from m_tbl_def->m_hidden_pk_info. */
update_hidden_pk_val()3644 longlong ha_rocksdb::update_hidden_pk_val() {
3645   DBUG_ASSERT(has_hidden_pk(table));
3646   const longlong new_val = m_tbl_def->m_hidden_pk_val++;
3647   return new_val;
3648 }
3649 
3650 /* Get the id of the hidden pk id from m_last_rowkey */
read_hidden_pk_id_from_rowkey(longlong * const hidden_pk_id)3651 int ha_rocksdb::read_hidden_pk_id_from_rowkey(longlong *const hidden_pk_id) {
3652   DBUG_ASSERT(hidden_pk_id != nullptr);
3653   DBUG_ASSERT(table != nullptr);
3654   DBUG_ASSERT(has_hidden_pk(table));
3655 
3656   rocksdb::Slice rowkey_slice(m_last_rowkey.ptr(), m_last_rowkey.length());
3657 
3658   // Get hidden primary key from old key slice
3659   Rdb_string_reader reader(&rowkey_slice);
3660   if ((!reader.read(Rdb_key_def::INDEX_NUMBER_SIZE)))
3661     return HA_EXIT_FAILURE;
3662 
3663   const int length = Field_longlong::PACK_LENGTH;
3664   const uchar *from = reinterpret_cast<const uchar *>(reader.read(length));
3665   if (from == nullptr) {
3666     return HA_EXIT_FAILURE; /* Mem-comparable image doesn't have enough bytes */
3667   }
3668 
3669   *hidden_pk_id = rdb_netbuf_read_uint64(&from);
3670   return HA_EXIT_SUCCESS;
3671 }
3672 
3673 /**
3674   @brief
3675   Free lock controls. We call this whenever we close a table. If the table had
3676   the last reference to the table_handler, then we free the memory associated
3677   with it.
3678 */
3679 
release_table_handler(Rdb_table_handler * const table_handler)3680 void Rdb_open_tables_map::release_table_handler(
3681     Rdb_table_handler *const table_handler) {
3682   mysql_mutex_lock(&m_mutex);
3683 
3684   DBUG_ASSERT(table_handler != nullptr);
3685   DBUG_ASSERT(table_handler->m_ref_count > 0);
3686   if (!--table_handler->m_ref_count) {
3687     // Last rereference was released. Tear down the hash entry.
3688     const auto ret __attribute__((__unused__)) =
3689         my_hash_delete(&m_hash, reinterpret_cast<uchar *>(table_handler));
3690     DBUG_ASSERT(!ret); // the hash entry must actually be found and deleted
3691     my_core::thr_lock_delete(&table_handler->m_thr_lock);
3692     my_free(table_handler);
3693   }
3694 
3695   mysql_mutex_unlock(&m_mutex);
3696 }
3697 
rocksdb_create_handler(my_core::handlerton * const hton,my_core::TABLE_SHARE * const table_arg,my_core::MEM_ROOT * const mem_root)3698 static handler *rocksdb_create_handler(my_core::handlerton *const hton,
3699                                        my_core::TABLE_SHARE *const table_arg,
3700                                        my_core::MEM_ROOT *const mem_root) {
3701   return new (mem_root) ha_rocksdb(hton, table_arg);
3702 }
3703 
ha_rocksdb(my_core::handlerton * const hton,my_core::TABLE_SHARE * const table_arg)3704 ha_rocksdb::ha_rocksdb(my_core::handlerton *const hton,
3705                        my_core::TABLE_SHARE *const table_arg)
3706     : handler(hton, table_arg), m_table_handler(nullptr), m_scan_it(nullptr),
3707       m_scan_it_skips_bloom(false), m_scan_it_snapshot(nullptr),
3708       m_tbl_def(nullptr), m_pk_descr(nullptr), m_key_descr_arr(nullptr),
3709       m_pk_can_be_decoded(false), m_maybe_unpack_info(false),
3710       m_pk_tuple(nullptr), m_pk_packed_tuple(nullptr),
3711       m_sk_packed_tuple(nullptr), m_end_key_packed_tuple(nullptr),
3712       m_sk_match_prefix(nullptr), m_sk_match_prefix_buf(nullptr),
3713       m_sk_packed_tuple_old(nullptr), m_dup_sk_packed_tuple(nullptr),
3714       m_dup_sk_packed_tuple_old(nullptr), m_pack_buffer(nullptr),
3715       m_lock_rows(RDB_LOCK_NONE), m_keyread_only(FALSE),
3716       m_bulk_load_tx(nullptr), m_encoder_arr(nullptr),
3717       m_row_checksums_checked(0), m_in_rpl_delete_rows(false),
3718       m_in_rpl_update_rows(false) {
3719   // TODO(alexyang): create a valid PSI_mutex_key for this mutex
3720   mysql_mutex_init(0, &m_bulk_load_mutex, MY_MUTEX_INIT_FAST);
3721 }
3722 
3723 static const char *ha_rocksdb_exts[] = {NullS};
3724 
bas_ext() const3725 const char **ha_rocksdb::bas_ext() const {
3726   DBUG_ENTER_FUNC();
3727 
3728   DBUG_RETURN(ha_rocksdb_exts);
3729 }
3730 
same_table(const ha_rocksdb & other) const3731 bool ha_rocksdb::same_table(const ha_rocksdb &other) const {
3732   return m_tbl_def->base_tablename() == other.m_tbl_def->base_tablename();
3733 }
3734 
init_with_fields()3735 bool ha_rocksdb::init_with_fields() {
3736   DBUG_ENTER_FUNC();
3737 
3738   const uint pk = table_share->primary_key;
3739   if (pk != MAX_KEY) {
3740     const uint key_parts = table_share->key_info[pk].user_defined_key_parts;
3741     check_keyread_allowed(pk /*PK*/, key_parts - 1, true);
3742   } else
3743     m_pk_can_be_decoded = false;
3744 
3745   cached_table_flags = table_flags();
3746 
3747   DBUG_RETURN(false); /* Ok */
3748 }
3749 
3750 /**
3751   Convert record from table->record[0] form into a form that can be written
3752   into rocksdb.
3753 
3754   @param pk_packed_slice      Packed PK tuple. We need it in order to compute
3755                               and store its CRC.
3756   @param packed_rec      OUT  Data slice with record data.
3757 */
3758 
convert_record_to_storage_format(const rocksdb::Slice & pk_packed_slice,Rdb_string_writer * const pk_unpack_info,rocksdb::Slice * const packed_rec)3759 void ha_rocksdb::convert_record_to_storage_format(
3760     const rocksdb::Slice &pk_packed_slice,
3761     Rdb_string_writer *const pk_unpack_info, rocksdb::Slice *const packed_rec) {
3762   m_storage_record.length(0);
3763 
3764   /* All NULL bits are initially 0 */
3765   m_storage_record.fill(m_null_bytes_in_rec, 0);
3766 
3767   // If a primary key may have non-empty unpack_info for certain values,
3768   // (m_maybe_unpack_info=TRUE), we write the unpack_info block. The block
3769   // itself was prepared in Rdb_key_def::pack_record.
3770   if (m_maybe_unpack_info) {
3771     m_storage_record.append(reinterpret_cast<char *>(pk_unpack_info->ptr()),
3772                             pk_unpack_info->get_current_pos());
3773   }
3774 
3775   for (uint i = 0; i < table->s->fields; i++) {
3776     /* Don't pack decodable PK key parts */
3777     if (m_encoder_arr[i].m_storage_type != Rdb_field_encoder::STORE_ALL) {
3778       continue;
3779     }
3780 
3781     Field *const field = table->field[i];
3782     if (m_encoder_arr[i].maybe_null()) {
3783       char *const data = (char *)m_storage_record.ptr();
3784       if (field->is_null()) {
3785         data[m_encoder_arr[i].m_null_offset] |= m_encoder_arr[i].m_null_mask;
3786         /* Don't write anything for NULL values */
3787         continue;
3788       }
3789     }
3790 
3791     if (m_encoder_arr[i].m_field_type == MYSQL_TYPE_BLOB) {
3792       my_core::Field_blob *blob = (my_core::Field_blob *)field;
3793       /* Get the number of bytes needed to store length*/
3794       const uint length_bytes = blob->pack_length() - portable_sizeof_char_ptr;
3795 
3796       /* Store the length of the value */
3797       m_storage_record.append(reinterpret_cast<char *>(blob->ptr),
3798                               length_bytes);
3799 
3800       /* Store the blob value itself */
3801       char *data_ptr;
3802       memcpy(&data_ptr, blob->ptr + length_bytes, sizeof(uchar **));
3803       m_storage_record.append(data_ptr, blob->get_length());
3804     } else if (m_encoder_arr[i].m_field_type == MYSQL_TYPE_VARCHAR) {
3805       Field_varstring *const field_var = (Field_varstring *)field;
3806       uint data_len;
3807       /* field_var->length_bytes is 1 or 2 */
3808       if (field_var->length_bytes == 1) {
3809         data_len = field_var->ptr[0];
3810       } else {
3811         DBUG_ASSERT(field_var->length_bytes == 2);
3812         data_len = uint2korr(field_var->ptr);
3813       }
3814       m_storage_record.append(reinterpret_cast<char *>(field_var->ptr),
3815                               field_var->length_bytes + data_len);
3816     } else {
3817       /* Copy the field data */
3818       const uint len = field->pack_length_in_rec();
3819       m_storage_record.append(reinterpret_cast<char *>(field->ptr), len);
3820     }
3821   }
3822 
3823   if (should_store_row_debug_checksums()) {
3824     const uint32_t key_crc32 = my_core::crc32(
3825         0, rdb_slice_to_uchar_ptr(&pk_packed_slice), pk_packed_slice.size());
3826     const uint32_t val_crc32 =
3827         my_core::crc32(0, rdb_mysql_str_to_uchar_str(&m_storage_record),
3828                        m_storage_record.length());
3829     uchar key_crc_buf[RDB_CHECKSUM_SIZE];
3830     uchar val_crc_buf[RDB_CHECKSUM_SIZE];
3831     rdb_netbuf_store_uint32(key_crc_buf, key_crc32);
3832     rdb_netbuf_store_uint32(val_crc_buf, val_crc32);
3833     m_storage_record.append((const char *)&RDB_CHECKSUM_DATA_TAG, 1);
3834     m_storage_record.append((const char *)key_crc_buf, RDB_CHECKSUM_SIZE);
3835     m_storage_record.append((const char *)val_crc_buf, RDB_CHECKSUM_SIZE);
3836   }
3837 
3838   *packed_rec =
3839       rocksdb::Slice(m_storage_record.ptr(), m_storage_record.length());
3840 }
3841 
3842 /*
3843   @brief
3844     Setup which fields will be unpacked when reading rows
3845 
3846   @detail
3847     Two special cases when we still unpack all fields:
3848     - When this table is being updated (m_lock_rows==RDB_LOCK_WRITE).
3849     - When @@rocksdb_verify_row_debug_checksums is ON (In this mode, we need to
3850   read all
3851       fields to find whether there is a row checksum at the end. We could skip
3852       the fields instead of decoding them, but currently we do decoding.)
3853 
3854   @seealso
3855     ha_rocksdb::setup_field_converters()
3856     ha_rocksdb::convert_record_from_storage_format()
3857 */
setup_read_decoders()3858 void ha_rocksdb::setup_read_decoders() {
3859   m_decoders_vect.clear();
3860 
3861   int last_useful = 0;
3862   int skip_size = 0;
3863 
3864   for (uint i = 0; i < table->s->fields; i++) {
3865     // We only need the decoder if the whole record is stored.
3866     if (m_encoder_arr[i].m_storage_type != Rdb_field_encoder::STORE_ALL) {
3867       continue;
3868     }
3869 
3870     if (m_lock_rows == RDB_LOCK_WRITE || m_verify_row_debug_checksums ||
3871         bitmap_is_set(table->read_set, table->field[i]->field_index)) {
3872       // We will need to decode this field
3873       m_decoders_vect.push_back({&m_encoder_arr[i], true, skip_size});
3874       last_useful = m_decoders_vect.size();
3875       skip_size = 0;
3876     } else {
3877       if (m_encoder_arr[i].uses_variable_len_encoding() ||
3878           m_encoder_arr[i].maybe_null()) {
3879         // For variable-length field, we need to read the data and skip it
3880         m_decoders_vect.push_back({&m_encoder_arr[i], false, skip_size});
3881         skip_size = 0;
3882       } else {
3883         // Fixed-width field can be skipped without looking at it.
3884         // Add appropriate skip_size to the next field.
3885         skip_size += m_encoder_arr[i].m_pack_length_in_rec;
3886       }
3887     }
3888   }
3889 
3890   // It could be that the last few elements are varchars that just do
3891   // skipping. Remove them.
3892   m_decoders_vect.erase(m_decoders_vect.begin() + last_useful,
3893                         m_decoders_vect.end());
3894 }
3895 
3896 #ifndef NDEBUG
dbug_append_garbage_at_end(std::string & on_disk_rec)3897 void dbug_append_garbage_at_end(std::string &on_disk_rec) {
3898   on_disk_rec.append("abc");
3899 }
3900 
dbug_truncate_record(std::string & on_disk_rec)3901 void dbug_truncate_record(std::string &on_disk_rec) { on_disk_rec.resize(0); }
3902 
dbug_modify_rec_varchar12(std::string & on_disk_rec)3903 void dbug_modify_rec_varchar12(std::string &on_disk_rec) {
3904   std::string res;
3905   // The record is NULL-byte followed by VARCHAR(10).
3906   // Put the NULL-byte
3907   res.append("\0", 1);
3908   // Then, add a valid VARCHAR(12) value.
3909   res.append("\xC", 1);
3910   res.append("123456789ab", 12);
3911 
3912   on_disk_rec.assign(res);
3913 }
3914 
dbug_modify_key_varchar8(String & on_disk_rec)3915 void dbug_modify_key_varchar8(String &on_disk_rec) {
3916   std::string res;
3917   // The key starts with index number
3918   res.append(on_disk_rec.ptr(), Rdb_key_def::INDEX_NUMBER_SIZE);
3919 
3920   // Then, a mem-comparable form of a varchar(8) value.
3921   res.append("ABCDE\0\0\0\xFC", 9);
3922   on_disk_rec.length(0);
3923   on_disk_rec.append(res.data(), res.size());
3924 }
3925 
dbug_create_err_inplace_alter()3926 void dbug_create_err_inplace_alter() {
3927   my_printf_error(ER_UNKNOWN_ERROR,
3928                   "Intentional failure in inplace alter occurred.", MYF(0));
3929 }
3930 #endif
3931 
convert_record_from_storage_format(const rocksdb::Slice * const key,uchar * const buf)3932 int ha_rocksdb::convert_record_from_storage_format(
3933     const rocksdb::Slice *const key, uchar *const buf) {
3934   DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read1",
3935                   dbug_append_garbage_at_end(m_retrieved_record););
3936   DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read2",
3937                   dbug_truncate_record(m_retrieved_record););
3938   DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read3",
3939                   dbug_modify_rec_varchar12(m_retrieved_record););
3940 
3941   const rocksdb::Slice retrieved_rec_slice(&m_retrieved_record.front(),
3942                                            m_retrieved_record.size());
3943   return convert_record_from_storage_format(key, &retrieved_rec_slice, buf);
3944 }
3945 
3946 /*
3947   @brief
3948   Unpack the record in this->m_retrieved_record and this->m_last_rowkey from
3949   storage format into buf (which can be table->record[0] or table->record[1]).
3950 
3951   @param  key   Table record's key in mem-comparable form.
3952   @param  buf   Store record in table->record[0] format here
3953 
3954   @detail
3955     If the table has blobs, the unpacked data in buf may keep pointers to the
3956     data in this->m_retrieved_record.
3957 
3958     The key is only needed to check its checksum value (the checksum is in
3959     m_retrieved_record).
3960 
3961   @seealso
3962     ha_rocksdb::setup_read_decoders()  Sets up data structures which tell which
3963     columns to decode.
3964 
3965   @return
3966     0      OK
3967     other  Error inpacking the data
3968 */
3969 
convert_record_from_storage_format(const rocksdb::Slice * const key,const rocksdb::Slice * const value,uchar * const buf)3970 int ha_rocksdb::convert_record_from_storage_format(
3971     const rocksdb::Slice *const key, const rocksdb::Slice *const value,
3972     uchar *const buf) {
3973   DBUG_ASSERT(key != nullptr);
3974   DBUG_ASSERT(buf != nullptr);
3975 
3976   Rdb_string_reader reader(value);
3977   const my_ptrdiff_t ptr_diff = buf - table->record[0];
3978 
3979   /*
3980     Decode PK fields from the key
3981   */
3982   DBUG_EXECUTE_IF("myrocks_simulate_bad_pk_read1",
3983                   dbug_modify_key_varchar8(m_last_rowkey););
3984 
3985   const rocksdb::Slice rowkey_slice(m_last_rowkey.ptr(),
3986                                     m_last_rowkey.length());
3987   const char *unpack_info = nullptr;
3988   uint16 unpack_info_len = 0;
3989   rocksdb::Slice unpack_slice;
3990 
3991   /* Other fields are decoded from the value */
3992   const char *null_bytes = nullptr;
3993   if (m_null_bytes_in_rec && !(null_bytes = reader.read(m_null_bytes_in_rec))) {
3994     return HA_ERR_INTERNAL_ERROR;
3995   }
3996 
3997   if (m_maybe_unpack_info) {
3998     unpack_info = reader.read(RDB_UNPACK_HEADER_SIZE);
3999 
4000     if (!unpack_info || unpack_info[0] != RDB_UNPACK_DATA_TAG) {
4001       return HA_ERR_INTERNAL_ERROR;
4002     }
4003 
4004     unpack_info_len =
4005         rdb_netbuf_to_uint16(reinterpret_cast<const uchar *>(unpack_info + 1));
4006     unpack_slice = rocksdb::Slice(unpack_info, unpack_info_len);
4007 
4008     reader.read(unpack_info_len - RDB_UNPACK_HEADER_SIZE);
4009   }
4010 
4011   if (m_pk_descr->unpack_record(table, buf, &rowkey_slice,
4012                                 unpack_info ? &unpack_slice : nullptr,
4013                                 false /* verify_checksum */)) {
4014     return HA_ERR_INTERNAL_ERROR;
4015   }
4016 
4017   for (auto it = m_decoders_vect.begin(); it != m_decoders_vect.end(); it++) {
4018     const Rdb_field_encoder *const field_dec = it->m_field_enc;
4019     const bool decode = it->m_decode;
4020     const bool isNull =
4021         field_dec->maybe_null() &&
4022         ((null_bytes[field_dec->m_null_offset] & field_dec->m_null_mask) != 0);
4023 
4024     Field *const field = table->field[field_dec->m_field_index];
4025 
4026     /* Skip the bytes we need to skip */
4027     if (it->m_skip && !reader.read(it->m_skip))
4028       return HA_ERR_INTERNAL_ERROR;
4029 
4030     if (isNull) {
4031       if (decode) {
4032         /* This sets the NULL-bit of this record */
4033         field->set_null(ptr_diff);
4034         /*
4035           Besides that, set the field value to default value. CHECKSUM TABLE
4036           depends on this.
4037         */
4038         uint field_offset = field->ptr - table->record[0];
4039         memcpy(buf + field_offset, table->s->default_values + field_offset,
4040                field->pack_length());
4041       }
4042       continue;
4043     } else {
4044       if (decode)
4045         field->set_notnull(ptr_diff);
4046     }
4047 
4048     if (field_dec->m_field_type == MYSQL_TYPE_BLOB) {
4049       my_core::Field_blob *const blob = (my_core::Field_blob *)field;
4050       /* Get the number of bytes needed to store length*/
4051       const uint length_bytes = blob->pack_length() - portable_sizeof_char_ptr;
4052 
4053       blob->move_field_offset(ptr_diff);
4054 
4055       const char *data_len_str;
4056       if (!(data_len_str = reader.read(length_bytes))) {
4057         blob->move_field_offset(-ptr_diff);
4058         return HA_ERR_INTERNAL_ERROR;
4059       }
4060 
4061       memcpy(blob->ptr, data_len_str, length_bytes);
4062 
4063       const uint32 data_len = blob->get_length(
4064           (uchar *)data_len_str, length_bytes, table->s->db_low_byte_first);
4065       const char *blob_ptr;
4066       if (!(blob_ptr = reader.read(data_len))) {
4067         blob->move_field_offset(-ptr_diff);
4068         return HA_ERR_INTERNAL_ERROR;
4069       }
4070 
4071       if (decode) {
4072         // set 8-byte pointer to 0, like innodb does (relevant for 32-bit
4073         // platforms)
4074         memset(blob->ptr + length_bytes, 0, 8);
4075         memcpy(blob->ptr + length_bytes, &blob_ptr, sizeof(uchar **));
4076         blob->move_field_offset(-ptr_diff);
4077       }
4078     } else if (field_dec->m_field_type == MYSQL_TYPE_VARCHAR) {
4079       Field_varstring *const field_var = (Field_varstring *)field;
4080       const char *data_len_str;
4081       if (!(data_len_str = reader.read(field_var->length_bytes)))
4082         return HA_ERR_INTERNAL_ERROR;
4083 
4084       uint data_len;
4085       /* field_var->length_bytes is 1 or 2 */
4086       if (field_var->length_bytes == 1) {
4087         data_len = (uchar)data_len_str[0];
4088       } else {
4089         DBUG_ASSERT(field_var->length_bytes == 2);
4090         data_len = uint2korr(data_len_str);
4091       }
4092       if (data_len > field->field_length) {
4093         /* The data on disk is longer than table DDL allows? */
4094         return HA_ERR_INTERNAL_ERROR;
4095       }
4096       if (!reader.read(data_len))
4097         return HA_ERR_INTERNAL_ERROR;
4098 
4099       if (decode) {
4100         memcpy(field_var->ptr + ptr_diff, data_len_str,
4101                field_var->length_bytes + data_len);
4102       }
4103     } else {
4104       const char *data_bytes;
4105       const uint len = field_dec->m_pack_length_in_rec;
4106       if (len > 0) {
4107         if ((data_bytes = reader.read(len)) == nullptr) {
4108           return HA_ERR_INTERNAL_ERROR;
4109         }
4110         if (decode)
4111           memcpy(field->ptr + ptr_diff, data_bytes, len);
4112       }
4113     }
4114   }
4115 
4116   if (m_verify_row_debug_checksums) {
4117     if (reader.remaining_bytes() == RDB_CHECKSUM_CHUNK_SIZE &&
4118         reader.read(1)[0] == RDB_CHECKSUM_DATA_TAG) {
4119       uint32_t stored_key_chksum =
4120           rdb_netbuf_to_uint32((const uchar *)reader.read(RDB_CHECKSUM_SIZE));
4121       uint32_t stored_val_chksum =
4122           rdb_netbuf_to_uint32((const uchar *)reader.read(RDB_CHECKSUM_SIZE));
4123 
4124       const uint32_t computed_key_chksum =
4125           my_core::crc32(0, rdb_slice_to_uchar_ptr(key), key->size());
4126       const uint32_t computed_val_chksum =
4127           my_core::crc32(0, rdb_slice_to_uchar_ptr(value),
4128                          value->size() - RDB_CHECKSUM_CHUNK_SIZE);
4129 
4130       DBUG_EXECUTE_IF("myrocks_simulate_bad_pk_checksum1",
4131                       stored_key_chksum++;);
4132 
4133       if (stored_key_chksum != computed_key_chksum) {
4134         m_pk_descr->report_checksum_mismatch(true, key->data(), key->size());
4135         return HA_ERR_INTERNAL_ERROR;
4136       }
4137 
4138       DBUG_EXECUTE_IF("myrocks_simulate_bad_pk_checksum2",
4139                       stored_val_chksum++;);
4140       if (stored_val_chksum != computed_val_chksum) {
4141         m_pk_descr->report_checksum_mismatch(false, value->data(),
4142                                              value->size());
4143         return HA_ERR_INTERNAL_ERROR;
4144       }
4145 
4146       m_row_checksums_checked++;
4147     }
4148     if (reader.remaining_bytes())
4149       return HA_ERR_INTERNAL_ERROR;
4150   }
4151 
4152   return HA_EXIT_SUCCESS;
4153 }
4154 
get_storage_type(Rdb_field_encoder * const encoder,const uint & kp)4155 void ha_rocksdb::get_storage_type(Rdb_field_encoder *const encoder,
4156                                   const uint &kp) {
4157   // STORE_SOME uses unpack_info.
4158   if (m_pk_descr->has_unpack_info(kp)) {
4159     DBUG_ASSERT(m_pk_descr->can_unpack(kp));
4160     encoder->m_storage_type = Rdb_field_encoder::STORE_SOME;
4161     m_maybe_unpack_info = true;
4162   } else if (m_pk_descr->can_unpack(kp)) {
4163     encoder->m_storage_type = Rdb_field_encoder::STORE_NONE;
4164   }
4165 }
4166 
4167 /*
4168   Setup data needed to convert table->record[] to and from record storage
4169   format.
4170 
4171   @seealso
4172      ha_rocksdb::convert_record_to_storage_format,
4173      ha_rocksdb::convert_record_from_storage_format
4174 */
4175 
setup_field_converters()4176 void ha_rocksdb::setup_field_converters() {
4177   uint i;
4178   uint null_bytes = 0;
4179   uchar cur_null_mask = 0x1;
4180 
4181   DBUG_ASSERT(m_encoder_arr == nullptr);
4182   m_encoder_arr = static_cast<Rdb_field_encoder *>(
4183       my_malloc(table->s->fields * sizeof(Rdb_field_encoder), MYF(0)));
4184   if (m_encoder_arr == nullptr) {
4185     return;
4186   }
4187 
4188   for (i = 0; i < table->s->fields; i++) {
4189     Field *const field = table->field[i];
4190     m_encoder_arr[i].m_storage_type = Rdb_field_encoder::STORE_ALL;
4191 
4192     /*
4193       Check if this field is
4194       - a part of primary key, and
4195       - it can be decoded back from its key image.
4196       If both hold, we don't need to store this field in the value part of
4197       RocksDB's key-value pair.
4198 
4199       If hidden pk exists, we skip this check since the field will never be
4200       part of the hidden pk.
4201     */
4202     if (!has_hidden_pk(table) &&
4203         field->part_of_key.is_set(table->s->primary_key)) {
4204       KEY *const pk_info = &table->key_info[table->s->primary_key];
4205       for (uint kp = 0; kp < pk_info->user_defined_key_parts; kp++) {
4206         /* key_part->fieldnr is counted from 1 */
4207         if (field->field_index + 1 == pk_info->key_part[kp].fieldnr) {
4208           get_storage_type(&m_encoder_arr[i], kp);
4209           break;
4210         }
4211       }
4212     }
4213 
4214     m_encoder_arr[i].m_field_type = field->real_type();
4215     m_encoder_arr[i].m_field_index = i;
4216     m_encoder_arr[i].m_pack_length_in_rec = field->pack_length_in_rec();
4217 
4218     if (field->real_maybe_null()) {
4219       m_encoder_arr[i].m_null_mask = cur_null_mask;
4220       m_encoder_arr[i].m_null_offset = null_bytes;
4221       if (cur_null_mask == 0x80) {
4222         cur_null_mask = 0x1;
4223         null_bytes++;
4224       } else
4225         cur_null_mask = cur_null_mask << 1;
4226     } else {
4227       m_encoder_arr[i].m_null_mask = 0;
4228     }
4229   }
4230 
4231   /* Count the last, unfinished NULL-bits byte */
4232   if (cur_null_mask != 0x1)
4233     null_bytes++;
4234 
4235   m_null_bytes_in_rec = null_bytes;
4236 }
4237 
alloc_key_buffers(const TABLE * const table_arg,const Rdb_tbl_def * const tbl_def_arg,bool alloc_alter_buffers)4238 int ha_rocksdb::alloc_key_buffers(const TABLE *const table_arg,
4239                                   const Rdb_tbl_def *const tbl_def_arg,
4240                                   bool alloc_alter_buffers) {
4241   DBUG_ENTER_FUNC();
4242 
4243   DBUG_ASSERT(m_pk_tuple == nullptr);
4244   DBUG_ASSERT(tbl_def_arg != nullptr);
4245 
4246   std::shared_ptr<Rdb_key_def> *const kd_arr = tbl_def_arg->m_key_descr_arr;
4247 
4248   uint key_len = 0;
4249   uint max_packed_sk_len = 0;
4250   uint pack_key_len = 0;
4251 
4252   m_pk_descr = kd_arr[pk_index(table_arg, tbl_def_arg)];
4253   if (has_hidden_pk(table_arg)) {
4254     m_pk_key_parts = 1;
4255   } else {
4256     m_pk_key_parts =
4257         table->key_info[table->s->primary_key].user_defined_key_parts;
4258     key_len = table->key_info[table->s->primary_key].key_length;
4259   }
4260 
4261   // move this into get_table_handler() ??
4262   m_pk_descr->setup(table_arg, tbl_def_arg);
4263 
4264   m_pk_tuple = reinterpret_cast<uchar *>(my_malloc(key_len, MYF(0)));
4265   if (m_pk_tuple == nullptr) {
4266     goto error;
4267   }
4268 
4269   pack_key_len = m_pk_descr->max_storage_fmt_length();
4270   m_pk_packed_tuple =
4271       reinterpret_cast<uchar *>(my_malloc(pack_key_len, MYF(0)));
4272   if (m_pk_packed_tuple == nullptr) {
4273     goto error;
4274   }
4275 
4276   /* Sometimes, we may use m_sk_packed_tuple for storing packed PK */
4277   max_packed_sk_len = pack_key_len;
4278   for (uint i = 0; i < table_arg->s->keys; i++) {
4279     if (i == table_arg->s->primary_key) /* Primary key was processed above */
4280       continue;
4281 
4282     // TODO: move this into get_table_handler() ??
4283     kd_arr[i]->setup(table_arg, tbl_def_arg);
4284 
4285     const uint packed_len = kd_arr[i]->max_storage_fmt_length();
4286     if (packed_len > max_packed_sk_len) {
4287       max_packed_sk_len = packed_len;
4288     }
4289   }
4290 
4291   if (!(m_sk_packed_tuple =
4292             reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)))) ||
4293       !(m_sk_match_prefix_buf =
4294             reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)))) ||
4295       !(m_sk_packed_tuple_old =
4296             reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)))) ||
4297       !(m_end_key_packed_tuple =
4298             reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)))) ||
4299       !((m_pack_buffer = reinterpret_cast<uchar *>(
4300              my_malloc(max_packed_sk_len, MYF(0)))))) {
4301     goto error;
4302   }
4303 
4304   /*
4305     If inplace alter is happening, allocate special buffers for unique
4306     secondary index duplicate checking.
4307   */
4308   if (alloc_alter_buffers &&
4309       (!(m_dup_sk_packed_tuple =
4310              reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)))) ||
4311        !(m_dup_sk_packed_tuple_old = reinterpret_cast<uchar *>(
4312              my_malloc(max_packed_sk_len, MYF(0)))))) {
4313     goto error;
4314   }
4315 
4316   DBUG_RETURN(HA_EXIT_SUCCESS);
4317 
4318 error:
4319   // If we're here then this means that at some point above an allocation may
4320   // have failed. To avoid any resource leaks and maintain a clear contract
4321   // we'll clean up before returning the error code.
4322   free_key_buffers();
4323 
4324   DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
4325 }
4326 
free_key_buffers()4327 void ha_rocksdb::free_key_buffers() {
4328   my_free(m_pk_tuple);
4329   m_pk_tuple = nullptr;
4330 
4331   my_free(m_pk_packed_tuple);
4332   m_pk_packed_tuple = nullptr;
4333 
4334   my_free(m_sk_packed_tuple);
4335   m_sk_packed_tuple = nullptr;
4336 
4337   my_free(m_sk_match_prefix_buf);
4338   m_sk_match_prefix_buf = nullptr;
4339 
4340   my_free(m_sk_packed_tuple_old);
4341   m_sk_packed_tuple_old = nullptr;
4342 
4343   my_free(m_end_key_packed_tuple);
4344   m_end_key_packed_tuple = nullptr;
4345 
4346   my_free(m_pack_buffer);
4347   m_pack_buffer = nullptr;
4348 
4349   my_free(m_dup_sk_packed_tuple);
4350   m_dup_sk_packed_tuple = nullptr;
4351 
4352   my_free(m_dup_sk_packed_tuple_old);
4353   m_dup_sk_packed_tuple_old = nullptr;
4354 }
4355 
set_use_read_free_rpl(const char * const whitelist)4356 void ha_rocksdb::set_use_read_free_rpl(const char *const whitelist) {
4357   DBUG_ASSERT(whitelist != nullptr);
4358 
4359 #if defined(HAVE_PSI_INTERFACE)
4360   Regex regex_handler(key_rwlock_read_free_rpl_tables);
4361 #else
4362   Regex regex_handler;
4363 #endif
4364 
4365   int flags = MY_REG_EXTENDED | MY_REG_NOSUB;
4366   if (lower_case_table_names)
4367     flags |= MY_REG_ICASE;
4368 
4369   if (!regex_handler.compile(whitelist,
4370                              flags,
4371                              table_alias_charset)) {
4372       warn_about_bad_patterns(regex_handler, "read_free_rpl_tables");
4373   }
4374 
4375   m_use_read_free_rpl = regex_handler.match(m_tbl_def->base_tablename());
4376 }
4377 
open(const char * const name,int mode,uint test_if_locked)4378 int ha_rocksdb::open(const char *const name, int mode, uint test_if_locked) {
4379   DBUG_ENTER_FUNC();
4380 
4381   if (close()) {
4382     DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
4383   }
4384 
4385   m_table_handler = rdb_open_tables.get_table_handler(name);
4386 
4387   if (m_table_handler == nullptr) {
4388     DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
4389   }
4390 
4391   my_core::thr_lock_data_init(&m_table_handler->m_thr_lock, &m_db_lock,
4392                               nullptr);
4393   m_io_perf.init(&m_table_handler->m_table_perf_context,
4394                  &m_table_handler->m_io_perf_read, &stats);
4395 
4396   std::string fullname;
4397   if (rdb_normalize_tablename(name, &fullname)) {
4398     DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
4399   }
4400 
4401   m_tbl_def = ddl_manager.find(fullname);
4402   if (m_tbl_def == nullptr) {
4403     my_error(ER_INTERNAL_ERROR, MYF(0),
4404              "Attempt to open a table that is not present in RocksDB-SE data "
4405              "dictionary");
4406     DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
4407   }
4408   m_lock_rows = RDB_LOCK_NONE;
4409 
4410   m_key_descr_arr = m_tbl_def->m_key_descr_arr;
4411 
4412   /*
4413     Full table scan actually uses primary key
4414     (UPDATE needs to know this, otherwise it will go into infinite loop on
4415     queries like "UPDATE tbl SET pk=pk+100")
4416   */
4417   key_used_on_scan = table->s->primary_key;
4418 
4419   // close() above has already called free_key_buffers(). No need to do it here.
4420   int err = alloc_key_buffers(table, m_tbl_def);
4421 
4422   if (err) {
4423     DBUG_RETURN(err);
4424   }
4425 
4426   /*
4427     init_with_fields() is used to initialize table flags based on the field
4428     definitions in table->field[].
4429     It is called by open_binary_frm(), but that function calls the method for
4430     a temporary ha_rocksdb object which is later destroyed.
4431 
4432     If we are here in ::open(), then init_with_fields() has not been called
4433     for this object. Call it ourselves, we want all member variables to be
4434     properly initialized.
4435   */
4436   init_with_fields();
4437 
4438   setup_field_converters();
4439 
4440   info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
4441 
4442   /*
4443     The following load_XXX code calls row decode functions, and they do
4444     that without having done ::external_lock() or index_init()/rnd_init().
4445     (Note: this also means we're doing a read when there was no
4446     setup_field_converters() call)
4447 
4448     Initialize the necessary variables for them:
4449   */
4450   m_verify_row_debug_checksums = false;
4451 
4452   /* TODO: move the following to where TABLE_SHARE is opened: */
4453   if (table->found_next_number_field)
4454     load_auto_incr_value();
4455 
4456   if (has_hidden_pk(table) && load_hidden_pk_value()) {
4457     free_key_buffers();
4458     DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
4459   }
4460 
4461   /* Index block size in MyRocks: used by MySQL in query optimization */
4462   stats.block_size = rocksdb_tbl_options.block_size;
4463 
4464   /* Determine at open whether we can use Read Free Replication or not */
4465   set_use_read_free_rpl(THDVAR(ha_thd(), read_free_rpl_tables));
4466 
4467   DBUG_RETURN(HA_EXIT_SUCCESS);
4468 }
4469 
close(void)4470 int ha_rocksdb::close(void) {
4471   DBUG_ENTER_FUNC();
4472 
4473   m_pk_descr = nullptr;
4474   m_key_descr_arr = nullptr;
4475 
4476   free_key_buffers();
4477 
4478   my_free(m_encoder_arr);
4479   m_encoder_arr = nullptr;
4480 
4481   if (m_table_handler != nullptr) {
4482     rdb_open_tables.release_table_handler(m_table_handler);
4483     m_table_handler = nullptr;
4484   }
4485 
4486   DBUG_RETURN(HA_EXIT_SUCCESS);
4487 }
4488 
get_error_message(const int error,String * const buf)4489 bool ha_rocksdb::get_error_message(const int error, String *const buf) {
4490   DBUG_ENTER_FUNC();
4491 
4492   static_assert(HA_ERR_ROCKSDB_LAST > HA_ERR_FIRST,
4493                 "HA_ERR_ROCKSDB_LAST > HA_ERR_FIRST");
4494   static_assert(HA_ERR_ROCKSDB_LAST > HA_ERR_LAST,
4495                 "HA_ERR_ROCKSDB_LAST > HA_ERR_LAST");
4496 
4497   DBUG_ASSERT(error > 0);
4498   DBUG_ASSERT(error <= HA_ERR_ROCKSDB_LAST);
4499   DBUG_ASSERT(buf != nullptr);
4500 
4501   switch (error) {
4502   case HA_ERR_ROCKSDB_PK_REQUIRED:
4503     buf->append("Table must have a PRIMARY KEY.");
4504     break;
4505   case HA_ERR_ROCKSDB_UNIQUE_NOT_SUPPORTED:
4506     buf->append("Unique indexes are not supported.");
4507     break;
4508   case HA_ERR_ROCKSDB_TOO_MANY_LOCKS:
4509     buf->append("Number of locks held reached @@rocksdb_max_row_locks.");
4510     break;
4511   default:
4512     // We can be called with the values which are < HA_ERR_FIRST because most
4513     // MySQL internal functions will just return HA_EXIT_FAILURE in case of
4514     // an error.
4515     break;
4516   }
4517 
4518   DBUG_RETURN(false);
4519 }
4520 
4521 /* MyRocks supports only the following collations for indexed columns */
4522 static const std::set<const my_core::CHARSET_INFO *> RDB_INDEX_COLLATIONS = {
4523     &my_charset_bin, &my_charset_utf8_bin, &my_charset_latin1_bin};
4524 
4525 static bool
rdb_is_index_collation_supported(const my_core::Field * const field)4526 rdb_is_index_collation_supported(const my_core::Field *const field) {
4527   const my_core::enum_field_types type = field->real_type();
4528   /* Handle [VAR](CHAR|BINARY) or TEXT|BLOB */
4529   if (type == MYSQL_TYPE_VARCHAR || type == MYSQL_TYPE_STRING ||
4530       type == MYSQL_TYPE_BLOB) {
4531     return RDB_INDEX_COLLATIONS.find(field->charset()) !=
4532            RDB_INDEX_COLLATIONS.end();
4533   }
4534   return true;
4535 }
4536 
4537 /*
4538   Create structures needed for storing data in rocksdb. This is called when the
4539   table is created. The structures will be shared by all TABLE* objects.
4540 
4541   @param
4542     table_arg        Table with definition
4543     db_table         "dbname.tablename"
4544     len              strlen of the above
4545     tbl_def_arg      tbl_def whose key_descr is being created/populated
4546     old_tbl_def_arg  tbl_def from which keys are being copied over from
4547                      (for use during inplace alter)
4548 
4549   @return
4550     0      - Ok
4551     other  - error, either given table ddl is not supported by rocksdb or OOM.
4552 */
create_key_defs(const TABLE * const table_arg,Rdb_tbl_def * const tbl_def_arg,const TABLE * const old_table_arg,const Rdb_tbl_def * const old_tbl_def_arg) const4553 int ha_rocksdb::create_key_defs(
4554     const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
4555     const TABLE *const old_table_arg /* = nullptr */,
4556     const Rdb_tbl_def *const old_tbl_def_arg
4557     /* = nullptr */) const {
4558   DBUG_ENTER_FUNC();
4559 
4560   DBUG_ASSERT(table_arg != nullptr);
4561   DBUG_ASSERT(table_arg->s != nullptr);
4562 
4563   uint i;
4564 
4565   /*
4566     These need to be one greater than MAX_INDEXES since the user can create
4567     MAX_INDEXES secondary keys and no primary key which would cause us
4568     to generate a hidden one.
4569   */
4570   std::array<key_def_cf_info, MAX_INDEXES + 1> cfs;
4571 
4572   /*
4573     NOTE: All new column families must be created before new index numbers are
4574     allocated to each key definition. See below for more details.
4575     http://github.com/MySQLOnRocksDB/mysql-5.6/issues/86#issuecomment-138515501
4576   */
4577   if (create_cfs(table_arg, tbl_def_arg, &cfs)) {
4578     DBUG_RETURN(HA_EXIT_FAILURE);
4579   };
4580 
4581   if (!old_tbl_def_arg) {
4582     /*
4583       old_tbl_def doesn't exist. this means we are in the process of creating
4584       a new table.
4585 
4586       Get the index numbers (this will update the next_index_number)
4587       and create Rdb_key_def structures.
4588     */
4589     for (i = 0; i < tbl_def_arg->m_key_count; i++) {
4590       if (create_key_def(table_arg, i, tbl_def_arg, &m_key_descr_arr[i],
4591                          cfs[i])) {
4592         DBUG_RETURN(HA_EXIT_FAILURE);
4593       }
4594     }
4595   } else {
4596     /*
4597       old_tbl_def exists.  This means we are creating a new tbl_def as part of
4598       in-place alter table.  Copy over existing keys from the old_tbl_def and
4599       generate the necessary new key definitions if any.
4600     */
4601     if (create_inplace_key_defs(table_arg, tbl_def_arg, old_table_arg,
4602                                 old_tbl_def_arg, cfs)) {
4603       DBUG_RETURN(HA_EXIT_FAILURE);
4604     }
4605   }
4606 
4607   DBUG_RETURN(HA_EXIT_SUCCESS);
4608 }
4609 
4610 /*
4611   Checks index parameters and creates column families needed for storing data
4612   in rocksdb if necessary.
4613 
4614   @param in
4615     table_arg     Table with definition
4616     db_table      Table name
4617     tbl_def_arg   Table def structure being populated
4618 
4619   @param out
4620     cfs           CF info for each key definition in 'key_info' order
4621 
4622   @return
4623     0      - Ok
4624     other  - error
4625 */
create_cfs(const TABLE * const table_arg,Rdb_tbl_def * const tbl_def_arg,std::array<struct key_def_cf_info,MAX_INDEXES+1> * const cfs) const4626 int ha_rocksdb::create_cfs(
4627     const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
4628     std::array<struct key_def_cf_info, MAX_INDEXES + 1> *const cfs) const {
4629   DBUG_ENTER_FUNC();
4630 
4631   DBUG_ASSERT(table_arg != nullptr);
4632   DBUG_ASSERT(table_arg->s != nullptr);
4633 
4634   char tablename_sys[NAME_LEN + 1];
4635 
4636   my_core::filename_to_tablename(tbl_def_arg->base_tablename().c_str(),
4637                                  tablename_sys, sizeof(tablename_sys));
4638 
4639   /*
4640     The first loop checks the index parameters and creates
4641     column families if necessary.
4642   */
4643   for (uint i = 0; i < tbl_def_arg->m_key_count; i++) {
4644     rocksdb::ColumnFamilyHandle *cf_handle;
4645 
4646     if (rocksdb_strict_collation_check &&
4647         !is_hidden_pk(i, table_arg, tbl_def_arg) &&
4648         tbl_def_arg->base_tablename().find(tmp_file_prefix) != 0) {
4649       for (uint part = 0; part < table_arg->key_info[i].actual_key_parts;
4650            part++) {
4651         if (!rdb_is_index_collation_supported(
4652                 table_arg->key_info[i].key_part[part].field) &&
4653             !rdb_collation_exceptions->match(tablename_sys)) {
4654           std::string collation_err;
4655           for (const auto &coll : RDB_INDEX_COLLATIONS) {
4656             if (collation_err != "") {
4657               collation_err += ", ";
4658             }
4659             collation_err += coll->name;
4660           }
4661           my_printf_error(
4662               ER_UNKNOWN_ERROR, "Unsupported collation on string indexed "
4663                                 "column %s.%s Use binary collation (%s).",
4664               MYF(0), tbl_def_arg->full_tablename().c_str(),
4665               table_arg->key_info[i].key_part[part].field->field_name,
4666               collation_err.c_str());
4667           DBUG_RETURN(HA_EXIT_FAILURE);
4668         }
4669       }
4670     }
4671 
4672     /*
4673       index comment has Column Family name. If there was no comment, we get
4674       NULL, and it means use the default column family.
4675     */
4676     const char *const comment = get_key_comment(i, table_arg, tbl_def_arg);
4677     const char *const key_name = get_key_name(i, table_arg, tbl_def_arg);
4678 
4679     if (looks_like_per_index_cf_typo(comment)) {
4680       my_error(ER_NOT_SUPPORTED_YET, MYF(0),
4681                "column family name looks like a typo of $per_index_cf");
4682       DBUG_RETURN(HA_EXIT_FAILURE);
4683     }
4684     /* Prevent create from using the system column family */
4685     if (comment && strcmp(DEFAULT_SYSTEM_CF_NAME, comment) == 0) {
4686       my_error(ER_WRONG_ARGUMENTS, MYF(0),
4687                "column family not valid for storing index data");
4688       DBUG_RETURN(HA_EXIT_FAILURE);
4689     }
4690     bool is_auto_cf_flag;
4691     cf_handle =
4692         cf_manager.get_or_create_cf(rdb, comment, tbl_def_arg->full_tablename(),
4693                                     key_name, &is_auto_cf_flag);
4694     if (!cf_handle)
4695       DBUG_RETURN(HA_EXIT_FAILURE);
4696 
4697     auto &cf = (*cfs)[i];
4698     cf.cf_handle = cf_handle;
4699     cf.is_reverse_cf = Rdb_cf_manager::is_cf_name_reverse(comment);
4700     cf.is_auto_cf = is_auto_cf_flag;
4701   }
4702 
4703   DBUG_RETURN(HA_EXIT_SUCCESS);
4704 }
4705 
4706 /*
4707   Create key definition needed for storing data in rocksdb during ADD index
4708   inplace operations.
4709 
4710   @param in
4711     table_arg         Table with definition
4712     tbl_def_arg       New table def structure being populated
4713     old_tbl_def_arg   Old(current) table def structure
4714     cfs               Struct array which contains column family information
4715 
4716   @return
4717     0      - Ok
4718     other  - error, either given table ddl is not supported by rocksdb or OOM.
4719 */
create_inplace_key_defs(const TABLE * const table_arg,Rdb_tbl_def * const tbl_def_arg,const TABLE * const old_table_arg,const Rdb_tbl_def * const old_tbl_def_arg,const std::array<key_def_cf_info,MAX_INDEXES+1> & cfs) const4720 int ha_rocksdb::create_inplace_key_defs(
4721     const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
4722     const TABLE *const old_table_arg, const Rdb_tbl_def *const old_tbl_def_arg,
4723     const std::array<key_def_cf_info, MAX_INDEXES + 1> &cfs) const {
4724   DBUG_ENTER_FUNC();
4725 
4726   DBUG_ASSERT(table_arg != nullptr);
4727   DBUG_ASSERT(tbl_def_arg != nullptr);
4728   DBUG_ASSERT(old_tbl_def_arg != nullptr);
4729 
4730   std::shared_ptr<Rdb_key_def> *const old_key_descr =
4731       old_tbl_def_arg->m_key_descr_arr;
4732   std::shared_ptr<Rdb_key_def> *const new_key_descr =
4733       tbl_def_arg->m_key_descr_arr;
4734   const std::unordered_map<std::string, uint> old_key_pos =
4735       get_old_key_positions(table_arg, tbl_def_arg, old_table_arg,
4736                             old_tbl_def_arg);
4737 
4738   uint i;
4739   for (i = 0; i < tbl_def_arg->m_key_count; i++) {
4740     const auto &it = old_key_pos.find(get_key_name(i, table_arg, tbl_def_arg));
4741     if (it != old_key_pos.end()) {
4742       /*
4743         Found matching index in old table definition, so copy it over to the
4744         new one created.
4745       */
4746       const Rdb_key_def &okd = *old_key_descr[it->second];
4747 
4748       uint16 index_dict_version = 0;
4749       uchar index_type = 0;
4750       uint16 kv_version = 0;
4751       const GL_INDEX_ID gl_index_id = okd.get_gl_index_id();
4752       if (!dict_manager.get_index_info(gl_index_id, &index_dict_version,
4753                                        &index_type, &kv_version)) {
4754         // NO_LINT_DEBUG
4755         sql_print_error("RocksDB: Could not get index information "
4756                         "for Index Number (%u,%u), table %s",
4757                         gl_index_id.cf_id, gl_index_id.index_id,
4758                         old_tbl_def_arg->full_tablename().c_str());
4759         DBUG_RETURN(HA_EXIT_FAILURE);
4760       }
4761 
4762       /*
4763         We can't use the copy constructor because we need to update the
4764         keynr within the pack_info for each field and the keyno of the keydef
4765         itself.
4766       */
4767       new_key_descr[i] = std::make_shared<Rdb_key_def>(
4768           okd.get_index_number(), i, okd.get_cf(), index_dict_version,
4769           index_type, kv_version, okd.m_is_reverse_cf, okd.m_is_auto_cf,
4770           okd.m_name.c_str(), dict_manager.get_stats(gl_index_id));
4771     } else if (create_key_def(table_arg, i, tbl_def_arg, &new_key_descr[i],
4772                               cfs[i])) {
4773       DBUG_RETURN(HA_EXIT_FAILURE);
4774     }
4775 
4776     DBUG_ASSERT(new_key_descr[i] != nullptr);
4777     new_key_descr[i]->setup(table_arg, tbl_def_arg);
4778   }
4779 
4780   DBUG_RETURN(HA_EXIT_SUCCESS);
4781 }
4782 
get_old_key_positions(const TABLE * const table_arg,const Rdb_tbl_def * const tbl_def_arg,const TABLE * const old_table_arg,const Rdb_tbl_def * const old_tbl_def_arg) const4783 std::unordered_map<std::string, uint> ha_rocksdb::get_old_key_positions(
4784     const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg,
4785     const TABLE *const old_table_arg,
4786     const Rdb_tbl_def *const old_tbl_def_arg) const {
4787   DBUG_ENTER_FUNC();
4788 
4789   DBUG_ASSERT(table_arg != nullptr);
4790   DBUG_ASSERT(old_table_arg != nullptr);
4791   DBUG_ASSERT(tbl_def_arg != nullptr);
4792   DBUG_ASSERT(old_tbl_def_arg != nullptr);
4793 
4794   std::shared_ptr<Rdb_key_def> *const old_key_descr =
4795       old_tbl_def_arg->m_key_descr_arr;
4796   std::unordered_map<std::string, uint> old_key_pos;
4797   std::unordered_map<std::string, uint> new_key_pos;
4798   uint i;
4799 
4800   for (i = 0; i < tbl_def_arg->m_key_count; i++) {
4801     new_key_pos[get_key_name(i, table_arg, tbl_def_arg)] = i;
4802   }
4803 
4804   for (i = 0; i < old_tbl_def_arg->m_key_count; i++) {
4805     if (is_hidden_pk(i, old_table_arg, old_tbl_def_arg)) {
4806       old_key_pos[old_key_descr[i]->m_name] = i;
4807       continue;
4808     }
4809 
4810     /*
4811       In case of matching key name, need to check key parts of keys as well,
4812       in case a simultaneous drop + add is performed, where the key name is the
4813       same but the key parts are different.
4814 
4815       Example:
4816       CREATE TABLE t1 (a INT, b INT, KEY ka(a)) ENGINE=RocksDB;
4817       ALTER TABLE t1 DROP INDEX ka, ADD INDEX ka(b), ALGORITHM=INPLACE;
4818     */
4819     const KEY *const old_key = &old_table_arg->key_info[i];
4820     const auto &it = new_key_pos.find(old_key->name);
4821     if (it == new_key_pos.end()) {
4822       continue;
4823     }
4824 
4825     KEY *const new_key = &table_arg->key_info[it->second];
4826 
4827     if (!compare_key_parts(old_key, new_key)) {
4828       old_key_pos[old_key->name] = i;
4829     }
4830   }
4831 
4832   DBUG_RETURN(old_key_pos);
4833 }
4834 
4835 /* Check two keys to ensure that key parts within keys match */
compare_key_parts(const KEY * const old_key,const KEY * const new_key) const4836 int ha_rocksdb::compare_key_parts(const KEY *const old_key,
4837                                   const KEY *const new_key) const {
4838   DBUG_ENTER_FUNC();
4839 
4840   DBUG_ASSERT(old_key != nullptr);
4841   DBUG_ASSERT(new_key != nullptr);
4842 
4843   /* Skip if key parts do not match, as it is a different key */
4844   if (new_key->user_defined_key_parts != old_key->user_defined_key_parts) {
4845     DBUG_RETURN(HA_EXIT_FAILURE);
4846   }
4847 
4848   /* Check to see that key parts themselves match */
4849   for (uint i = 0; i < old_key->user_defined_key_parts; i++) {
4850     if (strcmp(old_key->key_part[i].field->field_name,
4851                new_key->key_part[i].field->field_name) != 0) {
4852       DBUG_RETURN(HA_EXIT_FAILURE);
4853     }
4854   }
4855 
4856   DBUG_RETURN(HA_EXIT_SUCCESS);
4857 }
4858 
4859 /*
4860   Create key definition needed for storing data in rocksdb.
4861   This can be called either during CREATE table or doing ADD index operations.
4862 
4863   @param in
4864     table_arg     Table with definition
4865     i             Position of index being created inside table_arg->key_info
4866     tbl_def_arg   Table def structure being populated
4867     cf_info       Struct which contains column family information
4868 
4869   @param out
4870     new_key_def  Newly created index definition.
4871 
4872   @return
4873     0      - Ok
4874     other  - error, either given table ddl is not supported by rocksdb or OOM.
4875 */
create_key_def(const TABLE * const table_arg,const uint & i,const Rdb_tbl_def * const tbl_def_arg,std::shared_ptr<Rdb_key_def> * const new_key_def,const struct key_def_cf_info & cf_info) const4876 int ha_rocksdb::create_key_def(const TABLE *const table_arg, const uint &i,
4877                                const Rdb_tbl_def *const tbl_def_arg,
4878                                std::shared_ptr<Rdb_key_def> *const new_key_def,
4879                                const struct key_def_cf_info &cf_info) const {
4880   DBUG_ENTER_FUNC();
4881 
4882   DBUG_ASSERT(new_key_def != nullptr);
4883   DBUG_ASSERT(*new_key_def == nullptr);
4884 
4885   const uint index_id = ddl_manager.get_and_update_next_number(&dict_manager);
4886   const uint16_t index_dict_version = Rdb_key_def::INDEX_INFO_VERSION_LATEST;
4887   uchar index_type;
4888   uint16_t kv_version;
4889 
4890   if (is_hidden_pk(i, table_arg, tbl_def_arg)) {
4891     index_type = Rdb_key_def::INDEX_TYPE_HIDDEN_PRIMARY;
4892     kv_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_LATEST;
4893   } else if (i == table_arg->s->primary_key) {
4894     index_type = Rdb_key_def::INDEX_TYPE_PRIMARY;
4895     uint16 pk_latest_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_LATEST;
4896     kv_version = pk_latest_version;
4897   } else {
4898     index_type = Rdb_key_def::INDEX_TYPE_SECONDARY;
4899     uint16 sk_latest_version = Rdb_key_def::SECONDARY_FORMAT_VERSION_LATEST;
4900     kv_version = sk_latest_version;
4901   }
4902 
4903   const char *const key_name = get_key_name(i, table_arg, m_tbl_def);
4904   *new_key_def = std::make_shared<Rdb_key_def>(
4905       index_id, i, cf_info.cf_handle, index_dict_version, index_type,
4906       kv_version, cf_info.is_reverse_cf, cf_info.is_auto_cf, key_name);
4907 
4908   DBUG_RETURN(HA_EXIT_SUCCESS);
4909 }
4910 
rdb_normalize_tablename(const std::string & tablename,std::string * const strbuf)4911 int rdb_normalize_tablename(const std::string &tablename,
4912                             std::string *const strbuf) {
4913   DBUG_ASSERT(strbuf != nullptr);
4914 
4915   if (tablename.size() < 2 || tablename[0] != '.' || tablename[1] != '/') {
4916     DBUG_ASSERT(0); // We were not passed table name?
4917     return HA_ERR_INTERNAL_ERROR;
4918   }
4919 
4920   size_t pos = tablename.find_first_of('/', 2);
4921   if (pos == std::string::npos) {
4922     DBUG_ASSERT(0); // We were not passed table name?
4923     return HA_ERR_INTERNAL_ERROR;
4924   }
4925 
4926   *strbuf = tablename.substr(2, pos - 2) + "." + tablename.substr(pos + 1);
4927 
4928   return HA_EXIT_SUCCESS;
4929 }
4930 
4931 /*
4932   Check to see if the user's original statement includes foreign key
4933   references
4934 */
contains_foreign_key(THD * const thd)4935 bool ha_rocksdb::contains_foreign_key(THD *const thd) {
4936   bool success;
4937   const char *str = thd_query_string(thd)->str;
4938 
4939   DBUG_ASSERT(str != nullptr);
4940 
4941   while (*str != '\0') {
4942     // Scan from our current pos looking for 'FOREIGN'
4943     str = rdb_find_in_string(str, "FOREIGN", &success);
4944     if (!success) {
4945       return false;
4946     }
4947 
4948     // Skip past the found "FOREIGN'
4949     str = rdb_check_next_token(&my_charset_bin, str, "FOREIGN", &success);
4950     DBUG_ASSERT(success);
4951 
4952     if (!my_isspace(&my_charset_bin, *str)) {
4953       return false;
4954     }
4955 
4956     // See if the next token is 'KEY'
4957     str = rdb_check_next_token(&my_charset_bin, str, "KEY", &success);
4958     if (!success) {
4959       continue;
4960     }
4961 
4962     // See if the next token is '('
4963     str = rdb_check_next_token(&my_charset_bin, str, "(", &success);
4964     if (!success) {
4965       // There is an optional index id after 'FOREIGN KEY', skip it
4966       str = rdb_skip_id(&my_charset_bin, str);
4967 
4968       // Now check for '(' again
4969       str = rdb_check_next_token(&my_charset_bin, str, "(", &success);
4970     }
4971 
4972     // If we have found 'FOREIGN KEY [<word>] (' we can be confident we have
4973     // a foreign key clause.
4974     return success;
4975   }
4976 
4977   // We never found a valid foreign key clause
4978   return false;
4979 }
4980 
4981 /**
4982   @brief
4983   splits the normalized table name of <dbname>.<tablename>#P#<part_no> into
4984   the <dbname>, <tablename> and <part_no> components.
4985 
4986   @param dbbuf returns database name/table_schema
4987   @param tablebuf returns tablename
4988   @param partitionbuf returns partition suffix if there is one
4989   @return HA_EXIT_SUCCESS on success, non-zero on failure to split
4990 */
rdb_split_normalized_tablename(const std::string & fullname,std::string * const db,std::string * const table,std::string * const partition)4991 int rdb_split_normalized_tablename(const std::string &fullname,
4992                                    std::string *const db,
4993                                    std::string *const table,
4994                                    std::string *const partition) {
4995   DBUG_ASSERT(!fullname.empty());
4996 
4997 #define RDB_PARTITION_STR "#P#"
4998 
4999   /* Normalize returns dbname.tablename. */
5000   size_t dotpos = fullname.find('.');
5001 
5002   /* Invalid table name? */
5003   if (dotpos == std::string::npos) {
5004     return HA_ERR_INTERNAL_ERROR;
5005   }
5006 
5007   // Table must have a database name associated with it.
5008   DBUG_ASSERT(dotpos > 0);
5009 
5010   if (db != nullptr) {
5011     *db = fullname.substr(0, dotpos);
5012   }
5013 
5014   dotpos++;
5015 
5016   const size_t partpos =
5017       fullname.find(RDB_PARTITION_STR, dotpos, strlen(RDB_PARTITION_STR));
5018 
5019   if (partpos != std::string::npos) {
5020     DBUG_ASSERT(partpos >= dotpos);
5021 
5022     if (table != nullptr) {
5023       *table = fullname.substr(dotpos, partpos - dotpos);
5024     }
5025 
5026     if (partition != nullptr) {
5027       *partition = fullname.substr(partpos + strlen(RDB_PARTITION_STR));
5028     }
5029   } else if (table != nullptr) {
5030     *table = fullname.substr(dotpos);
5031   }
5032 
5033   return HA_EXIT_SUCCESS;
5034 }
5035 
5036 /**
5037   @brief
5038   create() is called to create a database. The variable name will have the name
5039   of the table.
5040 
5041   @details
5042   When create() is called you do not need to worry about
5043   opening the table. Also, the .frm file will have already been
5044   created so adjusting create_info is not necessary. You can overwrite
5045   the .frm file at this point if you wish to change the table
5046   definition, but there are no methods currently provided for doing
5047   so.
5048 
5049   Called from handle.cc by ha_create_table().
5050 
5051   @see
5052   ha_create_table() in handle.cc
5053 */
5054 
create(const char * const name,TABLE * const table_arg,HA_CREATE_INFO * const create_info)5055 int ha_rocksdb::create(const char *const name, TABLE *const table_arg,
5056                        HA_CREATE_INFO *const create_info) {
5057   DBUG_ENTER_FUNC();
5058 
5059   int res;
5060   std::string str;
5061 
5062   THD *const thd = my_core::thd_get_current_thd();
5063 
5064   if (get_table_if_exists(name)) {
5065     if (thd->lex->sql_command == SQLCOM_TRUNCATE) {
5066       if (delete_table(name)) {
5067         DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
5068       }
5069     } else {
5070       if (rdb_normalize_tablename(name, &str)) {
5071         DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
5072       }
5073       my_printf_error(
5074           ER_UNKNOWN_ERROR,
5075           "Table '%s' does not exist, but metadata information exists inside "
5076           "MyRocks. This is a sign of data inconsistency. Please check if "
5077           "'%s.frm' exists, and try to restore it if it does not exist.",
5078           MYF(0), str.c_str(), name);
5079       DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
5080     }
5081   }
5082 
5083   /*
5084     Construct dbname.tablename ourselves, because parititioning
5085     passes strings like "./test/t14#P#p0" for individual partitions,
5086     while table_arg->s->table_name has none of that.
5087   */
5088   if (rdb_normalize_tablename(name, &str)) {
5089     DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
5090   }
5091 
5092   if (contains_foreign_key(thd)) {
5093     my_printf_error(ER_NOT_SUPPORTED_YET, "MyRocks does not currently support "
5094                                           "foreign key constraints",
5095                     MYF(0));
5096     DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
5097   }
5098 
5099   /*
5100     TODO(alexyang): Temporarily disable unique indexes support when there is no
5101     Primary Key
5102   */
5103   for (uint keyno = 0; keyno < table_arg->s->keys; keyno++) {
5104     if ((table_arg->key_info[keyno].flags & HA_NOSAME) &&
5105         has_hidden_pk(table_arg)) {
5106       my_printf_error(ER_UNKNOWN_ERROR, "Unique index support is disabled when "
5107                                         "the table has no primary key. ",
5108                       MYF(0));
5109       DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
5110     }
5111   }
5112 
5113   bool write_err = false;
5114   const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
5115   rocksdb::WriteBatch *const batch = wb.get();
5116 
5117   /* Create table/key descriptions and put them into the data dictionary */
5118   m_tbl_def = new Rdb_tbl_def(str);
5119 
5120   uint n_keys = table_arg->s->keys;
5121 
5122   /*
5123     If no primary key found, create a hidden PK and place it inside table
5124     definition
5125   */
5126   if (has_hidden_pk(table_arg)) {
5127     n_keys += 1;
5128   }
5129 
5130   m_key_descr_arr = new std::shared_ptr<Rdb_key_def>[n_keys];
5131   m_tbl_def->m_key_count = n_keys;
5132   m_tbl_def->m_key_descr_arr = m_key_descr_arr;
5133 
5134   if ((res = create_key_defs(table_arg, m_tbl_def))) {
5135     goto error;
5136   }
5137 
5138   m_pk_descr = m_key_descr_arr[pk_index(table_arg, m_tbl_def)];
5139 
5140   dict_manager.lock();
5141   write_err =
5142       ddl_manager.put_and_write(m_tbl_def, batch) || dict_manager.commit(batch);
5143   dict_manager.unlock();
5144 
5145   if (write_err) {
5146     goto error;
5147   }
5148 
5149   if (create_info->auto_increment_value)
5150     m_tbl_def->m_auto_incr_val = create_info->auto_increment_value;
5151 
5152   /*
5153     We only support auto_increment at start of the PRIMARY KEY.
5154   */
5155   // Field *field;
5156   // if ((field= table_arg->next_number_field))
5157   /* TODO mdcallag: disable this for now to let UNIQUE indexes kind of work
5158   if ((field= table_arg->found_next_number_field))
5159   {
5160     int pk= table_arg->s->primary_key;
5161     Field *pk_field= table_arg->key_info[pk].key_part[0].field;
5162     if (field->field_index !=  pk_field->field_index)
5163       DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
5164   }
5165   */
5166   DBUG_RETURN(HA_EXIT_SUCCESS);
5167 
5168 error:
5169   /* Delete what we have allocated so far */
5170   delete m_tbl_def;
5171   m_tbl_def = nullptr;
5172   m_key_descr_arr = nullptr;
5173 
5174   DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
5175 }
5176 
5177 /**
5178   @note
5179   This function is used only when the table has not yet been opened, and
5180   keyread_allowed bitmap doesn't have the correct values yet.
5181 
5182   See comment in ha_rocksdb::index_flags() for details.
5183 */
5184 
check_keyread_allowed(uint inx,uint part,bool all_parts) const5185 bool ha_rocksdb::check_keyread_allowed(uint inx, uint part,
5186                                        bool all_parts) const {
5187   bool res = true;
5188   KEY *const key_info = &table_share->key_info[inx];
5189 
5190   Rdb_field_packing dummy1;
5191   res = dummy1.setup(nullptr, key_info->key_part[part].field, inx, part,
5192                      key_info->key_part[part].length);
5193 
5194   if (res && all_parts) {
5195     for (uint i = 0; i < part; i++) {
5196       Field *field;
5197       if ((field = key_info->key_part[i].field)) {
5198         Rdb_field_packing dummy;
5199         if (!dummy.setup(nullptr, field, inx, i,
5200                          key_info->key_part[i].length)) {
5201           /* Cannot do index-only reads for this column */
5202           res = false;
5203           break;
5204         }
5205       }
5206     }
5207   }
5208 
5209   const uint pk = table_share->primary_key;
5210   if (inx == pk && all_parts &&
5211       part + 1 == table_share->key_info[pk].user_defined_key_parts) {
5212     m_pk_can_be_decoded = res;
5213   }
5214 
5215   return res;
5216 }
5217 
read_key_exact(const Rdb_key_def & kd,rocksdb::Iterator * const iter,const bool & full_key_match,const rocksdb::Slice & key_slice) const5218 int ha_rocksdb::read_key_exact(const Rdb_key_def &kd,
5219                                rocksdb::Iterator *const iter,
5220                                const bool &full_key_match,
5221                                const rocksdb::Slice &key_slice) const {
5222   DBUG_ASSERT(iter != nullptr);
5223 
5224   /*
5225     We are looking for the first record such that
5226       index_tuple= lookup_tuple.
5227     lookup_tuple may be a prefix of the index.
5228   */
5229   if (kd.m_is_reverse_cf) {
5230     if (!full_key_match) {
5231       if (!iter->Valid())
5232         iter->SeekToLast();
5233       else
5234         iter->Prev();
5235     }
5236   }
5237 
5238   if (!iter->Valid() || !kd.value_matches_prefix(iter->key(), key_slice)) {
5239     /*
5240       Got a record that is not equal to the lookup value, or even a record
5241       from another table.index.
5242     */
5243     return HA_ERR_KEY_NOT_FOUND;
5244   }
5245   return HA_EXIT_SUCCESS;
5246 }
5247 
read_before_key(const Rdb_key_def & kd,const bool & full_key_match,const rocksdb::Slice & key_slice)5248 int ha_rocksdb::read_before_key(const Rdb_key_def &kd,
5249                                 const bool &full_key_match,
5250                                 const rocksdb::Slice &key_slice) {
5251   /*
5252     We are looking for record with the biggest t.key such that
5253     t.key < lookup_tuple.
5254   */
5255   if (kd.m_is_reverse_cf) {
5256     if (m_scan_it->Valid() && full_key_match &&
5257         kd.value_matches_prefix(m_scan_it->key(), key_slice)) {
5258       /* We are using full key and we've hit an exact match */
5259       m_scan_it->Next();
5260     }
5261   } else {
5262     if (m_scan_it->Valid())
5263       m_scan_it->Prev();
5264     else
5265       m_scan_it->SeekToLast();
5266   }
5267 
5268   return m_scan_it->Valid() ? HA_EXIT_SUCCESS : HA_ERR_KEY_NOT_FOUND;
5269 }
5270 
read_after_key(const Rdb_key_def & kd,const bool & full_key_match,const rocksdb::Slice & key_slice)5271 int ha_rocksdb::read_after_key(const Rdb_key_def &kd,
5272                                const bool &full_key_match,
5273                                const rocksdb::Slice &key_slice) {
5274   /*
5275     We are looking for the first record such that
5276 
5277       index_tuple $GT lookup_tuple
5278 
5279     with HA_READ_AFTER_KEY, $GT = '>',
5280     with HA_READ_KEY_OR_NEXT, $GT = '>='
5281   */
5282   if (kd.m_is_reverse_cf) {
5283     if (!m_scan_it->Valid()) {
5284       m_scan_it->SeekToLast();
5285     } else {
5286       /*
5287         We should step back
5288          - when not using full extended key
5289          - when using full extended key and when we've got an exact match
5290       */
5291       if (!full_key_match ||
5292           !kd.value_matches_prefix(m_scan_it->key(), key_slice)) {
5293         m_scan_it->Prev();
5294       }
5295     }
5296   }
5297 
5298   return m_scan_it->Valid() ? HA_EXIT_SUCCESS : HA_ERR_KEY_NOT_FOUND;
5299 }
5300 
position_to_correct_key(const Rdb_key_def & kd,const enum ha_rkey_function & find_flag,const bool & full_key_match,const uchar * const key,const key_part_map & keypart_map,const rocksdb::Slice & key_slice,bool * const move_forward)5301 int ha_rocksdb::position_to_correct_key(const Rdb_key_def &kd,
5302                                         const enum ha_rkey_function &find_flag,
5303                                         const bool &full_key_match,
5304                                         const uchar *const key,
5305                                         const key_part_map &keypart_map,
5306                                         const rocksdb::Slice &key_slice,
5307                                         bool *const move_forward) {
5308   int rc = 0;
5309 
5310   *move_forward = true;
5311 
5312   switch (find_flag) {
5313   case HA_READ_KEY_EXACT:
5314     rc = read_key_exact(kd, m_scan_it, full_key_match, key_slice);
5315     break;
5316   case HA_READ_BEFORE_KEY:
5317     *move_forward = false;
5318     rc = read_before_key(kd, full_key_match, key_slice);
5319     if (rc == 0 && !kd.covers_key(m_scan_it->key())) {
5320       /* The record we've got is not from this index */
5321       rc = HA_ERR_KEY_NOT_FOUND;
5322     }
5323     break;
5324   case HA_READ_AFTER_KEY:
5325   case HA_READ_KEY_OR_NEXT:
5326     rc = read_after_key(kd, full_key_match, key_slice);
5327     if (rc == 0 && !kd.covers_key(m_scan_it->key())) {
5328       /* The record we've got is not from this index */
5329       rc = HA_ERR_KEY_NOT_FOUND;
5330     }
5331     break;
5332   case HA_READ_KEY_OR_PREV:
5333   case HA_READ_PREFIX:
5334     /* This flag is not used by the SQL layer, so we don't support it yet. */
5335     rc = HA_ERR_UNSUPPORTED;
5336     break;
5337   case HA_READ_PREFIX_LAST:
5338   case HA_READ_PREFIX_LAST_OR_PREV:
5339     *move_forward = false;
5340     /*
5341       Find the last record with the specified index prefix lookup.
5342       - HA_READ_PREFIX_LAST requires that the record has the
5343         prefix=lookup (if there are no such records,
5344         HA_ERR_KEY_NOT_FOUND should be returned).
5345       - HA_READ_PREFIX_LAST_OR_PREV has no such requirement. If there are no
5346         records with prefix=lookup, we should return the last record
5347         before that.
5348     */
5349     rc = read_before_key(kd, full_key_match, key_slice);
5350     if (rc == 0) {
5351       const rocksdb::Slice &rkey = m_scan_it->key();
5352       if (!kd.covers_key(rkey)) {
5353         /* The record we've got is not from this index */
5354         rc = HA_ERR_KEY_NOT_FOUND;
5355       } else if (find_flag == HA_READ_PREFIX_LAST) {
5356         uint size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
5357                                         key, keypart_map);
5358         rocksdb::Slice lookup_tuple(reinterpret_cast<char *>(m_sk_packed_tuple),
5359                                     size);
5360 
5361         // We need to compare the key we've got with the original search prefix.
5362         if (!kd.value_matches_prefix(rkey, lookup_tuple)) {
5363           rc = HA_ERR_KEY_NOT_FOUND;
5364         }
5365       }
5366     }
5367     break;
5368   default:
5369     DBUG_ASSERT(0);
5370     break;
5371   }
5372 
5373   return rc;
5374 }
5375 
calc_eq_cond_len(const Rdb_key_def & kd,const enum ha_rkey_function & find_flag,const rocksdb::Slice & slice,const int & bytes_changed_by_succ,const key_range * const end_key,uint * const end_key_packed_size)5376 int ha_rocksdb::calc_eq_cond_len(const Rdb_key_def &kd,
5377                                  const enum ha_rkey_function &find_flag,
5378                                  const rocksdb::Slice &slice,
5379                                  const int &bytes_changed_by_succ,
5380                                  const key_range *const end_key,
5381                                  uint *const end_key_packed_size) {
5382   if (find_flag == HA_READ_KEY_EXACT)
5383     return slice.size();
5384 
5385   if (find_flag == HA_READ_PREFIX_LAST) {
5386     /*
5387       We have made the kd.successor(m_sk_packed_tuple) call above.
5388 
5389       The slice is at least Rdb_key_def::INDEX_NUMBER_SIZE bytes long.
5390     */
5391     return slice.size() - bytes_changed_by_succ;
5392   }
5393 
5394   if (end_key) {
5395     *end_key_packed_size =
5396         kd.pack_index_tuple(table, m_pack_buffer, m_end_key_packed_tuple,
5397                             end_key->key, end_key->keypart_map);
5398 
5399     /*
5400       Calculating length of the equal conditions here. 4 byte index id is
5401       included.
5402       Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3)
5403        WHERE id1=1 AND id2=1 AND id3>=2 => eq_cond_len= 4+8+4= 16
5404        WHERE id1=1 AND id2>=1 AND id3>=2 => eq_cond_len= 4+8= 12
5405       Example2: id1 VARCHAR(30), id2 INT, PRIMARY KEY (id1, id2)
5406        WHERE id1 = 'AAA' and id2 < 3; => eq_cond_len=13 (varchar used 9 bytes)
5407     */
5408     rocksdb::Slice end_slice(reinterpret_cast<char *>(m_end_key_packed_tuple),
5409                              *end_key_packed_size);
5410     return slice.difference_offset(end_slice);
5411   }
5412 
5413   /*
5414     On range scan without any end key condition, there is no
5415     eq cond, and eq cond length is the same as index_id size (4 bytes).
5416     Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3)
5417      WHERE id1>=1 AND id2 >= 2 and id2 <= 5 => eq_cond_len= 4
5418   */
5419   return Rdb_key_def::INDEX_NUMBER_SIZE;
5420 }
5421 
read_row_from_primary_key(uchar * const buf)5422 int ha_rocksdb::read_row_from_primary_key(uchar *const buf) {
5423   DBUG_ASSERT(buf != nullptr);
5424 
5425   int rc;
5426   const rocksdb::Slice &rkey = m_scan_it->key();
5427   const uint pk_size = rkey.size();
5428   const char *pk_data = rkey.data();
5429 
5430   memcpy(m_pk_packed_tuple, pk_data, pk_size);
5431   m_last_rowkey.copy(pk_data, pk_size, &my_charset_bin);
5432 
5433   if (m_lock_rows != RDB_LOCK_NONE) {
5434     /* We need to put a lock and re-read */
5435     rc = get_row_by_rowid(buf, m_pk_packed_tuple, pk_size);
5436   } else {
5437     /* Unpack from the row we've read */
5438     const rocksdb::Slice &value = m_scan_it->value();
5439     rc = convert_record_from_storage_format(&rkey, &value, buf);
5440   }
5441 
5442   return rc;
5443 }
5444 
read_row_from_secondary_key(uchar * const buf,const Rdb_key_def & kd,bool move_forward)5445 int ha_rocksdb::read_row_from_secondary_key(uchar *const buf,
5446                                             const Rdb_key_def &kd,
5447                                             bool move_forward) {
5448   DBUG_ASSERT(buf != nullptr);
5449 
5450   int rc = 0;
5451   uint pk_size;
5452 
5453   if (m_keyread_only && m_lock_rows == RDB_LOCK_NONE && !has_hidden_pk(table)) {
5454     /* Get the key columns and primary key value */
5455     const rocksdb::Slice &rkey = m_scan_it->key();
5456     pk_size =
5457         kd.get_primary_key_tuple(table, *m_pk_descr, &rkey, m_pk_packed_tuple);
5458     const rocksdb::Slice &value = m_scan_it->value();
5459     if (pk_size == RDB_INVALID_KEY_LEN ||
5460         kd.unpack_record(table, buf, &rkey, &value,
5461                          m_verify_row_debug_checksums)) {
5462       rc = HA_ERR_INTERNAL_ERROR;
5463     }
5464   } else {
5465     if (kd.m_is_reverse_cf)
5466       move_forward = !move_forward;
5467 
5468     rc = find_icp_matching_index_rec(move_forward, buf);
5469     if (!rc) {
5470       const rocksdb::Slice &rkey = m_scan_it->key();
5471       pk_size = kd.get_primary_key_tuple(table, *m_pk_descr, &rkey,
5472                                          m_pk_packed_tuple);
5473       if (pk_size == RDB_INVALID_KEY_LEN) {
5474         rc = HA_ERR_INTERNAL_ERROR;
5475       } else {
5476         rc = get_row_by_rowid(buf, m_pk_packed_tuple, pk_size);
5477       }
5478     }
5479   }
5480 
5481   if (!rc) {
5482     m_last_rowkey.copy((const char *)m_pk_packed_tuple, pk_size,
5483                        &my_charset_bin);
5484   }
5485 
5486   return rc;
5487 }
5488 
5489 /**
5490   @note
5491     The problem with this function is that SQL layer calls it, when
5492      - the table has not been yet opened (no ::open() call done)
5493      - this->table_share already exists, but it is in the process of being
5494        filled, so some of fields are still NULL.
5495      - In particular, table_share->key_info[inx].key_part[] is filled only up
5496        to part #part. Subsequent key parts are not yet filled.
5497 
5498     To complicate things further, SQL layer will call index_flags() with
5499     all_parts=TRUE. Essentially, we're asked to provide flags for reading
5500     keyparts whose datatype is not yet known.
5501 
5502     We walk around this problem by using check_keyread_allowed(), which uses
5503     table_share object and is careful not to step on unitialized data.
5504 
5505     When we get a call with all_parts=TRUE, we try to analyze all parts but
5506     ignore those that have key_part->field==nullptr (these are not initialized
5507     yet).
5508 */
5509 
index_flags(uint inx,uint part,bool all_parts) const5510 ulong ha_rocksdb::index_flags(uint inx, uint part, bool all_parts) const {
5511   DBUG_ENTER_FUNC();
5512 
5513   ulong base_flags = HA_READ_NEXT | // doesn't seem to be used
5514                      HA_READ_ORDER | HA_READ_RANGE | HA_READ_PREV;
5515 
5516   if (check_keyread_allowed(inx, part, all_parts))
5517     base_flags |= HA_KEYREAD_ONLY;
5518 
5519   if (inx == table_share->primary_key) {
5520     /*
5521       Index-only reads on primary key are the same as table scan for us. Still,
5522       we need to explicitly "allow" them, otherwise SQL layer will miss some
5523       plans.
5524     */
5525     base_flags |= HA_KEYREAD_ONLY;
5526   } else {
5527     /*
5528       We can Index Condition Pushdown any key except the primary. With primary
5529       key, we get (pk, record) pair immediately, there is no place to put the
5530       ICP check.
5531     */
5532     base_flags |= HA_DO_INDEX_COND_PUSHDOWN;
5533   }
5534 
5535   DBUG_RETURN(base_flags);
5536 }
5537 
5538 /**
5539   @brief
5540   Read next index tuple through the secondary index.
5541 
5542   @details
5543   m_scan_it points at the index key-value pair that we should read the (pk,row)
5544   pair for.
5545 */
secondary_index_read(const int keyno,uchar * const buf)5546 int ha_rocksdb::secondary_index_read(const int keyno, uchar *const buf) {
5547   DBUG_ASSERT(buf != nullptr);
5548   DBUG_ASSERT(table != nullptr);
5549 
5550   /* Use STATUS_NOT_FOUND when record not found or some error occurred */
5551   table->status = STATUS_NOT_FOUND;
5552 
5553   if (m_scan_it->Valid()) {
5554     rocksdb::Slice key = m_scan_it->key();
5555 
5556     /* Check if we've ran out of records of this index */
5557     if (m_key_descr_arr[keyno]->covers_key(key)) {
5558       int rc = 0;
5559 
5560       // TODO: We could here check if we have ran out of range we're scanning
5561       const uint size = m_key_descr_arr[keyno]->get_primary_key_tuple(
5562           table, *m_pk_descr, &key, m_pk_packed_tuple);
5563       if (size == RDB_INVALID_KEY_LEN) {
5564         return HA_ERR_INTERNAL_ERROR;
5565       }
5566 
5567       m_last_rowkey.copy((const char *)m_pk_packed_tuple, size,
5568                          &my_charset_bin);
5569 
5570       if (m_keyread_only && m_lock_rows == RDB_LOCK_NONE &&
5571           !has_hidden_pk(table)) {
5572         rocksdb::Slice value = m_scan_it->value();
5573         if (m_key_descr_arr[keyno]->unpack_record(
5574                 table, buf, &key, &value, m_verify_row_debug_checksums)) {
5575           return HA_ERR_INTERNAL_ERROR;
5576         }
5577       } else {
5578         rc = get_row_by_rowid(buf, m_pk_packed_tuple, size);
5579       }
5580 
5581       if (!rc) {
5582         table->status = 0;
5583         update_row_stats(ROWS_READ);
5584       }
5585       return rc;
5586     }
5587   }
5588   return HA_ERR_END_OF_FILE;
5589 }
5590 
5591 /*
5592   ha_rocksdb::read_range_first overrides handler::read_range_first.
5593   The only difference from handler::read_range_first is that
5594   ha_rocksdb::read_range_first passes end_key to
5595   ha_rocksdb::index_read_map_impl function.
5596 */
read_range_first(const key_range * const start_key,const key_range * const end_key,bool eq_range_arg,bool sorted)5597 int ha_rocksdb::read_range_first(const key_range *const start_key,
5598                                  const key_range *const end_key,
5599                                  bool eq_range_arg, bool sorted) {
5600   DBUG_ENTER_FUNC();
5601 
5602   int result;
5603 
5604   eq_range = eq_range_arg;
5605   set_end_range(end_key, RANGE_SCAN_ASC);
5606 
5607   range_key_part = table->key_info[active_index].key_part;
5608 
5609   if (!start_key) // Read first record
5610     result = ha_index_first(table->record[0]);
5611   else {
5612     if (is_using_prohibited_gap_locks(
5613             table,
5614             is_using_full_unique_key(active_index, start_key->keypart_map,
5615                                      start_key->flag))) {
5616       DBUG_RETURN(HA_ERR_LOCK_DEADLOCK);
5617     }
5618 
5619     MYSQL_TABLE_IO_WAIT(m_psi, PSI_TABLE_FETCH_ROW, active_index, 0, {
5620       result =
5621           index_read_map_impl(table->record[0], start_key->key,
5622                               start_key->keypart_map, start_key->flag, end_key);
5623     })
5624   }
5625   if (result)
5626     DBUG_RETURN((result == HA_ERR_KEY_NOT_FOUND) ? HA_ERR_END_OF_FILE : result);
5627 
5628   if (compare_key(end_range) <= 0) {
5629     DBUG_RETURN(HA_EXIT_SUCCESS);
5630   } else {
5631     /*
5632       The last read row does not fall in the range. So request
5633       storage engine to release row lock if possible.
5634     */
5635     unlock_row();
5636     DBUG_RETURN(HA_ERR_END_OF_FILE);
5637   }
5638 }
5639 
index_read_map(uchar * const buf,const uchar * const key,key_part_map keypart_map,enum ha_rkey_function find_flag)5640 int ha_rocksdb::index_read_map(uchar *const buf, const uchar *const key,
5641                                key_part_map keypart_map,
5642                                enum ha_rkey_function find_flag) {
5643   DBUG_ENTER_FUNC();
5644 
5645   DBUG_RETURN(index_read_map_impl(buf, key, keypart_map, find_flag, nullptr));
5646 }
5647 
5648 /*
5649    See storage/rocksdb/rocksdb-range-access.txt for description of how MySQL
5650    index navigation commands are converted into RocksDB lookup commands.
5651 
5652    This function takes end_key as an argument, and it is set on range scan.
5653    MyRocks needs to decide whether prefix bloom filter can be used or not.
5654    To decide to use prefix bloom filter or not, calculating equal condition
5655    length
5656    is needed. On equal lookups (find_flag == HA_READ_KEY_EXACT), equal
5657    condition length is the same as rocksdb::Slice.size() of the start key.
5658    On range scan, equal condition length is MIN(start_key, end_key) of the
5659    rocksdb::Slice expression.
5660 */
index_read_map_impl(uchar * const buf,const uchar * const key,key_part_map keypart_map,enum ha_rkey_function find_flag,const key_range * end_key)5661 int ha_rocksdb::index_read_map_impl(uchar *const buf, const uchar *const key,
5662                                     key_part_map keypart_map,
5663                                     enum ha_rkey_function find_flag,
5664                                     const key_range *end_key) {
5665   DBUG_ENTER_FUNC();
5666 
5667   int rc = 0;
5668 
5669   ha_statistic_increment(&SSV::ha_read_key_count);
5670   const Rdb_key_def &kd = *m_key_descr_arr[active_index];
5671   const uint actual_key_parts = kd.get_key_parts();
5672   bool using_full_key = is_using_full_key(keypart_map, actual_key_parts);
5673 
5674   if (!end_key)
5675     end_key = end_range;
5676 
5677   /* By default, we don't need the retrieved records to match the prefix */
5678   m_sk_match_prefix = nullptr;
5679 
5680   if (active_index == table->s->primary_key && find_flag == HA_READ_KEY_EXACT &&
5681       using_full_key) {
5682     /*
5683       Equality lookup over primary key, using full tuple.
5684       This is a special case, use DB::Get.
5685     */
5686     const uint size = kd.pack_index_tuple(table, m_pack_buffer,
5687                                           m_pk_packed_tuple, key, keypart_map);
5688     rc = get_row_by_rowid(buf, m_pk_packed_tuple, size);
5689     if (!rc) {
5690       update_row_stats(ROWS_READ);
5691     }
5692     DBUG_RETURN(rc);
5693   }
5694 
5695   /*
5696     Unique secondary index performs lookups without the extended key fields
5697   */
5698   uint packed_size;
5699   if (active_index != table->s->primary_key &&
5700       table->key_info[active_index].flags & HA_NOSAME &&
5701       find_flag == HA_READ_KEY_EXACT && using_full_key) {
5702     key_part_map tmp_map = (key_part_map(1) << table->key_info[active_index]
5703                                                    .user_defined_key_parts) -
5704                            1;
5705     packed_size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
5706                                       key, tmp_map);
5707     if (table->key_info[active_index].user_defined_key_parts !=
5708         kd.get_key_parts())
5709       using_full_key = false;
5710   } else {
5711     packed_size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
5712                                       key, keypart_map);
5713   }
5714 
5715   if ((pushed_idx_cond && pushed_idx_cond_keyno == active_index) &&
5716       (find_flag == HA_READ_KEY_EXACT || find_flag == HA_READ_PREFIX_LAST)) {
5717     /*
5718       We are doing a point index lookup, and ICP is enabled. It is possible
5719       that this call will be followed by ha_rocksdb->index_next_same() call.
5720 
5721       Do what InnoDB does: save the lookup tuple now. We will need it in
5722       index_next_same/find_icp_matching_index_rec in order to stop scanning
5723       as soon as index record doesn't match the lookup tuple.
5724 
5725       When not using ICP, handler::index_next_same() will make sure that rows
5726       that don't match the lookup prefix are not returned.
5727       row matches the lookup prefix.
5728     */
5729     m_sk_match_prefix = m_sk_match_prefix_buf;
5730     m_sk_match_length = packed_size;
5731     memcpy(m_sk_match_prefix, m_sk_packed_tuple, packed_size);
5732   }
5733 
5734   int bytes_changed_by_succ = 0;
5735   if (find_flag == HA_READ_PREFIX_LAST_OR_PREV ||
5736       find_flag == HA_READ_PREFIX_LAST || find_flag == HA_READ_AFTER_KEY) {
5737     /* See below */
5738     bytes_changed_by_succ = kd.successor(m_sk_packed_tuple, packed_size);
5739   }
5740 
5741   rocksdb::Slice slice(reinterpret_cast<const char *>(m_sk_packed_tuple),
5742                        packed_size);
5743 
5744   uint end_key_packed_size = 0;
5745   const uint eq_cond_len =
5746       calc_eq_cond_len(kd, find_flag, slice, bytes_changed_by_succ, end_key,
5747                        &end_key_packed_size);
5748 
5749   bool use_all_keys = false;
5750   if (find_flag == HA_READ_KEY_EXACT &&
5751       my_count_bits(keypart_map) == kd.get_key_parts())
5752     use_all_keys = true;
5753 
5754   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
5755   const bool is_new_snapshot = !tx->has_snapshot();
5756   // Loop as long as we get a deadlock error AND we end up creating the
5757   // snapshot here (i.e. it did not exist prior to this)
5758   for (;;) {
5759     /*
5760       This will open the iterator and position it at a record that's equal or
5761       greater than the lookup tuple.
5762     */
5763     setup_scan_iterator(kd, &slice, use_all_keys, is_ascending(kd, find_flag),
5764                         eq_cond_len);
5765 
5766     /*
5767       Once we are positioned on from above, move to the position we really
5768       want: See storage/rocksdb/rocksdb-range-access.txt
5769     */
5770     bool move_forward;
5771     rc = position_to_correct_key(kd, find_flag, using_full_key, key,
5772                                  keypart_map, slice, &move_forward);
5773 
5774     if (rc) {
5775       /* This status is returned on any error */
5776       table->status = STATUS_NOT_FOUND;
5777       DBUG_RETURN(rc);
5778     }
5779 
5780     m_skip_scan_it_next_call = false;
5781 
5782     /*
5783       Now get the data for the row into 'buf'.  If we were using a primary key
5784       then we have all the rows we need.  For a secondary key we now need to
5785       lookup the primary key.
5786     */
5787     if (active_index == table->s->primary_key)
5788       rc = read_row_from_primary_key(buf);
5789     else
5790       rc = read_row_from_secondary_key(buf, kd, move_forward);
5791 
5792     if (rc != HA_ERR_LOCK_DEADLOCK || !is_new_snapshot)
5793       break; /* Exit the loop */
5794 
5795     // release the snapshot and iterator so they will be regenerated
5796     tx->release_snapshot();
5797     release_scan_iterator();
5798   }
5799 
5800   if (rc) {
5801     /* the only possible error condition is record-not-found */
5802     table->status = STATUS_NOT_FOUND;
5803   } else {
5804     table->status = 0;
5805     update_row_stats(ROWS_READ);
5806   }
5807 
5808   DBUG_RETURN(rc);
5809 }
5810 
5811 /*
5812   @brief
5813   Scan the secondary index until we find an index record that satisfies ICP
5814 
5815   @param move_forward   TRUE  <=> move m_scan_it forward
5816                         FALSE <=> move m_scan_it backward
5817   @param buf            Record buffer (must be the same buffer that
5818                         pushed index condition points to, in practice
5819                         it is table->record[0])
5820 
5821   @detail
5822   Move the current iterator m_scan_it until we get an index tuple that
5823   satisfies the pushed Index Condition.
5824   (if there is no pushed index condition, return right away)
5825 
5826   @return
5827     0     - Index tuple satisfies ICP, can do index read.
5828     other - error code
5829 */
5830 
find_icp_matching_index_rec(const bool & move_forward,uchar * const buf)5831 int ha_rocksdb::find_icp_matching_index_rec(const bool &move_forward,
5832                                             uchar *const buf) {
5833   DBUG_ASSERT(buf != nullptr);
5834 
5835   if (pushed_idx_cond && pushed_idx_cond_keyno == active_index) {
5836     const Rdb_key_def &kd = *m_key_descr_arr[active_index];
5837 
5838     while (1) {
5839       if (!m_scan_it->Valid()) {
5840         table->status = STATUS_NOT_FOUND;
5841         return HA_ERR_END_OF_FILE;
5842       }
5843       const rocksdb::Slice rkey = m_scan_it->key();
5844 
5845       if (!kd.covers_key(rkey)) {
5846         table->status = STATUS_NOT_FOUND;
5847         return HA_ERR_END_OF_FILE;
5848       }
5849 
5850       if (m_sk_match_prefix) {
5851         const rocksdb::Slice prefix((const char *)m_sk_match_prefix,
5852                                     m_sk_match_length);
5853         if (!kd.value_matches_prefix(rkey, prefix)) {
5854           table->status = STATUS_NOT_FOUND;
5855           return HA_ERR_END_OF_FILE;
5856         }
5857       }
5858 
5859       const rocksdb::Slice value = m_scan_it->value();
5860       if (kd.unpack_record(table, buf, &rkey, &value,
5861                            m_verify_row_debug_checksums)) {
5862         return HA_ERR_INTERNAL_ERROR;
5863       }
5864 
5865       const enum icp_result icp_status = check_index_cond();
5866       if (icp_status == ICP_NO_MATCH) {
5867         if (move_forward)
5868           m_scan_it->Next();
5869         else
5870           m_scan_it->Prev();
5871         continue; /* Get the next (or prev) index tuple */
5872       } else if (icp_status == ICP_OUT_OF_RANGE) {
5873         /* We have walked out of range we are scanning */
5874         table->status = STATUS_NOT_FOUND;
5875         return HA_ERR_END_OF_FILE;
5876       } else /* icp_status == ICP_MATCH */
5877       {
5878         /* Index Condition is satisfied. We have rc==0, proceed to fetch the
5879          * row. */
5880         break;
5881       }
5882       /*
5883         TODO: should we have this here, or RockDB handles this internally?
5884         if (my_core::thd_killed(current_thd))
5885         {
5886           rc= HA_ERR_INTERNAL_ERROR; // doesn't matter
5887           break;
5888         }
5889       */
5890     }
5891   }
5892   return HA_EXIT_SUCCESS;
5893 }
5894 
index_read_last_map(uchar * const buf,const uchar * const key,key_part_map keypart_map)5895 int ha_rocksdb::index_read_last_map(uchar *const buf, const uchar *const key,
5896                                     key_part_map keypart_map) {
5897   DBUG_ENTER_FUNC();
5898 
5899   DBUG_RETURN(index_read_map(buf, key, keypart_map, HA_READ_PREFIX_LAST));
5900 }
5901 
check(THD * const thd,HA_CHECK_OPT * const check_opt)5902 int ha_rocksdb::check(THD *const thd, HA_CHECK_OPT *const check_opt) {
5903   DBUG_ENTER_FUNC();
5904 
5905   DBUG_ASSERT(thd != nullptr);
5906   DBUG_ASSERT(check_opt != nullptr);
5907 
5908   const uint pk = pk_index(table, m_tbl_def);
5909   String rowkey_copy;
5910   String sec_key_copy;
5911   const char *const table_name = table->s->table_name.str;
5912 
5913   // Only when debugging: don't use snapshot when reading
5914   // Rdb_transaction *tx= get_or_create_tx(table->in_use);
5915   // tx->snapshot= nullptr;
5916 
5917   const bool save_verify_row_debug_checksums = m_verify_row_debug_checksums;
5918   m_verify_row_debug_checksums = true;
5919 
5920   /* For each secondary index, check that we can get a PK value from it */
5921   // NO_LINT_DEBUG
5922   sql_print_information("CHECKTABLE %s: Checking table %s", table_name,
5923                         table_name);
5924   ha_rows UNINIT_VAR(row_checksums_at_start); // set/used iff first_index==true
5925   ha_rows row_checksums = ha_rows(-1);
5926   bool first_index = true;
5927 
5928   for (uint keyno = 0; keyno < table->s->keys; keyno++) {
5929     if (keyno != pk) {
5930       extra(HA_EXTRA_KEYREAD);
5931       ha_index_init(keyno, true);
5932       ha_rows rows = 0;
5933       ha_rows checksums = 0;
5934       if (first_index)
5935         row_checksums_at_start = m_row_checksums_checked;
5936       int res;
5937       // NO_LINT_DEBUG
5938       sql_print_information("CHECKTABLE %s:   Checking index %s", table_name,
5939                             table->key_info[keyno].name);
5940       while (1) {
5941         if (!rows)
5942           res = index_first(table->record[0]);
5943         else
5944           res = index_next(table->record[0]);
5945 
5946         if (res == HA_ERR_END_OF_FILE)
5947           break;
5948         if (res) {
5949           // error
5950           // NO_LINT_DEBUG
5951           sql_print_error("CHECKTABLE %s:   .. row %lld: index scan error %d",
5952                           table_name, rows, res);
5953           goto error;
5954         }
5955         rocksdb::Slice key = m_scan_it->key();
5956         sec_key_copy.copy(key.data(), key.size(), &my_charset_bin);
5957         rowkey_copy.copy(m_last_rowkey.ptr(), m_last_rowkey.length(),
5958                          &my_charset_bin);
5959 
5960         if (m_key_descr_arr[keyno]->unpack_info_has_checksum(
5961                 m_scan_it->value())) {
5962           checksums++;
5963         }
5964 
5965         if ((res = get_row_by_rowid(table->record[0], rowkey_copy.ptr(),
5966                                     rowkey_copy.length()))) {
5967           // NO_LINT_DEBUG
5968           sql_print_error("CHECKTABLE %s:   .. row %lld: "
5969                           "failed to fetch row by rowid",
5970                           table_name, rows);
5971           goto error;
5972         }
5973 
5974         longlong hidden_pk_id = 0;
5975         if (has_hidden_pk(table) &&
5976             read_hidden_pk_id_from_rowkey(&hidden_pk_id))
5977           goto error;
5978 
5979         /* Check if we get the same PK value */
5980         uint packed_size = m_pk_descr->pack_record(
5981             table, m_pack_buffer, table->record[0], m_pk_packed_tuple, nullptr,
5982             false, hidden_pk_id);
5983         if (packed_size != rowkey_copy.length() ||
5984             memcmp(m_pk_packed_tuple, rowkey_copy.ptr(), packed_size)) {
5985           // NO_LINT_DEBUG
5986           sql_print_error("CHECKTABLE %s:   .. row %lld: PK value mismatch",
5987                           table_name, rows);
5988           goto print_and_error;
5989         }
5990 
5991         /* Check if we get the same secondary key value */
5992         packed_size = m_key_descr_arr[keyno]->pack_record(
5993             table, m_pack_buffer, table->record[0], m_sk_packed_tuple,
5994             &m_sk_tails, false, hidden_pk_id);
5995         if (packed_size != sec_key_copy.length() ||
5996             memcmp(m_sk_packed_tuple, sec_key_copy.ptr(), packed_size)) {
5997           // NO_LINT_DEBUG
5998           sql_print_error("CHECKTABLE %s:   .. row %lld: "
5999                           "secondary index value mismatch",
6000                           table_name, rows);
6001           goto print_and_error;
6002         }
6003         rows++;
6004         continue;
6005 
6006       print_and_error : {
6007         std::string buf;
6008         buf = rdb_hexdump(rowkey_copy.ptr(), rowkey_copy.length(),
6009                           RDB_MAX_HEXDUMP_LEN);
6010         // NO_LINT_DEBUG
6011         sql_print_error("CHECKTABLE %s:   rowkey: %s", table_name, buf.c_str());
6012 
6013         buf = rdb_hexdump(m_retrieved_record.data(), m_retrieved_record.size(),
6014                           RDB_MAX_HEXDUMP_LEN);
6015         // NO_LINT_DEBUG
6016         sql_print_error("CHECKTABLE %s:   record: %s", table_name, buf.c_str());
6017 
6018         buf = rdb_hexdump(sec_key_copy.ptr(), sec_key_copy.length(),
6019                           RDB_MAX_HEXDUMP_LEN);
6020         // NO_LINT_DEBUG
6021         sql_print_error("CHECKTABLE %s:   index: %s", table_name, buf.c_str());
6022 
6023         goto error;
6024       }
6025       }
6026       // NO_LINT_DEBUG
6027       sql_print_information("CHECKTABLE %s:   ... %lld index entries checked "
6028                             "(%lld had checksums)",
6029                             table_name, rows, checksums);
6030 
6031       if (first_index) {
6032         row_checksums = m_row_checksums_checked - row_checksums_at_start;
6033         first_index = false;
6034       }
6035       ha_index_end();
6036     }
6037   }
6038   if (row_checksums != ha_rows(-1)) {
6039     // NO_LINT_DEBUG
6040     sql_print_information("CHECKTABLE %s:   %lld table records had checksums",
6041                           table_name, row_checksums);
6042   }
6043   extra(HA_EXTRA_NO_KEYREAD);
6044 
6045   m_verify_row_debug_checksums = save_verify_row_debug_checksums;
6046   /*
6047     TODO: we should check also for PK records that are missing in the secondary
6048     indexes.
6049     For that, need to walk through the PK and check that every PK record has a
6050     proper counterpart in each secondary index.
6051   */
6052   DBUG_RETURN(HA_ADMIN_OK);
6053 error:
6054   m_verify_row_debug_checksums = save_verify_row_debug_checksums;
6055   ha_index_or_rnd_end();
6056   extra(HA_EXTRA_NO_KEYREAD);
6057 
6058   DBUG_RETURN(HA_ADMIN_CORRUPT);
6059 }
6060 
dbug_dump_str(FILE * const out,const char * const str,int len)6061 static void dbug_dump_str(FILE *const out, const char *const str, int len) {
6062   fprintf(out, "\"");
6063   for (int i = 0; i < len; i++) {
6064     if (str[i] > 32)
6065       fprintf(out, "%c", str[i]);
6066     else
6067       fprintf(out, "\\%d", str[i]);
6068   }
6069   fprintf(out, "\"");
6070 }
6071 
6072 /*
6073   Debugging help: dump the whole database into a human-readable file.
6074   Usage:
6075     dbug_dump_database(rdb);
6076 */
6077 
dbug_dump_database(rocksdb::DB * const db)6078 void dbug_dump_database(rocksdb::DB *const db) {
6079   FILE *const out = fopen("/tmp/rocksdb.dump", "wt");
6080   if (!out)
6081     return;
6082 
6083   rocksdb::Iterator *it = db->NewIterator(rocksdb::ReadOptions());
6084   for (it->SeekToFirst(); it->Valid(); it->Next()) {
6085     rocksdb::Slice key = it->key();
6086     rocksdb::Slice val = it->value();
6087     dbug_dump_str(out, key.data(), key.size());
6088     fprintf(out, " -> ");
6089     dbug_dump_str(out, val.data(), val.size());
6090     fprintf(out, "\n");
6091   }
6092 
6093   delete it;
6094   fclose(out);
6095 }
6096 
get_for_update(Rdb_transaction * const tx,rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,std::string * const value) const6097 rocksdb::Status ha_rocksdb::get_for_update(
6098     Rdb_transaction *const tx, rocksdb::ColumnFamilyHandle *const column_family,
6099     const rocksdb::Slice &key, std::string *const value) const {
6100   DBUG_ASSERT(m_lock_rows != RDB_LOCK_NONE);
6101   const bool exclusive = m_lock_rows != RDB_LOCK_READ;
6102 
6103   rocksdb::Status s = tx->get_for_update(column_family, key, value, exclusive);
6104 
6105   // If we have a lock conflict and we are running in READ COMMITTTED mode
6106   // release and reacquire the snapshot and then retry the get_for_update().
6107   if (s.IsBusy() && !s.IsDeadlock() &&
6108       my_core::thd_tx_isolation(ha_thd()) == ISO_READ_COMMITTED) {
6109     tx->release_snapshot();
6110     tx->acquire_snapshot(false);
6111 
6112     s = tx->get_for_update(column_family, key, value, exclusive);
6113   }
6114 
6115   return s;
6116 }
6117 
6118 /*
6119   Given a rowid (i.e. packed PK) as a parameter, get the record.
6120 
6121   @return
6122     0      - Ok
6123     other  - HA_ERR error code.
6124 */
6125 
get_row_by_rowid(uchar * const buf,const char * const rowid,const uint rowid_size)6126 int ha_rocksdb::get_row_by_rowid(uchar *const buf, const char *const rowid,
6127                                  const uint rowid_size) {
6128   DBUG_ENTER_FUNC();
6129 
6130   DBUG_ASSERT(buf != nullptr);
6131   DBUG_ASSERT(rowid != nullptr);
6132   DBUG_ASSERT(table != nullptr);
6133 
6134   int rc;
6135 
6136   rocksdb::Slice key_slice(rowid, rowid_size);
6137 
6138   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
6139   DBUG_ASSERT(tx != nullptr);
6140 
6141   DEBUG_SYNC(ha_thd(), "rocksdb.get_row_by_rowid");
6142   DBUG_EXECUTE_IF("dbug.rocksdb.get_row_by_rowid", {
6143     THD *thd = ha_thd();
6144     const char act[] = "now signal Reached "
6145                        "wait_for signal.rocksdb.get_row_by_rowid_let_running";
6146     DBUG_ASSERT(opt_debug_sync_timeout > 0);
6147     DBUG_ASSERT(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
6148   };);
6149 
6150   bool found;
6151   rocksdb::Status s;
6152 
6153   if (m_lock_rows == RDB_LOCK_NONE) {
6154     tx->acquire_snapshot(true);
6155     s = tx->get(m_pk_descr->get_cf(), key_slice, &m_retrieved_record);
6156   } else {
6157     s = get_for_update(tx, m_pk_descr->get_cf(), key_slice,
6158                        &m_retrieved_record);
6159   }
6160 
6161   if (!s.IsNotFound() && !s.ok()) {
6162     DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def));
6163   }
6164   found = !s.IsNotFound();
6165 
6166   table->status = STATUS_NOT_FOUND;
6167   if (found) {
6168     m_last_rowkey.copy((const char *)rowid, rowid_size, &my_charset_bin);
6169     rc = convert_record_from_storage_format(&key_slice, buf);
6170     if (!rc)
6171       table->status = 0;
6172   } else {
6173     /*
6174       Note: we don't need to unlock the row. It is intentional that we keep
6175       locks on rows that don't exist.
6176     */
6177     rc = HA_ERR_KEY_NOT_FOUND;
6178   }
6179 
6180   DBUG_RETURN(rc);
6181 }
6182 
index_next(uchar * const buf)6183 int ha_rocksdb::index_next(uchar *const buf) {
6184   DBUG_ENTER_FUNC();
6185 
6186   bool moves_forward = true;
6187   ha_statistic_increment(&SSV::ha_read_next_count);
6188   if (m_key_descr_arr[active_index]->m_is_reverse_cf) {
6189     moves_forward = false;
6190   }
6191 
6192   int rc = index_next_with_direction(buf, moves_forward);
6193   if (rc == HA_ERR_KEY_NOT_FOUND)
6194     rc = HA_ERR_END_OF_FILE;
6195 
6196   DBUG_RETURN(rc);
6197 }
6198 
index_prev(uchar * const buf)6199 int ha_rocksdb::index_prev(uchar *const buf) {
6200   DBUG_ENTER_FUNC();
6201 
6202   bool moves_forward = false;
6203   ha_statistic_increment(&SSV::ha_read_prev_count);
6204   if (m_key_descr_arr[active_index]->m_is_reverse_cf) {
6205     moves_forward = true;
6206   }
6207 
6208   int rc = index_next_with_direction(buf, moves_forward);
6209   if (rc == HA_ERR_KEY_NOT_FOUND)
6210     rc = HA_ERR_END_OF_FILE;
6211 
6212   DBUG_RETURN(rc);
6213 }
6214 
index_next_with_direction(uchar * const buf,bool move_forward)6215 int ha_rocksdb::index_next_with_direction(uchar *const buf, bool move_forward) {
6216   DBUG_ENTER_FUNC();
6217 
6218   int rc;
6219 
6220   if (active_index == pk_index(table, m_tbl_def)) {
6221     rc = rnd_next_with_direction(buf, move_forward);
6222   } else {
6223     if (m_skip_scan_it_next_call) {
6224       m_skip_scan_it_next_call = false;
6225     } else {
6226       if (move_forward)
6227         m_scan_it->Next(); /* this call cannot fail */
6228       else
6229         m_scan_it->Prev();
6230     }
6231     rc = find_icp_matching_index_rec(move_forward, buf);
6232     if (!rc)
6233       rc = secondary_index_read(active_index, buf);
6234   }
6235 
6236   DBUG_RETURN(rc);
6237 }
6238 
index_first(uchar * const buf)6239 int ha_rocksdb::index_first(uchar *const buf) {
6240   DBUG_ENTER_FUNC();
6241 
6242   m_sk_match_prefix = nullptr;
6243   ha_statistic_increment(&SSV::ha_read_first_count);
6244   int rc = m_key_descr_arr[active_index]->m_is_reverse_cf
6245                ? index_last_intern(buf)
6246                : index_first_intern(buf);
6247   if (rc == HA_ERR_KEY_NOT_FOUND)
6248     rc = HA_ERR_END_OF_FILE;
6249 
6250   DBUG_RETURN(rc);
6251 }
6252 
index_last(uchar * const buf)6253 int ha_rocksdb::index_last(uchar *const buf) {
6254   DBUG_ENTER_FUNC();
6255 
6256   m_sk_match_prefix = nullptr;
6257   ha_statistic_increment(&SSV::ha_read_last_count);
6258   int rc = m_key_descr_arr[active_index]->m_is_reverse_cf
6259                ? index_first_intern(buf)
6260                : index_last_intern(buf);
6261   if (rc == HA_ERR_KEY_NOT_FOUND)
6262     rc = HA_ERR_END_OF_FILE;
6263 
6264   DBUG_RETURN(rc);
6265 }
6266 
6267 /*
6268   Start scanning from the "first" value.
6269 
6270   The 'first' here means "the first from start of the key space".
6271   For reverse-ordered key spaces, the first value will be the biggest, 'zzzz'.
6272 
6273   An picture of a forward-ordered keyspace (remember, the keys have form
6274   'indexnr-keyval'. Suppose the index we are at has number n)
6275 
6276       (n-1) - ...
6277       ( n )          <--- 1. (n) doesn't exist in the db but it would be here.
6278       ( n ) - aaa       <--- 2. Seek("n") will put us here on the first index
6279       ( n ) - bbb               record.
6280       ( n ) - cc
6281 
6282   So, need to do: Seek(n);
6283 
6284   A backward-ordered keyspace:
6285 
6286       (n+1) - bbb
6287       (n+1) - aaa
6288       (n+1)        <--- (n+1) doesn't exist in the db but would be here.
6289       ( n ) - ccc       <--- 1. We need to be here.
6290       ( n ) - bbb
6291       ( n ) - aaa
6292       ( n )
6293 
6294   So, need to: Seek(n+1);
6295 
6296 */
6297 
index_first_intern(uchar * const buf)6298 int ha_rocksdb::index_first_intern(uchar *const buf) {
6299   DBUG_ENTER_FUNC();
6300 
6301   DBUG_ASSERT(buf != nullptr);
6302 
6303   uchar *key;
6304   uint key_size;
6305   int rc;
6306 
6307   if (is_pk(active_index, table, m_tbl_def)) {
6308     key = m_pk_packed_tuple;
6309   } else {
6310     key = m_sk_packed_tuple;
6311   }
6312 
6313   DBUG_ASSERT(key != nullptr);
6314 
6315   const Rdb_key_def &kd = *m_key_descr_arr[active_index];
6316   if (kd.m_is_reverse_cf) {
6317     kd.get_supremum_key(key, &key_size);
6318   } else {
6319     kd.get_infimum_key(key, &key_size);
6320   }
6321 
6322   rocksdb::Slice index_key((const char *)key, key_size);
6323 
6324   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
6325   DBUG_ASSERT(tx != nullptr);
6326 
6327   const bool is_new_snapshot = !tx->has_snapshot();
6328   // Loop as long as we get a deadlock error AND we end up creating the
6329   // snapshot here (i.e. it did not exist prior to this)
6330   for (;;) {
6331     setup_scan_iterator(kd, &index_key, false, !kd.m_is_reverse_cf,
6332                         Rdb_key_def::INDEX_NUMBER_SIZE);
6333     m_skip_scan_it_next_call = true;
6334 
6335     rc = index_next_with_direction(buf, true);
6336     if (rc != HA_ERR_LOCK_DEADLOCK || !is_new_snapshot)
6337       break; // exit the loop
6338 
6339     // release the snapshot and iterator so they will be regenerated
6340     tx->release_snapshot();
6341     release_scan_iterator();
6342   }
6343 
6344   DBUG_RETURN(rc);
6345 }
6346 
6347 /**
6348   @details
6349   Start scanning from the "last" value
6350 
6351   The 'last' here means "the last from start of the key space".
6352   For reverse-ordered key spaces, we will actually read the smallest value.
6353 
6354   An picture of a forward-ordered keyspace (remember, the keys have form
6355   'indexnr-keyval'. Suppose the we are at a key that has number n)
6356 
6357      (n-1)-something
6358      ( n )-aaa
6359      ( n )-bbb
6360      ( n )-ccc            <----------- Need to seek to here.
6361      (n+1)      <---- Doesn't exist, but would be here.
6362      (n+1)-smth, or no value at all
6363 
6364    RocksDB's Iterator::Seek($val) seeks to "at $val or first value that's
6365    greater". We can't see to "(n)-ccc" directly, because we don't know what
6366    is the value of 'ccc' (the biggest record with prefix (n)). Instead, we seek
6367    to "(n+1)", which is the least possible value that's greater than any value
6368    in index #n. Then we step one record back.
6369 
6370    So, need to:  it->Seek(n+1) || it->SeekToLast(); it->Prev();
6371 
6372    A backward-ordered keyspace:
6373 
6374       (n+1)-something
6375       ( n ) - ccc
6376       ( n ) - bbb
6377       ( n ) - aaa       <---------------- (*) Need to seek here.
6378       ( n ) <--- Doesn't exist, but would be here.
6379       (n-1)-smth, or no value at all
6380 
6381    So, need to:  it->Seek(n) || it->SeekToLast(); it->Prev();
6382 */
6383 
index_last_intern(uchar * const buf)6384 int ha_rocksdb::index_last_intern(uchar *const buf) {
6385   DBUG_ENTER_FUNC();
6386 
6387   DBUG_ASSERT(buf != nullptr);
6388 
6389   uchar *key;
6390   uint key_size;
6391   int rc;
6392 
6393   if (is_pk(active_index, table, m_tbl_def)) {
6394     key = m_pk_packed_tuple;
6395   } else {
6396     key = m_sk_packed_tuple;
6397   }
6398 
6399   DBUG_ASSERT(key != nullptr);
6400 
6401   const Rdb_key_def &kd = *m_key_descr_arr[active_index];
6402   if (kd.m_is_reverse_cf) {
6403     kd.get_infimum_key(key, &key_size);
6404   } else {
6405     kd.get_supremum_key(key, &key_size);
6406   }
6407 
6408   rocksdb::Slice index_key((const char *)key, key_size);
6409 
6410   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
6411   DBUG_ASSERT(tx != nullptr);
6412 
6413   const bool is_new_snapshot = !tx->has_snapshot();
6414   // Loop as long as we get a deadlock error AND we end up creating the
6415   // snapshot here (i.e. it did not exist prior to this)
6416   for (;;) {
6417     setup_scan_iterator(kd, &index_key, false, kd.m_is_reverse_cf,
6418                         Rdb_key_def::INDEX_NUMBER_SIZE);
6419 
6420     if (!m_scan_it->Valid()) {
6421       /*
6422         RocksDB keyspace has no records that follow our index. It could be, our
6423         index is the last in the keyspace. Navigate to last record in the
6424         keyspace.
6425       */
6426       m_scan_it->SeekToLast();
6427     } else {
6428       /*
6429         Iterator is pointing at some record right after the end of the index.
6430         Move one record back.
6431       */
6432       m_scan_it->Prev();
6433     }
6434     m_skip_scan_it_next_call = false;
6435 
6436     if (is_pk(active_index, table, m_tbl_def)) {
6437       m_skip_scan_it_next_call = true;
6438       rc = rnd_next_with_direction(buf, false);
6439     } else {
6440       rc = find_icp_matching_index_rec(false /*move_forward*/, buf);
6441       if (!rc)
6442         rc = secondary_index_read(active_index, buf);
6443     }
6444 
6445     if (rc != HA_ERR_LOCK_DEADLOCK || !is_new_snapshot)
6446       break; /* exit the loop */
6447 
6448     // release the snapshot and iterator so they will be regenerated
6449     tx->release_snapshot();
6450     release_scan_iterator();
6451   }
6452 
6453   DBUG_RETURN(rc);
6454 }
6455 
unlock_row()6456 void ha_rocksdb::unlock_row() {
6457   DBUG_ENTER_FUNC();
6458 
6459   if (m_lock_rows != RDB_LOCK_NONE) {
6460     Rdb_transaction *const tx = get_or_create_tx(table->in_use);
6461     tx->release_lock(m_pk_descr->get_cf(),
6462                      std::string(m_last_rowkey.ptr(), m_last_rowkey.length()));
6463   }
6464 
6465   DBUG_VOID_RETURN;
6466 }
6467 
6468 /*
6469   Returning true if SingleDelete can be used.
6470   - Secondary Indexes can always use SingleDelete.
6471   - If the index is PRIMARY KEY, and if all of the columns of the table
6472     are covered by the PRIMARY KEY, SingleDelete can be used.
6473 */
can_use_single_delete(const uint & index) const6474 bool ha_rocksdb::can_use_single_delete(const uint &index) const {
6475   return (index != pk_index(table, m_tbl_def) ||
6476           (!has_hidden_pk(table) &&
6477            table->key_info[index].actual_key_parts == table->s->fields));
6478 }
6479 
skip_unique_check() const6480 bool ha_rocksdb::skip_unique_check() const {
6481   /*
6482     We want to skip unique checks if:
6483       1) bulk_load is on
6484       2) the user set unique_checks option to 0
6485   */
6486   return THDVAR(table->in_use, bulk_load) ||
6487          my_core::thd_test_options(table->in_use, OPTION_RELAXED_UNIQUE_CHECKS);
6488 }
6489 
commit_in_the_middle()6490 bool ha_rocksdb::commit_in_the_middle() {
6491   return THDVAR(table->in_use, bulk_load) ||
6492          THDVAR(table->in_use, commit_in_the_middle);
6493 }
6494 
6495 /*
6496   Executing bulk commit if it should.
6497   @retval true if bulk commit failed
6498   @retval false if bulk commit was skipped or succeeded
6499 */
do_bulk_commit(Rdb_transaction * const tx)6500 bool ha_rocksdb::do_bulk_commit(Rdb_transaction *const tx) {
6501   DBUG_ASSERT(tx != nullptr);
6502   return commit_in_the_middle() &&
6503          tx->get_write_count() >= THDVAR(table->in_use, bulk_load_size) &&
6504          tx->flush_batch();
6505 }
6506 
6507 /*
6508   If table was created without primary key, SQL layer represents the primary
6509   key number as MAX_INDEXES.  Hence, this function returns true if the table
6510   does not contain a primary key. (In which case we generate a hidden
6511   'auto-incremented' pk.)
6512 */
has_hidden_pk(const TABLE * const table) const6513 bool ha_rocksdb::has_hidden_pk(const TABLE *const table) const {
6514   DBUG_ASSERT(table != nullptr);
6515   return Rdb_key_def::table_has_hidden_pk(table);
6516 }
6517 
6518 /*
6519   Returns true if given index number is a hidden_pk.
6520   - This is used when a table is created with no primary key.
6521 */
is_hidden_pk(const uint index,const TABLE * const table_arg,const Rdb_tbl_def * const tbl_def_arg)6522 bool ha_rocksdb::is_hidden_pk(const uint index, const TABLE *const table_arg,
6523                               const Rdb_tbl_def *const tbl_def_arg) {
6524   DBUG_ASSERT(table_arg != nullptr);
6525   DBUG_ASSERT(table_arg->s != nullptr);
6526   DBUG_ASSERT(tbl_def_arg != nullptr);
6527 
6528   return (table_arg->s->primary_key == MAX_INDEXES &&
6529           index == tbl_def_arg->m_key_count - 1);
6530 }
6531 
6532 /* Returns index of primary key */
pk_index(const TABLE * const table_arg,const Rdb_tbl_def * const tbl_def_arg)6533 uint ha_rocksdb::pk_index(const TABLE *const table_arg,
6534                           const Rdb_tbl_def *const tbl_def_arg) {
6535   DBUG_ASSERT(table_arg != nullptr);
6536   DBUG_ASSERT(table_arg->s != nullptr);
6537   DBUG_ASSERT(tbl_def_arg != nullptr);
6538 
6539   return table_arg->s->primary_key == MAX_INDEXES ? tbl_def_arg->m_key_count - 1
6540                                                   : table_arg->s->primary_key;
6541 }
6542 
6543 /* Returns true if given index number is a primary key */
is_pk(const uint index,const TABLE * const table_arg,const Rdb_tbl_def * const tbl_def_arg)6544 bool ha_rocksdb::is_pk(const uint index, const TABLE *const table_arg,
6545                        const Rdb_tbl_def *const tbl_def_arg) {
6546   DBUG_ASSERT(table_arg != nullptr);
6547   DBUG_ASSERT(table_arg->s != nullptr);
6548   DBUG_ASSERT(tbl_def_arg != nullptr);
6549 
6550   return index == table_arg->s->primary_key ||
6551          is_hidden_pk(index, table_arg, tbl_def_arg);
6552 }
6553 
get_key_name(const uint index,const TABLE * const table_arg,const Rdb_tbl_def * const tbl_def_arg)6554 const char *ha_rocksdb::get_key_name(const uint index,
6555                                      const TABLE *const table_arg,
6556                                      const Rdb_tbl_def *const tbl_def_arg) {
6557   DBUG_ASSERT(table_arg != nullptr);
6558 
6559   if (is_hidden_pk(index, table_arg, tbl_def_arg)) {
6560     return HIDDEN_PK_NAME;
6561   }
6562 
6563   return table_arg->key_info[index].name;
6564 }
6565 
get_key_comment(const uint index,const TABLE * const table_arg,const Rdb_tbl_def * const tbl_def_arg)6566 const char *ha_rocksdb::get_key_comment(const uint index,
6567                                         const TABLE *const table_arg,
6568                                         const Rdb_tbl_def *const tbl_def_arg) {
6569   DBUG_ASSERT(table_arg != nullptr);
6570 
6571   if (is_hidden_pk(index, table_arg, tbl_def_arg)) {
6572     return nullptr;
6573   }
6574 
6575   return table_arg->key_info[index].comment.str;
6576 }
6577 
write_row(uchar * const buf)6578 int ha_rocksdb::write_row(uchar *const buf) {
6579   DBUG_ENTER_FUNC();
6580 
6581   DBUG_ASSERT(buf != nullptr);
6582   DBUG_ASSERT(buf == table->record[0]);
6583   DBUG_ASSERT(m_lock_rows == RDB_LOCK_WRITE);
6584 
6585   ha_statistic_increment(&SSV::ha_write_count);
6586   /*
6587     Note: "buf == table->record[0]" is copied from innodb. I am not aware of
6588     any use cases where this condition is not true.
6589   */
6590   if (table->next_number_field && buf == table->record[0]) {
6591     int err;
6592     if ((err = update_auto_increment())) {
6593       DBUG_RETURN(err);
6594     }
6595   }
6596 
6597   const int rv = update_write_row(nullptr, buf, skip_unique_check());
6598 
6599   if (rv == 0) {
6600     update_row_stats(ROWS_INSERTED);
6601   }
6602 
6603   DBUG_RETURN(rv);
6604 }
6605 
6606 /**
6607   Constructing m_last_rowkey (MyRocks key expression) from
6608   before_update|delete image (MySQL row expression).
6609   m_last_rowkey is normally set during lookup phase, such as
6610   rnd_next_with_direction() and rnd_pos(). With Read Free Replication,
6611   these read functions are skipped and update_rows(), delete_rows() are
6612   called without setting m_last_rowkey. This function sets m_last_rowkey
6613   for Read Free Replication.
6614 */
set_last_rowkey(const uchar * const old_data)6615 void ha_rocksdb::set_last_rowkey(const uchar *const old_data) {
6616   if (old_data && use_read_free_rpl()) {
6617     const int old_pk_size = m_pk_descr->pack_record(
6618         table, m_pack_buffer, old_data, m_pk_packed_tuple, nullptr, false);
6619     m_last_rowkey.copy((const char *)m_pk_packed_tuple, old_pk_size,
6620                        &my_charset_bin);
6621   }
6622 }
6623 
get_pk_for_update(struct update_row_info * const row_info)6624 int ha_rocksdb::get_pk_for_update(struct update_row_info *const row_info) {
6625   int size;
6626 
6627   /*
6628     Get new row key for any insert, and any update where the pk is not hidden.
6629     Row key for updates with hidden pk is handled below.
6630   */
6631   if (!has_hidden_pk(table)) {
6632     row_info->hidden_pk_id = 0;
6633 
6634     row_info->new_pk_unpack_info = &m_pk_unpack_info;
6635 
6636     size = m_pk_descr->pack_record(table, m_pack_buffer, row_info->new_data,
6637                                    m_pk_packed_tuple,
6638                                    row_info->new_pk_unpack_info, false);
6639   } else if (row_info->old_data == nullptr) {
6640     row_info->hidden_pk_id = update_hidden_pk_val();
6641     size =
6642         m_pk_descr->pack_hidden_pk(row_info->hidden_pk_id, m_pk_packed_tuple);
6643   } else {
6644     /*
6645       If hidden primary key, rowkey for new record will always be the same as
6646       before
6647     */
6648     size = row_info->old_pk_slice.size();
6649     memcpy(m_pk_packed_tuple, row_info->old_pk_slice.data(), size);
6650     if (read_hidden_pk_id_from_rowkey(&row_info->hidden_pk_id)) {
6651       return HA_ERR_INTERNAL_ERROR;
6652     }
6653   }
6654 
6655   row_info->new_pk_slice =
6656       rocksdb::Slice((const char *)m_pk_packed_tuple, size);
6657 
6658   return HA_EXIT_SUCCESS;
6659 }
6660 
check_and_lock_unique_pk(const uint & key_id,const struct update_row_info & row_info,bool * const found,bool * const pk_changed)6661 int ha_rocksdb::check_and_lock_unique_pk(const uint &key_id,
6662                                          const struct update_row_info &row_info,
6663                                          bool *const found,
6664                                          bool *const pk_changed) {
6665   DBUG_ASSERT(found != nullptr);
6666   DBUG_ASSERT(pk_changed != nullptr);
6667 
6668   *pk_changed = false;
6669 
6670   /*
6671     For UPDATEs, if the key has changed, we need to obtain a lock. INSERTs
6672     always require locking.
6673   */
6674   if (row_info.old_pk_slice.size() > 0) {
6675     /*
6676       If the keys are the same, then no lock is needed
6677     */
6678     if (!Rdb_pk_comparator::bytewise_compare(row_info.new_pk_slice,
6679                                              row_info.old_pk_slice)) {
6680       *found = false;
6681       return HA_EXIT_SUCCESS;
6682     }
6683 
6684     *pk_changed = true;
6685   }
6686 
6687   /*
6688     Perform a read to determine if a duplicate entry exists. For primary
6689     keys, a point lookup will be sufficient.
6690 
6691     note: we intentionally don't set options.snapshot here. We want to read
6692     the latest committed data.
6693   */
6694 
6695   /*
6696     To prevent race conditions like below, it is necessary to
6697     take a lock for a target row. get_for_update() holds a gap lock if
6698     target key does not exist, so below conditions should never
6699     happen.
6700 
6701     1) T1 Get(empty) -> T2 Get(empty) -> T1 Put(insert) -> T1 commit
6702        -> T2 Put(overwrite) -> T2 commit
6703     2) T1 Get(empty) -> T1 Put(insert, not committed yet) -> T2 Get(empty)
6704        -> T2 Put(insert, blocked) -> T1 commit -> T2 commit(overwrite)
6705   */
6706   const rocksdb::Status s =
6707       get_for_update(row_info.tx, m_pk_descr->get_cf(), row_info.new_pk_slice,
6708                      &m_retrieved_record);
6709   if (!s.ok() && !s.IsNotFound()) {
6710     return row_info.tx->set_status_error(table->in_use, s,
6711                                          *m_key_descr_arr[key_id], m_tbl_def);
6712   }
6713 
6714   *found = !s.IsNotFound();
6715   return HA_EXIT_SUCCESS;
6716 }
6717 
check_and_lock_sk(const uint & key_id,const struct update_row_info & row_info,bool * const found) const6718 int ha_rocksdb::check_and_lock_sk(const uint &key_id,
6719                                   const struct update_row_info &row_info,
6720                                   bool *const found) const {
6721   DBUG_ASSERT(found != nullptr);
6722   *found = false;
6723 
6724   /*
6725     Can skip checking this key if none of the key fields have changed.
6726   */
6727   if (row_info.old_data != nullptr && !m_update_scope.is_set(key_id)) {
6728     return HA_EXIT_SUCCESS;
6729   }
6730 
6731   KEY *key_info = nullptr;
6732   uint n_null_fields = 0;
6733   uint user_defined_key_parts = 1;
6734 
6735   key_info = &table->key_info[key_id];
6736   user_defined_key_parts = key_info->user_defined_key_parts;
6737   /*
6738     If there are no uniqueness requirements, there's no need to obtain a
6739     lock for this key.
6740   */
6741   if (!(key_info->flags & HA_NOSAME)) {
6742     return HA_EXIT_SUCCESS;
6743   }
6744 
6745   const Rdb_key_def &kd = *m_key_descr_arr[key_id];
6746 
6747   /*
6748     Calculate the new key for obtaining the lock
6749 
6750     For unique secondary indexes, the key used for locking does not
6751     include the extended fields.
6752   */
6753   int size =
6754       kd.pack_record(table, m_pack_buffer, row_info.new_data, m_sk_packed_tuple,
6755                      nullptr, false, 0, user_defined_key_parts, &n_null_fields);
6756   if (n_null_fields > 0) {
6757     /*
6758       If any fields are marked as NULL this will never match another row as
6759       to NULL never matches anything else including another NULL.
6760      */
6761     return HA_EXIT_SUCCESS;
6762   }
6763 
6764   const rocksdb::Slice new_slice =
6765       rocksdb::Slice((const char *)m_sk_packed_tuple, size);
6766 
6767   /*
6768     For UPDATEs, if the key has changed, we need to obtain a lock. INSERTs
6769     always require locking.
6770   */
6771   if (row_info.old_data != nullptr) {
6772     size = kd.pack_record(table, m_pack_buffer, row_info.old_data,
6773                           m_sk_packed_tuple_old, nullptr, false,
6774                           row_info.hidden_pk_id, user_defined_key_parts);
6775     const rocksdb::Slice old_slice =
6776         rocksdb::Slice((const char *)m_sk_packed_tuple_old, size);
6777 
6778     /*
6779       For updates, if the keys are the same, then no lock is needed
6780 
6781       Also check to see if the key has any fields set to NULL. If it does, then
6782       this key is unique since NULL is not equal to each other, so no lock is
6783       needed.
6784     */
6785     if (!Rdb_pk_comparator::bytewise_compare(new_slice, old_slice)) {
6786       return HA_EXIT_SUCCESS;
6787     }
6788   }
6789 
6790   /*
6791     Perform a read to determine if a duplicate entry exists - since this is
6792     a secondary indexes a range scan is needed.
6793 
6794     note: we intentionally don't set options.snapshot here. We want to read
6795     the latest committed data.
6796   */
6797 
6798   const bool all_parts_used = (user_defined_key_parts == kd.get_key_parts());
6799 
6800   /*
6801     This iterator seems expensive since we need to allocate and free
6802     memory for each unique index.
6803 
6804     If this needs to be optimized, for keys without NULL fields, the
6805     extended primary key fields can be migrated to the value portion of the
6806     key. This enables using Get() instead of Seek() as in the primary key
6807     case.
6808 
6809     The bloom filter may need to be disabled for this lookup.
6810   */
6811   const bool total_order_seek = !can_use_bloom_filter(
6812       ha_thd(), kd, new_slice, all_parts_used,
6813       is_ascending(*m_key_descr_arr[key_id], HA_READ_KEY_EXACT));
6814   const bool fill_cache = !THDVAR(ha_thd(), skip_fill_cache);
6815 
6816   /*
6817     psergey-todo: we just need to take lock, lookups not needed:
6818   */
6819   std::string dummy_value;
6820   const rocksdb::Status s =
6821       get_for_update(row_info.tx, kd.get_cf(), new_slice, &dummy_value);
6822   if (!s.ok() && !s.IsNotFound()) {
6823     return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def);
6824   }
6825 
6826   rocksdb::Iterator *const iter = row_info.tx->get_iterator(
6827       kd.get_cf(), total_order_seek, fill_cache, true /* read current data */,
6828       false /* acquire snapshot */);
6829   /*
6830     Need to scan the transaction to see if there is a duplicate key.
6831     Also need to scan RocksDB and verify the key has not been deleted
6832     in the transaction.
6833   */
6834   iter->Seek(new_slice);
6835   *found = !read_key_exact(kd, iter, all_parts_used, new_slice);
6836   delete iter;
6837 
6838   return HA_EXIT_SUCCESS;
6839 }
6840 
check_uniqueness_and_lock(const struct update_row_info & row_info,bool * const pk_changed)6841 int ha_rocksdb::check_uniqueness_and_lock(
6842     const struct update_row_info &row_info, bool *const pk_changed) {
6843   /*
6844     Go through each index and determine if the index has uniqueness
6845     requirements. If it does, then try to obtain a row lock on the new values.
6846     Once all locks have been obtained, then perform the changes needed to
6847     update/insert the row.
6848   */
6849   for (uint key_id = 0; key_id < m_tbl_def->m_key_count; key_id++) {
6850     bool found;
6851     int rc;
6852 
6853     if (is_pk(key_id, table, m_tbl_def)) {
6854       rc = check_and_lock_unique_pk(key_id, row_info, &found, pk_changed);
6855     } else {
6856       rc = check_and_lock_sk(key_id, row_info, &found);
6857     }
6858 
6859     if (rc != 0) {
6860       return rc;
6861     }
6862 
6863     if (found) {
6864       /* There is a row with this key already, so error out. */
6865       errkey = key_id;
6866       m_dupp_errkey = errkey;
6867       return HA_ERR_FOUND_DUPP_KEY;
6868     }
6869   }
6870 
6871   return HA_EXIT_SUCCESS;
6872 }
6873 
check_duplicate_sk(const TABLE * table_arg,const Rdb_key_def & index,const rocksdb::Slice * key,struct unique_sk_buf_info * sk_info)6874 int ha_rocksdb::check_duplicate_sk(const TABLE *table_arg,
6875                                    const Rdb_key_def &index,
6876                                    const rocksdb::Slice *key,
6877                                    struct unique_sk_buf_info *sk_info) {
6878   uint n_null_fields = 0;
6879   const rocksdb::Comparator *index_comp = index.get_cf()->GetComparator();
6880 
6881   /* Get proper SK buffer. */
6882   uchar *sk_buf = sk_info->swap_and_get_sk_buf();
6883 
6884   /* Get memcmp form of sk without extended pk tail */
6885   uint sk_memcmp_size =
6886       index.get_memcmp_sk_parts(table_arg, *key, sk_buf, &n_null_fields);
6887 
6888   sk_info->sk_memcmp_key =
6889       rocksdb::Slice(reinterpret_cast<char *>(sk_buf), sk_memcmp_size);
6890 
6891   if (sk_info->sk_memcmp_key_old.size() > 0 && n_null_fields == 0 &&
6892       index_comp->Compare(sk_info->sk_memcmp_key, sk_info->sk_memcmp_key_old) ==
6893           0) {
6894     return 1;
6895   }
6896 
6897   sk_info->sk_memcmp_key_old = sk_info->sk_memcmp_key;
6898   return 0;
6899 }
6900 
bulk_load_key(Rdb_transaction * const tx,const Rdb_key_def & kd,const rocksdb::Slice & key,const rocksdb::Slice & value)6901 int ha_rocksdb::bulk_load_key(Rdb_transaction *const tx, const Rdb_key_def &kd,
6902                               const rocksdb::Slice &key,
6903                               const rocksdb::Slice &value) {
6904   rocksdb::ColumnFamilyHandle *const cf = kd.get_cf();
6905   DBUG_ASSERT(cf != nullptr);
6906 
6907   if (m_sst_info == nullptr) {
6908     m_sst_info = std::make_shared<Rdb_sst_info>(
6909         rdb, m_table_handler->m_table_name, kd.get_name(), cf,
6910         rocksdb_db_options, THDVAR(ha_thd(), trace_sst_api));
6911     tx->start_bulk_load(this);
6912     m_bulk_load_tx = tx;
6913   }
6914 
6915   DBUG_ASSERT(m_sst_info != nullptr);
6916 
6917   int rc = m_sst_info->put(key, value);
6918   if (rc != 0) {
6919     my_printf_error(ER_UNKNOWN_ERROR,
6920                     "Failed to add a key to sst file writer(%s)", MYF(0),
6921                     m_sst_info->error_message().c_str());
6922     rc = HA_ERR_INTERNAL_ERROR;
6923   }
6924 
6925   return rc;
6926 }
6927 
finalize_bulk_load()6928 int ha_rocksdb::finalize_bulk_load() {
6929   int rc = 0;
6930 
6931   /* Skip if there are no possible ongoing bulk loads */
6932   if (m_sst_info == nullptr && m_bulk_load_tx == nullptr) {
6933     return rc;
6934   }
6935 
6936   mysql_mutex_lock(&m_bulk_load_mutex);
6937 
6938   /*
6939     We need this check because it's possible that m_sst_info has been
6940     flushed and cleared by another thread by the time the mutex has been
6941     acquired.
6942   */
6943   if (m_sst_info != nullptr) {
6944     rc = m_sst_info->commit();
6945     if (rc != 0) {
6946       my_printf_error(ER_UNKNOWN_ERROR,
6947                       "Failed to commit bulk loaded sst file to the "
6948                       "data store (%s)",
6949                       MYF(0), m_sst_info->error_message().c_str());
6950       rc = HA_ERR_INTERNAL_ERROR;
6951     }
6952 
6953     m_sst_info = nullptr;
6954     m_bulk_load_tx->end_bulk_load(this);
6955     m_bulk_load_tx = nullptr;
6956   }
6957 
6958   mysql_mutex_unlock(&m_bulk_load_mutex);
6959   return rc;
6960 }
6961 
update_pk(const Rdb_key_def & kd,const struct update_row_info & row_info,const bool & pk_changed)6962 int ha_rocksdb::update_pk(const Rdb_key_def &kd,
6963                           const struct update_row_info &row_info,
6964                           const bool &pk_changed) {
6965   const uint key_id = kd.get_keyno();
6966   const bool hidden_pk = is_hidden_pk(key_id, table, m_tbl_def);
6967   if (!hidden_pk && pk_changed) {
6968     /*
6969       The old key needs to be deleted.
6970     */
6971     const rocksdb::Status s = delete_or_singledelete(
6972         key_id, row_info.tx, kd.get_cf(), row_info.old_pk_slice);
6973     if (!s.ok()) {
6974       return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def);
6975     }
6976   }
6977 
6978   if (table->next_number_field) {
6979     update_auto_incr_val();
6980   }
6981 
6982   rocksdb::Slice value_slice;
6983   convert_record_to_storage_format(row_info.new_pk_slice,
6984                                    row_info.new_pk_unpack_info, &value_slice);
6985 
6986   int rc = 0;
6987   const auto cf = m_pk_descr->get_cf();
6988   if (rocksdb_enable_bulk_load_api && THDVAR(table->in_use, bulk_load) &&
6989       !hidden_pk) {
6990     /*
6991       Write the primary key directly to an SST file using an SstFileWriter
6992      */
6993     rc = bulk_load_key(row_info.tx, kd, row_info.new_pk_slice, value_slice);
6994   } else if (row_info.skip_unique_check) {
6995     /*
6996       It is responsibility of the user to make sure that the data being
6997       inserted doesn't violate any unique keys.
6998     */
6999     row_info.tx->get_blind_write_batch()->Put(cf, row_info.new_pk_slice,
7000                                               value_slice);
7001   } else if (row_info.tx->m_ddl_transaction) {
7002     /*
7003       DDL statement must check for unique key conflicts. For example:
7004       ALTER TABLE tbl DROP PRIMARY KEY, ADD PRIMARY KEY(non_unique_column)
7005     */
7006     row_info.tx->get_indexed_write_batch()->Put(cf, row_info.new_pk_slice,
7007                                                 value_slice);
7008   } else {
7009     const auto s = row_info.tx->put(cf, row_info.new_pk_slice, value_slice);
7010     if (!s.ok()) {
7011       if (s.IsBusy()) {
7012         errkey = table->s->primary_key;
7013         m_dupp_errkey = errkey;
7014         rc = HA_ERR_FOUND_DUPP_KEY;
7015       } else {
7016         rc = row_info.tx->set_status_error(table->in_use, s, *m_pk_descr,
7017                                            m_tbl_def);
7018       }
7019     }
7020   }
7021 
7022   return rc;
7023 }
7024 
update_sk(const TABLE * const table_arg,const Rdb_key_def & kd,const struct update_row_info & row_info)7025 int ha_rocksdb::update_sk(const TABLE *const table_arg, const Rdb_key_def &kd,
7026                           const struct update_row_info &row_info) {
7027   int new_packed_size;
7028   int old_packed_size;
7029 
7030   rocksdb::Slice new_key_slice;
7031   rocksdb::Slice new_value_slice;
7032   rocksdb::Slice old_key_slice;
7033 
7034   const uint key_id = kd.get_keyno();
7035   /*
7036     Can skip updating this key if none of the key fields have changed.
7037   */
7038   if (row_info.old_data != nullptr && !m_update_scope.is_set(key_id)) {
7039     return HA_EXIT_SUCCESS;
7040   }
7041 
7042   const bool store_row_debug_checksums = should_store_row_debug_checksums();
7043 
7044   new_packed_size = kd.pack_record(
7045       table_arg, m_pack_buffer, row_info.new_data, m_sk_packed_tuple,
7046       &m_sk_tails, store_row_debug_checksums, row_info.hidden_pk_id);
7047 
7048   if (row_info.old_data != nullptr) {
7049     // The old value
7050     old_packed_size = kd.pack_record(
7051         table_arg, m_pack_buffer, row_info.old_data, m_sk_packed_tuple_old,
7052         &m_sk_tails_old, store_row_debug_checksums, row_info.hidden_pk_id);
7053 
7054     /*
7055       Check if we are going to write the same value. This can happen when
7056       one does
7057         UPDATE tbl SET col='foo'
7058       and we are looking at the row that already has col='foo'.
7059 
7060       We also need to compare the unpack info. Suppose, the collation is
7061       case-insensitive, and unpack info contains information about whether
7062       the letters were uppercase and lowercase.  Then, both 'foo' and 'FOO'
7063       will have the same key value, but different data in unpack_info.
7064 
7065       (note: anyone changing bytewise_compare should take this code into
7066       account)
7067     */
7068     if (old_packed_size == new_packed_size &&
7069         m_sk_tails_old.get_current_pos() == m_sk_tails.get_current_pos() &&
7070         memcmp(m_sk_packed_tuple_old, m_sk_packed_tuple, old_packed_size) ==
7071             0 &&
7072         memcmp(m_sk_tails_old.ptr(), m_sk_tails.ptr(),
7073                m_sk_tails.get_current_pos()) == 0) {
7074       return HA_EXIT_SUCCESS;
7075     }
7076 
7077     /*
7078       Deleting entries from secondary index should skip locking, but
7079       be visible to the transaction.
7080       (also note that DDL statements do not delete rows, so this is not a DDL
7081        statement)
7082     */
7083     old_key_slice = rocksdb::Slice(
7084         reinterpret_cast<const char *>(m_sk_packed_tuple_old), old_packed_size);
7085 
7086     row_info.tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(),
7087                                                          old_key_slice);
7088   }
7089 
7090   /*
7091     We're writing a new entry for secondary key. We can skip locking; we
7092     should write to
7093     - WriteBatchWithIndex normally (so that transaction sees the new row)
7094     - non-indexed WriteBatch, when we don't need to see the new row:
7095        = when doing a DDL operation and writing to a non-unique index, or
7096        = when doing a bulk load
7097   */
7098   rocksdb::WriteBatchBase *write_batch;
7099   if ((row_info.tx->m_ddl_transaction &&
7100        !(table_arg->key_info[key_id].flags & HA_NOSAME)) ||
7101       row_info.skip_unique_check) {
7102     write_batch = row_info.tx->get_blind_write_batch();
7103   } else {
7104     write_batch = row_info.tx->get_indexed_write_batch();
7105   }
7106 
7107   new_key_slice = rocksdb::Slice(
7108       reinterpret_cast<const char *>(m_sk_packed_tuple), new_packed_size);
7109   new_value_slice =
7110       rocksdb::Slice(reinterpret_cast<const char *>(m_sk_tails.ptr()),
7111                      m_sk_tails.get_current_pos());
7112 
7113   write_batch->Put(kd.get_cf(), new_key_slice, new_value_slice);
7114 
7115   return HA_EXIT_SUCCESS;
7116 }
7117 
update_indexes(const struct update_row_info & row_info,const bool & pk_changed)7118 int ha_rocksdb::update_indexes(const struct update_row_info &row_info,
7119                                const bool &pk_changed) {
7120   int rc;
7121 
7122   for (uint key_id = 0; key_id < m_tbl_def->m_key_count; key_id++) {
7123     const Rdb_key_def &kd = *m_key_descr_arr[key_id];
7124     if (is_pk(key_id, table, m_tbl_def)) {
7125       rc = update_pk(kd, row_info, pk_changed);
7126     } else {
7127       rc = update_sk(table, kd, row_info);
7128     }
7129 
7130     if (rc != 0) {
7131       return rc;
7132     }
7133   }
7134 
7135   return HA_EXIT_SUCCESS;
7136 }
7137 
update_write_row(const uchar * const old_data,const uchar * const new_data,const bool skip_unique_check)7138 int ha_rocksdb::update_write_row(const uchar *const old_data,
7139                                  const uchar *const new_data,
7140                                  const bool skip_unique_check) {
7141   DBUG_ENTER_FUNC();
7142 
7143   bool pk_changed = false;
7144   struct update_row_info row_info;
7145 
7146   row_info.old_data = old_data;
7147   row_info.new_data = new_data;
7148   row_info.skip_unique_check = skip_unique_check;
7149   row_info.new_pk_unpack_info = nullptr;
7150 
7151   set_last_rowkey(old_data);
7152 
7153   row_info.tx = get_or_create_tx(table->in_use);
7154 
7155   if (old_data != nullptr) {
7156     row_info.old_pk_slice =
7157         rocksdb::Slice(m_last_rowkey.ptr(), m_last_rowkey.length());
7158 
7159     /* Determine which indexes need updating. */
7160     calc_updated_indexes();
7161   }
7162 
7163   /*
7164     Get the new row key into row_info.new_pk_slice
7165    */
7166   int rc = get_pk_for_update(&row_info);
7167   if (rc != 0) {
7168     DBUG_RETURN(rc);
7169   }
7170 
7171   if (!skip_unique_check) {
7172     /*
7173       Check to see if we are going to have failures because of unique
7174       keys.  Also lock the appropriate key values.
7175     */
7176     rc = check_uniqueness_and_lock(row_info, &pk_changed);
7177     if (rc != 0) {
7178       DBUG_RETURN(rc);
7179     }
7180   }
7181 
7182   DEBUG_SYNC(ha_thd(), "rocksdb.update_write_row_after_unique_check");
7183 
7184   /*
7185     At this point, all locks have been obtained, and all checks for duplicate
7186     keys have been performed. No further errors can be allowed to occur from
7187     here because updates to the transaction will be made and those updates
7188     cannot be easily removed without rolling back the entire transaction.
7189   */
7190   rc = update_indexes(row_info, pk_changed);
7191   if (rc != 0) {
7192     DBUG_RETURN(rc);
7193   }
7194 
7195   if (do_bulk_commit(row_info.tx)) {
7196     DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
7197   }
7198 
7199   DBUG_RETURN(HA_EXIT_SUCCESS);
7200 }
7201 
7202 /*
7203   Open a cursor and position it at the passed record
7204 */
7205 
setup_scan_iterator(const Rdb_key_def & kd,rocksdb::Slice * const slice,const bool use_all_keys,const bool is_ascending,const uint eq_cond_len)7206 void ha_rocksdb::setup_scan_iterator(const Rdb_key_def &kd,
7207                                      rocksdb::Slice *const slice,
7208                                      const bool use_all_keys,
7209                                      const bool is_ascending,
7210                                      const uint eq_cond_len) {
7211   DBUG_ASSERT(slice != nullptr);
7212   DBUG_ASSERT(slice->size() >= eq_cond_len);
7213 
7214   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
7215 
7216   bool skip_bloom = true;
7217 
7218   const rocksdb::Slice eq_cond(slice->data(), eq_cond_len);
7219   if (can_use_bloom_filter(ha_thd(), kd, eq_cond, use_all_keys, is_ascending)) {
7220     skip_bloom = false;
7221   }
7222 
7223   /*
7224     In some cases, setup_scan_iterator() is called multiple times from
7225     the same query but bloom filter can not always be used.
7226     Suppose the following query example. id2 is VARCHAR(30) and PRIMARY KEY
7227     (id1, id2).
7228      select count(*) from t2 WHERE id1=100 and id2 IN ('00000000000000000000',
7229     '100');
7230     In this case, setup_scan_iterator() is called twice, the first time is for
7231     (id1, id2)=(100, '00000000000000000000') and the second time is for (100,
7232     '100').
7233     If prefix bloom filter length is 24 bytes, prefix bloom filter can be used
7234     for the
7235     first condition but not for the second condition.
7236     If bloom filter condition is changed, currently it is necessary to destroy
7237     and
7238     re-create Iterator.
7239   */
7240   if (m_scan_it_skips_bloom != skip_bloom) {
7241     release_scan_iterator();
7242   }
7243 
7244   /*
7245     SQL layer can call rnd_init() multiple times in a row.
7246     In that case, re-use the iterator, but re-position it at the table start.
7247   */
7248   if (!m_scan_it) {
7249     const bool fill_cache = !THDVAR(ha_thd(), skip_fill_cache);
7250     if (commit_in_the_middle()) {
7251       DBUG_ASSERT(m_scan_it_snapshot == nullptr);
7252       m_scan_it_snapshot = rdb->GetSnapshot();
7253 
7254       auto read_opts = rocksdb::ReadOptions();
7255       read_opts.total_order_seek = true; // TODO: set based on WHERE conditions
7256       read_opts.snapshot = m_scan_it_snapshot;
7257       m_scan_it = rdb->NewIterator(read_opts, kd.get_cf());
7258     } else {
7259       m_scan_it = tx->get_iterator(kd.get_cf(), skip_bloom, fill_cache);
7260     }
7261     m_scan_it_skips_bloom = skip_bloom;
7262   }
7263   /*
7264     Seek() will "Position at the first key in the source that at or past
7265     target".
7266     The operation cannot fail.
7267   */
7268   m_scan_it->Seek(*slice);
7269 }
7270 
release_scan_iterator()7271 void ha_rocksdb::release_scan_iterator() {
7272   delete m_scan_it;
7273   m_scan_it = nullptr;
7274 
7275   if (m_scan_it_snapshot) {
7276     rdb->ReleaseSnapshot(m_scan_it_snapshot);
7277     m_scan_it_snapshot = nullptr;
7278   }
7279 }
7280 
setup_iterator_for_rnd_scan()7281 void ha_rocksdb::setup_iterator_for_rnd_scan() {
7282   uint key_size;
7283 
7284   if (m_pk_descr->m_is_reverse_cf)
7285     m_pk_descr->get_supremum_key(m_pk_packed_tuple, &key_size);
7286   else
7287     m_pk_descr->get_infimum_key(m_pk_packed_tuple, &key_size);
7288 
7289   rocksdb::Slice table_key((const char *)m_pk_packed_tuple, key_size);
7290 
7291   setup_scan_iterator(*m_pk_descr, &table_key);
7292   m_skip_scan_it_next_call = true;
7293 }
7294 
rnd_init(bool scan)7295 int ha_rocksdb::rnd_init(bool scan) {
7296   DBUG_ENTER_FUNC();
7297 
7298   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
7299   setup_read_decoders();
7300 
7301   if (scan) {
7302     m_rnd_scan_is_new_snapshot = !tx->has_snapshot();
7303     setup_iterator_for_rnd_scan();
7304   } else {
7305     /* We don't need any preparations for rnd_pos() calls. */
7306   }
7307 
7308   // If m_lock_rows is on then we will be doing a get_for_update when accessing
7309   // the index, so don't acquire the snapshot right away.  Otherwise acquire
7310   // the snapshot immediately.
7311   tx->acquire_snapshot(m_lock_rows == RDB_LOCK_NONE);
7312 
7313   DBUG_RETURN(HA_EXIT_SUCCESS);
7314 }
7315 
rnd_next(uchar * const buf)7316 int ha_rocksdb::rnd_next(uchar *const buf) {
7317   DBUG_ENTER_FUNC();
7318 
7319   int rc;
7320   ha_statistic_increment(&SSV::ha_read_rnd_next_count);
7321   for (;;) {
7322     rc = rnd_next_with_direction(buf, true);
7323     if (rc != HA_ERR_LOCK_DEADLOCK || !m_rnd_scan_is_new_snapshot)
7324       break; /* exit the loop */
7325 
7326     // release the snapshot and iterator and then regenerate them
7327     Rdb_transaction *tx = get_or_create_tx(table->in_use);
7328     tx->release_snapshot();
7329     release_scan_iterator();
7330     setup_iterator_for_rnd_scan();
7331   }
7332 
7333   m_rnd_scan_is_new_snapshot = false;
7334 
7335   if (rc == HA_ERR_KEY_NOT_FOUND)
7336     rc = HA_ERR_END_OF_FILE;
7337 
7338   DBUG_RETURN(rc);
7339 }
7340 
7341 /*
7342   See also secondary_index_read().
7343 */
rnd_next_with_direction(uchar * const buf,bool move_forward)7344 int ha_rocksdb::rnd_next_with_direction(uchar *const buf, bool move_forward) {
7345   DBUG_ENTER_FUNC();
7346 
7347   int rc;
7348 
7349   table->status = STATUS_NOT_FOUND;
7350 
7351   if (!m_scan_it || !m_scan_it->Valid()) {
7352     /*
7353       We can get here when SQL layer has called
7354 
7355         h->index_init(PRIMARY);
7356         h->index_read_map(full index tuple, HA_READ_KEY_EXACT);
7357 
7358       In this case, we should return EOF.
7359     */
7360     DBUG_RETURN(HA_ERR_END_OF_FILE);
7361   }
7362 
7363   for (;;) {
7364     if (m_skip_scan_it_next_call) {
7365       m_skip_scan_it_next_call = false;
7366     } else {
7367       if (move_forward)
7368         m_scan_it->Next(); /* this call cannot fail */
7369       else
7370         m_scan_it->Prev(); /* this call cannot fail */
7371     }
7372 
7373     if (!m_scan_it->Valid()) {
7374       rc = HA_ERR_END_OF_FILE;
7375       break;
7376     }
7377 
7378     /* check if we're out of this table */
7379     const rocksdb::Slice key = m_scan_it->key();
7380     if (!m_pk_descr->covers_key(key)) {
7381       rc = HA_ERR_END_OF_FILE;
7382       break;
7383     }
7384 
7385     if (m_lock_rows != RDB_LOCK_NONE) {
7386       /*
7387         Lock the row we've just read.
7388 
7389         Now we call get_for_update which will 1) Take a lock and 2) Will fail
7390         if the row was deleted since the snapshot was taken.
7391       */
7392       Rdb_transaction *const tx = get_or_create_tx(table->in_use);
7393       DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete");
7394       const rocksdb::Status s =
7395           get_for_update(tx, m_pk_descr->get_cf(), key, &m_retrieved_record);
7396       if (s.IsNotFound() &&
7397           my_core::thd_tx_isolation(ha_thd()) == ISO_READ_COMMITTED) {
7398         // This occurs if we accessed a row, tried to lock it, failed,
7399         // released and reacquired the snapshot (because of READ COMMITTED
7400         // mode) and the row was deleted by someone else in the meantime.
7401         // If so, we just want to move on to the next row.
7402         continue;
7403       }
7404 
7405       if (!s.ok()) {
7406         DBUG_RETURN(
7407             tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def));
7408       }
7409 
7410       // If we called get_for_update() use the value from that call not from
7411       // the iterator as it may be stale since we don't have a snapshot
7412       // when m_lock_rows is not RDB_LOCK_NONE.
7413       m_last_rowkey.copy(key.data(), key.size(), &my_charset_bin);
7414       rc = convert_record_from_storage_format(&key, buf);
7415     } else {
7416       // Use the value from the iterator
7417       rocksdb::Slice value = m_scan_it->value();
7418       m_last_rowkey.copy(key.data(), key.size(), &my_charset_bin);
7419       rc = convert_record_from_storage_format(&key, &value, buf);
7420     }
7421 
7422     table->status = 0;
7423     break;
7424   }
7425 
7426   if (!rc) {
7427     update_row_stats(ROWS_READ);
7428   }
7429 
7430   DBUG_RETURN(rc);
7431 }
7432 
rnd_end()7433 int ha_rocksdb::rnd_end() {
7434   DBUG_ENTER_FUNC();
7435 
7436   release_scan_iterator();
7437 
7438   DBUG_RETURN(HA_EXIT_SUCCESS);
7439 }
7440 
index_init(uint idx,bool sorted)7441 int ha_rocksdb::index_init(uint idx, bool sorted) {
7442   DBUG_ENTER_FUNC();
7443 
7444   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
7445   DBUG_ASSERT(tx != nullptr);
7446 
7447   setup_read_decoders();
7448 
7449   // If m_lock_rows is not RDB_LOCK_NONE then we will be doing a get_for_update
7450   // when accessing the index, so don't acquire the snapshot right away.
7451   // Otherwise acquire the snapshot immediately.
7452   tx->acquire_snapshot(m_lock_rows == RDB_LOCK_NONE);
7453 
7454   active_index = idx;
7455 
7456   DBUG_RETURN(HA_EXIT_SUCCESS);
7457 }
7458 
index_end()7459 int ha_rocksdb::index_end() {
7460   DBUG_ENTER_FUNC();
7461 
7462   release_scan_iterator();
7463 
7464   active_index = MAX_KEY;
7465   in_range_check_pushed_down = FALSE;
7466 
7467   DBUG_RETURN(HA_EXIT_SUCCESS);
7468 }
7469 
truncate()7470 int ha_rocksdb::truncate() {
7471   DBUG_ENTER_FUNC();
7472 
7473   DBUG_ASSERT(m_tbl_def != nullptr);
7474 
7475   m_tbl_def->m_auto_incr_val = 1;
7476   m_tbl_def->m_hidden_pk_val = 1;
7477 
7478   remove_rows(m_tbl_def);
7479 
7480   DBUG_RETURN(HA_EXIT_SUCCESS);
7481 }
7482 
7483 /*
7484   Delete the row we've last read. The row is also passed as parameter.
7485 
7486   @detail
7487     The caller guarantees table buf points to the row that was just read.
7488     The row is either table->record[0] or table->record[1].
7489     (Check out InnoDB: row_update_for_mysql() has "UT_NOT_USED(mysql_rec)"
7490 */
delete_row(const uchar * const buf)7491 int ha_rocksdb::delete_row(const uchar *const buf) {
7492   DBUG_ENTER_FUNC();
7493 
7494   DBUG_ASSERT(buf != nullptr);
7495 
7496   ha_statistic_increment(&SSV::ha_delete_count);
7497   set_last_rowkey(buf);
7498 
7499   rocksdb::Slice key_slice(m_last_rowkey.ptr(), m_last_rowkey.length());
7500   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
7501 
7502   const uint index = pk_index(table, m_tbl_def);
7503   rocksdb::Status s =
7504       delete_or_singledelete(index, tx, m_pk_descr->get_cf(), key_slice);
7505   if (!s.ok()) {
7506     DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def));
7507   }
7508 
7509   longlong hidden_pk_id = 0;
7510   if (m_tbl_def->m_key_count > 1 && has_hidden_pk(table) &&
7511       read_hidden_pk_id_from_rowkey(&hidden_pk_id))
7512     DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
7513 
7514   // Delete the record for every secondary index
7515   for (uint i = 0; i < m_tbl_def->m_key_count; i++) {
7516     if (!is_pk(i, table, m_tbl_def)) {
7517       int packed_size;
7518       const Rdb_key_def &kd = *m_key_descr_arr[i];
7519       packed_size = kd.pack_record(table, m_pack_buffer, buf, m_sk_packed_tuple,
7520                                    nullptr, false, hidden_pk_id);
7521       rocksdb::Slice secondary_key_slice(
7522           reinterpret_cast<const char *>(m_sk_packed_tuple), packed_size);
7523       /* Deleting on secondary key doesn't need any locks: */
7524       tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(),
7525                                                   secondary_key_slice);
7526     }
7527   }
7528 
7529   if (do_bulk_commit(tx)) {
7530     DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
7531   }
7532   update_row_stats(ROWS_DELETED);
7533 
7534   DBUG_RETURN(HA_EXIT_SUCCESS);
7535 }
7536 
delete_or_singledelete(uint index,Rdb_transaction * const tx,rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key)7537 rocksdb::Status ha_rocksdb::delete_or_singledelete(
7538     uint index, Rdb_transaction *const tx,
7539     rocksdb::ColumnFamilyHandle *const column_family,
7540     const rocksdb::Slice &key) {
7541   if (can_use_single_delete(index))
7542     return tx->single_delete(column_family, key);
7543   return tx->delete_key(column_family, key);
7544 }
7545 
update_stats(void)7546 void ha_rocksdb::update_stats(void) {
7547   DBUG_ENTER_FUNC();
7548 
7549   stats.records = 0;
7550   stats.index_file_length = 0ul;
7551   stats.data_file_length = 0ul;
7552   stats.mean_rec_length = 0;
7553 
7554   for (uint i = 0; i < m_tbl_def->m_key_count; i++) {
7555     if (is_pk(i, table, m_tbl_def)) {
7556       stats.data_file_length = m_pk_descr->m_stats.m_actual_disk_size;
7557       stats.records = m_pk_descr->m_stats.m_rows;
7558     } else {
7559       stats.index_file_length += m_key_descr_arr[i]->m_stats.m_actual_disk_size;
7560     }
7561   }
7562 
7563   DBUG_VOID_RETURN;
7564 }
7565 
info(uint flag)7566 int ha_rocksdb::info(uint flag) {
7567   DBUG_ENTER_FUNC();
7568 
7569   if (!table)
7570     return HA_EXIT_FAILURE;
7571 
7572   if (flag & HA_STATUS_VARIABLE) {
7573     /*
7574       Test only to simulate corrupted stats
7575     */
7576     DBUG_EXECUTE_IF("myrocks_simulate_negative_stats",
7577                     m_pk_descr->m_stats.m_actual_disk_size =
7578                         -m_pk_descr->m_stats.m_actual_disk_size;);
7579 
7580     update_stats();
7581 
7582     /*
7583       If any stats are negative due to bad cached stats, re-run analyze table
7584       and re-retrieve the stats.
7585     */
7586     if (static_cast<longlong>(stats.data_file_length) < 0 ||
7587         static_cast<longlong>(stats.index_file_length) < 0 ||
7588         static_cast<longlong>(stats.records) < 0) {
7589       if (analyze(nullptr, nullptr)) {
7590         DBUG_RETURN(HA_EXIT_FAILURE);
7591       }
7592 
7593       update_stats();
7594     }
7595 
7596     if (stats.records == 0) {
7597       // most likely, the table is in memtable
7598       // try to deduce from GetApproximateSizes
7599       uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
7600       auto r = get_range(pk_index(table, m_tbl_def), buf);
7601       uint64_t sz = 0;
7602 
7603 #pragma GCC diagnostic push
7604 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
7605       rdb->GetApproximateSizes(m_pk_descr->get_cf(), &r, 1, &sz, true);
7606 #pragma GCC diagnostic pop
7607 
7608       stats.records = sz / ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE;
7609       stats.data_file_length = sz;
7610 
7611       if (rocksdb_debug_optimizer_n_rows > 0)
7612         stats.records = rocksdb_debug_optimizer_n_rows;
7613     }
7614 
7615     if (stats.records != 0)
7616       stats.mean_rec_length = stats.data_file_length / stats.records;
7617   }
7618   if (flag & HA_STATUS_CONST) {
7619     ref_length = m_pk_descr->max_storage_fmt_length();
7620 
7621     // TODO: Needs to reimplement after having real index statistics
7622     for (uint i = 0; i < m_tbl_def->m_key_count; i++) {
7623       if (is_hidden_pk(i, table, m_tbl_def)) {
7624         continue;
7625       }
7626       KEY *const k = &table->key_info[i];
7627       for (uint j = 0; j < k->actual_key_parts; j++) {
7628         const Rdb_index_stats &k_stats = m_key_descr_arr[i]->m_stats;
7629         uint x = k_stats.m_distinct_keys_per_prefix.size() > j &&
7630                          k_stats.m_distinct_keys_per_prefix[j] > 0
7631                      ? k_stats.m_rows / k_stats.m_distinct_keys_per_prefix[j]
7632                      : 0;
7633         if (x > stats.records)
7634           x = stats.records;
7635         if ((x == 0 && rocksdb_debug_optimizer_no_zero_cardinality) ||
7636             rocksdb_debug_optimizer_n_rows > 0) {
7637           // Fake cardinality implementation. For example, (idx1, idx2, idx3)
7638           // index
7639           // will have rec_per_key for (idx1)=4, (idx1,2)=2, and (idx1,2,3)=1.
7640           // rec_per_key for the whole index is 1, and multiplied by 2^n if
7641           // n suffix columns of the index are not used.
7642           x = 1 << (k->actual_key_parts - j - 1);
7643         }
7644         k->rec_per_key[j] = x;
7645       }
7646     }
7647   }
7648 
7649   if (flag & HA_STATUS_ERRKEY) {
7650     /*
7651       Currently we support only primary keys so we know which key had a
7652       uniqueness violation.
7653     */
7654     errkey = m_dupp_errkey;
7655     dup_ref = m_pk_tuple; // TODO(?): this should store packed PK.
7656   }
7657 
7658   if (flag & HA_STATUS_AUTO) {
7659     stats.auto_increment_value = m_tbl_def->m_auto_incr_val;
7660   }
7661 
7662   DBUG_RETURN(HA_EXIT_SUCCESS);
7663 }
7664 
position(const uchar * const record)7665 void ha_rocksdb::position(const uchar *const record) {
7666   DBUG_ENTER_FUNC();
7667 
7668   longlong hidden_pk_id = 0;
7669   if (has_hidden_pk(table) && read_hidden_pk_id_from_rowkey(&hidden_pk_id))
7670     DBUG_ASSERT(false); // should never reach here
7671 
7672   /*
7673     Get packed primary key value from the record.
7674 
7675     (Note: m_last_rowkey has the packed PK of last-read-row, which allows to
7676     handle most cases, but there is an exception to this: when slave applies
7677     RBR events, it fills the record and then calls position(); rnd_pos())
7678 
7679     Also note that we *can* rely on PK column values being available. This is
7680     because table_flags() includes HA_PRIMARY_KEY_REQUIRED_FOR_POSITION bit.
7681     When that is true, table->prepare_for_position() adds PK columns into the
7682     read set (this may potentially disable index-only access when PK column
7683     cannot be restored from its mem-comparable form in the secondary indexes).
7684   */
7685   const uint packed_size = m_pk_descr->pack_record(
7686       table, m_pack_buffer, record, ref, nullptr, false, hidden_pk_id);
7687 
7688   /*
7689     It could be that mem-comparable form of PK occupies less than ref_length
7690     bytes. Fill the remainder with zeros.
7691   */
7692   if (ref_length > packed_size)
7693     memset(ref + packed_size, 0, ref_length - packed_size);
7694 
7695   DBUG_VOID_RETURN;
7696 }
7697 
rnd_pos(uchar * const buf,uchar * const pos)7698 int ha_rocksdb::rnd_pos(uchar *const buf, uchar *const pos) {
7699   DBUG_ENTER_FUNC();
7700 
7701   int rc;
7702   size_t len;
7703 
7704   ha_statistic_increment(&SSV::ha_read_rnd_count);
7705   len = m_pk_descr->key_length(table,
7706                                rocksdb::Slice((const char *)pos, ref_length));
7707   if (len == size_t(-1)) {
7708     DBUG_RETURN(HA_ERR_INTERNAL_ERROR); /* Data corruption? */
7709   }
7710 
7711   rc = get_row_by_rowid(buf, pos, len);
7712 
7713   if (!rc) {
7714     update_row_stats(ROWS_READ);
7715   }
7716 
7717   DBUG_RETURN(rc);
7718 }
7719 
7720 /*
7721   @brief
7722     Calculate (if needed) the bitmap of indexes that are modified by the
7723     current query
7724 
7725   @detail
7726     The calculation done by checking index definitions against the
7727   table->write_set
7728     column bitmap.
7729 */
7730 
calc_updated_indexes()7731 void ha_rocksdb::calc_updated_indexes() {
7732   if (!m_update_scope_is_valid) {
7733     m_update_scope_is_valid = true;
7734     m_update_scope.clear_all();
7735 
7736     for (uint keynr = 0; keynr < table->s->keys; keynr++) {
7737       const Rdb_key_def &kd = *m_key_descr_arr[keynr];
7738       /* Walk over all key parts, including the "extended key" suffix */
7739       const uint key_parts = kd.get_key_parts();
7740       for (uint kp = 0; kp < key_parts; kp++) {
7741         if (has_hidden_pk(table) && kp + 1 == key_parts)
7742           break;
7743 
7744         Field *const field = kd.get_table_field_for_part_no(table, kp);
7745         if (bitmap_is_set(table->write_set, field->field_index)) {
7746           m_update_scope.set_bit(keynr);
7747           break;
7748         }
7749       }
7750     }
7751   }
7752 }
7753 
update_row(const uchar * const old_data,uchar * const new_data)7754 int ha_rocksdb::update_row(const uchar *const old_data, uchar *const new_data) {
7755   DBUG_ENTER_FUNC();
7756 
7757   DBUG_ASSERT(old_data != nullptr);
7758   DBUG_ASSERT(new_data != nullptr);
7759   DBUG_ASSERT(m_lock_rows == RDB_LOCK_WRITE);
7760   /*
7761     old_data points to record we're updating. It is the same as the record
7762     we've just read (for multi-table UPDATE, too, because SQL layer will make
7763     an rnd_pos() call to re-read the record before calling update_row())
7764   */
7765   DBUG_ASSERT(new_data == table->record[0]);
7766 
7767   ha_statistic_increment(&SSV::ha_update_count);
7768   const int rv = update_write_row(old_data, new_data, false);
7769 
7770   if (rv == 0) {
7771     update_row_stats(ROWS_UPDATED);
7772   }
7773 
7774   DBUG_RETURN(rv);
7775 }
7776 
7777 /* The following function was copied from ha_blackhole::store_lock: */
store_lock(THD * const thd,THR_LOCK_DATA ** to,enum thr_lock_type lock_type)7778 THR_LOCK_DATA **ha_rocksdb::store_lock(THD *const thd, THR_LOCK_DATA **to,
7779                                        enum thr_lock_type lock_type) {
7780   DBUG_ENTER_FUNC();
7781 
7782   DBUG_ASSERT(thd != nullptr);
7783   DBUG_ASSERT(to != nullptr);
7784 
7785   bool in_lock_tables = my_core::thd_in_lock_tables(thd);
7786 
7787   /* First, make a decision about MyRocks's internal locking */
7788   if (lock_type >= TL_WRITE_ALLOW_WRITE) {
7789     m_lock_rows = RDB_LOCK_WRITE;
7790   } else if (lock_type == TL_READ_WITH_SHARED_LOCKS) {
7791     m_lock_rows = RDB_LOCK_READ;
7792   } else {
7793     m_lock_rows = RDB_LOCK_NONE;
7794     if (THDVAR(thd, lock_scanned_rows)) {
7795       /*
7796         The following logic was copied directly from
7797         ha_innobase::store_lock_with_x_type() in
7798         storage/innobase/handler/ha_innodb.cc and causes MyRocks to leave
7799         locks in place on rows that are in a table that is not being updated.
7800       */
7801       const uint sql_command = my_core::thd_sql_command(thd);
7802       if ((lock_type == TL_READ && in_lock_tables) ||
7803           (lock_type == TL_READ_HIGH_PRIORITY && in_lock_tables) ||
7804           lock_type == TL_READ_WITH_SHARED_LOCKS ||
7805           lock_type == TL_READ_NO_INSERT ||
7806           (lock_type != TL_IGNORE && sql_command != SQLCOM_SELECT)) {
7807 
7808         ulong tx_isolation = my_core::thd_tx_isolation(thd);
7809         if (sql_command != SQLCOM_CHECKSUM &&
7810             ((my_core::thd_test_options(thd, OPTION_BIN_LOG) &&
7811               tx_isolation > ISO_READ_COMMITTED) ||
7812              tx_isolation == ISO_SERIALIZABLE ||
7813              (lock_type != TL_READ && lock_type != TL_READ_NO_INSERT) ||
7814              (sql_command != SQLCOM_INSERT_SELECT &&
7815               sql_command != SQLCOM_REPLACE_SELECT &&
7816               sql_command != SQLCOM_UPDATE && sql_command != SQLCOM_DELETE &&
7817               sql_command != SQLCOM_CREATE_TABLE))) {
7818           m_lock_rows = RDB_LOCK_READ;
7819         }
7820       }
7821     }
7822   }
7823 
7824   /* Then, tell the SQL layer what kind of locking it should use: */
7825   if (lock_type != TL_IGNORE && m_db_lock.type == TL_UNLOCK) {
7826     /*
7827       Here is where we get into the guts of a row level lock.
7828       If TL_UNLOCK is set
7829       If we are not doing a LOCK TABLE or DISCARD/IMPORT
7830       TABLESPACE, then allow multiple writers
7831     */
7832 
7833     if ((lock_type >= TL_WRITE_CONCURRENT_INSERT && lock_type <= TL_WRITE) &&
7834         !in_lock_tables && !my_core::thd_tablespace_op(thd)) {
7835       lock_type = TL_WRITE_ALLOW_WRITE;
7836     }
7837 
7838     /*
7839       In queries of type INSERT INTO t1 SELECT ... FROM t2 ...
7840       MySQL would use the lock TL_READ_NO_INSERT on t2, and that
7841       would conflict with TL_WRITE_ALLOW_WRITE, blocking all inserts
7842       to t2. Convert the lock to a normal read lock to allow
7843       concurrent inserts to t2.
7844     */
7845 
7846     if (lock_type == TL_READ_NO_INSERT && !in_lock_tables) {
7847       lock_type = TL_READ;
7848     }
7849 
7850     m_db_lock.type = lock_type;
7851   }
7852 
7853   *to++ = &m_db_lock;
7854 
7855   DBUG_RETURN(to);
7856 }
7857 
read_thd_vars(THD * const thd)7858 void ha_rocksdb::read_thd_vars(THD *const thd) {
7859   m_store_row_debug_checksums = THDVAR(thd, store_row_debug_checksums);
7860   m_verify_row_debug_checksums = THDVAR(thd, verify_row_debug_checksums);
7861   m_checksums_pct = THDVAR(thd, checksums_pct);
7862 }
7863 
thd_rocksdb_tmpdir()7864 const char *ha_rocksdb::thd_rocksdb_tmpdir() {
7865   const char *tmp_dir = THDVAR(ha_thd(), tmpdir);
7866 
7867   /*
7868     We want to treat an empty string as nullptr, in these cases DDL operations
7869     will use the default --tmpdir passed to mysql instead.
7870   */
7871   if (tmp_dir != nullptr && *tmp_dir == '\0') {
7872     tmp_dir = nullptr;
7873   }
7874 
7875   return (tmp_dir);
7876 }
7877 
external_lock(THD * const thd,int lock_type)7878 int ha_rocksdb::external_lock(THD *const thd, int lock_type) {
7879   DBUG_ENTER_FUNC();
7880 
7881   DBUG_ASSERT(thd != nullptr);
7882 
7883   int res = 0;
7884   int binlog_format = my_core::thd_binlog_format(thd);
7885   bool unsafe_for_binlog = THDVAR(ha_thd(), unsafe_for_binlog);
7886   if (lock_type == F_WRLCK && !thd->rli_slave && !unsafe_for_binlog &&
7887       binlog_format != BINLOG_FORMAT_ROW &&
7888       binlog_format != BINLOG_FORMAT_UNSPEC &&
7889       my_core::thd_binlog_filter_ok(thd)) {
7890     my_printf_error(
7891         ER_UNKNOWN_ERROR,
7892         "Can't execute updates on master with binlog_format != ROW.", MYF(0));
7893     DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
7894   }
7895 
7896   if (lock_type == F_UNLCK) {
7897     Rdb_transaction *const tx = get_or_create_tx(thd);
7898 
7899     tx->io_perf_end_and_record(&m_io_perf);
7900     if (!my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
7901       /*
7902         Do like InnoDB: when we get here, it's time to commit a
7903         single-statement transaction.
7904 
7905         If the statement involved multiple tables, this code will be executed
7906         for each of them, but that's ok because non-first tx->commit() calls
7907         will be no-ops.
7908       */
7909       res = tx->commit_or_rollback();
7910     }
7911   } else {
7912     if (my_core::thd_tx_isolation(thd) < ISO_READ_COMMITTED ||
7913         my_core::thd_tx_isolation(thd) > ISO_REPEATABLE_READ) {
7914       my_printf_error(
7915           ER_UNKNOWN_ERROR, "MyRocks supports only READ COMMITTED and "
7916                             "REPEATABLE READ isolation levels. "
7917                             "Please change from current isolation "
7918                             "level %s",
7919           MYF(0), tx_isolation_names[my_core::thd_tx_isolation(thd)]);
7920       DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
7921     }
7922     /*
7923       It's nice to do the following on start of every statement. The problem
7924       is, handler->start_stmt() is not called for INSERTs.
7925       So, we put this code here.
7926     */
7927     Rdb_transaction *const tx = get_or_create_tx(thd);
7928     read_thd_vars(thd);
7929 
7930     m_update_scope_is_valid = false;
7931 
7932     if (skip_unique_check()) {
7933       if ((thd->lex->sql_command == SQLCOM_INSERT ||
7934            thd->lex->sql_command == SQLCOM_LOAD ||
7935            thd->lex->sql_command == SQLCOM_REPLACE) &&
7936           (thd->lex->duplicates == DUP_REPLACE ||
7937            thd->lex->duplicates == DUP_UPDATE)) {
7938         my_printf_error(ER_UNKNOWN_ERROR,
7939                         "When unique checking is disabled in MyRocks, INSERT,"
7940                         "UPDATE,LOAD statements with clauses that update or "
7941                         "replace the key (i.e. "
7942                         "INSERT ON DUPLICATE KEY UPDATE, REPLACE) are not "
7943                         "allowed. Query: %s",
7944                         MYF(0), thd->query());
7945         DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
7946       }
7947     }
7948 
7949     if (lock_type == F_WRLCK) {
7950       if (tx->is_tx_read_only()) {
7951         my_printf_error(ER_UNKNOWN_ERROR,
7952                         "Can't execute updates when you started a transaction "
7953                         "with START TRANSACTION WITH CONSISTENT SNAPSHOT.",
7954                         MYF(0));
7955         DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
7956       }
7957 
7958       /*
7959         SQL layer signals us to take a write lock. It does so when starting DML
7960         statement. We should put locks on the rows we're reading.
7961 
7962         Note: sometimes, external_lock() can be called without a prior
7963         ::store_lock call.  That's why we need to set lock_* members here, too.
7964       */
7965       m_lock_rows = RDB_LOCK_WRITE;
7966 
7967       if (thd->lex->sql_command == SQLCOM_CREATE_INDEX ||
7968           thd->lex->sql_command == SQLCOM_DROP_INDEX ||
7969           thd->lex->sql_command == SQLCOM_ALTER_TABLE) {
7970         tx->m_ddl_transaction = true;
7971       }
7972     }
7973     rocksdb_register_tx(rocksdb_hton, thd, tx);
7974     tx->io_perf_start(&m_io_perf);
7975   }
7976 
7977   DBUG_RETURN(res);
7978 }
7979 
7980 /**
7981   @note
7982   A quote from ha_innobase::start_stmt():
7983   <quote>
7984   MySQL calls this function at the start of each SQL statement inside LOCK
7985   TABLES. Inside LOCK TABLES the ::external_lock method does not work to
7986   mark SQL statement borders.
7987   </quote>
7988 */
7989 
start_stmt(THD * const thd,thr_lock_type lock_type)7990 int ha_rocksdb::start_stmt(THD *const thd, thr_lock_type lock_type) {
7991   DBUG_ENTER_FUNC();
7992 
7993   DBUG_ASSERT(thd != nullptr);
7994 
7995   Rdb_transaction *const tx = get_or_create_tx(thd);
7996   read_thd_vars(thd);
7997   rocksdb_register_tx(ht, thd, tx);
7998   tx->io_perf_start(&m_io_perf);
7999 
8000   DBUG_RETURN(HA_EXIT_SUCCESS);
8001 }
8002 
get_range(uint32_t i,uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2],int offset1,int offset2)8003 rocksdb::Range get_range(uint32_t i,
8004                          uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2],
8005                          int offset1, int offset2) {
8006   uchar *buf_begin = buf;
8007   uchar *buf_end = buf + Rdb_key_def::INDEX_NUMBER_SIZE;
8008   rdb_netbuf_store_index(buf_begin, i + offset1);
8009   rdb_netbuf_store_index(buf_end, i + offset2);
8010 
8011   return rocksdb::Range(
8012       rocksdb::Slice((const char *)buf_begin, Rdb_key_def::INDEX_NUMBER_SIZE),
8013       rocksdb::Slice((const char *)buf_end, Rdb_key_def::INDEX_NUMBER_SIZE));
8014 }
8015 
get_range(const Rdb_key_def & kd,uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2],int offset1,int offset2)8016 static rocksdb::Range get_range(const Rdb_key_def &kd,
8017                                 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2],
8018                                 int offset1, int offset2) {
8019   return get_range(kd.get_index_number(), buf, offset1, offset2);
8020 }
8021 
get_range(const Rdb_key_def & kd,uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2])8022 rocksdb::Range get_range(const Rdb_key_def &kd,
8023                          uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]) {
8024   if (kd.m_is_reverse_cf) {
8025     return myrocks::get_range(kd, buf, 1, 0);
8026   } else {
8027     return myrocks::get_range(kd, buf, 0, 1);
8028   }
8029 }
8030 
8031 rocksdb::Range
get_range(const int & i,uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]) const8032 ha_rocksdb::get_range(const int &i,
8033                       uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]) const {
8034   return myrocks::get_range(*m_key_descr_arr[i], buf);
8035 }
8036 
8037 /*
8038   Drop index thread's main logic
8039 */
8040 
run()8041 void Rdb_drop_index_thread::run() {
8042   mysql_mutex_lock(&m_signal_mutex);
8043 
8044   for (;;) {
8045     // The stop flag might be set by shutdown command
8046     // after drop_index_thread releases signal_mutex
8047     // (i.e. while executing expensive Seek()). To prevent drop_index_thread
8048     // from entering long cond_timedwait, checking if stop flag
8049     // is true or not is needed, with drop_index_interrupt_mutex held.
8050     if (m_stop) {
8051       break;
8052     }
8053 
8054     timespec ts;
8055     clock_gettime(CLOCK_REALTIME, &ts);
8056     ts.tv_sec += dict_manager.is_drop_index_empty()
8057                      ? 24 * 60 * 60 // no filtering
8058                      : 60;          // filtering
8059 
8060     const auto ret __attribute__((__unused__)) =
8061         mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts);
8062     if (m_stop) {
8063       break;
8064     }
8065     // make sure, no program error is returned
8066     DBUG_ASSERT(ret == 0 || ret == ETIMEDOUT);
8067     mysql_mutex_unlock(&m_signal_mutex);
8068 
8069     std::unordered_set<GL_INDEX_ID> indices;
8070     dict_manager.get_ongoing_drop_indexes(&indices);
8071     if (!indices.empty()) {
8072       std::unordered_set<GL_INDEX_ID> finished;
8073       rocksdb::ReadOptions read_opts;
8074       read_opts.total_order_seek = true; // disable bloom filter
8075 
8076       for (const auto d : indices) {
8077         uint32 cf_flags = 0;
8078         if (!dict_manager.get_cf_flags(d.cf_id, &cf_flags)) {
8079           sql_print_error("RocksDB: Failed to get column family flags "
8080                           "from cf id %u. MyRocks data dictionary may "
8081                           "get corrupted.",
8082                           d.cf_id);
8083           abort_with_stack_traces();
8084         }
8085         rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(d.cf_id);
8086         DBUG_ASSERT(cfh);
8087         const bool is_reverse_cf = cf_flags & Rdb_key_def::REVERSE_CF_FLAG;
8088 
8089         bool index_removed = false;
8090         uchar key_buf[Rdb_key_def::INDEX_NUMBER_SIZE] = {0};
8091         rdb_netbuf_store_uint32(key_buf, d.index_id);
8092         const rocksdb::Slice key =
8093             rocksdb::Slice((char *)key_buf, sizeof(key_buf));
8094         uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
8095         rocksdb::Range range = get_range(d.index_id, buf, is_reverse_cf ? 1 : 0,
8096                                          is_reverse_cf ? 0 : 1);
8097         rocksdb::CompactRangeOptions compact_range_options;
8098         compact_range_options.bottommost_level_compaction =
8099             rocksdb::BottommostLevelCompaction::kForce;
8100         compact_range_options.exclusive_manual_compaction = false;
8101         rocksdb::Status status = DeleteFilesInRange(rdb->GetBaseDB(), cfh,
8102                                                     &range.start, &range.limit);
8103         if (!status.ok()) {
8104           if (status.IsShutdownInProgress()) {
8105             break;
8106           }
8107           rdb_handle_io_error(status, RDB_IO_ERROR_BG_THREAD);
8108         }
8109         status = rdb->CompactRange(compact_range_options, cfh, &range.start,
8110                                    &range.limit);
8111         if (!status.ok()) {
8112           if (status.IsShutdownInProgress()) {
8113             break;
8114           }
8115           rdb_handle_io_error(status, RDB_IO_ERROR_BG_THREAD);
8116         }
8117         std::unique_ptr<rocksdb::Iterator> it(rdb->NewIterator(read_opts, cfh));
8118         it->Seek(key);
8119         if (is_reverse_cf) {
8120           if (!it->Valid()) {
8121             it->SeekToLast();
8122           } else {
8123             it->Prev();
8124           }
8125         }
8126         if (!it->Valid()) {
8127           index_removed = true;
8128         } else {
8129           if (memcmp(it->key().data(), key_buf,
8130                      Rdb_key_def::INDEX_NUMBER_SIZE)) {
8131             // Key does not have same prefix
8132             index_removed = true;
8133           }
8134         }
8135         if (index_removed) {
8136           finished.insert(d);
8137         }
8138       }
8139 
8140       if (!finished.empty()) {
8141         dict_manager.finish_drop_indexes(finished);
8142       }
8143     }
8144     mysql_mutex_lock(&m_signal_mutex);
8145   }
8146 
8147   mysql_mutex_unlock(&m_signal_mutex);
8148 }
8149 
get_table_if_exists(const char * const tablename)8150 Rdb_tbl_def *ha_rocksdb::get_table_if_exists(const char *const tablename) {
8151   DBUG_ASSERT(tablename != nullptr);
8152 
8153   std::string str;
8154   if (rdb_normalize_tablename(tablename, &str)) {
8155     // We were not passed table name?
8156     DBUG_ASSERT(0);
8157     return nullptr;
8158   }
8159 
8160   return ddl_manager.find(str);
8161 }
8162 
8163 /*
8164   Note: the following function is called when the table is not open. That is,
8165   this->table==nullptr, pk_key_descr==nullptr, etc.
8166 
8167   tablename points to line in form "./dbname/tablename".
8168 */
8169 
delete_table(const char * const tablename)8170 int ha_rocksdb::delete_table(const char *const tablename) {
8171   DBUG_ENTER_FUNC();
8172 
8173   DBUG_ASSERT(tablename != nullptr);
8174 
8175   const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
8176   rocksdb::WriteBatch *const batch = wb.get();
8177 
8178   /* Find the table in the hash */
8179   Rdb_tbl_def *const tbl = get_table_if_exists(tablename);
8180   if (!tbl)
8181     DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
8182   dict_manager.add_drop_table(tbl->m_key_descr_arr, tbl->m_key_count, batch);
8183 
8184   /*
8185     Remove the table entry in data dictionary (this will also remove it from
8186     the persistent data dictionary).
8187   */
8188   ddl_manager.remove(tbl, batch, true);
8189   if (dict_manager.commit(batch))
8190     DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
8191 
8192   rdb_drop_idx_thread.signal();
8193 
8194   DBUG_RETURN(HA_EXIT_SUCCESS);
8195 }
8196 
remove_rows(Rdb_tbl_def * const tbl)8197 void ha_rocksdb::remove_rows(Rdb_tbl_def *const tbl) {
8198   const rocksdb::WriteOptions wo =
8199       rdb_get_rocksdb_write_options(handler::ha_thd());
8200 
8201   rocksdb::ReadOptions opts;
8202   opts.total_order_seek = true;
8203   rocksdb::Iterator *const it = rdb->NewIterator(opts);
8204 
8205   char key_buf[MAX_KEY_LENGTH];
8206   uint key_len;
8207   /*
8208     Remove all records in each index.
8209     (This is is not crash-safe, but it doesn't matter, because bulk row
8210     deletion will be handled on rocksdb side)
8211   */
8212   for (uint i = 0; i < tbl->m_key_count; i++) {
8213     const Rdb_key_def &kd = *tbl->m_key_descr_arr[i];
8214     kd.get_infimum_key(reinterpret_cast<uchar *>(key_buf), &key_len);
8215 
8216     const rocksdb::Slice table_key(key_buf, key_len);
8217     it->Seek(table_key);
8218     while (it->Valid()) {
8219       const rocksdb::Slice key = it->key();
8220       if (!kd.covers_key(key)) {
8221         break;
8222       }
8223 
8224       if (can_use_single_delete(i))
8225         rdb->SingleDelete(wo, key);
8226       else
8227         rdb->Delete(wo, key);
8228       it->Next();
8229     }
8230   }
8231   delete it;
8232 }
8233 
rename_table(const char * const from,const char * const to)8234 int ha_rocksdb::rename_table(const char *const from, const char *const to) {
8235   DBUG_ENTER_FUNC();
8236 
8237   DBUG_ASSERT(from != nullptr);
8238   DBUG_ASSERT(to != nullptr);
8239 
8240   bool write_err = false;
8241   std::string from_str;
8242   std::string to_str;
8243   std::string from_db;
8244   std::string to_db;
8245 
8246   if (rdb_normalize_tablename(from, &from_str) ||
8247       rdb_split_normalized_tablename(from_str, &from_db)) {
8248     DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
8249   }
8250 
8251   if (rdb_normalize_tablename(to, &to_str) ||
8252       rdb_split_normalized_tablename(to_str, &to_db)) {
8253     DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
8254   }
8255 
8256   // If the user changed the database part of the name then validate that the
8257   // 'to' database exists.
8258   if (from_db != to_db && !rdb_database_exists(to_db)) {
8259     DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
8260   }
8261 
8262   /* Check if any index has a per-index column family */
8263   {
8264     /*
8265       We can't check this directly, because
8266       1. the object that referred to the old table has been close()d.
8267       2. this ha_rocksdb object has no connection to any table at all, it has
8268          been just created with (TABLE*)nullptr.
8269 
8270       So, we have to find the old table in the ddl_manager (it's there because
8271       it has been opened by this statement), and check is_auto_cf there.
8272     */
8273     Rdb_tbl_def *tdef;
8274     if (!(tdef = ddl_manager.find(from_str)))
8275       DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
8276 
8277     for (uint i = 0; i < tdef->m_key_count; i++) {
8278       DBUG_ASSERT(tdef->m_key_descr_arr != nullptr);
8279 
8280       if (tdef->m_key_descr_arr[i]->m_is_auto_cf) {
8281         my_error(ER_NOT_SUPPORTED_YET, MYF(0),
8282                  "ALTER TABLE on table with per-index CF");
8283         DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
8284       }
8285     }
8286   }
8287 
8288   const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
8289   rocksdb::WriteBatch *const batch = wb.get();
8290   dict_manager.lock();
8291   write_err =
8292       ddl_manager.rename(from_str, to_str, batch) || dict_manager.commit(batch);
8293   dict_manager.unlock();
8294 
8295   DBUG_RETURN(write_err ? HA_ERR_INTERNAL_ERROR : 0);
8296 }
8297 
8298 /**
8299   check_if_incompatible_data() called if ALTER TABLE can't detect otherwise
8300   if new and old definition are compatible
8301 
8302   @details If there are no other explicit signs like changed number of
8303   fields this function will be called by compare_tables()
8304   (sql/sql_tables.cc) to decide should we rewrite whole table or only .frm
8305   file.
8306 
8307 */
8308 
check_if_incompatible_data(HA_CREATE_INFO * const info,uint table_changes)8309 bool ha_rocksdb::check_if_incompatible_data(HA_CREATE_INFO *const info,
8310                                             uint table_changes) {
8311   DBUG_ENTER_FUNC();
8312 
8313   DBUG_ASSERT(info != nullptr);
8314 
8315   // this function is needed only for online alter-table
8316   DBUG_RETURN(COMPATIBLE_DATA_NO);
8317 }
8318 
extra(enum ha_extra_function operation)8319 int ha_rocksdb::extra(enum ha_extra_function operation) {
8320   DBUG_ENTER_FUNC();
8321 
8322   switch (operation) {
8323   case HA_EXTRA_KEYREAD:
8324     m_keyread_only = true;
8325     break;
8326   case HA_EXTRA_NO_KEYREAD:
8327     m_keyread_only = false;
8328     break;
8329   case HA_EXTRA_FLUSH:
8330     /*
8331       If the table has blobs, then they are part of m_retrieved_record.
8332       This call invalidates them.
8333     */
8334     m_retrieved_record.clear();
8335     break;
8336   default:
8337     break;
8338   }
8339 
8340   DBUG_RETURN(HA_EXIT_SUCCESS);
8341 }
8342 
8343 /*
8344   Given a starting key and an ending key, estimate the number of rows that
8345   will exist between the two keys.
8346 */
records_in_range(uint inx,key_range * const min_key,key_range * const max_key)8347 ha_rows ha_rocksdb::records_in_range(uint inx, key_range *const min_key,
8348                                      key_range *const max_key) {
8349   DBUG_ENTER_FUNC();
8350 
8351   ha_rows ret = THDVAR(ha_thd(), records_in_range);
8352   if (ret) {
8353     DBUG_RETURN(ret);
8354   }
8355   if (table->force_index) {
8356     const ha_rows force_rows = THDVAR(ha_thd(), force_index_records_in_range);
8357     if (force_rows) {
8358       DBUG_RETURN(force_rows);
8359     }
8360   }
8361 
8362   const Rdb_key_def &kd = *m_key_descr_arr[inx];
8363 
8364   uint size1 = 0;
8365   if (min_key) {
8366     size1 = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
8367                                 min_key->key, min_key->keypart_map);
8368     if (min_key->flag == HA_READ_PREFIX_LAST_OR_PREV ||
8369         min_key->flag == HA_READ_PREFIX_LAST ||
8370         min_key->flag == HA_READ_AFTER_KEY) {
8371       kd.successor(m_sk_packed_tuple, size1);
8372     }
8373   } else {
8374     kd.get_infimum_key(m_sk_packed_tuple, &size1);
8375   }
8376 
8377   uint size2 = 0;
8378   if (max_key) {
8379     size2 = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple_old,
8380                                 max_key->key, max_key->keypart_map);
8381     if (max_key->flag == HA_READ_PREFIX_LAST_OR_PREV ||
8382         max_key->flag == HA_READ_PREFIX_LAST ||
8383         max_key->flag == HA_READ_AFTER_KEY) {
8384       kd.successor(m_sk_packed_tuple_old, size2);
8385     }
8386     // pad the upper key with FFFFs to make sure it is more than the lower
8387     if (size1 > size2) {
8388       memset(m_sk_packed_tuple_old + size2, 0xff, size1 - size2);
8389       size2 = size1;
8390     }
8391   } else {
8392     kd.get_supremum_key(m_sk_packed_tuple_old, &size2);
8393   }
8394 
8395   const rocksdb::Slice slice1((const char *)m_sk_packed_tuple, size1);
8396   const rocksdb::Slice slice2((const char *)m_sk_packed_tuple_old, size2);
8397 
8398   // slice1 >= slice2 means no row will match
8399   if (slice1.compare(slice2) >= 0) {
8400     DBUG_RETURN(HA_EXIT_SUCCESS);
8401   }
8402 
8403   rocksdb::Range r(kd.m_is_reverse_cf ? slice2 : slice1,
8404                    kd.m_is_reverse_cf ? slice1 : slice2);
8405 
8406   uint64_t sz = 0;
8407   auto disk_size = kd.m_stats.m_actual_disk_size;
8408   if (disk_size == 0)
8409     disk_size = kd.m_stats.m_data_size;
8410   auto rows = kd.m_stats.m_rows;
8411   if (rows == 0 || disk_size == 0) {
8412     rows = 1;
8413     disk_size = ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE;
8414   }
8415 
8416 #pragma GCC diagnostic push
8417 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
8418   // Getting statistics, including from Memtables
8419   rdb->GetApproximateSizes(kd.get_cf(), &r, 1, &sz, true);
8420 #pragma GCC diagnostic pop
8421 
8422   ret = rows * sz / disk_size;
8423 
8424   /*
8425     GetApproximateSizes() gives estimates so ret might exceed stats.records.
8426     MySQL then decides to use full index scan rather than range scan, which
8427     is not efficient for most cases.
8428     To prevent this, changing estimated records slightly smaller than
8429     stats.records.
8430   */
8431   if (ret >= stats.records) {
8432     ret = stats.records * 0.99;
8433   }
8434 
8435   if (rocksdb_debug_optimizer_n_rows > 0) {
8436     ret = rocksdb_debug_optimizer_n_rows;
8437   } else if (ret == 0) {
8438     ret = 1;
8439   }
8440 
8441   DBUG_RETURN(ret);
8442 }
8443 
update_create_info(HA_CREATE_INFO * const create_info)8444 void ha_rocksdb::update_create_info(HA_CREATE_INFO *const create_info) {
8445   DBUG_ENTER_FUNC();
8446 
8447   DBUG_ASSERT(create_info != nullptr);
8448 
8449   if (!(create_info->used_fields & HA_CREATE_USED_AUTO)) {
8450     create_info->auto_increment_value = m_tbl_def->m_auto_incr_val;
8451   }
8452 
8453   DBUG_VOID_RETURN;
8454 }
8455 
8456 /**
8457   @brief
8458   Doing manual compaction on OPTIMIZE TABLE in RocksDB.
8459   Compaction itself is executed by background thread in RocksDB, but
8460   CompactRange() waits until compaction completes so this function
8461   may take a long time.
8462   Since RocksDB dataset is allocated per index id, OPTIMIZE TABLE
8463   triggers manual compaction for all indexes of the table.
8464   @details
8465   Compaction range is from the beginning of the index id to
8466   the first row of the next index id. When using reverse order
8467   column family, the first row of the next index id should be
8468   the last row of the previous index id.
8469 */
optimize(THD * const thd,HA_CHECK_OPT * const check_opt)8470 int ha_rocksdb::optimize(THD *const thd, HA_CHECK_OPT *const check_opt) {
8471   DBUG_ENTER_FUNC();
8472 
8473   DBUG_ASSERT(thd != nullptr);
8474   DBUG_ASSERT(check_opt != nullptr);
8475 
8476   int rc = 0;
8477   for (uint i = 0; i < table->s->keys; i++) {
8478     uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
8479     auto range = get_range(i, buf);
8480     if (!rdb->CompactRange(rocksdb::CompactRangeOptions(),
8481                            m_key_descr_arr[i]->get_cf(), &range.start,
8482                            &range.limit)
8483              .ok()) {
8484       rc = 1;
8485       break;
8486     }
8487   }
8488 
8489   DBUG_RETURN(rc);
8490 }
8491 
calculate_stats(const TABLE * const table_arg,THD * const thd,HA_CHECK_OPT * const check_opt)8492 int ha_rocksdb::calculate_stats(const TABLE *const table_arg, THD *const thd,
8493                                 HA_CHECK_OPT *const check_opt) {
8494   DBUG_ENTER_FUNC();
8495 
8496   // find per column family key ranges which need to be queried
8497   std::unordered_map<rocksdb::ColumnFamilyHandle *, std::vector<rocksdb::Range>>
8498       ranges;
8499   std::unordered_set<GL_INDEX_ID> ids_to_check;
8500   std::vector<uchar> buf(table_arg->s->keys * 2 *
8501                          Rdb_key_def::INDEX_NUMBER_SIZE);
8502   for (uint i = 0; i < table_arg->s->keys; i++) {
8503     const auto bufp = &buf[i * 2 * Rdb_key_def::INDEX_NUMBER_SIZE];
8504     const Rdb_key_def &kd = *m_key_descr_arr[i];
8505     ranges[kd.get_cf()].push_back(get_range(i, bufp));
8506     ids_to_check.insert(kd.get_gl_index_id());
8507   }
8508 
8509   // for analyze statements, force flush on memtable to get accurate cardinality
8510   Rdb_cf_manager &cf_manager = rdb_get_cf_manager();
8511   if (thd != nullptr && THDVAR(thd, flush_memtable_on_analyze) &&
8512       !rocksdb_pause_background_work) {
8513     for (auto it : ids_to_check) {
8514       rdb->Flush(rocksdb::FlushOptions(), cf_manager.get_cf(it.cf_id));
8515     }
8516   }
8517 
8518   // get RocksDB table properties for these ranges
8519   rocksdb::TablePropertiesCollection props;
8520   for (auto it : ranges) {
8521     const auto old_size __attribute__((__unused__)) = props.size();
8522     const auto status = rdb->GetPropertiesOfTablesInRange(
8523         it.first, &it.second[0], it.second.size(), &props);
8524     DBUG_ASSERT(props.size() >= old_size);
8525     if (!status.ok())
8526       DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
8527   }
8528 
8529   int num_sst = 0;
8530   // group stats per index id
8531   std::unordered_map<GL_INDEX_ID, Rdb_index_stats> stats;
8532   for (const auto &it : ids_to_check) {
8533     // Initialize the stats to 0. If there are no files that contain
8534     // this gl_index_id, then 0 should be stored for the cached stats.
8535     stats[it] = Rdb_index_stats(it);
8536   }
8537   for (const auto &it : props) {
8538     std::vector<Rdb_index_stats> sst_stats;
8539     Rdb_tbl_prop_coll::read_stats_from_tbl_props(it.second, &sst_stats);
8540     /*
8541       sst_stats is a list of index statistics for indexes that have entries
8542       in the current SST file.
8543     */
8544     for (const auto &it1 : sst_stats) {
8545       /*
8546         Only update statistics for indexes that belong to this SQL table.
8547 
8548         The reason is: We are walking through all SST files that have
8549         entries from this table (and so can compute good statistics). For
8550         other SQL tables, it can be that we're only seeing a small fraction
8551         of table's entries (and so we can't update statistics based on that).
8552       */
8553       if (ids_to_check.find(it1.m_gl_index_id) == ids_to_check.end())
8554         continue;
8555 
8556       auto kd = ddl_manager.safe_find(it1.m_gl_index_id);
8557       DBUG_ASSERT(kd != nullptr);
8558       stats[it1.m_gl_index_id].merge(it1, true, kd->max_storage_fmt_length());
8559     }
8560     num_sst++;
8561   }
8562 
8563   // set and persist new stats
8564   ddl_manager.set_stats(stats);
8565   ddl_manager.persist_stats(true);
8566 
8567   DBUG_RETURN(HA_EXIT_SUCCESS);
8568 }
8569 
analyze(THD * const thd,HA_CHECK_OPT * const check_opt)8570 int ha_rocksdb::analyze(THD *const thd, HA_CHECK_OPT *const check_opt) {
8571   DBUG_ENTER_FUNC();
8572 
8573   if (!table) {
8574     DBUG_RETURN(HA_EXIT_FAILURE);
8575   }
8576 
8577   DBUG_RETURN(calculate_stats(table, thd, check_opt));
8578 }
8579 
get_auto_increment(ulonglong off,ulonglong inc,ulonglong nb_desired_values,ulonglong * const first_value,ulonglong * const nb_reserved_values)8580 void ha_rocksdb::get_auto_increment(ulonglong off, ulonglong inc,
8581                                     ulonglong nb_desired_values,
8582                                     ulonglong *const first_value,
8583                                     ulonglong *const nb_reserved_values) {
8584   /*
8585     MySQL has a somewhat complicated way of handling the auto-increment value.
8586     The first time get_auto_increment is called for a statement,
8587     nb_desired_values is the estimate for how many values will be needed.  The
8588     engine can then reserve some values, and those will be automatically used
8589     by MySQL, until a hard-coded value shows up in the insert statement, after
8590     which MySQL again calls this function to reset its starting value.
8591    *
8592     For simplicity we will just ignore nb_desired_values - we aren't going to
8593     reserve any extra values for a multi-insert statement.  Each row will
8594     simply acquire the next value as needed and we will always tell MySQL that
8595     we only reserved 1 value.  Since we are using an atomic value for
8596     m_auto_incr_val this should be safe - if we had to grab a mutex, doing
8597     an actual reserve of some values might be a better solution.
8598    */
8599   DEBUG_SYNC(ha_thd(), "rocksdb.autoinc_vars");
8600 
8601   if (off > inc) {
8602     off = 1;
8603   }
8604 
8605   // Local variable reference to simplify code below
8606   std::atomic<longlong> &auto_incr = m_tbl_def->m_auto_incr_val;
8607 
8608   if (inc == 1 && off == 1) {
8609     // Optimization for the standard case where we are always simply
8610     // incrementing from the last position
8611     *first_value = auto_incr++;
8612   } else {
8613     // The next value can be more complicated if either `inc` or 'off' is not 1
8614     longlong new_val;
8615     longlong last_val = auto_incr;
8616 
8617     // Loop until we can correctly update the atomic value
8618     do {
8619       // Calculate the next value in the auto increment series:
8620       //   offset + N * increment
8621       // where N is 0, 1, 2, ...
8622       //
8623       // For further information please visit:
8624       // http://dev.mysql.com/doc/refman/5.7/en/replication-options-master.html
8625       new_val = ((last_val + (inc - off) - 1) / inc) * inc + off;
8626 
8627       // Attempt to store the new value (plus 1 since m_auto_incr_val contains
8628       // the next available value) into the atomic value.  If the current
8629       // value no longer matches what we have in 'last_val' this will fail and
8630       // we will repeat the loop (`last_val` will automatically get updated
8631       // with the current value).
8632     } while (!auto_incr.compare_exchange_weak(last_val, new_val + 1));
8633 
8634     *first_value = new_val;
8635   }
8636 
8637   *nb_reserved_values = 1;
8638 }
8639 
8640 #ifndef DBUG_OFF
8641 
8642 /* Debugger help function */
8643 static char dbug_item_print_buf[512];
8644 
dbug_print_item(Item * const item)8645 const char *dbug_print_item(Item *const item) {
8646   char *const buf = dbug_item_print_buf;
8647   String str(buf, sizeof(dbug_item_print_buf), &my_charset_bin);
8648   str.length(0);
8649   if (!item)
8650     return "(Item*)nullptr";
8651   item->print(&str, QT_ORDINARY);
8652   if (str.c_ptr() == buf)
8653     return buf;
8654   else
8655     return "Couldn't fit into buffer";
8656 }
8657 
8658 #endif /*DBUG_OFF*/
8659 
8660 /**
8661   SQL layer calls this function to push an index condition.
8662 
8663   @details
8664     The condition is for index keyno (only one condition can be pushed at a
8665     time).
8666     The caller guarantees that condition refers only to index fields; besides
8667     that, fields must have
8668 
8669       $field->part_of_key.set_bit(keyno)
8670 
8671     which means that
8672 
8673        (handler->index_flags(keyno, $keypart, 0) & HA_KEYREAD_ONLY) == 1
8674 
8675     which means that field value can be restored from the index tuple.
8676 
8677   @return
8678     Part of condition we couldn't check (always nullptr).
8679 */
8680 
idx_cond_push(uint keyno,class Item * const idx_cond)8681 class Item *ha_rocksdb::idx_cond_push(uint keyno, class Item *const idx_cond) {
8682   DBUG_ENTER_FUNC();
8683 
8684   DBUG_ASSERT(keyno != MAX_KEY);
8685   DBUG_ASSERT(idx_cond != nullptr);
8686 
8687   pushed_idx_cond = idx_cond;
8688   pushed_idx_cond_keyno = keyno;
8689   in_range_check_pushed_down = TRUE;
8690 
8691   /* We will check the whole condition */
8692   DBUG_RETURN(nullptr);
8693 }
8694 
8695 /*
8696   @brief
8697   Check the index condition.
8698 
8699   @detail
8700   Check the index condition. (The caller has unpacked all needed index
8701   columns into table->record[0])
8702 
8703   @return
8704     ICP_NO_MATCH - Condition not satisfied (caller should continue
8705                    scanning)
8706     OUT_OF_RANGE - We've left the range we're scanning (caller should
8707                    stop scanning and return HA_ERR_END_OF_FILE)
8708 
8709     ICP_MATCH    - Condition is satisfied (caller should fetch the record
8710                    and return it)
8711 */
8712 
check_index_cond() const8713 enum icp_result ha_rocksdb::check_index_cond() const {
8714   DBUG_ASSERT(pushed_idx_cond);
8715   DBUG_ASSERT(pushed_idx_cond_keyno != MAX_KEY);
8716 
8717   if (end_range && compare_key_icp(end_range) > 0) {
8718     /* caller should return HA_ERR_END_OF_FILE already */
8719     return ICP_OUT_OF_RANGE;
8720   }
8721 
8722   return pushed_idx_cond->val_int() ? ICP_MATCH : ICP_NO_MATCH;
8723 }
8724 
8725 /**
8726   Checking if an index is used for ascending scan or not
8727 
8728   @detail
8729   Currently RocksDB does not support bloom filter for
8730   prefix lookup + descending scan, but supports bloom filter for
8731   prefix lookup + ascending scan. This function returns true if
8732   the scan pattern is absolutely ascending.
8733   @param kd
8734   @param find_flag
8735 */
is_ascending(const Rdb_key_def & kd,enum ha_rkey_function find_flag) const8736 bool ha_rocksdb::is_ascending(const Rdb_key_def &kd,
8737                               enum ha_rkey_function find_flag) const {
8738   bool is_ascending;
8739   switch (find_flag) {
8740   case HA_READ_KEY_EXACT: {
8741     is_ascending = !kd.m_is_reverse_cf;
8742     break;
8743   }
8744   case HA_READ_PREFIX: {
8745     is_ascending = true;
8746     break;
8747   }
8748   case HA_READ_KEY_OR_NEXT:
8749   case HA_READ_AFTER_KEY: {
8750     is_ascending = !kd.m_is_reverse_cf;
8751     break;
8752   }
8753   case HA_READ_KEY_OR_PREV:
8754   case HA_READ_BEFORE_KEY:
8755   case HA_READ_PREFIX_LAST:
8756   case HA_READ_PREFIX_LAST_OR_PREV: {
8757     is_ascending = kd.m_is_reverse_cf;
8758     break;
8759   }
8760   default:
8761     is_ascending = false;
8762   }
8763   return is_ascending;
8764 }
8765 
8766 /*
8767   Checks if inplace alter is supported for a given operation.
8768 */
8769 
check_if_supported_inplace_alter(TABLE * altered_table,my_core::Alter_inplace_info * const ha_alter_info)8770 my_core::enum_alter_inplace_result ha_rocksdb::check_if_supported_inplace_alter(
8771     TABLE *altered_table, my_core::Alter_inplace_info *const ha_alter_info) {
8772   DBUG_ENTER_FUNC();
8773 
8774   DBUG_ASSERT(ha_alter_info != nullptr);
8775 
8776   if (ha_alter_info->handler_flags &
8777       ~(my_core::Alter_inplace_info::DROP_INDEX |
8778         my_core::Alter_inplace_info::DROP_UNIQUE_INDEX |
8779         my_core::Alter_inplace_info::ADD_INDEX |
8780         my_core::Alter_inplace_info::ADD_UNIQUE_INDEX)) {
8781     DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED);
8782   }
8783 
8784   /* We don't support unique keys on table w/ no primary keys */
8785   if ((ha_alter_info->handler_flags &
8786        my_core::Alter_inplace_info::ADD_UNIQUE_INDEX) &&
8787       has_hidden_pk(altered_table)) {
8788     DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED);
8789   }
8790 
8791   DBUG_RETURN(my_core::HA_ALTER_INPLACE_SHARED_LOCK_AFTER_PREPARE);
8792 }
8793 
8794 /**
8795   Allows the storage engine to update internal structures with concurrent
8796   writes blocked. If check_if_supported_inplace_alter() returns
8797   HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE or
8798   HA_ALTER_INPLACE_SHARED_AFTER_PREPARE, this function is called with
8799   exclusive lock otherwise the same level of locking as for
8800   inplace_alter_table() will be used.
8801 
8802   @note Storage engines are responsible for reporting any errors by
8803   calling my_error()/print_error()
8804 
8805   @note If this function reports error, commit_inplace_alter_table()
8806   will be called with commit= false.
8807 
8808   @note For partitioning, failing to prepare one partition, means that
8809   commit_inplace_alter_table() will be called to roll back changes for
8810   all partitions. This means that commit_inplace_alter_table() might be
8811   called without prepare_inplace_alter_table() having been called first
8812   for a given partition.
8813 
8814   @param    altered_table     TABLE object for new version of table.
8815   @param    ha_alter_info     Structure describing changes to be done
8816                               by ALTER TABLE and holding data used
8817                               during in-place alter.
8818 
8819   @retval   true              Error
8820   @retval   false             Success
8821 */
prepare_inplace_alter_table(TABLE * const altered_table,my_core::Alter_inplace_info * const ha_alter_info)8822 bool ha_rocksdb::prepare_inplace_alter_table(
8823     TABLE *const altered_table,
8824     my_core::Alter_inplace_info *const ha_alter_info) {
8825   DBUG_ENTER_FUNC();
8826 
8827   DBUG_ASSERT(altered_table != nullptr);
8828   DBUG_ASSERT(ha_alter_info != nullptr);
8829 
8830   const uint n_dropped_keys = ha_alter_info->index_drop_count;
8831   const uint n_added_keys = ha_alter_info->index_add_count;
8832   const uint old_n_keys = m_tbl_def->m_key_count;
8833   uint new_n_keys = altered_table->s->keys;
8834 
8835   if (has_hidden_pk(altered_table)) {
8836     new_n_keys += 1;
8837   }
8838 
8839   std::shared_ptr<Rdb_key_def> *const old_key_descr =
8840       m_tbl_def->m_key_descr_arr;
8841   std::shared_ptr<Rdb_key_def> *const new_key_descr =
8842       new std::shared_ptr<Rdb_key_def>[new_n_keys];
8843 
8844   Rdb_tbl_def *const new_tdef = new Rdb_tbl_def(m_tbl_def->full_tablename());
8845   new_tdef->m_key_descr_arr = new_key_descr;
8846   new_tdef->m_key_count = new_n_keys;
8847   new_tdef->m_auto_incr_val =
8848       m_tbl_def->m_auto_incr_val.load(std::memory_order_relaxed);
8849   new_tdef->m_hidden_pk_val =
8850       m_tbl_def->m_hidden_pk_val.load(std::memory_order_relaxed);
8851 
8852   if (ha_alter_info->handler_flags &
8853           (my_core::Alter_inplace_info::DROP_INDEX |
8854            my_core::Alter_inplace_info::DROP_UNIQUE_INDEX |
8855            my_core::Alter_inplace_info::ADD_INDEX |
8856            my_core::Alter_inplace_info::ADD_UNIQUE_INDEX) &&
8857       create_key_defs(altered_table, new_tdef, table, m_tbl_def)) {
8858     /* Delete the new key descriptors */
8859     delete[] new_key_descr;
8860 
8861     /*
8862       Explicitly mark as nullptr so we don't accidentally remove entries
8863       from data dictionary on cleanup (or cause double delete[]).
8864     */
8865     new_tdef->m_key_descr_arr = nullptr;
8866     delete new_tdef;
8867 
8868     my_printf_error(ER_UNKNOWN_ERROR,
8869                     "MyRocks failed creating new key definitions during alter.",
8870                     MYF(0));
8871     DBUG_RETURN(HA_EXIT_FAILURE);
8872   }
8873 
8874   std::unordered_set<std::shared_ptr<Rdb_key_def>> added_indexes;
8875   std::unordered_set<GL_INDEX_ID> dropped_index_ids;
8876 
8877   uint i;
8878   uint j;
8879   const KEY *key;
8880 
8881   /* Determine which(if any) key definition(s) need to be dropped */
8882   for (i = 0; i < ha_alter_info->index_drop_count; i++) {
8883     const KEY *const key = ha_alter_info->index_drop_buffer[i];
8884     for (j = 0; j < old_n_keys; j++) {
8885       if (!old_key_descr[j]->m_name.compare(key->name)) {
8886         dropped_index_ids.insert(old_key_descr[j]->get_gl_index_id());
8887         break;
8888       }
8889     }
8890   }
8891 
8892   DBUG_ASSERT(dropped_index_ids.size() == ha_alter_info->index_drop_count);
8893 
8894   /* Determine which(if any) key definitions(s) need to be added */
8895   for (i = 0; i < ha_alter_info->index_add_count; i++) {
8896     key = &ha_alter_info->key_info_buffer[ha_alter_info->index_add_buffer[i]];
8897     for (j = 0; j < new_n_keys; j++) {
8898       if (!new_key_descr[j]->m_name.compare(key->name)) {
8899         added_indexes.insert(new_key_descr[j]);
8900         break;
8901       }
8902     }
8903   }
8904 
8905   DBUG_ASSERT(added_indexes.size() == ha_alter_info->index_add_count);
8906 
8907   ha_alter_info->handler_ctx = new Rdb_inplace_alter_ctx(
8908       new_tdef, old_key_descr, new_key_descr, old_n_keys, new_n_keys,
8909       added_indexes, dropped_index_ids, n_added_keys, n_dropped_keys);
8910 
8911   DBUG_RETURN(HA_EXIT_SUCCESS);
8912 }
8913 
8914 /**
8915   Alter the table structure in-place with operations specified using
8916   HA_ALTER_FLAGS and Alter_inplace_info. The level of concurrency allowed
8917   during this operation depends on the return value from
8918   check_if_supported_inplace_alter().
8919 
8920   @note Storage engines are responsible for reporting any errors by
8921   calling my_error()/print_error()
8922 
8923   @note If this function reports error, commit_inplace_alter_table()
8924   will be called with commit= false.
8925 
8926   @param    altered_table     TABLE object for new version of table.
8927   @param    ha_alter_info     Structure describing changes to be done
8928                               by ALTER TABLE and holding data used
8929                               during in-place alter.
8930 
8931   @retval   true              Error
8932   @retval   false             Success
8933 */
inplace_alter_table(TABLE * const altered_table,my_core::Alter_inplace_info * const ha_alter_info)8934 bool ha_rocksdb::inplace_alter_table(
8935     TABLE *const altered_table,
8936     my_core::Alter_inplace_info *const ha_alter_info) {
8937   DBUG_ENTER_FUNC();
8938 
8939   DBUG_ASSERT(altered_table != nullptr);
8940   DBUG_ASSERT(ha_alter_info != nullptr);
8941   DBUG_ASSERT(ha_alter_info->handler_ctx != nullptr);
8942 
8943   Rdb_inplace_alter_ctx *const ctx =
8944       static_cast<Rdb_inplace_alter_ctx *>(ha_alter_info->handler_ctx);
8945 
8946   if (ha_alter_info->handler_flags &
8947       (my_core::Alter_inplace_info::ADD_INDEX |
8948        my_core::Alter_inplace_info::ADD_UNIQUE_INDEX)) {
8949     /*
8950       Buffers need to be set up again to account for new, possibly longer
8951       secondary keys.
8952     */
8953     free_key_buffers();
8954 
8955     DBUG_ASSERT(ctx != nullptr);
8956 
8957     /*
8958       If adding unique index, allocate special buffers for duplicate checking.
8959     */
8960     int err;
8961     if ((err = alloc_key_buffers(
8962              altered_table, ctx->m_new_tdef,
8963              ha_alter_info->handler_flags &
8964                  my_core::Alter_inplace_info::ADD_UNIQUE_INDEX))) {
8965       my_printf_error(ER_UNKNOWN_ERROR,
8966                       "MyRocks failed allocating key buffers during alter.",
8967                       MYF(0));
8968       DBUG_RETURN(err);
8969     }
8970 
8971     /* Populate all new secondary keys by scanning the primary key. */
8972     if ((err = inplace_populate_sk(altered_table, ctx->m_added_indexes))) {
8973       my_printf_error(ER_UNKNOWN_ERROR,
8974                       "MyRocks failed populating secondary key during alter.",
8975                       MYF(0));
8976       DBUG_RETURN(HA_EXIT_FAILURE);
8977     }
8978   }
8979 
8980   DBUG_EXECUTE_IF("myrocks_simulate_index_create_rollback", {
8981     dbug_create_err_inplace_alter();
8982     DBUG_RETURN(HA_EXIT_FAILURE);
8983   };);
8984 
8985   DBUG_RETURN(HA_EXIT_SUCCESS);
8986 }
8987 
8988 /**
8989  Scan the Primary Key index entries and populate the new secondary keys.
8990 */
inplace_populate_sk(TABLE * const new_table_arg,const std::unordered_set<std::shared_ptr<Rdb_key_def>> & indexes)8991 int ha_rocksdb::inplace_populate_sk(
8992     TABLE *const new_table_arg,
8993     const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes) {
8994   DBUG_ENTER_FUNC();
8995 
8996   const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
8997   rocksdb::WriteBatch *const batch = wb.get();
8998 
8999   /* Update the data dictionary */
9000   std::unordered_set<GL_INDEX_ID> create_index_ids;
9001   for (const auto &index : indexes) {
9002     create_index_ids.insert(index->get_gl_index_id());
9003   }
9004   dict_manager.add_create_index(create_index_ids, batch);
9005   dict_manager.commit(batch);
9006 
9007   const bool hidden_pk_exists = has_hidden_pk(table);
9008 
9009   int res = 0;
9010   Rdb_transaction *tx = get_or_create_tx(table->in_use);
9011 
9012   /*
9013     There is one specific scenario where m_sst_info may not be nullptr. This
9014     happens if the handler we're using happens to be the handler where the PK
9015     bulk load was done on. The sequence of events that lead to this is as
9016     follows (T1 is PK bulk load, T2 is SK alter table):
9017 
9018     T1: Execute last INSERT statement
9019     T1: Return TABLE and handler object back to Table_cache_manager
9020     T1: Close connection
9021     T2: Execute ALTER statement
9022     T2: Take same TABLE/handler from Table_cache_manager
9023     T2: Call closefrm which will call finalize_bulk_load on every other open
9024         table/handler *except* the one it's on.
9025     T2: Acquire stale snapshot of PK
9026     T1: Call finalize_bulk_load
9027 
9028     This is rare because usually, closefrm will call the destructor (and thus
9029     finalize_bulk_load) on the handler where PK bulk load is done. However, if
9030     the thread ids of the bulk load thread and the alter thread differ by a
9031     multiple of table_cache_instances (8 by default), then they hash to the
9032     same bucket in Table_cache_manager and the alter thread will not not call
9033     the destructor on the handler it is holding. Thus, its m_sst_info will not
9034     be nullptr.
9035 
9036     At this point, it is safe to refresh the snapshot because we know all other
9037     open handlers have been closed at this point, and the one we're on is the
9038     only one left.
9039   */
9040   if (m_sst_info != nullptr) {
9041     if ((res = finalize_bulk_load())) {
9042       DBUG_RETURN(res);
9043     }
9044     tx->commit();
9045   }
9046 
9047   const ulonglong rdb_merge_buf_size = THDVAR(ha_thd(), merge_buf_size);
9048   const ulonglong rdb_merge_combine_read_size =
9049       THDVAR(ha_thd(), merge_combine_read_size);
9050 
9051   for (const auto &index : indexes) {
9052     const rocksdb::Comparator *index_comp = index->get_cf()->GetComparator();
9053     bool is_unique_index =
9054         new_table_arg->key_info[index->get_keyno()].flags & HA_NOSAME;
9055 
9056     Rdb_index_merge rdb_merge(thd_rocksdb_tmpdir(), rdb_merge_buf_size,
9057                               rdb_merge_combine_read_size, index_comp);
9058 
9059     if ((res = rdb_merge.init())) {
9060       DBUG_RETURN(res);
9061     }
9062 
9063     /*
9064       Note: We pass in the currently existing table + tbl_def object here,
9065       as the pk index position may have changed in the case of hidden primary
9066       keys.
9067     */
9068     const uint pk = pk_index(table, m_tbl_def);
9069     ha_index_init(pk, true);
9070 
9071     /* Scan each record in the primary key in order */
9072     for (res = index_first(table->record[0]); res == 0;
9073          res = index_next(table->record[0])) {
9074       longlong hidden_pk_id = 0;
9075       if (hidden_pk_exists && read_hidden_pk_id_from_rowkey(&hidden_pk_id)) {
9076         // NO_LINT_DEBUG
9077         sql_print_error("Error retrieving hidden pk id.");
9078         ha_index_end();
9079         DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
9080       }
9081 
9082       /* Create new secondary index entry */
9083       const int new_packed_size = index->pack_record(
9084           new_table_arg, m_pack_buffer, table->record[0], m_sk_packed_tuple,
9085           &m_sk_tails, should_store_row_debug_checksums(), hidden_pk_id);
9086 
9087       const rocksdb::Slice key = rocksdb::Slice(
9088           reinterpret_cast<const char *>(m_sk_packed_tuple), new_packed_size);
9089       const rocksdb::Slice val =
9090           rocksdb::Slice(reinterpret_cast<const char *>(m_sk_tails.ptr()),
9091                          m_sk_tails.get_current_pos());
9092 
9093       /*
9094         Add record to offset tree in preparation for writing out to
9095         disk in sorted chunks.
9096       */
9097       if ((res = rdb_merge.add(key, val))) {
9098         ha_index_end();
9099         DBUG_RETURN(res);
9100       }
9101     }
9102 
9103     if (res != HA_ERR_END_OF_FILE) {
9104       // NO_LINT_DEBUG
9105       sql_print_error("Error retrieving index entry from primary key.");
9106       ha_index_end();
9107       DBUG_RETURN(res);
9108     }
9109 
9110     ha_index_end();
9111 
9112     /*
9113       Perform an n-way merge of n sorted buffers on disk, then writes all
9114       results to RocksDB via SSTFileWriter API.
9115     */
9116     rocksdb::Slice merge_key;
9117     rocksdb::Slice merge_val;
9118 
9119     struct unique_sk_buf_info sk_info;
9120     sk_info.dup_sk_buf = m_dup_sk_packed_tuple;
9121     sk_info.dup_sk_buf_old = m_dup_sk_packed_tuple_old;
9122 
9123     while ((res = rdb_merge.next(&merge_key, &merge_val)) == 0) {
9124       /* Perform uniqueness check if needed */
9125       if (is_unique_index) {
9126         if (check_duplicate_sk(new_table_arg, *index, &merge_key, &sk_info)) {
9127           /*
9128             Duplicate entry found when trying to create unique secondary key.
9129             We need to unpack the record into new_table_arg->record[0] as it
9130             is used inside print_keydup_error so that the error message shows
9131             the duplicate record.
9132           */
9133           if (index->unpack_record(new_table_arg, new_table_arg->record[0],
9134                                    &merge_key, nullptr,
9135                                    m_verify_row_debug_checksums)) {
9136             /* Should never reach here */
9137             DBUG_ASSERT(0);
9138           }
9139 
9140           print_keydup_error(new_table_arg,
9141                              &new_table_arg->key_info[index->get_keyno()],
9142                              MYF(0));
9143           DBUG_RETURN(ER_DUP_ENTRY);
9144         }
9145       }
9146 
9147       /*
9148         Insert key and slice to SST via SSTFileWriter API.
9149       */
9150       if ((res = bulk_load_key(tx, *index, merge_key, merge_val))) {
9151         break;
9152       }
9153     }
9154 
9155     /*
9156       Here, res == -1 means that we are finished, while > 0 means an error
9157       occurred.
9158     */
9159     if (res > 0) {
9160       // NO_LINT_DEBUG
9161       sql_print_error("Error while bulk loading keys in external merge sort.");
9162       DBUG_RETURN(res);
9163     }
9164 
9165     if ((res = tx->finish_bulk_load())) {
9166       // NO_LINT_DEBUG
9167       sql_print_error("Error finishing bulk load.");
9168       DBUG_RETURN(res);
9169     }
9170   }
9171 
9172   /*
9173     Explicitly tell jemalloc to clean up any unused dirty pages at this point.
9174     See https://reviews.facebook.net/D63723 for more details.
9175   */
9176   purge_all_jemalloc_arenas();
9177 
9178   DBUG_EXECUTE_IF("crash_during_online_index_creation", DBUG_SUICIDE(););
9179   DBUG_RETURN(res);
9180 }
9181 
9182 /**
9183   Commit or rollback the changes made during prepare_inplace_alter_table()
9184   and inplace_alter_table() inside the storage engine.
9185   Note that in case of rollback the allowed level of concurrency during
9186   this operation will be the same as for inplace_alter_table() and thus
9187   might be higher than during prepare_inplace_alter_table(). (For example,
9188   concurrent writes were blocked during prepare, but might not be during
9189   rollback).
9190 
9191   @note Storage engines are responsible for reporting any errors by
9192   calling my_error()/print_error()
9193 
9194   @note If this function with commit= true reports error, it will be called
9195   again with commit= false.
9196 
9197   @note In case of partitioning, this function might be called for rollback
9198   without prepare_inplace_alter_table() having been called first.
9199   Also partitioned tables sets ha_alter_info->group_commit_ctx to a NULL
9200   terminated array of the partitions handlers and if all of them are
9201   committed as one, then group_commit_ctx should be set to NULL to indicate
9202   to the partitioning handler that all partitions handlers are committed.
9203   @see prepare_inplace_alter_table().
9204 
9205   @param    altered_table     TABLE object for new version of table.
9206   @param    ha_alter_info     Structure describing changes to be done
9207                               by ALTER TABLE and holding data used
9208                               during in-place alter.
9209   @param    commit            True => Commit, False => Rollback.
9210 
9211   @retval   true              Error
9212   @retval   false             Success
9213 */
commit_inplace_alter_table(my_core::TABLE * const altered_table,my_core::Alter_inplace_info * const ha_alter_info,bool commit)9214 bool ha_rocksdb::commit_inplace_alter_table(
9215     my_core::TABLE *const altered_table,
9216     my_core::Alter_inplace_info *const ha_alter_info, bool commit) {
9217   DBUG_ENTER_FUNC();
9218 
9219   DBUG_ASSERT(altered_table != nullptr);
9220   DBUG_ASSERT(ha_alter_info != nullptr);
9221 
9222   Rdb_inplace_alter_ctx *const ctx0 =
9223       static_cast<Rdb_inplace_alter_ctx *>(ha_alter_info->handler_ctx);
9224 
9225   /*
9226     IMPORTANT: When rollback is requested, mysql will abort with
9227     an assertion failure. That means every failed commit during inplace alter
9228     table will result in a fatal error on the server. Indexes ongoing creation
9229     will be detected when the server restarts, and dropped.
9230 
9231     For partitioned tables, a rollback call to this function (commit == false)
9232     is done for each partition.  A successful commit call only executes once
9233     for all partitions.
9234   */
9235   if (!commit) {
9236     /* If ctx has not been created yet, nothing to do here */
9237     if (!ctx0) {
9238       DBUG_RETURN(HA_EXIT_SUCCESS);
9239     }
9240 
9241     /*
9242       Cannot call destructor for Rdb_tbl_def directly because we don't want to
9243       erase the mappings inside the ddl_manager, as the old_key_descr is still
9244       using them.
9245     */
9246     if (ctx0->m_new_key_descr) {
9247       /* Delete the new key descriptors */
9248       for (uint i = 0; i < ctx0->m_new_tdef->m_key_count; i++) {
9249         ctx0->m_new_key_descr[i] = nullptr;
9250       }
9251 
9252       delete[] ctx0->m_new_key_descr;
9253       ctx0->m_new_key_descr = nullptr;
9254       ctx0->m_new_tdef->m_key_descr_arr = nullptr;
9255 
9256       delete ctx0->m_new_tdef;
9257     }
9258 
9259     /* Rollback any partially created indexes */
9260     dict_manager.rollback_ongoing_index_creation();
9261 
9262     DBUG_RETURN(HA_EXIT_SUCCESS);
9263   }
9264 
9265   DBUG_ASSERT(ctx0);
9266 
9267   /*
9268     For partitioned tables, we need to commit all changes to all tables at
9269     once, unlike in the other inplace alter API methods.
9270   */
9271   inplace_alter_handler_ctx **ctx_array;
9272   inplace_alter_handler_ctx *ctx_single[2];
9273 
9274   if (ha_alter_info->group_commit_ctx) {
9275     DBUG_EXECUTE_IF("crash_during_index_creation_partition", DBUG_SUICIDE(););
9276     ctx_array = ha_alter_info->group_commit_ctx;
9277   } else {
9278     ctx_single[0] = ctx0;
9279     ctx_single[1] = nullptr;
9280     ctx_array = ctx_single;
9281   }
9282 
9283   DBUG_ASSERT(ctx0 == ctx_array[0]);
9284   ha_alter_info->group_commit_ctx = nullptr;
9285 
9286   if (ha_alter_info->handler_flags &
9287       (my_core::Alter_inplace_info::DROP_INDEX |
9288        my_core::Alter_inplace_info::DROP_UNIQUE_INDEX |
9289        my_core::Alter_inplace_info::ADD_INDEX |
9290        my_core::Alter_inplace_info::ADD_UNIQUE_INDEX)) {
9291     const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
9292     rocksdb::WriteBatch *const batch = wb.get();
9293     std::unordered_set<GL_INDEX_ID> create_index_ids;
9294 
9295     m_tbl_def = ctx0->m_new_tdef;
9296     m_key_descr_arr = m_tbl_def->m_key_descr_arr;
9297     m_pk_descr = m_key_descr_arr[pk_index(altered_table, m_tbl_def)];
9298 
9299     dict_manager.lock();
9300     for (inplace_alter_handler_ctx **pctx = ctx_array; *pctx; pctx++) {
9301       Rdb_inplace_alter_ctx *const ctx =
9302           static_cast<Rdb_inplace_alter_ctx *>(*pctx);
9303 
9304       /* Mark indexes to be dropped */
9305       dict_manager.add_drop_index(ctx->m_dropped_index_ids, batch);
9306 
9307       for (const auto &index : ctx->m_added_indexes) {
9308         create_index_ids.insert(index->get_gl_index_id());
9309       }
9310 
9311       if (ddl_manager.put_and_write(ctx->m_new_tdef, batch)) {
9312         /*
9313           Failed to write new entry into data dictionary, this should never
9314           happen.
9315         */
9316         DBUG_ASSERT(0);
9317       }
9318     }
9319 
9320     if (dict_manager.commit(batch)) {
9321       /*
9322         Should never reach here. We assume MyRocks will abort if commit fails.
9323       */
9324       DBUG_ASSERT(0);
9325     }
9326 
9327     dict_manager.unlock();
9328 
9329     /* Mark ongoing create indexes as finished/remove from data dictionary */
9330     dict_manager.finish_indexes_operation(
9331         create_index_ids, Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
9332 
9333     /*
9334       We need to recalculate the index stats here manually.  The reason is that
9335       the secondary index does not exist inside
9336       m_index_num_to_keydef until it is committed to the data dictionary, which
9337       prevents us from updating the stats normally as the ddl_manager cannot
9338       find the proper gl_index_ids yet during adjust_stats calls.
9339     */
9340     if (calculate_stats(altered_table, nullptr, nullptr)) {
9341       /* Failed to update index statistics, should never happen */
9342       DBUG_ASSERT(0);
9343     }
9344 
9345     rdb_drop_idx_thread.signal();
9346   }
9347 
9348   DBUG_RETURN(HA_EXIT_SUCCESS);
9349 }
9350 
9351 #define SHOW_FNAME(name) rocksdb_show_##name
9352 
9353 #define DEF_SHOW_FUNC(name, key)                                               \
9354   static int SHOW_FNAME(name)(MYSQL_THD thd, SHOW_VAR * var, char *buff) {     \
9355     rocksdb_status_counters.name =                                             \
9356         rocksdb_stats->getTickerCount(rocksdb::key);                           \
9357     var->type = SHOW_LONGLONG;                                                 \
9358     var->value = (char *)&rocksdb_status_counters.name;                        \
9359     return HA_EXIT_SUCCESS;                                                    \
9360   }
9361 
9362 #define DEF_STATUS_VAR(name)                                                   \
9363   { "rocksdb_" #name, (char *)&SHOW_FNAME(name), SHOW_FUNC }
9364 
9365 #define DEF_STATUS_VAR_PTR(name, ptr, option)                                  \
9366   { "rocksdb_" name, (char *)ptr, option }
9367 
9368 #define DEF_STATUS_VAR_FUNC(name, ptr, option)                                 \
9369   { name, reinterpret_cast<char *>(ptr), option }
9370 
9371 struct rocksdb_status_counters_t {
9372   uint64_t block_cache_miss;
9373   uint64_t block_cache_hit;
9374   uint64_t block_cache_add;
9375   uint64_t block_cache_index_miss;
9376   uint64_t block_cache_index_hit;
9377   uint64_t block_cache_filter_miss;
9378   uint64_t block_cache_filter_hit;
9379   uint64_t block_cache_data_miss;
9380   uint64_t block_cache_data_hit;
9381   uint64_t bloom_filter_useful;
9382   uint64_t memtable_hit;
9383   uint64_t memtable_miss;
9384   uint64_t compaction_key_drop_new;
9385   uint64_t compaction_key_drop_obsolete;
9386   uint64_t compaction_key_drop_user;
9387   uint64_t number_keys_written;
9388   uint64_t number_keys_read;
9389   uint64_t number_keys_updated;
9390   uint64_t bytes_written;
9391   uint64_t bytes_read;
9392   uint64_t no_file_closes;
9393   uint64_t no_file_opens;
9394   uint64_t no_file_errors;
9395   uint64_t l0_slowdown_micros;
9396   uint64_t memtable_compaction_micros;
9397   uint64_t l0_num_files_stall_micros;
9398   uint64_t rate_limit_delay_millis;
9399   uint64_t num_iterators;
9400   uint64_t number_multiget_get;
9401   uint64_t number_multiget_keys_read;
9402   uint64_t number_multiget_bytes_read;
9403   uint64_t number_deletes_filtered;
9404   uint64_t number_merge_failures;
9405   uint64_t bloom_filter_prefix_checked;
9406   uint64_t bloom_filter_prefix_useful;
9407   uint64_t number_reseeks_iteration;
9408   uint64_t getupdatessince_calls;
9409   uint64_t block_cachecompressed_miss;
9410   uint64_t block_cachecompressed_hit;
9411   uint64_t wal_synced;
9412   uint64_t wal_bytes;
9413   uint64_t write_self;
9414   uint64_t write_other;
9415   uint64_t write_timedout;
9416   uint64_t write_wal;
9417   uint64_t flush_write_bytes;
9418   uint64_t compact_read_bytes;
9419   uint64_t compact_write_bytes;
9420   uint64_t number_superversion_acquires;
9421   uint64_t number_superversion_releases;
9422   uint64_t number_superversion_cleanups;
9423   uint64_t number_block_not_compressed;
9424 };
9425 
9426 static rocksdb_status_counters_t rocksdb_status_counters;
9427 
DEF_SHOW_FUNC(block_cache_miss,BLOCK_CACHE_MISS)9428 DEF_SHOW_FUNC(block_cache_miss, BLOCK_CACHE_MISS)
9429 DEF_SHOW_FUNC(block_cache_hit, BLOCK_CACHE_HIT)
9430 DEF_SHOW_FUNC(block_cache_add, BLOCK_CACHE_ADD)
9431 DEF_SHOW_FUNC(block_cache_index_miss, BLOCK_CACHE_INDEX_MISS)
9432 DEF_SHOW_FUNC(block_cache_index_hit, BLOCK_CACHE_INDEX_HIT)
9433 DEF_SHOW_FUNC(block_cache_filter_miss, BLOCK_CACHE_FILTER_MISS)
9434 DEF_SHOW_FUNC(block_cache_filter_hit, BLOCK_CACHE_FILTER_HIT)
9435 DEF_SHOW_FUNC(block_cache_data_miss, BLOCK_CACHE_DATA_MISS)
9436 DEF_SHOW_FUNC(block_cache_data_hit, BLOCK_CACHE_DATA_HIT)
9437 DEF_SHOW_FUNC(bloom_filter_useful, BLOOM_FILTER_USEFUL)
9438 DEF_SHOW_FUNC(memtable_hit, MEMTABLE_HIT)
9439 DEF_SHOW_FUNC(memtable_miss, MEMTABLE_MISS)
9440 DEF_SHOW_FUNC(compaction_key_drop_new, COMPACTION_KEY_DROP_NEWER_ENTRY)
9441 DEF_SHOW_FUNC(compaction_key_drop_obsolete, COMPACTION_KEY_DROP_OBSOLETE)
9442 DEF_SHOW_FUNC(compaction_key_drop_user, COMPACTION_KEY_DROP_USER)
9443 DEF_SHOW_FUNC(number_keys_written, NUMBER_KEYS_WRITTEN)
9444 DEF_SHOW_FUNC(number_keys_read, NUMBER_KEYS_READ)
9445 DEF_SHOW_FUNC(number_keys_updated, NUMBER_KEYS_UPDATED)
9446 DEF_SHOW_FUNC(bytes_written, BYTES_WRITTEN)
9447 DEF_SHOW_FUNC(bytes_read, BYTES_READ)
9448 DEF_SHOW_FUNC(no_file_closes, NO_FILE_CLOSES)
9449 DEF_SHOW_FUNC(no_file_opens, NO_FILE_OPENS)
9450 DEF_SHOW_FUNC(no_file_errors, NO_FILE_ERRORS)
9451 DEF_SHOW_FUNC(l0_slowdown_micros, STALL_L0_SLOWDOWN_MICROS)
9452 DEF_SHOW_FUNC(memtable_compaction_micros, STALL_MEMTABLE_COMPACTION_MICROS)
9453 DEF_SHOW_FUNC(l0_num_files_stall_micros, STALL_L0_NUM_FILES_MICROS)
9454 DEF_SHOW_FUNC(rate_limit_delay_millis, RATE_LIMIT_DELAY_MILLIS)
9455 DEF_SHOW_FUNC(num_iterators, NO_ITERATORS)
9456 DEF_SHOW_FUNC(number_multiget_get, NUMBER_MULTIGET_CALLS)
9457 DEF_SHOW_FUNC(number_multiget_keys_read, NUMBER_MULTIGET_KEYS_READ)
9458 DEF_SHOW_FUNC(number_multiget_bytes_read, NUMBER_MULTIGET_BYTES_READ)
9459 DEF_SHOW_FUNC(number_deletes_filtered, NUMBER_FILTERED_DELETES)
9460 DEF_SHOW_FUNC(number_merge_failures, NUMBER_MERGE_FAILURES)
9461 DEF_SHOW_FUNC(bloom_filter_prefix_checked, BLOOM_FILTER_PREFIX_CHECKED)
9462 DEF_SHOW_FUNC(bloom_filter_prefix_useful, BLOOM_FILTER_PREFIX_USEFUL)
9463 DEF_SHOW_FUNC(number_reseeks_iteration, NUMBER_OF_RESEEKS_IN_ITERATION)
9464 DEF_SHOW_FUNC(getupdatessince_calls, GET_UPDATES_SINCE_CALLS)
9465 DEF_SHOW_FUNC(block_cachecompressed_miss, BLOCK_CACHE_COMPRESSED_MISS)
9466 DEF_SHOW_FUNC(block_cachecompressed_hit, BLOCK_CACHE_COMPRESSED_HIT)
9467 DEF_SHOW_FUNC(wal_synced, WAL_FILE_SYNCED)
9468 DEF_SHOW_FUNC(wal_bytes, WAL_FILE_BYTES)
9469 DEF_SHOW_FUNC(write_self, WRITE_DONE_BY_SELF)
9470 DEF_SHOW_FUNC(write_other, WRITE_DONE_BY_OTHER)
9471 DEF_SHOW_FUNC(write_timedout, WRITE_TIMEDOUT)
9472 DEF_SHOW_FUNC(write_wal, WRITE_WITH_WAL)
9473 DEF_SHOW_FUNC(flush_write_bytes, FLUSH_WRITE_BYTES)
9474 DEF_SHOW_FUNC(compact_read_bytes, COMPACT_READ_BYTES)
9475 DEF_SHOW_FUNC(compact_write_bytes, COMPACT_WRITE_BYTES)
9476 DEF_SHOW_FUNC(number_superversion_acquires, NUMBER_SUPERVERSION_ACQUIRES)
9477 DEF_SHOW_FUNC(number_superversion_releases, NUMBER_SUPERVERSION_RELEASES)
9478 DEF_SHOW_FUNC(number_superversion_cleanups, NUMBER_SUPERVERSION_CLEANUPS)
9479 DEF_SHOW_FUNC(number_block_not_compressed, NUMBER_BLOCK_NOT_COMPRESSED)
9480 
9481 static void myrocks_update_status() {
9482   export_stats.rows_deleted = global_stats.rows[ROWS_DELETED];
9483   export_stats.rows_inserted = global_stats.rows[ROWS_INSERTED];
9484   export_stats.rows_read = global_stats.rows[ROWS_READ];
9485   export_stats.rows_updated = global_stats.rows[ROWS_UPDATED];
9486 
9487   export_stats.system_rows_deleted = global_stats.system_rows[ROWS_DELETED];
9488   export_stats.system_rows_inserted = global_stats.system_rows[ROWS_INSERTED];
9489   export_stats.system_rows_read = global_stats.system_rows[ROWS_READ];
9490   export_stats.system_rows_updated = global_stats.system_rows[ROWS_UPDATED];
9491 }
9492 
9493 static SHOW_VAR myrocks_status_variables[] = {
9494     DEF_STATUS_VAR_FUNC("rows_deleted", &export_stats.rows_deleted,
9495                         SHOW_LONGLONG),
9496     DEF_STATUS_VAR_FUNC("rows_inserted", &export_stats.rows_inserted,
9497                         SHOW_LONGLONG),
9498     DEF_STATUS_VAR_FUNC("rows_read", &export_stats.rows_read, SHOW_LONGLONG),
9499     DEF_STATUS_VAR_FUNC("rows_updated", &export_stats.rows_updated,
9500                         SHOW_LONGLONG),
9501     DEF_STATUS_VAR_FUNC("system_rows_deleted",
9502                         &export_stats.system_rows_deleted, SHOW_LONGLONG),
9503     DEF_STATUS_VAR_FUNC("system_rows_inserted",
9504                         &export_stats.system_rows_inserted, SHOW_LONGLONG),
9505     DEF_STATUS_VAR_FUNC("system_rows_read", &export_stats.system_rows_read,
9506                         SHOW_LONGLONG),
9507     DEF_STATUS_VAR_FUNC("system_rows_updated",
9508                         &export_stats.system_rows_updated, SHOW_LONGLONG),
9509 
9510     {NullS, NullS, SHOW_LONG}};
9511 
show_myrocks_vars(THD * thd,SHOW_VAR * var,char * buff)9512 static void show_myrocks_vars(THD *thd, SHOW_VAR *var, char *buff) {
9513   myrocks_update_status();
9514   var->type = SHOW_ARRAY;
9515   var->value = reinterpret_cast<char *>(&myrocks_status_variables);
9516 }
9517 
9518 static SHOW_VAR rocksdb_status_vars[] = {
9519     DEF_STATUS_VAR(block_cache_miss),
9520     DEF_STATUS_VAR(block_cache_hit),
9521     DEF_STATUS_VAR(block_cache_add),
9522     DEF_STATUS_VAR(block_cache_index_miss),
9523     DEF_STATUS_VAR(block_cache_index_hit),
9524     DEF_STATUS_VAR(block_cache_filter_miss),
9525     DEF_STATUS_VAR(block_cache_filter_hit),
9526     DEF_STATUS_VAR(block_cache_data_miss),
9527     DEF_STATUS_VAR(block_cache_data_hit),
9528     DEF_STATUS_VAR(bloom_filter_useful),
9529     DEF_STATUS_VAR(memtable_hit),
9530     DEF_STATUS_VAR(memtable_miss),
9531     DEF_STATUS_VAR(compaction_key_drop_new),
9532     DEF_STATUS_VAR(compaction_key_drop_obsolete),
9533     DEF_STATUS_VAR(compaction_key_drop_user),
9534     DEF_STATUS_VAR(number_keys_written),
9535     DEF_STATUS_VAR(number_keys_read),
9536     DEF_STATUS_VAR(number_keys_updated),
9537     DEF_STATUS_VAR(bytes_written),
9538     DEF_STATUS_VAR(bytes_read),
9539     DEF_STATUS_VAR(no_file_closes),
9540     DEF_STATUS_VAR(no_file_opens),
9541     DEF_STATUS_VAR(no_file_errors),
9542     DEF_STATUS_VAR(l0_slowdown_micros),
9543     DEF_STATUS_VAR(memtable_compaction_micros),
9544     DEF_STATUS_VAR(l0_num_files_stall_micros),
9545     DEF_STATUS_VAR(rate_limit_delay_millis),
9546     DEF_STATUS_VAR(num_iterators),
9547     DEF_STATUS_VAR(number_multiget_get),
9548     DEF_STATUS_VAR(number_multiget_keys_read),
9549     DEF_STATUS_VAR(number_multiget_bytes_read),
9550     DEF_STATUS_VAR(number_deletes_filtered),
9551     DEF_STATUS_VAR(number_merge_failures),
9552     DEF_STATUS_VAR(bloom_filter_prefix_checked),
9553     DEF_STATUS_VAR(bloom_filter_prefix_useful),
9554     DEF_STATUS_VAR(number_reseeks_iteration),
9555     DEF_STATUS_VAR(getupdatessince_calls),
9556     DEF_STATUS_VAR(block_cachecompressed_miss),
9557     DEF_STATUS_VAR(block_cachecompressed_hit),
9558     DEF_STATUS_VAR(wal_synced),
9559     DEF_STATUS_VAR(wal_bytes),
9560     DEF_STATUS_VAR(write_self),
9561     DEF_STATUS_VAR(write_other),
9562     DEF_STATUS_VAR(write_timedout),
9563     DEF_STATUS_VAR(write_wal),
9564     DEF_STATUS_VAR(flush_write_bytes),
9565     DEF_STATUS_VAR(compact_read_bytes),
9566     DEF_STATUS_VAR(compact_write_bytes),
9567     DEF_STATUS_VAR(number_superversion_acquires),
9568     DEF_STATUS_VAR(number_superversion_releases),
9569     DEF_STATUS_VAR(number_superversion_cleanups),
9570     DEF_STATUS_VAR(number_block_not_compressed),
9571     DEF_STATUS_VAR_PTR("snapshot_conflict_errors",
9572                        &rocksdb_snapshot_conflict_errors, SHOW_LONGLONG),
9573     DEF_STATUS_VAR_PTR("wal_group_syncs", &rocksdb_wal_group_syncs,
9574                        SHOW_LONGLONG),
9575     DEF_STATUS_VAR_PTR("number_stat_computes", &rocksdb_number_stat_computes,
9576                        SHOW_LONGLONG),
9577     DEF_STATUS_VAR_PTR("number_sst_entry_put", &rocksdb_num_sst_entry_put,
9578                        SHOW_LONGLONG),
9579     DEF_STATUS_VAR_PTR("number_sst_entry_delete", &rocksdb_num_sst_entry_delete,
9580                        SHOW_LONGLONG),
9581     DEF_STATUS_VAR_PTR("number_sst_entry_singledelete",
9582                        &rocksdb_num_sst_entry_singledelete, SHOW_LONGLONG),
9583     DEF_STATUS_VAR_PTR("number_sst_entry_merge", &rocksdb_num_sst_entry_merge,
9584                        SHOW_LONGLONG),
9585     DEF_STATUS_VAR_PTR("number_sst_entry_other", &rocksdb_num_sst_entry_other,
9586                        SHOW_LONGLONG),
9587     {"rocksdb", reinterpret_cast<char *>(&show_myrocks_vars), SHOW_FUNC},
9588     {NullS, NullS, SHOW_LONG}};
9589 
9590 /*
9591   Background thread's main logic
9592 */
9593 
run()9594 void Rdb_background_thread::run() {
9595   // How many seconds to wait till flushing the WAL next time.
9596   const int WAKE_UP_INTERVAL = 1;
9597 
9598   timespec ts_next_sync;
9599   clock_gettime(CLOCK_REALTIME, &ts_next_sync);
9600   ts_next_sync.tv_sec += WAKE_UP_INTERVAL;
9601 
9602   for (;;) {
9603     // Wait until the next timeout or until we receive a signal to stop the
9604     // thread. Request to stop the thread should only be triggered when the
9605     // storage engine is being unloaded.
9606     mysql_mutex_lock(&m_signal_mutex);
9607     const auto ret __attribute__((__unused__)) =
9608         mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts_next_sync);
9609 
9610     // Check that we receive only the expected error codes.
9611     DBUG_ASSERT(ret == 0 || ret == ETIMEDOUT);
9612     const bool local_stop = m_stop;
9613     const bool local_save_stats = m_save_stats;
9614     reset();
9615     mysql_mutex_unlock(&m_signal_mutex);
9616 
9617     if (local_stop) {
9618       // If we're here then that's because condition variable was signaled by
9619       // another thread and we're shutting down. Break out the loop to make
9620       // sure that shutdown thread can proceed.
9621       break;
9622     }
9623 
9624     // This path should be taken only when the timer expired.
9625     DBUG_ASSERT(ret == ETIMEDOUT);
9626 
9627     if (local_save_stats) {
9628       ddl_manager.persist_stats();
9629     }
9630 
9631     timespec ts;
9632     clock_gettime(CLOCK_REALTIME, &ts);
9633 
9634     // Flush the WAL.
9635     if (rdb && rocksdb_background_sync) {
9636       DBUG_ASSERT(!rocksdb_db_options.allow_mmap_writes);
9637       const rocksdb::Status s = rdb->SyncWAL();
9638       if (!s.ok()) {
9639         rdb_handle_io_error(s, RDB_IO_ERROR_BG_THREAD);
9640       }
9641     }
9642 
9643     // Set the next timestamp for mysql_cond_timedwait() (which ends up calling
9644     // pthread_cond_timedwait()) to wait on.
9645     ts_next_sync.tv_sec = ts.tv_sec + WAKE_UP_INTERVAL;
9646   }
9647 
9648   // save remaining stats which might've left unsaved
9649   ddl_manager.persist_stats();
9650 }
9651 
9652 /**
9653   Deciding if it is possible to use bloom filter or not.
9654 
9655   @detail
9656    Even if bloom filter exists, it is not always possible
9657    to use bloom filter. If using bloom filter when you shouldn't,
9658    false negative may happen -- fewer rows than expected may be returned.
9659    It is users' responsibility to use bloom filter correctly.
9660 
9661    If bloom filter does not exist, return value does not matter because
9662    RocksDB does not use bloom filter internally.
9663 
9664   @param kd
9665   @param eq_cond      Equal condition part of the key. This always includes
9666                       system index id (4 bytes).
9667   @param use_all_keys True if all key parts are set with equal conditions.
9668                       This is aware of extended keys.
9669 */
can_use_bloom_filter(THD * thd,const Rdb_key_def & kd,const rocksdb::Slice & eq_cond,const bool use_all_keys,bool is_ascending)9670 bool can_use_bloom_filter(THD *thd, const Rdb_key_def &kd,
9671                           const rocksdb::Slice &eq_cond,
9672                           const bool use_all_keys, bool is_ascending) {
9673   bool can_use = false;
9674 
9675   if (THDVAR(thd, skip_bloom_filter_on_read)) {
9676     return can_use;
9677   }
9678 
9679   const rocksdb::SliceTransform *prefix_extractor = kd.get_extractor();
9680   if (prefix_extractor) {
9681     /*
9682       This is an optimized use case for CappedPrefixTransform.
9683       If eq_cond length >= prefix extractor length and if
9684       all keys are used for equal lookup, it is
9685       always possible to use bloom filter.
9686 
9687       Prefix bloom filter can't be used on descending scan with
9688       prefix lookup (i.e. WHERE id1=1 ORDER BY id2 DESC), because of
9689       RocksDB's limitation. On ascending (or not sorting) scan,
9690       keys longer than the capped prefix length will be truncated down
9691       to the capped length and the resulting key is added to the bloom filter.
9692 
9693       Keys shorter than the capped prefix length will be added to
9694       the bloom filter. When keys are looked up, key conditionals
9695       longer than the capped length can be used; key conditionals
9696       shorter require all parts of the key to be available
9697       for the short key match.
9698     */
9699     if (use_all_keys && prefix_extractor->InRange(eq_cond))
9700       can_use = true;
9701     else if (!is_ascending)
9702       can_use = false;
9703     else if (prefix_extractor->SameResultWhenAppended(eq_cond))
9704       can_use = true;
9705     else
9706       can_use = false;
9707   } else {
9708     /*
9709       if prefix extractor is not defined, all key parts have to be
9710       used by eq_cond.
9711     */
9712     if (use_all_keys)
9713       can_use = true;
9714     else
9715       can_use = false;
9716   }
9717 
9718   return can_use;
9719 }
9720 
9721 /* For modules that need access to the global data structures */
rdb_get_rocksdb_db()9722 rocksdb::TransactionDB *rdb_get_rocksdb_db() { return rdb; }
9723 
rdb_get_cf_manager()9724 Rdb_cf_manager &rdb_get_cf_manager() { return cf_manager; }
9725 
rdb_get_table_options()9726 rocksdb::BlockBasedTableOptions &rdb_get_table_options() {
9727   return rocksdb_tbl_options;
9728 }
9729 
rdb_get_table_perf_counters(const char * const tablename,Rdb_perf_counters * const counters)9730 int rdb_get_table_perf_counters(const char *const tablename,
9731                                 Rdb_perf_counters *const counters) {
9732   DBUG_ASSERT(counters != nullptr);
9733   DBUG_ASSERT(tablename != nullptr);
9734 
9735   Rdb_table_handler *table_handler;
9736   table_handler = rdb_open_tables.get_table_handler(tablename);
9737   if (table_handler == nullptr) {
9738     return HA_ERR_INTERNAL_ERROR;
9739   }
9740 
9741   counters->load(table_handler->m_table_perf_context);
9742 
9743   rdb_open_tables.release_table_handler(table_handler);
9744   return HA_EXIT_SUCCESS;
9745 }
9746 
rdb_handle_io_error(rocksdb::Status status,RDB_IO_ERROR_TYPE err_type)9747 void rdb_handle_io_error(rocksdb::Status status, RDB_IO_ERROR_TYPE err_type) {
9748   if (status.IsIOError()) {
9749     switch (err_type) {
9750     case RDB_IO_ERROR_TX_COMMIT:
9751     case RDB_IO_ERROR_DICT_COMMIT: {
9752       sql_print_error("RocksDB: Failed to write to WAL - status %d, %s",
9753                       status.code(), status.ToString().c_str());
9754       sql_print_error("RocksDB: Aborting on WAL write error.");
9755       abort_with_stack_traces();
9756       break;
9757     }
9758     case RDB_IO_ERROR_BG_THREAD: {
9759       sql_print_warning("RocksDB: BG Thread failed to write to RocksDB "
9760                         "- status %d, %s",
9761                         status.code(), status.ToString().c_str());
9762       break;
9763     }
9764     case RDB_IO_ERROR_GENERAL: {
9765       sql_print_error("RocksDB: Failed on I/O - status %d, %s", status.code(),
9766                       status.ToString().c_str());
9767       sql_print_error("RocksDB: Aborting on I/O error.");
9768       abort_with_stack_traces();
9769       break;
9770     }
9771     default:
9772       DBUG_ASSERT(0);
9773       break;
9774     }
9775   } else if (status.IsCorruption()) {
9776     /* NO_LINT_DEBUG */
9777     sql_print_error("RocksDB: Data Corruption detected! %d, %s", status.code(),
9778                     status.ToString().c_str());
9779     /* NO_LINT_DEBUG */
9780     sql_print_error("RocksDB: Aborting because of data corruption.");
9781     abort_with_stack_traces();
9782   } else if (!status.ok()) {
9783     switch (err_type) {
9784     case RDB_IO_ERROR_DICT_COMMIT: {
9785       sql_print_error("RocksDB: Failed to write to WAL (dictionary) - "
9786                       "status %d, %s",
9787                       status.code(), status.ToString().c_str());
9788       sql_print_error("RocksDB: Aborting on WAL write error.");
9789       abort_with_stack_traces();
9790       break;
9791     }
9792     default:
9793       sql_print_warning("RocksDB: Failed to read/write in RocksDB "
9794                         "- status %d, %s",
9795                         status.code(), status.ToString().c_str());
9796       break;
9797     }
9798   }
9799 }
9800 
rdb_get_dict_manager(void)9801 Rdb_dict_manager *rdb_get_dict_manager(void) { return &dict_manager; }
9802 
rdb_get_ddl_manager(void)9803 Rdb_ddl_manager *rdb_get_ddl_manager(void) { return &ddl_manager; }
9804 
rocksdb_set_compaction_options(my_core::THD * const thd,my_core::st_mysql_sys_var * const var,void * const var_ptr,const void * const save)9805 void rocksdb_set_compaction_options(my_core::THD *const thd
9806                                     __attribute__((__unused__)),
9807                                     my_core::st_mysql_sys_var *const var
9808                                     __attribute__((__unused__)),
9809                                     void *const var_ptr,
9810                                     const void *const save) {
9811   if (var_ptr && save) {
9812     *(uint64_t *)var_ptr = *(const uint64_t *)save;
9813   }
9814   const Rdb_compact_params params = {
9815       (uint64_t)rocksdb_compaction_sequential_deletes,
9816       (uint64_t)rocksdb_compaction_sequential_deletes_window,
9817       (uint64_t)rocksdb_compaction_sequential_deletes_file_size};
9818   if (properties_collector_factory) {
9819     properties_collector_factory->SetCompactionParams(params);
9820   }
9821 }
9822 
rocksdb_set_table_stats_sampling_pct(my_core::THD * const thd,my_core::st_mysql_sys_var * const var,void * const var_ptr,const void * const save)9823 void rocksdb_set_table_stats_sampling_pct(
9824     my_core::THD *const thd __attribute__((__unused__)),
9825     my_core::st_mysql_sys_var *const var __attribute__((__unused__)),
9826     void *const var_ptr __attribute__((__unused__)), const void *const save) {
9827   mysql_mutex_lock(&rdb_sysvars_mutex);
9828 
9829   const uint32_t new_val = *static_cast<const uint32_t *>(save);
9830 
9831   if (new_val != rocksdb_table_stats_sampling_pct) {
9832     rocksdb_table_stats_sampling_pct = new_val;
9833 
9834     if (properties_collector_factory) {
9835       properties_collector_factory->SetTableStatsSamplingPct(
9836           rocksdb_table_stats_sampling_pct);
9837     }
9838   }
9839 
9840   mysql_mutex_unlock(&rdb_sysvars_mutex);
9841 }
9842 
9843 /*
9844   This function allows setting the rate limiter's bytes per second value
9845   but only if the rate limiter is turned on which has to be done at startup.
9846   If the rate is already 0 (turned off) or we are changing it to 0 (trying
9847   to turn it off) this function will push a warning to the client and do
9848   nothing.
9849   This is similar to the code in innodb_doublewrite_update (found in
9850   storage/innobase/handler/ha_innodb.cc).
9851 */
rocksdb_set_rate_limiter_bytes_per_sec(my_core::THD * const thd,my_core::st_mysql_sys_var * const var,void * const var_ptr,const void * const save)9852 void rocksdb_set_rate_limiter_bytes_per_sec(my_core::THD *const thd,
9853                                             my_core::st_mysql_sys_var *const var
9854                                             __attribute__((__unused__)),
9855                                             void *const var_ptr
9856                                             __attribute__((__unused__)),
9857                                             const void *const save) {
9858   const uint64_t new_val = *static_cast<const uint64_t *>(save);
9859   if (new_val == 0 || rocksdb_rate_limiter_bytes_per_sec == 0) {
9860     /*
9861       If a rate_limiter was not enabled at startup we can't change it nor
9862       can we disable it if one was created at startup
9863     */
9864     push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_WRONG_ARGUMENTS,
9865                         "RocksDB: rocksdb_rate_limiter_bytes_per_sec cannot "
9866                         "be dynamically changed to or from 0.  Do a clean "
9867                         "shutdown if you want to change it from or to 0.");
9868   } else if (new_val != rocksdb_rate_limiter_bytes_per_sec) {
9869     /* Apply the new value to the rate limiter and store it locally */
9870     DBUG_ASSERT(rocksdb_rate_limiter != nullptr);
9871     rocksdb_rate_limiter_bytes_per_sec = new_val;
9872     rocksdb_rate_limiter->SetBytesPerSecond(new_val);
9873   }
9874 }
9875 
rdb_set_collation_exception_list(const char * const exception_list)9876 void rdb_set_collation_exception_list(const char *const exception_list) {
9877   DBUG_ASSERT(rdb_collation_exceptions != nullptr);
9878 
9879   int flags = MY_REG_EXTENDED | MY_REG_NOSUB;
9880   if (lower_case_table_names)
9881     flags |= MY_REG_ICASE;
9882   if (!rdb_collation_exceptions->compile(
9883           exception_list, flags, table_alias_charset)) {
9884     warn_about_bad_patterns(*rdb_collation_exceptions,
9885                             "strict_collation_exceptions");
9886   }
9887 }
9888 
rocksdb_set_collation_exception_list(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)9889 void rocksdb_set_collation_exception_list(THD *const thd,
9890                                           struct st_mysql_sys_var *const var,
9891                                           void *const var_ptr,
9892                                           const void *const save) {
9893   const char *const val = *static_cast<const char *const *>(save);
9894 
9895   rdb_set_collation_exception_list(val);
9896 
9897   *static_cast<const char **>(var_ptr) = val;
9898 }
9899 
rocksdb_set_bulk_load(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)9900 void rocksdb_set_bulk_load(THD *const thd, struct st_mysql_sys_var *const var
9901                            __attribute__((__unused__)),
9902                            void *const var_ptr, const void *const save) {
9903   Rdb_transaction *&tx = get_tx_from_thd(thd);
9904 
9905   if (tx != nullptr) {
9906     const int rc = tx->finish_bulk_load();
9907     if (rc != 0) {
9908       // NO_LINT_DEBUG
9909       sql_print_error("RocksDB: Error %d finalizing last SST file while "
9910                       "setting bulk loading variable",
9911                       rc);
9912       abort_with_stack_traces();
9913     }
9914   }
9915 
9916   *static_cast<bool *>(var_ptr) = *static_cast<const bool *>(save);
9917 }
9918 
rocksdb_set_max_background_compactions(THD * thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)9919 static void rocksdb_set_max_background_compactions(
9920     THD *thd, struct st_mysql_sys_var *const var, void *const var_ptr,
9921     const void *const save) {
9922   DBUG_ASSERT(save != nullptr);
9923 
9924   mysql_mutex_lock(&rdb_sysvars_mutex);
9925   rocksdb_db_options.max_background_compactions =
9926       *static_cast<const int *>(save);
9927   rocksdb_db_options.env->SetBackgroundThreads(
9928       rocksdb_db_options.max_background_compactions,
9929       rocksdb::Env::Priority::LOW);
9930   mysql_mutex_unlock(&rdb_sysvars_mutex);
9931 }
9932 
rdb_queue_save_stats_request()9933 void rdb_queue_save_stats_request() { rdb_bg_thread.request_save_stats(); }
9934 
rpl_before_delete_rows()9935 void ha_rocksdb::rpl_before_delete_rows() {
9936   DBUG_ENTER_FUNC();
9937 
9938   m_in_rpl_delete_rows = true;
9939 
9940   DBUG_VOID_RETURN;
9941 }
9942 
rpl_after_delete_rows()9943 void ha_rocksdb::rpl_after_delete_rows() {
9944   DBUG_ENTER_FUNC();
9945 
9946   m_in_rpl_delete_rows = false;
9947 
9948   DBUG_VOID_RETURN;
9949 }
9950 
rpl_before_update_rows()9951 void ha_rocksdb::rpl_before_update_rows() {
9952   DBUG_ENTER_FUNC();
9953 
9954   m_in_rpl_update_rows = true;
9955 
9956   DBUG_VOID_RETURN;
9957 }
9958 
rpl_after_update_rows()9959 void ha_rocksdb::rpl_after_update_rows() {
9960   DBUG_ENTER_FUNC();
9961 
9962   m_in_rpl_update_rows = false;
9963 
9964   DBUG_VOID_RETURN;
9965 }
9966 
9967 /**
9968   @brief
9969   Read Free Replication can be used or not. Returning False means
9970   Read Free Replication can be used. Read Free Replication can be used
9971   on UPDATE or DELETE row events, and table must have user defined
9972   primary key.
9973 */
use_read_free_rpl()9974 bool ha_rocksdb::use_read_free_rpl() {
9975   DBUG_ENTER_FUNC();
9976 
9977   DBUG_RETURN((m_in_rpl_delete_rows || m_in_rpl_update_rows) &&
9978               !has_hidden_pk(table) && m_use_read_free_rpl);
9979 }
9980 
read_time(uint index,uint ranges,ha_rows rows)9981 double ha_rocksdb::read_time(uint index, uint ranges, ha_rows rows) {
9982   DBUG_ENTER_FUNC();
9983 
9984   if (index != table->s->primary_key) {
9985     /* Non covering index range scan */
9986     DBUG_RETURN(handler::read_time(index, ranges, rows));
9987   }
9988 
9989   DBUG_RETURN((rows / 20.0) + 1);
9990 }
9991 
9992 } // namespace myrocks
9993 
9994 /*
9995   Register the storage engine plugin outside of myrocks namespace
9996   so that mysql_declare_plugin does not get confused when it does
9997   its name generation.
9998 */
9999 
10000 struct st_mysql_storage_engine rocksdb_storage_engine = {
10001     MYSQL_HANDLERTON_INTERFACE_VERSION};
10002 
mysql_declare_plugin(rocksdb_se)10003 mysql_declare_plugin(rocksdb_se){
10004     MYSQL_STORAGE_ENGINE_PLUGIN,       /* Plugin Type */
10005     &rocksdb_storage_engine,           /* Plugin Descriptor */
10006     "ROCKSDB",                         /* Plugin Name */
10007     "Monty Program Ab",                /* Plugin Author */
10008     "RocksDB storage engine",          /* Plugin Description */
10009     PLUGIN_LICENSE_GPL,                /* Plugin Licence */
10010     myrocks::rocksdb_init_func,        /* Plugin Entry Point */
10011     myrocks::rocksdb_done_func,        /* Plugin Deinitializer */
10012     0x0001,                            /* version number (0.1) */
10013     myrocks::rocksdb_status_vars,      /* status variables */
10014     myrocks::rocksdb_system_variables, /* system variables */
10015     nullptr,                           /* config options */
10016     0,                                 /* flags */
10017 },
10018     myrocks::rdb_i_s_cfstats, myrocks::rdb_i_s_dbstats,
10019     myrocks::rdb_i_s_perf_context, myrocks::rdb_i_s_perf_context_global,
10020     myrocks::rdb_i_s_cfoptions, myrocks::rdb_i_s_compact_stats,
10021     myrocks::rdb_i_s_global_info, myrocks::rdb_i_s_ddl,
10022     myrocks::rdb_i_s_index_file_map, myrocks::rdb_i_s_lock_info,
10023     myrocks::rdb_i_s_trx_info mysql_declare_plugin_end;
10024