1 /*
2 Copyright (c) 2012, Monty Program Ab
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; version 2 of the License.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
16
17 #ifdef USE_PRAGMA_IMPLEMENTATION
18 #pragma implementation // gcc: Class implementation
19 #endif
20
21 #define MYSQL_SERVER 1
22
23 /* The C++ file's header */
24 #include "./ha_rocksdb.h"
25
26 /* C++ standard header files */
27 #include <algorithm>
28 #include <queue>
29 #include <set>
30 #include <string>
31 #include <vector>
32
33 /* MySQL includes */
34 #include "./debug_sync.h"
35 #include "./my_bit.h"
36 #include "./my_stacktrace.h"
37 #include "./sql_audit.h"
38 #include "./sql_table.h"
39 #include <mysql/psi/mysql_table.h>
40 #include <mysql/thread_pool_priv.h>
41 #include <mysys_err.h>
42
43 /* RocksDB includes */
44 #include "rocksdb/compaction_filter.h"
45 #include "rocksdb/persistent_cache.h"
46 #include "rocksdb/rate_limiter.h"
47 #include "rocksdb/slice_transform.h"
48 #include "rocksdb/utilities/checkpoint.h"
49 #include "rocksdb/utilities/convenience.h"
50 #include "rocksdb/utilities/memory_util.h"
51
52 /* MyRocks includes */
53 #include "./event_listener.h"
54 #include "./ha_rocksdb_proto.h"
55 #include "./logger.h"
56 #include "./rdb_cf_manager.h"
57 #include "./rdb_cf_options.h"
58 #include "./rdb_datadic.h"
59 #include "./rdb_i_s.h"
60 #include "./rdb_index_merge.h"
61 #include "./rdb_mutex_wrapper.h"
62 #include "./rdb_threads.h"
63
64 // Internal MySQL APIs not exposed in any header.
65 extern "C" {
66 /**
67 Mark transaction to rollback and mark error as fatal to a sub-statement.
68 @param thd Thread handle
69 @param all TRUE <=> rollback main transaction.
70 */
71 void thd_mark_transaction_to_rollback(MYSQL_THD thd, bool all);
72
73 /**
74 * Get the user thread's binary logging format
75 * @param thd user thread
76 * @return Value to be used as index into the binlog_format_names array
77 */
78 int thd_binlog_format(const MYSQL_THD thd);
79
80 /**
81 * Check if binary logging is filtered for thread's current db.
82 * @param thd Thread handle
83 * @retval 1 the query is not filtered, 0 otherwise.
84 */
85 bool thd_binlog_filter_ok(const MYSQL_THD thd);
86 }
87
88 namespace myrocks {
89
90 static st_global_stats global_stats;
91 static st_export_stats export_stats;
92
93 /**
94 Updates row counters based on the table type and operation type.
95 */
update_row_stats(const operation_type & type)96 void ha_rocksdb::update_row_stats(const operation_type &type) {
97 DBUG_ASSERT(type < ROWS_MAX);
98 // Find if we are modifying system databases.
99 if (table->s && m_tbl_def->m_is_mysql_system_table)
100 global_stats.system_rows[type].inc();
101 else
102 global_stats.rows[type].inc();
103 }
104
105 void dbug_dump_database(rocksdb::DB *db);
106 static handler *rocksdb_create_handler(my_core::handlerton *hton,
107 my_core::TABLE_SHARE *table_arg,
108 my_core::MEM_ROOT *mem_root);
109
110 bool can_use_bloom_filter(THD *thd, const Rdb_key_def &kd,
111 const rocksdb::Slice &eq_cond,
112 const bool use_all_keys, bool is_ascending);
113
114 ///////////////////////////////////////////////////////////
115 // Parameters and settings
116 ///////////////////////////////////////////////////////////
117 static char *rocksdb_default_cf_options;
118 static char *rocksdb_override_cf_options;
119 Rdb_cf_options rocksdb_cf_options_map;
120
121 ///////////////////////////////////////////////////////////
122 // Globals
123 ///////////////////////////////////////////////////////////
124 handlerton *rocksdb_hton;
125
126 rocksdb::TransactionDB *rdb = nullptr;
127
128 static std::shared_ptr<rocksdb::Statistics> rocksdb_stats;
129 static std::shared_ptr<Rdb_tbl_prop_coll_factory> properties_collector_factory;
130
131 Rdb_dict_manager dict_manager;
132 Rdb_cf_manager cf_manager;
133 Rdb_ddl_manager ddl_manager;
134
135 /**
136 MyRocks background thread control
137 N.B. This is besides RocksDB's own background threads
138 (@see rocksdb::CancelAllBackgroundWork())
139 */
140
141 static Rdb_background_thread rdb_bg_thread;
142
143 // List of table names (using regex) that are exceptions to the strict
144 // collation check requirement.
145 Regex *rdb_collation_exceptions;
146
147 static const char *const ERRSTR_ROLLBACK_ONLY =
148 "This transaction was rolled back and cannot be "
149 "committed. Only supported operation is to roll it back, "
150 "so all pending changes will be discarded. "
151 "Please restart another transaction.";
152
rocksdb_flush_all_memtables()153 static void rocksdb_flush_all_memtables() {
154 const Rdb_cf_manager &cf_manager = rdb_get_cf_manager();
155 for (const auto &cf_handle : cf_manager.get_all_cf()) {
156 rdb->Flush(rocksdb::FlushOptions(), cf_handle);
157 }
158 }
159
rocksdb_compact_column_family_stub(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)160 static void rocksdb_compact_column_family_stub(
161 THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
162 const void *const save) {}
163
rocksdb_compact_column_family(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,struct st_mysql_value * const value)164 static int rocksdb_compact_column_family(THD *const thd,
165 struct st_mysql_sys_var *const var,
166 void *const var_ptr,
167 struct st_mysql_value *const value) {
168 char buff[STRING_BUFFER_USUAL_SIZE];
169 int len = sizeof(buff);
170
171 DBUG_ASSERT(value != nullptr);
172
173 if (const char *const cf = value->val_str(value, buff, &len)) {
174 bool is_automatic;
175 auto cfh = cf_manager.get_cf(cf, "", nullptr, &is_automatic);
176 if (cfh != nullptr && rdb != nullptr) {
177 sql_print_information("RocksDB: Manual compaction of column family: %s\n",
178 cf);
179 rdb->CompactRange(rocksdb::CompactRangeOptions(), cfh, nullptr, nullptr);
180 }
181 }
182 return HA_EXIT_SUCCESS;
183 }
184
185 ///////////////////////////////////////////////////////////
186 // Hash map: table name => open table handler
187 ///////////////////////////////////////////////////////////
188
189 namespace // anonymous namespace = not visible outside this source file
190 {
191
192 const ulong TABLE_HASH_SIZE = 32;
193
194 struct Rdb_open_tables_map {
195 /* Hash table used to track the handlers of open tables */
196 my_core::HASH m_hash;
197 /* The mutex used to protect the hash table */
198 mutable mysql_mutex_t m_mutex;
199
init_hashmyrocks::__anoncf2112d80111::Rdb_open_tables_map200 void init_hash(void) {
201 (void)my_hash_init(&m_hash, my_core::system_charset_info, TABLE_HASH_SIZE,
202 0, 0, (my_hash_get_key)Rdb_open_tables_map::get_hash_key,
203 0, 0);
204 }
205
free_hashmyrocks::__anoncf2112d80111::Rdb_open_tables_map206 void free_hash(void) { my_hash_free(&m_hash); }
207
208 static uchar *get_hash_key(Rdb_table_handler *const table_handler,
209 size_t *const length,
210 my_bool not_used __attribute__((__unused__)));
211
212 Rdb_table_handler *get_table_handler(const char *const table_name);
213 void release_table_handler(Rdb_table_handler *const table_handler);
214
215 std::vector<std::string> get_table_names(void) const;
216 };
217
218 } // anonymous namespace
219
220 static Rdb_open_tables_map rdb_open_tables;
221
rdb_normalize_dir(std::string dir)222 static std::string rdb_normalize_dir(std::string dir) {
223 while (dir.size() > 0 && dir.back() == '/') {
224 dir.resize(dir.size() - 1);
225 }
226 return dir;
227 }
228
rocksdb_create_checkpoint(THD * const thd,struct st_mysql_sys_var * const var,void * const save,struct st_mysql_value * const value)229 static int rocksdb_create_checkpoint(THD *const thd __attribute__((__unused__)),
230 struct st_mysql_sys_var *const var
231 __attribute__((__unused__)),
232 void *const save
233 __attribute__((__unused__)),
234 struct st_mysql_value *const value) {
235 char buf[FN_REFLEN];
236 int len = sizeof(buf);
237 const char *const checkpoint_dir_raw = value->val_str(value, buf, &len);
238 if (checkpoint_dir_raw) {
239 if (rdb != nullptr) {
240 std::string checkpoint_dir = rdb_normalize_dir(checkpoint_dir_raw);
241 // NO_LINT_DEBUG
242 sql_print_information("RocksDB: creating checkpoint in directory : %s\n",
243 checkpoint_dir.c_str());
244 rocksdb::Checkpoint *checkpoint;
245 auto status = rocksdb::Checkpoint::Create(rdb, &checkpoint);
246 if (status.ok()) {
247 status = checkpoint->CreateCheckpoint(checkpoint_dir.c_str());
248 if (status.ok()) {
249 sql_print_information(
250 "RocksDB: created checkpoint in directory : %s\n",
251 checkpoint_dir.c_str());
252 } else {
253 my_printf_error(
254 ER_UNKNOWN_ERROR,
255 "RocksDB: Failed to create checkpoint directory. status %d %s",
256 MYF(0), status.code(), status.ToString().c_str());
257 }
258 delete checkpoint;
259 } else {
260 const std::string err_text(status.ToString());
261 my_printf_error(
262 ER_UNKNOWN_ERROR,
263 "RocksDB: failed to initialize checkpoint. status %d %s\n", MYF(0),
264 status.code(), err_text.c_str());
265 }
266 return status.code();
267 }
268 }
269 return HA_ERR_INTERNAL_ERROR;
270 }
271
272 /* This method is needed to indicate that the
273 ROCKSDB_CREATE_CHECKPOINT command is not read-only */
rocksdb_create_checkpoint_stub(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)274 static void rocksdb_create_checkpoint_stub(THD *const thd,
275 struct st_mysql_sys_var *const var,
276 void *const var_ptr,
277 const void *const save) {}
278
rocksdb_force_flush_memtable_now_stub(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)279 static void rocksdb_force_flush_memtable_now_stub(
280 THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
281 const void *const save) {}
282
rocksdb_force_flush_memtable_now(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,struct st_mysql_value * const value)283 static int rocksdb_force_flush_memtable_now(
284 THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
285 struct st_mysql_value *const value) {
286 sql_print_information("RocksDB: Manual memtable flush\n");
287 rocksdb_flush_all_memtables();
288 return HA_EXIT_SUCCESS;
289 }
290
291 static void rocksdb_drop_index_wakeup_thread(
292 my_core::THD *const thd __attribute__((__unused__)),
293 struct st_mysql_sys_var *const var __attribute__((__unused__)),
294 void *const var_ptr __attribute__((__unused__)), const void *const save);
295
296 static my_bool rocksdb_pause_background_work = 0;
297 static mysql_mutex_t rdb_sysvars_mutex;
298
rocksdb_set_pause_background_work(my_core::THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)299 static void rocksdb_set_pause_background_work(
300 my_core::THD *const thd __attribute__((__unused__)),
301 struct st_mysql_sys_var *const var __attribute__((__unused__)),
302 void *const var_ptr __attribute__((__unused__)), const void *const save) {
303 mysql_mutex_lock(&rdb_sysvars_mutex);
304 const bool pause_requested = *static_cast<const bool *>(save);
305 if (rocksdb_pause_background_work != pause_requested) {
306 if (pause_requested) {
307 rdb->PauseBackgroundWork();
308 } else {
309 rdb->ContinueBackgroundWork();
310 }
311 rocksdb_pause_background_work = pause_requested;
312 }
313 mysql_mutex_unlock(&rdb_sysvars_mutex);
314 }
315
316 static void rocksdb_set_compaction_options(THD *thd,
317 struct st_mysql_sys_var *var,
318 void *var_ptr, const void *save);
319
320 static void rocksdb_set_table_stats_sampling_pct(THD *thd,
321 struct st_mysql_sys_var *var,
322 void *var_ptr,
323 const void *save);
324
325 static void rocksdb_set_rate_limiter_bytes_per_sec(THD *thd,
326 struct st_mysql_sys_var *var,
327 void *var_ptr,
328 const void *save);
329
330 static void rdb_set_collation_exception_list(const char *exception_list);
331 static void rocksdb_set_collation_exception_list(THD *thd,
332 struct st_mysql_sys_var *var,
333 void *var_ptr,
334 const void *save);
335
336 static void rocksdb_set_bulk_load(THD *thd, struct st_mysql_sys_var *var
337 __attribute__((__unused__)),
338 void *var_ptr, const void *save);
339
340 static void rocksdb_set_max_background_compactions(
341 THD *thd, struct st_mysql_sys_var *const var, void *const var_ptr,
342 const void *const save);
343 //////////////////////////////////////////////////////////////////////////////
344 // Options definitions
345 //////////////////////////////////////////////////////////////////////////////
346 static long long rocksdb_block_cache_size;
347 /* Use unsigned long long instead of uint64_t because of MySQL compatibility */
348 static unsigned long long // NOLINT(runtime/int)
349 rocksdb_rate_limiter_bytes_per_sec;
350 static unsigned long // NOLINT(runtime/int)
351 rocksdb_persistent_cache_size;
352 static uint64_t rocksdb_info_log_level;
353 static char *rocksdb_wal_dir;
354 static char *rocksdb_persistent_cache_path;
355 static uint64_t rocksdb_index_type;
356 static char rocksdb_background_sync;
357 static uint32_t rocksdb_debug_optimizer_n_rows;
358 static my_bool rocksdb_debug_optimizer_no_zero_cardinality;
359 static uint32_t rocksdb_wal_recovery_mode;
360 static uint32_t rocksdb_access_hint_on_compaction_start;
361 static char *rocksdb_compact_cf_name;
362 static char *rocksdb_checkpoint_name;
363 static my_bool rocksdb_signal_drop_index_thread;
364 static my_bool rocksdb_strict_collation_check = 1;
365 static my_bool rocksdb_enable_2pc = 0;
366 static char *rocksdb_strict_collation_exceptions;
367 static my_bool rocksdb_collect_sst_properties = 1;
368 static my_bool rocksdb_force_flush_memtable_now_var = 0;
369 static uint64_t rocksdb_number_stat_computes = 0;
370 static uint32_t rocksdb_seconds_between_stat_computes = 3600;
371 static long long rocksdb_compaction_sequential_deletes = 0l;
372 static long long rocksdb_compaction_sequential_deletes_window = 0l;
373 static long long rocksdb_compaction_sequential_deletes_file_size = 0l;
374 static uint32_t rocksdb_validate_tables = 1;
375 static char *rocksdb_datadir;
376 static uint32_t rocksdb_table_stats_sampling_pct;
377 static my_bool rocksdb_enable_bulk_load_api = 1;
378 static my_bool rpl_skip_tx_api_var = 0;
379 static my_bool rocksdb_print_snapshot_conflict_queries = 0;
380
381 std::atomic<uint64_t> rocksdb_snapshot_conflict_errors(0);
382 std::atomic<uint64_t> rocksdb_wal_group_syncs(0);
383
rdb_init_rocksdb_db_options(void)384 static rocksdb::DBOptions rdb_init_rocksdb_db_options(void) {
385 rocksdb::DBOptions o;
386
387 o.create_if_missing = true;
388 o.listeners.push_back(std::make_shared<Rdb_event_listener>(&ddl_manager));
389 o.info_log_level = rocksdb::InfoLogLevel::INFO_LEVEL;
390 o.max_subcompactions = DEFAULT_SUBCOMPACTIONS;
391
392 return o;
393 }
394
395 static rocksdb::DBOptions rocksdb_db_options = rdb_init_rocksdb_db_options();
396 static rocksdb::BlockBasedTableOptions rocksdb_tbl_options;
397
398 static std::shared_ptr<rocksdb::RateLimiter> rocksdb_rate_limiter;
399
400 /* This enum needs to be kept up to date with rocksdb::InfoLogLevel */
401 static const char *info_log_level_names[] = {"debug_level", "info_level",
402 "warn_level", "error_level",
403 "fatal_level", NullS};
404
405 static TYPELIB info_log_level_typelib = {
406 array_elements(info_log_level_names) - 1, "info_log_level_typelib",
407 info_log_level_names, nullptr};
408
rocksdb_set_rocksdb_info_log_level(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)409 static void rocksdb_set_rocksdb_info_log_level(
410 THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
411 const void *const save) {
412 DBUG_ASSERT(save != nullptr);
413
414 mysql_mutex_lock(&rdb_sysvars_mutex);
415 rocksdb_info_log_level = *static_cast<const uint64_t *>(save);
416 rocksdb_db_options.info_log->SetInfoLogLevel(
417 static_cast<const rocksdb::InfoLogLevel>(rocksdb_info_log_level));
418 mysql_mutex_unlock(&rdb_sysvars_mutex);
419 }
420
421 static const char *index_type_names[] = {"kBinarySearch", "kHashSearch", NullS};
422
423 static TYPELIB index_type_typelib = {array_elements(index_type_names) - 1,
424 "index_type_typelib", index_type_names,
425 nullptr};
426
427 const ulong RDB_MAX_LOCK_WAIT_SECONDS = 1024 * 1024 * 1024;
428 const ulong RDB_MAX_ROW_LOCKS = 1024 * 1024 * 1024;
429 const ulong RDB_DEFAULT_BULK_LOAD_SIZE = 1000;
430 const ulong RDB_MAX_BULK_LOAD_SIZE = 1024 * 1024 * 1024;
431 const size_t RDB_DEFAULT_MERGE_BUF_SIZE = 64 * 1024 * 1024;
432 const size_t RDB_MIN_MERGE_BUF_SIZE = 100;
433 const size_t RDB_DEFAULT_MERGE_COMBINE_READ_SIZE = 1024 * 1024 * 1024;
434 const size_t RDB_MIN_MERGE_COMBINE_READ_SIZE = 100;
435 const int64 RDB_DEFAULT_BLOCK_CACHE_SIZE = 512 * 1024 * 1024;
436 const int64 RDB_MIN_BLOCK_CACHE_SIZE = 1024;
437 const int RDB_MAX_CHECKSUMS_PCT = 100;
438
439 // TODO: 0 means don't wait at all, and we don't support it yet?
440 static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG,
441 "Number of seconds to wait for lock", nullptr,
442 nullptr, /*default*/ 1, /*min*/ 1,
443 /*max*/ RDB_MAX_LOCK_WAIT_SECONDS, 0);
444
445 static MYSQL_THDVAR_BOOL(deadlock_detect, PLUGIN_VAR_RQCMDARG,
446 "Enables deadlock detection", nullptr, nullptr, FALSE);
447
448 static MYSQL_THDVAR_BOOL(
449 trace_sst_api, PLUGIN_VAR_RQCMDARG,
450 "Generate trace output in the log for each call to the SstFileWriter",
451 nullptr, nullptr, FALSE);
452
453 static MYSQL_THDVAR_BOOL(
454 bulk_load, PLUGIN_VAR_RQCMDARG,
455 "Use bulk-load mode for inserts. This disables "
456 "unique_checks and enables rocksdb_commit_in_the_middle.",
457 nullptr, rocksdb_set_bulk_load, FALSE);
458
459 static MYSQL_SYSVAR_BOOL(enable_bulk_load_api, rocksdb_enable_bulk_load_api,
460 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
461 "Enables using SstFileWriter for bulk loading",
462 nullptr, nullptr, rocksdb_enable_bulk_load_api);
463
464 static MYSQL_THDVAR_STR(tmpdir, PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_MEMALLOC,
465 "Directory for temporary files during DDL operations.",
466 nullptr, nullptr, "");
467
468 static MYSQL_THDVAR_BOOL(
469 commit_in_the_middle, PLUGIN_VAR_RQCMDARG,
470 "Commit rows implicitly every rocksdb_bulk_load_size, on bulk load/insert, "
471 "update and delete",
472 nullptr, nullptr, FALSE);
473
474 static MYSQL_THDVAR_STR(
475 read_free_rpl_tables, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
476 "Regex that describes set of tables that will use read-free replication "
477 "on the slave (i.e. not lookup a row during replication)",
478 nullptr, nullptr, "");
479
480 static MYSQL_SYSVAR_BOOL(
481 rpl_skip_tx_api, rpl_skip_tx_api_var, PLUGIN_VAR_RQCMDARG,
482 "Use write batches for replication thread instead of tx api", nullptr,
483 nullptr, FALSE);
484
485 static MYSQL_THDVAR_BOOL(skip_bloom_filter_on_read, PLUGIN_VAR_RQCMDARG,
486 "Skip using bloom filter for reads", nullptr, nullptr,
487 FALSE);
488
489 static MYSQL_THDVAR_ULONG(max_row_locks, PLUGIN_VAR_RQCMDARG,
490 "Maximum number of locks a transaction can have",
491 nullptr, nullptr,
492 /*default*/ RDB_MAX_ROW_LOCKS,
493 /*min*/ 1,
494 /*max*/ RDB_MAX_ROW_LOCKS, 0);
495
496 static MYSQL_THDVAR_BOOL(
497 lock_scanned_rows, PLUGIN_VAR_RQCMDARG,
498 "Take and hold locks on rows that are scanned but not updated", nullptr,
499 nullptr, FALSE);
500
501 static MYSQL_THDVAR_ULONG(bulk_load_size, PLUGIN_VAR_RQCMDARG,
502 "Max #records in a batch for bulk-load mode", nullptr,
503 nullptr,
504 /*default*/ RDB_DEFAULT_BULK_LOAD_SIZE,
505 /*min*/ 1,
506 /*max*/ RDB_MAX_BULK_LOAD_SIZE, 0);
507
508 static MYSQL_THDVAR_ULONGLONG(
509 merge_buf_size, PLUGIN_VAR_RQCMDARG,
510 "Size to allocate for merge sort buffers written out to disk "
511 "during inplace index creation.",
512 nullptr, nullptr,
513 /* default (64MB) */ RDB_DEFAULT_MERGE_BUF_SIZE,
514 /* min (100B) */ RDB_MIN_MERGE_BUF_SIZE,
515 /* max */ SIZE_T_MAX, 1);
516
517 static MYSQL_THDVAR_ULONGLONG(
518 merge_combine_read_size, PLUGIN_VAR_RQCMDARG,
519 "Size that we have to work with during combine (reading from disk) phase "
520 "of "
521 "external sort during fast index creation.",
522 nullptr, nullptr,
523 /* default (1GB) */ RDB_DEFAULT_MERGE_COMBINE_READ_SIZE,
524 /* min (100B) */ RDB_MIN_MERGE_COMBINE_READ_SIZE,
525 /* max */ SIZE_T_MAX, 1);
526
527 static MYSQL_SYSVAR_BOOL(
528 create_if_missing,
529 *reinterpret_cast<my_bool *>(&rocksdb_db_options.create_if_missing),
530 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
531 "DBOptions::create_if_missing for RocksDB", nullptr, nullptr,
532 rocksdb_db_options.create_if_missing);
533
534 static MYSQL_SYSVAR_BOOL(
535 create_missing_column_families,
536 *reinterpret_cast<my_bool *>(
537 &rocksdb_db_options.create_missing_column_families),
538 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
539 "DBOptions::create_missing_column_families for RocksDB", nullptr, nullptr,
540 rocksdb_db_options.create_missing_column_families);
541
542 static MYSQL_SYSVAR_BOOL(
543 error_if_exists,
544 *reinterpret_cast<my_bool *>(&rocksdb_db_options.error_if_exists),
545 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
546 "DBOptions::error_if_exists for RocksDB", nullptr, nullptr,
547 rocksdb_db_options.error_if_exists);
548
549 static MYSQL_SYSVAR_BOOL(
550 paranoid_checks,
551 *reinterpret_cast<my_bool *>(&rocksdb_db_options.paranoid_checks),
552 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
553 "DBOptions::paranoid_checks for RocksDB", nullptr, nullptr,
554 rocksdb_db_options.paranoid_checks);
555
556 static MYSQL_SYSVAR_ULONGLONG(
557 rate_limiter_bytes_per_sec, rocksdb_rate_limiter_bytes_per_sec,
558 PLUGIN_VAR_RQCMDARG, "DBOptions::rate_limiter bytes_per_sec for RocksDB",
559 nullptr, rocksdb_set_rate_limiter_bytes_per_sec, /* default */ 0L,
560 /* min */ 0L, /* max */ MAX_RATE_LIMITER_BYTES_PER_SEC, 0);
561
562 static MYSQL_SYSVAR_ENUM(
563 info_log_level, rocksdb_info_log_level, PLUGIN_VAR_RQCMDARG,
564 "Filter level for info logs to be written mysqld error log. "
565 "Valid values include 'debug_level', 'info_level', 'warn_level'"
566 "'error_level' and 'fatal_level'.",
567 nullptr, rocksdb_set_rocksdb_info_log_level,
568 rocksdb::InfoLogLevel::ERROR_LEVEL, &info_log_level_typelib);
569
570 static MYSQL_THDVAR_INT(
571 perf_context_level, PLUGIN_VAR_RQCMDARG,
572 "Perf Context Level for rocksdb internal timer stat collection", nullptr,
573 nullptr,
574 /* default */ rocksdb::PerfLevel::kUninitialized,
575 /* min */ rocksdb::PerfLevel::kUninitialized,
576 /* max */ rocksdb::PerfLevel::kOutOfBounds - 1, 0);
577
578 static MYSQL_SYSVAR_UINT(
579 wal_recovery_mode, rocksdb_wal_recovery_mode, PLUGIN_VAR_RQCMDARG,
580 "DBOptions::wal_recovery_mode for RocksDB", nullptr, nullptr,
581 /* default */ (uint)rocksdb::WALRecoveryMode::kPointInTimeRecovery,
582 /* min */ (uint)rocksdb::WALRecoveryMode::kTolerateCorruptedTailRecords,
583 /* max */ (uint)rocksdb::WALRecoveryMode::kSkipAnyCorruptedRecords, 0);
584
585 static MYSQL_SYSVAR_ULONG(compaction_readahead_size,
586 rocksdb_db_options.compaction_readahead_size,
587 PLUGIN_VAR_RQCMDARG,
588 "DBOptions::compaction_readahead_size for RocksDB",
589 nullptr, nullptr,
590 rocksdb_db_options.compaction_readahead_size,
591 /* min */ 0L, /* max */ ULONG_MAX, 0);
592
593 static MYSQL_SYSVAR_BOOL(
594 new_table_reader_for_compaction_inputs,
595 *reinterpret_cast<my_bool *>(
596 &rocksdb_db_options.new_table_reader_for_compaction_inputs),
597 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
598 "DBOptions::new_table_reader_for_compaction_inputs for RocksDB", nullptr,
599 nullptr, rocksdb_db_options.new_table_reader_for_compaction_inputs);
600
601 static MYSQL_SYSVAR_UINT(
602 access_hint_on_compaction_start, rocksdb_access_hint_on_compaction_start,
603 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
604 "DBOptions::access_hint_on_compaction_start for RocksDB", nullptr, nullptr,
605 /* default */ (uint)rocksdb::Options::AccessHint::NORMAL,
606 /* min */ (uint)rocksdb::Options::AccessHint::NONE,
607 /* max */ (uint)rocksdb::Options::AccessHint::WILLNEED, 0);
608
609 static MYSQL_SYSVAR_BOOL(
610 allow_concurrent_memtable_write,
611 *reinterpret_cast<my_bool *>(
612 &rocksdb_db_options.allow_concurrent_memtable_write),
613 PLUGIN_VAR_RQCMDARG,
614 "DBOptions::allow_concurrent_memtable_write for RocksDB", nullptr, nullptr,
615 false);
616
617 static MYSQL_SYSVAR_BOOL(
618 enable_write_thread_adaptive_yield,
619 *reinterpret_cast<my_bool *>(
620 &rocksdb_db_options.enable_write_thread_adaptive_yield),
621 PLUGIN_VAR_RQCMDARG,
622 "DBOptions::enable_write_thread_adaptive_yield for RocksDB", nullptr,
623 nullptr, false);
624
625 static MYSQL_SYSVAR_INT(max_open_files, rocksdb_db_options.max_open_files,
626 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
627 "DBOptions::max_open_files for RocksDB", nullptr,
628 nullptr, rocksdb_db_options.max_open_files,
629 /* min */ -1, /* max */ INT_MAX, 0);
630
631 static MYSQL_SYSVAR_ULONG(max_total_wal_size,
632 rocksdb_db_options.max_total_wal_size,
633 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
634 "DBOptions::max_total_wal_size for RocksDB", nullptr,
635 nullptr, rocksdb_db_options.max_total_wal_size,
636 /* min */ 0L, /* max */ LONG_MAX, 0);
637
638 static MYSQL_SYSVAR_BOOL(
639 disabledatasync,
640 *reinterpret_cast<my_bool *>(&rocksdb_db_options.disableDataSync),
641 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
642 "DBOptions::disableDataSync for RocksDB", nullptr, nullptr,
643 rocksdb_db_options.disableDataSync);
644
645 static MYSQL_SYSVAR_BOOL(
646 use_fsync, *reinterpret_cast<my_bool *>(&rocksdb_db_options.use_fsync),
647 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
648 "DBOptions::use_fsync for RocksDB", nullptr, nullptr,
649 rocksdb_db_options.use_fsync);
650
651 static MYSQL_SYSVAR_STR(wal_dir, rocksdb_wal_dir,
652 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
653 "DBOptions::wal_dir for RocksDB", nullptr, nullptr,
654 rocksdb_db_options.wal_dir.c_str());
655
656 static MYSQL_SYSVAR_STR(
657 persistent_cache_path, rocksdb_persistent_cache_path,
658 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
659 "Path for BlockBasedTableOptions::persistent_cache for RocksDB", nullptr,
660 nullptr, "");
661
662 static MYSQL_SYSVAR_ULONG(
663 persistent_cache_size, rocksdb_persistent_cache_size,
664 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
665 "Size of cache for BlockBasedTableOptions::persistent_cache for RocksDB",
666 nullptr, nullptr, rocksdb_persistent_cache_size,
667 /* min */ 0L, /* max */ ULONG_MAX, 0);
668
669 static MYSQL_SYSVAR_ULONG(
670 delete_obsolete_files_period_micros,
671 rocksdb_db_options.delete_obsolete_files_period_micros,
672 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
673 "DBOptions::delete_obsolete_files_period_micros for RocksDB", nullptr,
674 nullptr, rocksdb_db_options.delete_obsolete_files_period_micros,
675 /* min */ 0L, /* max */ LONG_MAX, 0);
676
677 static MYSQL_SYSVAR_INT(base_background_compactions,
678 rocksdb_db_options.base_background_compactions,
679 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
680 "DBOptions::base_background_compactions for RocksDB",
681 nullptr, nullptr,
682 rocksdb_db_options.base_background_compactions,
683 /* min */ -1, /* max */ MAX_BACKGROUND_COMPACTIONS, 0);
684
685 static MYSQL_SYSVAR_INT(max_background_compactions,
686 rocksdb_db_options.max_background_compactions,
687 PLUGIN_VAR_RQCMDARG,
688 "DBOptions::max_background_compactions for RocksDB",
689 nullptr, rocksdb_set_max_background_compactions,
690 rocksdb_db_options.max_background_compactions,
691 /* min */ 1, /* max */ MAX_BACKGROUND_COMPACTIONS, 0);
692
693 static MYSQL_SYSVAR_INT(max_background_flushes,
694 rocksdb_db_options.max_background_flushes,
695 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
696 "DBOptions::max_background_flushes for RocksDB",
697 nullptr, nullptr,
698 rocksdb_db_options.max_background_flushes,
699 /* min */ 1, /* max */ MAX_BACKGROUND_FLUSHES, 0);
700
701 static MYSQL_SYSVAR_UINT(max_subcompactions,
702 rocksdb_db_options.max_subcompactions,
703 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
704 "DBOptions::max_subcompactions for RocksDB", nullptr,
705 nullptr, rocksdb_db_options.max_subcompactions,
706 /* min */ 1, /* max */ MAX_SUBCOMPACTIONS, 0);
707
708 static MYSQL_SYSVAR_ULONG(max_log_file_size,
709 rocksdb_db_options.max_log_file_size,
710 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
711 "DBOptions::max_log_file_size for RocksDB", nullptr,
712 nullptr, rocksdb_db_options.max_log_file_size,
713 /* min */ 0L, /* max */ LONG_MAX, 0);
714
715 static MYSQL_SYSVAR_ULONG(log_file_time_to_roll,
716 rocksdb_db_options.log_file_time_to_roll,
717 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
718 "DBOptions::log_file_time_to_roll for RocksDB",
719 nullptr, nullptr,
720 rocksdb_db_options.log_file_time_to_roll,
721 /* min */ 0L, /* max */ LONG_MAX, 0);
722
723 static MYSQL_SYSVAR_ULONG(keep_log_file_num,
724 rocksdb_db_options.keep_log_file_num,
725 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
726 "DBOptions::keep_log_file_num for RocksDB", nullptr,
727 nullptr, rocksdb_db_options.keep_log_file_num,
728 /* min */ 0L, /* max */ LONG_MAX, 0);
729
730 static MYSQL_SYSVAR_ULONG(max_manifest_file_size,
731 rocksdb_db_options.max_manifest_file_size,
732 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
733 "DBOptions::max_manifest_file_size for RocksDB",
734 nullptr, nullptr,
735 rocksdb_db_options.max_manifest_file_size,
736 /* min */ 0L, /* max */ ULONG_MAX, 0);
737
738 static MYSQL_SYSVAR_INT(table_cache_numshardbits,
739 rocksdb_db_options.table_cache_numshardbits,
740 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
741 "DBOptions::table_cache_numshardbits for RocksDB",
742 nullptr, nullptr,
743 rocksdb_db_options.table_cache_numshardbits,
744 /* min */ 0, /* max */ INT_MAX, 0);
745
746 static MYSQL_SYSVAR_ULONG(wal_ttl_seconds, rocksdb_db_options.WAL_ttl_seconds,
747 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
748 "DBOptions::WAL_ttl_seconds for RocksDB", nullptr,
749 nullptr, rocksdb_db_options.WAL_ttl_seconds,
750 /* min */ 0L, /* max */ LONG_MAX, 0);
751
752 static MYSQL_SYSVAR_ULONG(wal_size_limit_mb,
753 rocksdb_db_options.WAL_size_limit_MB,
754 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
755 "DBOptions::WAL_size_limit_MB for RocksDB", nullptr,
756 nullptr, rocksdb_db_options.WAL_size_limit_MB,
757 /* min */ 0L, /* max */ LONG_MAX, 0);
758
759 static MYSQL_SYSVAR_ULONG(manifest_preallocation_size,
760 rocksdb_db_options.manifest_preallocation_size,
761 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
762 "DBOptions::manifest_preallocation_size for RocksDB",
763 nullptr, nullptr,
764 rocksdb_db_options.manifest_preallocation_size,
765 /* min */ 0L, /* max */ LONG_MAX, 0);
766
767 static MYSQL_SYSVAR_BOOL(
768 use_direct_reads,
769 *reinterpret_cast<my_bool *>(&rocksdb_db_options.use_direct_reads),
770 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
771 "DBOptions::use_direct_reads for RocksDB", nullptr, nullptr,
772 rocksdb_db_options.use_direct_reads);
773
774 static MYSQL_SYSVAR_BOOL(
775 use_direct_writes,
776 *reinterpret_cast<my_bool *>(&rocksdb_db_options.use_direct_writes),
777 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
778 "DBOptions::use_direct_writes for RocksDB", nullptr, nullptr,
779 rocksdb_db_options.use_direct_writes);
780
781 static MYSQL_SYSVAR_BOOL(
782 allow_mmap_reads,
783 *reinterpret_cast<my_bool *>(&rocksdb_db_options.allow_mmap_reads),
784 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
785 "DBOptions::allow_mmap_reads for RocksDB", nullptr, nullptr,
786 rocksdb_db_options.allow_mmap_reads);
787
788 static MYSQL_SYSVAR_BOOL(
789 allow_mmap_writes,
790 *reinterpret_cast<my_bool *>(&rocksdb_db_options.allow_mmap_writes),
791 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
792 "DBOptions::allow_mmap_writes for RocksDB", nullptr, nullptr,
793 rocksdb_db_options.allow_mmap_writes);
794
795 static MYSQL_SYSVAR_BOOL(
796 is_fd_close_on_exec,
797 *reinterpret_cast<my_bool *>(&rocksdb_db_options.is_fd_close_on_exec),
798 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
799 "DBOptions::is_fd_close_on_exec for RocksDB", nullptr, nullptr,
800 rocksdb_db_options.is_fd_close_on_exec);
801
802 static MYSQL_SYSVAR_UINT(stats_dump_period_sec,
803 rocksdb_db_options.stats_dump_period_sec,
804 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
805 "DBOptions::stats_dump_period_sec for RocksDB",
806 nullptr, nullptr,
807 rocksdb_db_options.stats_dump_period_sec,
808 /* min */ 0, /* max */ INT_MAX, 0);
809
810 static MYSQL_SYSVAR_BOOL(
811 advise_random_on_open,
812 *reinterpret_cast<my_bool *>(&rocksdb_db_options.advise_random_on_open),
813 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
814 "DBOptions::advise_random_on_open for RocksDB", nullptr, nullptr,
815 rocksdb_db_options.advise_random_on_open);
816
817 static MYSQL_SYSVAR_ULONG(db_write_buffer_size,
818 rocksdb_db_options.db_write_buffer_size,
819 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
820 "DBOptions::db_write_buffer_size for RocksDB",
821 nullptr, nullptr,
822 rocksdb_db_options.db_write_buffer_size,
823 /* min */ 0L, /* max */ LONG_MAX, 0);
824
825 static MYSQL_SYSVAR_BOOL(
826 use_adaptive_mutex,
827 *reinterpret_cast<my_bool *>(&rocksdb_db_options.use_adaptive_mutex),
828 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
829 "DBOptions::use_adaptive_mutex for RocksDB", nullptr, nullptr,
830 rocksdb_db_options.use_adaptive_mutex);
831
832 static MYSQL_SYSVAR_ULONG(bytes_per_sync, rocksdb_db_options.bytes_per_sync,
833 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
834 "DBOptions::bytes_per_sync for RocksDB", nullptr,
835 nullptr, rocksdb_db_options.bytes_per_sync,
836 /* min */ 0L, /* max */ LONG_MAX, 0);
837
838 static MYSQL_SYSVAR_ULONG(wal_bytes_per_sync,
839 rocksdb_db_options.wal_bytes_per_sync,
840 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
841 "DBOptions::wal_bytes_per_sync for RocksDB", nullptr,
842 nullptr, rocksdb_db_options.wal_bytes_per_sync,
843 /* min */ 0L, /* max */ LONG_MAX, 0);
844
845 static MYSQL_SYSVAR_BOOL(
846 enable_thread_tracking,
847 *reinterpret_cast<my_bool *>(&rocksdb_db_options.enable_thread_tracking),
848 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
849 "DBOptions::enable_thread_tracking for RocksDB", nullptr, nullptr,
850 rocksdb_db_options.enable_thread_tracking);
851
852 static MYSQL_SYSVAR_LONGLONG(block_cache_size, rocksdb_block_cache_size,
853 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
854 "block_cache size for RocksDB", nullptr, nullptr,
855 /* default */ RDB_DEFAULT_BLOCK_CACHE_SIZE,
856 /* min */ RDB_MIN_BLOCK_CACHE_SIZE,
857 /* max */ LONGLONG_MAX,
858 /* Block size */ RDB_MIN_BLOCK_CACHE_SIZE);
859
860 static MYSQL_SYSVAR_BOOL(
861 cache_index_and_filter_blocks,
862 *reinterpret_cast<my_bool *>(
863 &rocksdb_tbl_options.cache_index_and_filter_blocks),
864 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
865 "BlockBasedTableOptions::cache_index_and_filter_blocks for RocksDB",
866 nullptr, nullptr, true);
867
868 // When pin_l0_filter_and_index_blocks_in_cache is true, RocksDB will use the
869 // LRU cache, but will always keep the filter & idndex block's handle checked
870 // out (=won't call ShardedLRUCache::Release), plus the parsed out objects
871 // the LRU cache will never push flush them out, hence they're pinned.
872 //
873 // This fixes the mutex contention between :ShardedLRUCache::Lookup and
874 // ShardedLRUCache::Release which reduced the QPS ratio (QPS using secondary
875 // index / QPS using PK).
876 static MYSQL_SYSVAR_BOOL(
877 pin_l0_filter_and_index_blocks_in_cache,
878 *reinterpret_cast<my_bool *>(
879 &rocksdb_tbl_options.pin_l0_filter_and_index_blocks_in_cache),
880 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
881 "pin_l0_filter_and_index_blocks_in_cache for RocksDB", nullptr, nullptr,
882 true);
883
884 static MYSQL_SYSVAR_ENUM(index_type, rocksdb_index_type,
885 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
886 "BlockBasedTableOptions::index_type for RocksDB",
887 nullptr, nullptr,
888 (uint64_t)rocksdb_tbl_options.index_type,
889 &index_type_typelib);
890
891 static MYSQL_SYSVAR_BOOL(
892 hash_index_allow_collision,
893 *reinterpret_cast<my_bool *>(
894 &rocksdb_tbl_options.hash_index_allow_collision),
895 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
896 "BlockBasedTableOptions::hash_index_allow_collision for RocksDB", nullptr,
897 nullptr, rocksdb_tbl_options.hash_index_allow_collision);
898
899 static MYSQL_SYSVAR_BOOL(
900 no_block_cache,
901 *reinterpret_cast<my_bool *>(&rocksdb_tbl_options.no_block_cache),
902 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
903 "BlockBasedTableOptions::no_block_cache for RocksDB", nullptr, nullptr,
904 rocksdb_tbl_options.no_block_cache);
905
906 static MYSQL_SYSVAR_ULONG(block_size, rocksdb_tbl_options.block_size,
907 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
908 "BlockBasedTableOptions::block_size for RocksDB",
909 nullptr, nullptr, rocksdb_tbl_options.block_size,
910 /* min */ 1L, /* max */ LONG_MAX, 0);
911
912 static MYSQL_SYSVAR_INT(
913 block_size_deviation, rocksdb_tbl_options.block_size_deviation,
914 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
915 "BlockBasedTableOptions::block_size_deviation for RocksDB", nullptr,
916 nullptr, rocksdb_tbl_options.block_size_deviation,
917 /* min */ 0, /* max */ INT_MAX, 0);
918
919 static MYSQL_SYSVAR_INT(
920 block_restart_interval, rocksdb_tbl_options.block_restart_interval,
921 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
922 "BlockBasedTableOptions::block_restart_interval for RocksDB", nullptr,
923 nullptr, rocksdb_tbl_options.block_restart_interval,
924 /* min */ 1, /* max */ INT_MAX, 0);
925
926 static MYSQL_SYSVAR_BOOL(
927 whole_key_filtering,
928 *reinterpret_cast<my_bool *>(&rocksdb_tbl_options.whole_key_filtering),
929 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
930 "BlockBasedTableOptions::whole_key_filtering for RocksDB", nullptr, nullptr,
931 rocksdb_tbl_options.whole_key_filtering);
932
933 static MYSQL_SYSVAR_STR(default_cf_options, rocksdb_default_cf_options,
934 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
935 "default cf options for RocksDB", nullptr, nullptr, "");
936
937 static MYSQL_SYSVAR_STR(override_cf_options, rocksdb_override_cf_options,
938 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
939 "option overrides per cf for RocksDB", nullptr, nullptr,
940 "");
941
942 static MYSQL_SYSVAR_BOOL(background_sync, rocksdb_background_sync,
943 PLUGIN_VAR_RQCMDARG,
944 "turns on background syncs for RocksDB", nullptr,
945 nullptr, FALSE);
946
947 static MYSQL_THDVAR_BOOL(write_sync, PLUGIN_VAR_RQCMDARG,
948 "WriteOptions::sync for RocksDB", nullptr, nullptr,
949 rocksdb::WriteOptions().sync);
950
951 static MYSQL_THDVAR_BOOL(write_disable_wal, PLUGIN_VAR_RQCMDARG,
952 "WriteOptions::disableWAL for RocksDB", nullptr,
953 nullptr, rocksdb::WriteOptions().disableWAL);
954
955 static MYSQL_THDVAR_BOOL(
956 write_ignore_missing_column_families, PLUGIN_VAR_RQCMDARG,
957 "WriteOptions::ignore_missing_column_families for RocksDB", nullptr,
958 nullptr, rocksdb::WriteOptions().ignore_missing_column_families);
959
960 static MYSQL_THDVAR_BOOL(skip_fill_cache, PLUGIN_VAR_RQCMDARG,
961 "Skip filling block cache on read requests", nullptr,
962 nullptr, FALSE);
963
964 static MYSQL_THDVAR_BOOL(
965 unsafe_for_binlog, PLUGIN_VAR_RQCMDARG,
966 "Allowing statement based binary logging which may break consistency",
967 nullptr, nullptr, FALSE);
968
969 static MYSQL_THDVAR_UINT(records_in_range, PLUGIN_VAR_RQCMDARG,
970 "Used to override the result of records_in_range(). "
971 "Set to a positive number to override",
972 nullptr, nullptr, 0,
973 /* min */ 0, /* max */ INT_MAX, 0);
974
975 static MYSQL_THDVAR_UINT(force_index_records_in_range, PLUGIN_VAR_RQCMDARG,
976 "Used to override the result of records_in_range() "
977 "when FORCE INDEX is used.",
978 nullptr, nullptr, 0,
979 /* min */ 0, /* max */ INT_MAX, 0);
980
981 static MYSQL_SYSVAR_UINT(
982 debug_optimizer_n_rows, rocksdb_debug_optimizer_n_rows,
983 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_NOSYSVAR,
984 "Test only to override rocksdb estimates of table size in a memtable",
985 nullptr, nullptr, 0, /* min */ 0, /* max */ INT_MAX, 0);
986
987 static MYSQL_SYSVAR_BOOL(
988 debug_optimizer_no_zero_cardinality,
989 rocksdb_debug_optimizer_no_zero_cardinality, PLUGIN_VAR_RQCMDARG,
990 "In case if cardinality is zero, overrides it with some value", nullptr,
991 nullptr, TRUE);
992
993 static MYSQL_SYSVAR_STR(compact_cf, rocksdb_compact_cf_name,
994 PLUGIN_VAR_RQCMDARG, "Compact column family",
995 rocksdb_compact_column_family,
996 rocksdb_compact_column_family_stub, "");
997
998 static MYSQL_SYSVAR_STR(create_checkpoint, rocksdb_checkpoint_name,
999 PLUGIN_VAR_RQCMDARG, "Checkpoint directory",
1000 rocksdb_create_checkpoint,
1001 rocksdb_create_checkpoint_stub, "");
1002
1003 static MYSQL_SYSVAR_BOOL(signal_drop_index_thread,
1004 rocksdb_signal_drop_index_thread, PLUGIN_VAR_RQCMDARG,
1005 "Wake up drop index thread", nullptr,
1006 rocksdb_drop_index_wakeup_thread, FALSE);
1007
1008 static MYSQL_SYSVAR_BOOL(pause_background_work, rocksdb_pause_background_work,
1009 PLUGIN_VAR_RQCMDARG,
1010 "Disable all rocksdb background operations", nullptr,
1011 rocksdb_set_pause_background_work, FALSE);
1012
1013 static MYSQL_SYSVAR_BOOL(enable_2pc, rocksdb_enable_2pc, PLUGIN_VAR_RQCMDARG,
1014 "Enable two phase commit for MyRocks", nullptr,
1015 nullptr, TRUE);
1016
1017 static MYSQL_SYSVAR_BOOL(strict_collation_check, rocksdb_strict_collation_check,
1018 PLUGIN_VAR_RQCMDARG,
1019 "Enforce case sensitive collation for MyRocks indexes",
1020 nullptr, nullptr, TRUE);
1021
1022 static MYSQL_SYSVAR_STR(strict_collation_exceptions,
1023 rocksdb_strict_collation_exceptions,
1024 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
1025 "Regex that describes set of tables that are excluded "
1026 "from the case sensitive collation enforcement",
1027 nullptr, rocksdb_set_collation_exception_list, "");
1028
1029 static MYSQL_SYSVAR_BOOL(collect_sst_properties, rocksdb_collect_sst_properties,
1030 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1031 "Enables collecting SST file properties on each flush",
1032 nullptr, nullptr, rocksdb_collect_sst_properties);
1033
1034 static MYSQL_SYSVAR_BOOL(
1035 force_flush_memtable_now, rocksdb_force_flush_memtable_now_var,
1036 PLUGIN_VAR_RQCMDARG,
1037 "Forces memstore flush which may block all write requests so be careful",
1038 rocksdb_force_flush_memtable_now, rocksdb_force_flush_memtable_now_stub,
1039 FALSE);
1040
1041 static MYSQL_THDVAR_BOOL(
1042 flush_memtable_on_analyze, PLUGIN_VAR_RQCMDARG,
1043 "Forces memtable flush on ANALZYE table to get accurate cardinality",
1044 nullptr, nullptr, true);
1045
1046 static MYSQL_SYSVAR_UINT(
1047 seconds_between_stat_computes, rocksdb_seconds_between_stat_computes,
1048 PLUGIN_VAR_RQCMDARG,
1049 "Sets a number of seconds to wait between optimizer stats recomputation. "
1050 "Only changed indexes will be refreshed.",
1051 nullptr, nullptr, rocksdb_seconds_between_stat_computes,
1052 /* min */ 0L, /* max */ UINT_MAX, 0);
1053
1054 static MYSQL_SYSVAR_LONGLONG(compaction_sequential_deletes,
1055 rocksdb_compaction_sequential_deletes,
1056 PLUGIN_VAR_RQCMDARG,
1057 "RocksDB will trigger compaction for the file if "
1058 "it has more than this number sequential deletes "
1059 "per window",
1060 nullptr, rocksdb_set_compaction_options,
1061 DEFAULT_COMPACTION_SEQUENTIAL_DELETES,
1062 /* min */ 0L,
1063 /* max */ MAX_COMPACTION_SEQUENTIAL_DELETES, 0);
1064
1065 static MYSQL_SYSVAR_LONGLONG(
1066 compaction_sequential_deletes_window,
1067 rocksdb_compaction_sequential_deletes_window, PLUGIN_VAR_RQCMDARG,
1068 "Size of the window for counting rocksdb_compaction_sequential_deletes",
1069 nullptr, rocksdb_set_compaction_options,
1070 DEFAULT_COMPACTION_SEQUENTIAL_DELETES_WINDOW,
1071 /* min */ 0L, /* max */ MAX_COMPACTION_SEQUENTIAL_DELETES_WINDOW, 0);
1072
1073 static MYSQL_SYSVAR_LONGLONG(
1074 compaction_sequential_deletes_file_size,
1075 rocksdb_compaction_sequential_deletes_file_size, PLUGIN_VAR_RQCMDARG,
1076 "Minimum file size required for compaction_sequential_deletes", nullptr,
1077 rocksdb_set_compaction_options, 0L,
1078 /* min */ -1L, /* max */ LONGLONG_MAX, 0);
1079
1080 static MYSQL_SYSVAR_BOOL(
1081 compaction_sequential_deletes_count_sd,
1082 rocksdb_compaction_sequential_deletes_count_sd, PLUGIN_VAR_RQCMDARG,
1083 "Counting SingleDelete as rocksdb_compaction_sequential_deletes", nullptr,
1084 nullptr, rocksdb_compaction_sequential_deletes_count_sd);
1085
1086 static MYSQL_SYSVAR_BOOL(
1087 print_snapshot_conflict_queries, rocksdb_print_snapshot_conflict_queries,
1088 PLUGIN_VAR_RQCMDARG,
1089 "Logging queries that got snapshot conflict errors into *.err log", nullptr,
1090 nullptr, rocksdb_print_snapshot_conflict_queries);
1091
1092 static MYSQL_THDVAR_INT(checksums_pct, PLUGIN_VAR_RQCMDARG,
1093 "How many percentages of rows to be checksummed",
1094 nullptr, nullptr, RDB_MAX_CHECKSUMS_PCT,
1095 /* min */ 0, /* max */ RDB_MAX_CHECKSUMS_PCT, 0);
1096
1097 static MYSQL_THDVAR_BOOL(store_row_debug_checksums, PLUGIN_VAR_RQCMDARG,
1098 "Include checksums when writing index/table records",
1099 nullptr, nullptr, false /* default value */);
1100
1101 static MYSQL_THDVAR_BOOL(verify_row_debug_checksums, PLUGIN_VAR_RQCMDARG,
1102 "Verify checksums when reading index/table records",
1103 nullptr, nullptr, false /* default value */);
1104
1105 static MYSQL_SYSVAR_UINT(
1106 validate_tables, rocksdb_validate_tables,
1107 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1108 "Verify all .frm files match all RocksDB tables (0 means no verification, "
1109 "1 means verify and fail on error, and 2 means verify but continue",
1110 nullptr, nullptr, 1 /* default value */, 0 /* min value */,
1111 2 /* max value */, 0);
1112
1113 static MYSQL_SYSVAR_STR(datadir, rocksdb_datadir,
1114 PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
1115 "RocksDB data directory", nullptr, nullptr,
1116 "./.rocksdb");
1117
1118 static MYSQL_SYSVAR_UINT(
1119 table_stats_sampling_pct, rocksdb_table_stats_sampling_pct,
1120 PLUGIN_VAR_RQCMDARG,
1121 "Percentage of entries to sample when collecting statistics about table "
1122 "properties. Specify either 0 to sample everything or percentage "
1123 "[" STRINGIFY_ARG(RDB_TBL_STATS_SAMPLE_PCT_MIN) ".." STRINGIFY_ARG(
1124 RDB_TBL_STATS_SAMPLE_PCT_MAX) "]. "
1125 "By default " STRINGIFY_ARG(
1126 RDB_DEFAULT_TBL_STATS_SAMPLE_PCT) "% "
1127 "of"
1128 " e"
1129 "nt"
1130 "ri"
1131 "es"
1132 " a"
1133 "re"
1134 " "
1135 "sa"
1136 "mp"
1137 "le"
1138 "d"
1139 ".",
1140 nullptr, rocksdb_set_table_stats_sampling_pct, /* default */
1141 RDB_DEFAULT_TBL_STATS_SAMPLE_PCT, /* everything */ 0,
1142 /* max */ RDB_TBL_STATS_SAMPLE_PCT_MAX, 0);
1143
1144 static const int ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE = 100;
1145
1146 static struct st_mysql_sys_var *rocksdb_system_variables[] = {
1147 MYSQL_SYSVAR(lock_wait_timeout),
1148 MYSQL_SYSVAR(deadlock_detect),
1149 MYSQL_SYSVAR(max_row_locks),
1150 MYSQL_SYSVAR(lock_scanned_rows),
1151 MYSQL_SYSVAR(bulk_load),
1152 MYSQL_SYSVAR(trace_sst_api),
1153 MYSQL_SYSVAR(commit_in_the_middle),
1154 MYSQL_SYSVAR(read_free_rpl_tables),
1155 MYSQL_SYSVAR(rpl_skip_tx_api),
1156 MYSQL_SYSVAR(bulk_load_size),
1157 MYSQL_SYSVAR(merge_buf_size),
1158 MYSQL_SYSVAR(enable_bulk_load_api),
1159 MYSQL_SYSVAR(tmpdir),
1160 MYSQL_SYSVAR(merge_combine_read_size),
1161 MYSQL_SYSVAR(skip_bloom_filter_on_read),
1162
1163 MYSQL_SYSVAR(create_if_missing),
1164 MYSQL_SYSVAR(create_missing_column_families),
1165 MYSQL_SYSVAR(error_if_exists),
1166 MYSQL_SYSVAR(paranoid_checks),
1167 MYSQL_SYSVAR(rate_limiter_bytes_per_sec),
1168 MYSQL_SYSVAR(info_log_level),
1169 MYSQL_SYSVAR(max_open_files),
1170 MYSQL_SYSVAR(max_total_wal_size),
1171 MYSQL_SYSVAR(disabledatasync),
1172 MYSQL_SYSVAR(use_fsync),
1173 MYSQL_SYSVAR(wal_dir),
1174 MYSQL_SYSVAR(persistent_cache_path),
1175 MYSQL_SYSVAR(persistent_cache_size),
1176 MYSQL_SYSVAR(delete_obsolete_files_period_micros),
1177 MYSQL_SYSVAR(base_background_compactions),
1178 MYSQL_SYSVAR(max_background_compactions),
1179 MYSQL_SYSVAR(max_background_flushes),
1180 MYSQL_SYSVAR(max_log_file_size),
1181 MYSQL_SYSVAR(max_subcompactions),
1182 MYSQL_SYSVAR(log_file_time_to_roll),
1183 MYSQL_SYSVAR(keep_log_file_num),
1184 MYSQL_SYSVAR(max_manifest_file_size),
1185 MYSQL_SYSVAR(table_cache_numshardbits),
1186 MYSQL_SYSVAR(wal_ttl_seconds),
1187 MYSQL_SYSVAR(wal_size_limit_mb),
1188 MYSQL_SYSVAR(manifest_preallocation_size),
1189 MYSQL_SYSVAR(use_direct_reads),
1190 MYSQL_SYSVAR(use_direct_writes),
1191 MYSQL_SYSVAR(allow_mmap_reads),
1192 MYSQL_SYSVAR(allow_mmap_writes),
1193 MYSQL_SYSVAR(is_fd_close_on_exec),
1194 MYSQL_SYSVAR(stats_dump_period_sec),
1195 MYSQL_SYSVAR(advise_random_on_open),
1196 MYSQL_SYSVAR(db_write_buffer_size),
1197 MYSQL_SYSVAR(use_adaptive_mutex),
1198 MYSQL_SYSVAR(bytes_per_sync),
1199 MYSQL_SYSVAR(wal_bytes_per_sync),
1200 MYSQL_SYSVAR(enable_thread_tracking),
1201 MYSQL_SYSVAR(perf_context_level),
1202 MYSQL_SYSVAR(wal_recovery_mode),
1203 MYSQL_SYSVAR(access_hint_on_compaction_start),
1204 MYSQL_SYSVAR(new_table_reader_for_compaction_inputs),
1205 MYSQL_SYSVAR(compaction_readahead_size),
1206 MYSQL_SYSVAR(allow_concurrent_memtable_write),
1207 MYSQL_SYSVAR(enable_write_thread_adaptive_yield),
1208
1209 MYSQL_SYSVAR(block_cache_size),
1210 MYSQL_SYSVAR(cache_index_and_filter_blocks),
1211 MYSQL_SYSVAR(pin_l0_filter_and_index_blocks_in_cache),
1212 MYSQL_SYSVAR(index_type),
1213 MYSQL_SYSVAR(hash_index_allow_collision),
1214 MYSQL_SYSVAR(no_block_cache),
1215 MYSQL_SYSVAR(block_size),
1216 MYSQL_SYSVAR(block_size_deviation),
1217 MYSQL_SYSVAR(block_restart_interval),
1218 MYSQL_SYSVAR(whole_key_filtering),
1219
1220 MYSQL_SYSVAR(default_cf_options),
1221 MYSQL_SYSVAR(override_cf_options),
1222
1223 MYSQL_SYSVAR(background_sync),
1224
1225 MYSQL_SYSVAR(write_sync),
1226 MYSQL_SYSVAR(write_disable_wal),
1227 MYSQL_SYSVAR(write_ignore_missing_column_families),
1228
1229 MYSQL_SYSVAR(skip_fill_cache),
1230 MYSQL_SYSVAR(unsafe_for_binlog),
1231
1232 MYSQL_SYSVAR(records_in_range),
1233 MYSQL_SYSVAR(force_index_records_in_range),
1234 MYSQL_SYSVAR(debug_optimizer_n_rows),
1235 MYSQL_SYSVAR(debug_optimizer_no_zero_cardinality),
1236
1237 MYSQL_SYSVAR(compact_cf),
1238 MYSQL_SYSVAR(signal_drop_index_thread),
1239 MYSQL_SYSVAR(pause_background_work),
1240 MYSQL_SYSVAR(enable_2pc),
1241 MYSQL_SYSVAR(strict_collation_check),
1242 MYSQL_SYSVAR(strict_collation_exceptions),
1243 MYSQL_SYSVAR(collect_sst_properties),
1244 MYSQL_SYSVAR(force_flush_memtable_now),
1245 MYSQL_SYSVAR(flush_memtable_on_analyze),
1246 MYSQL_SYSVAR(seconds_between_stat_computes),
1247
1248 MYSQL_SYSVAR(compaction_sequential_deletes),
1249 MYSQL_SYSVAR(compaction_sequential_deletes_window),
1250 MYSQL_SYSVAR(compaction_sequential_deletes_file_size),
1251 MYSQL_SYSVAR(compaction_sequential_deletes_count_sd),
1252 MYSQL_SYSVAR(print_snapshot_conflict_queries),
1253
1254 MYSQL_SYSVAR(datadir),
1255 MYSQL_SYSVAR(create_checkpoint),
1256
1257 MYSQL_SYSVAR(checksums_pct),
1258 MYSQL_SYSVAR(store_row_debug_checksums),
1259 MYSQL_SYSVAR(verify_row_debug_checksums),
1260
1261 MYSQL_SYSVAR(validate_tables),
1262 MYSQL_SYSVAR(table_stats_sampling_pct),
1263 nullptr};
1264
1265 static rocksdb::WriteOptions
rdb_get_rocksdb_write_options(my_core::THD * const thd)1266 rdb_get_rocksdb_write_options(my_core::THD *const thd) {
1267 rocksdb::WriteOptions opt;
1268
1269 opt.sync = THDVAR(thd, write_sync);
1270 opt.disableWAL = THDVAR(thd, write_disable_wal);
1271 opt.ignore_missing_column_families =
1272 THDVAR(thd, write_ignore_missing_column_families);
1273
1274 return opt;
1275 }
1276
1277 ///////////////////////////////////////////////////////////////////////////////////////////
1278
1279 /**
1280 @brief
1281 Function we use in the creation of our hash to get key.
1282 */
1283
get_hash_key(Rdb_table_handler * const table_handler,size_t * const length,my_bool not_used)1284 uchar *Rdb_open_tables_map::get_hash_key(Rdb_table_handler *const table_handler,
1285 size_t *const length, my_bool not_used
1286 __attribute__((__unused__))) {
1287 *length = table_handler->m_table_name_length;
1288 return reinterpret_cast<uchar *>(table_handler->m_table_name);
1289 }
1290
1291 /*
1292 The following is needed as an argument for mysql_stage_register,
1293 irrespectively of whether we're compiling with P_S or not.
1294 */
1295 PSI_stage_info stage_waiting_on_row_lock = {0, "Waiting for row lock", 0};
1296
1297 #ifdef HAVE_PSI_INTERFACE
1298 static PSI_thread_key rdb_background_psi_thread_key;
1299 static PSI_thread_key rdb_drop_idx_psi_thread_key;
1300
1301 static PSI_stage_info *all_rocksdb_stages[] = {&stage_waiting_on_row_lock};
1302
1303 static my_core::PSI_mutex_key rdb_psi_open_tbls_mutex_key,
1304 rdb_signal_bg_psi_mutex_key, rdb_signal_drop_idx_psi_mutex_key,
1305 rdb_collation_data_mutex_key, rdb_mem_cmp_space_mutex_key,
1306 key_mutex_tx_list, rdb_sysvars_psi_mutex_key;
1307
1308 static PSI_mutex_info all_rocksdb_mutexes[] = {
1309 {&rdb_psi_open_tbls_mutex_key, "open tables", PSI_FLAG_GLOBAL},
1310 {&rdb_signal_bg_psi_mutex_key, "stop background", PSI_FLAG_GLOBAL},
1311 {&rdb_signal_drop_idx_psi_mutex_key, "signal drop index", PSI_FLAG_GLOBAL},
1312 {&rdb_collation_data_mutex_key, "collation data init", PSI_FLAG_GLOBAL},
1313 {&rdb_mem_cmp_space_mutex_key, "collation space char data init",
1314 PSI_FLAG_GLOBAL},
1315 {&key_mutex_tx_list, "tx_list", PSI_FLAG_GLOBAL},
1316 {&rdb_sysvars_psi_mutex_key, "setting sysvar", PSI_FLAG_GLOBAL},
1317 };
1318
1319 static PSI_rwlock_key key_rwlock_collation_exception_list;
1320 static PSI_rwlock_key key_rwlock_read_free_rpl_tables;
1321
1322 static PSI_rwlock_info all_rocksdb_rwlocks[] = {
1323 {&key_rwlock_collation_exception_list, "collation_exception_list",
1324 PSI_FLAG_GLOBAL},
1325 {&key_rwlock_read_free_rpl_tables, "read_free_rpl_tables", PSI_FLAG_GLOBAL}
1326 };
1327
1328 PSI_cond_key rdb_signal_bg_psi_cond_key, rdb_signal_drop_idx_psi_cond_key;
1329
1330 static PSI_cond_info all_rocksdb_conds[] = {
1331 {&rdb_signal_bg_psi_cond_key, "cond signal background", PSI_FLAG_GLOBAL},
1332 {&rdb_signal_drop_idx_psi_cond_key, "cond signal drop index",
1333 PSI_FLAG_GLOBAL},
1334 };
1335
1336 static PSI_thread_info all_rocksdb_threads[] = {
1337 {&rdb_background_psi_thread_key, "background", PSI_FLAG_GLOBAL},
1338 {&rdb_drop_idx_psi_thread_key, "drop index", PSI_FLAG_GLOBAL},
1339 };
1340
init_rocksdb_psi_keys()1341 static void init_rocksdb_psi_keys() {
1342 const char *const category = "rocksdb";
1343 int count;
1344
1345 if (PSI_server == nullptr)
1346 return;
1347
1348 count = array_elements(all_rocksdb_mutexes);
1349 PSI_server->register_mutex(category, all_rocksdb_mutexes, count);
1350
1351 count = array_elements(all_rocksdb_rwlocks);
1352 PSI_server->register_rwlock(category, all_rocksdb_rwlocks, count);
1353
1354 count = array_elements(all_rocksdb_conds);
1355 // TODO Disabling PFS for conditions due to the bug
1356 // https://github.com/MySQLOnRocksDB/mysql-5.6/issues/92
1357 // PSI_server->register_cond(category, all_rocksdb_conds, count);
1358
1359 count = array_elements(all_rocksdb_stages);
1360 mysql_stage_register(category, all_rocksdb_stages, count);
1361
1362 count = array_elements(all_rocksdb_threads);
1363 mysql_thread_register(category, all_rocksdb_threads, count);
1364 }
1365 #endif
1366
1367 /*
1368 Drop index thread's control
1369 */
1370
1371 static Rdb_drop_index_thread rdb_drop_idx_thread;
1372
rocksdb_drop_index_wakeup_thread(my_core::THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)1373 static void rocksdb_drop_index_wakeup_thread(
1374 my_core::THD *const thd __attribute__((__unused__)),
1375 struct st_mysql_sys_var *const var __attribute__((__unused__)),
1376 void *const var_ptr __attribute__((__unused__)), const void *const save) {
1377 if (*static_cast<const bool *>(save)) {
1378 rdb_drop_idx_thread.signal();
1379 }
1380 }
1381
rocksdb_perf_context_level(THD * const thd)1382 static inline uint32_t rocksdb_perf_context_level(THD *const thd) {
1383 DBUG_ASSERT(thd != nullptr);
1384
1385 const int session_perf_context_level = THDVAR(thd, perf_context_level);
1386 if (session_perf_context_level > rocksdb::PerfLevel::kUninitialized) {
1387 return session_perf_context_level;
1388 }
1389
1390 /*
1391 Fallback to global thdvar, if session specific one was not set to a valid
1392 value.
1393 */
1394
1395 const int global_perf_context_level = THDVAR(nullptr, perf_context_level);
1396 if (global_perf_context_level > rocksdb::PerfLevel::kUninitialized) {
1397 return global_perf_context_level;
1398 }
1399
1400 return rocksdb::PerfLevel::kDisable;
1401 }
1402
1403 /*
1404 Very short (functor-like) interface to be passed to
1405 Rdb_transaction::walk_tx_list()
1406 */
1407
1408 interface Rdb_tx_list_walker {
1409 virtual ~Rdb_tx_list_walker() {}
1410 virtual void process_tran(const Rdb_transaction *const) = 0;
1411 };
1412
1413 /*
1414 This is a helper class that is passed to RocksDB to get notifications when
1415 a snapshot gets created.
1416 */
1417
1418 class Rdb_snapshot_notifier : public rocksdb::TransactionNotifier {
1419 Rdb_transaction *m_owning_tx;
1420
1421 void SnapshotCreated(const rocksdb::Snapshot *snapshot) override;
1422
1423 public:
1424 Rdb_snapshot_notifier(const Rdb_snapshot_notifier &) = delete;
1425 Rdb_snapshot_notifier &operator=(const Rdb_snapshot_notifier &) = delete;
1426
Rdb_snapshot_notifier(Rdb_transaction * const owning_tx)1427 explicit Rdb_snapshot_notifier(Rdb_transaction *const owning_tx)
1428 : m_owning_tx(owning_tx) {}
1429
1430 // If the owning Rdb_transaction gets destructed we need to not reference
1431 // it anymore.
detach()1432 void detach() { m_owning_tx = nullptr; }
1433 };
1434
1435 /* This is the base class for transactions when interacting with rocksdb.
1436 */
1437 class Rdb_transaction {
1438 protected:
1439 ulonglong m_write_count = 0;
1440 ulonglong m_lock_count = 0;
1441
1442 bool m_is_delayed_snapshot = false;
1443 bool m_is_two_phase = false;
1444
1445 THD *m_thd = nullptr;
1446
1447 rocksdb::ReadOptions m_read_opts;
1448
1449 static std::multiset<Rdb_transaction *> s_tx_list;
1450 static mysql_mutex_t s_tx_list_mutex;
1451
1452 Rdb_io_perf *m_tbl_io_perf;
1453
1454 bool m_tx_read_only = false;
1455
1456 int m_timeout_sec; /* Cached value of @@rocksdb_lock_wait_timeout */
1457
1458 /* Maximum number of locks the transaction can have */
1459 ulonglong m_max_row_locks;
1460
1461 bool m_is_tx_failed = false;
1462 bool m_rollback_only = false;
1463
1464 std::shared_ptr<Rdb_snapshot_notifier> m_notifier;
1465
1466 // This should be used only when updating binlog information.
1467 virtual bool commit_no_binlog() = 0;
1468 virtual rocksdb::Iterator *
1469 get_iterator(const rocksdb::ReadOptions &options,
1470 rocksdb::ColumnFamilyHandle *column_family) = 0;
1471
1472 public:
1473 int64_t m_snapshot_timestamp = 0;
1474 bool m_ddl_transaction;
1475
1476 /*
1477 for distinction between rdb_transaction_impl and rdb_writebatch_impl
1478 when using walk tx list
1479 */
1480 virtual bool is_writebatch_trx() const = 0;
1481
init_mutex()1482 static void init_mutex() {
1483 mysql_mutex_init(key_mutex_tx_list, &s_tx_list_mutex, MY_MUTEX_INIT_FAST);
1484 }
1485
term_mutex()1486 static void term_mutex() {
1487 DBUG_ASSERT(s_tx_list.size() == 0);
1488 mysql_mutex_destroy(&s_tx_list_mutex);
1489 }
1490
walk_tx_list(Rdb_tx_list_walker * walker)1491 static void walk_tx_list(Rdb_tx_list_walker *walker) {
1492 DBUG_ASSERT(walker != nullptr);
1493
1494 mysql_mutex_lock(&s_tx_list_mutex);
1495 for (auto it : s_tx_list)
1496 walker->process_tran(it);
1497 mysql_mutex_unlock(&s_tx_list_mutex);
1498 }
1499
set_status_error(THD * const thd,const rocksdb::Status & s,const Rdb_key_def & kd,Rdb_tbl_def * const tbl_def)1500 int set_status_error(THD *const thd, const rocksdb::Status &s,
1501 const Rdb_key_def &kd, Rdb_tbl_def *const tbl_def) {
1502 DBUG_ASSERT(!s.ok());
1503 DBUG_ASSERT(tbl_def != nullptr);
1504
1505 if (s.IsTimedOut()) {
1506 /*
1507 SQL layer has weird expectations. If we return an error when
1508 doing a read in DELETE IGNORE, it will ignore the error ("because it's
1509 an IGNORE command!) but then will fail an assert, because "error code
1510 was returned, but no error happened". Do what InnoDB's
1511 convert_error_code_to_mysql() does: force a statement
1512 rollback before returning HA_ERR_LOCK_WAIT_TIMEOUT:
1513 */
1514 my_core::thd_mark_transaction_to_rollback(thd, false /*just statement*/);
1515
1516 return HA_ERR_LOCK_WAIT_TIMEOUT;
1517 }
1518
1519 if (s.IsDeadlock()) {
1520 my_core::thd_mark_transaction_to_rollback(thd,
1521 false /* just statement */);
1522 return HA_ERR_LOCK_DEADLOCK;
1523 } else if (s.IsBusy()) {
1524 rocksdb_snapshot_conflict_errors++;
1525 if (rocksdb_print_snapshot_conflict_queries) {
1526 char user_host_buff[MAX_USER_HOST_SIZE + 1];
1527 make_user_name(thd, user_host_buff);
1528 // NO_LINT_DEBUG
1529 sql_print_warning("Got snapshot conflict errors: User: %s "
1530 "Query: %s",
1531 user_host_buff, thd->query());
1532 }
1533 return HA_ERR_LOCK_DEADLOCK;
1534 }
1535
1536 if (s.IsLockLimit()) {
1537 return HA_ERR_ROCKSDB_TOO_MANY_LOCKS;
1538 }
1539
1540 if (s.IsIOError() || s.IsCorruption()) {
1541 rdb_handle_io_error(s, RDB_IO_ERROR_GENERAL);
1542 }
1543 my_error(ER_INTERNAL_ERROR, MYF(0), s.ToString().c_str());
1544 return HA_ERR_INTERNAL_ERROR;
1545 }
1546
get_thd() const1547 THD *get_thd() const { return m_thd; }
1548
1549 /* Used for tracking io_perf counters */
io_perf_start(Rdb_io_perf * const io_perf)1550 void io_perf_start(Rdb_io_perf *const io_perf) {
1551 /*
1552 Since perf_context is tracked per thread, it is difficult and expensive
1553 to maintain perf_context on a per table basis. Therefore, roll all
1554 perf_context data into the first table used in a query. This works well
1555 for single table queries and is probably good enough for queries that hit
1556 multiple tables.
1557
1558 perf_context stats gathering is started when the table lock is acquired
1559 or when ha_rocksdb::start_stmt is called in case of LOCK TABLES. They
1560 are recorded when the table lock is released, or when commit/rollback
1561 is called on the transaction, whichever comes first. Table lock release
1562 and commit/rollback can happen in different orders. In the case where
1563 the lock is released before commit/rollback is called, an extra step to
1564 gather stats during commit/rollback is needed.
1565 */
1566 if (m_tbl_io_perf == nullptr &&
1567 io_perf->start(rocksdb_perf_context_level(m_thd))) {
1568 m_tbl_io_perf = io_perf;
1569 }
1570 }
1571
io_perf_end_and_record(void)1572 void io_perf_end_and_record(void) {
1573 if (m_tbl_io_perf != nullptr) {
1574 m_tbl_io_perf->end_and_record(rocksdb_perf_context_level(m_thd));
1575 m_tbl_io_perf = nullptr;
1576 }
1577 }
1578
io_perf_end_and_record(Rdb_io_perf * const io_perf)1579 void io_perf_end_and_record(Rdb_io_perf *const io_perf) {
1580 if (m_tbl_io_perf == io_perf) {
1581 io_perf_end_and_record();
1582 }
1583 }
1584
set_params(int timeout_sec_arg,int max_row_locks_arg)1585 void set_params(int timeout_sec_arg, int max_row_locks_arg) {
1586 m_timeout_sec = timeout_sec_arg;
1587 m_max_row_locks = max_row_locks_arg;
1588 set_lock_timeout(timeout_sec_arg);
1589 }
1590
1591 virtual void set_lock_timeout(int timeout_sec_arg) = 0;
1592
get_write_count() const1593 ulonglong get_write_count() const { return m_write_count; }
1594
get_timeout_sec() const1595 int get_timeout_sec() const { return m_timeout_sec; }
1596
get_lock_count() const1597 ulonglong get_lock_count() const { return m_lock_count; }
1598
1599 virtual void set_sync(bool sync) = 0;
1600
1601 virtual void release_lock(rocksdb::ColumnFamilyHandle *const column_family,
1602 const std::string &rowkey) = 0;
1603
1604 virtual bool prepare(const rocksdb::TransactionName &name) = 0;
1605
commit_or_rollback()1606 bool commit_or_rollback() {
1607 bool res;
1608 if (m_is_tx_failed) {
1609 rollback();
1610 res = false;
1611 } else
1612 res = commit();
1613 return res;
1614 }
1615
commit()1616 bool commit() {
1617 if (get_write_count() == 0) {
1618 rollback();
1619 return false;
1620 } else if (m_rollback_only) {
1621 /*
1622 Transactions marked as rollback_only are expected to be rolled back at
1623 prepare(). But there are some exceptions like below that prepare() is
1624 never called and commit() is called instead.
1625 1. Binlog is disabled
1626 2. No modification exists in binlog cache for the transaction (#195)
1627 In both cases, rolling back transaction is safe. Nothing is written to
1628 binlog.
1629 */
1630 my_printf_error(ER_UNKNOWN_ERROR, ERRSTR_ROLLBACK_ONLY, MYF(0));
1631 rollback();
1632 return true;
1633 } else {
1634 return commit_no_binlog();
1635 }
1636 }
1637
1638 virtual void rollback() = 0;
1639
snapshot_created(const rocksdb::Snapshot * const snapshot)1640 void snapshot_created(const rocksdb::Snapshot *const snapshot) {
1641 DBUG_ASSERT(snapshot != nullptr);
1642
1643 m_read_opts.snapshot = snapshot;
1644 rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp);
1645 m_is_delayed_snapshot = false;
1646 }
1647
1648 virtual void acquire_snapshot(bool acquire_now) = 0;
1649 virtual void release_snapshot() = 0;
1650
has_snapshot() const1651 bool has_snapshot() const { return m_read_opts.snapshot != nullptr; }
1652
1653 private:
1654 // The tables we are currently loading. In a partitioned table this can
1655 // have more than one entry
1656 std::vector<ha_rocksdb *> m_curr_bulk_load;
1657
1658 public:
finish_bulk_load()1659 int finish_bulk_load() {
1660 int rc = 0;
1661
1662 std::vector<ha_rocksdb *>::iterator it;
1663 while ((it = m_curr_bulk_load.begin()) != m_curr_bulk_load.end()) {
1664 int rc2 = (*it)->finalize_bulk_load();
1665 if (rc2 != 0 && rc == 0) {
1666 rc = rc2;
1667 }
1668 }
1669
1670 DBUG_ASSERT(m_curr_bulk_load.size() == 0);
1671
1672 return rc;
1673 }
1674
start_bulk_load(ha_rocksdb * const bulk_load)1675 void start_bulk_load(ha_rocksdb *const bulk_load) {
1676 /*
1677 If we already have an open bulk load of a table and the name doesn't
1678 match the current one, close out the currently running one. This allows
1679 multiple bulk loads to occur on a partitioned table, but then closes
1680 them all out when we switch to another table.
1681 */
1682 DBUG_ASSERT(bulk_load != nullptr);
1683
1684 if (!m_curr_bulk_load.empty() &&
1685 !bulk_load->same_table(*m_curr_bulk_load[0])) {
1686 const auto res = finish_bulk_load();
1687 SHIP_ASSERT(res == 0);
1688 }
1689
1690 m_curr_bulk_load.push_back(bulk_load);
1691 }
1692
end_bulk_load(ha_rocksdb * const bulk_load)1693 void end_bulk_load(ha_rocksdb *const bulk_load) {
1694 for (auto it = m_curr_bulk_load.begin(); it != m_curr_bulk_load.end();
1695 it++) {
1696 if (*it == bulk_load) {
1697 m_curr_bulk_load.erase(it);
1698 return;
1699 }
1700 }
1701
1702 // Should not reach here
1703 SHIP_ASSERT(0);
1704 }
1705
num_ongoing_bulk_load() const1706 int num_ongoing_bulk_load() const { return m_curr_bulk_load.size(); }
1707
1708 /*
1709 Flush the data accumulated so far. This assumes we're doing a bulk insert.
1710
1711 @detail
1712 This should work like transaction commit, except that we don't
1713 synchronize with the binlog (there is no API that would allow to have
1714 binlog flush the changes accumulated so far and return its current
1715 position)
1716
1717 @todo
1718 Add test coverage for what happens when somebody attempts to do bulk
1719 inserts while inside a multi-statement transaction.
1720 */
flush_batch()1721 bool flush_batch() {
1722 if (get_write_count() == 0)
1723 return false;
1724
1725 /* Commit the current transaction */
1726 if (commit_no_binlog())
1727 return true;
1728
1729 /* Start another one */
1730 start_tx();
1731 return false;
1732 }
1733
1734 virtual rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family,
1735 const rocksdb::Slice &key,
1736 const rocksdb::Slice &value) = 0;
1737 virtual rocksdb::Status
1738 delete_key(rocksdb::ColumnFamilyHandle *const column_family,
1739 const rocksdb::Slice &key) = 0;
1740 virtual rocksdb::Status
1741 single_delete(rocksdb::ColumnFamilyHandle *const column_family,
1742 const rocksdb::Slice &key) = 0;
1743
1744 virtual bool has_modifications() const = 0;
1745
1746 virtual rocksdb::WriteBatchBase *get_indexed_write_batch() = 0;
1747 /*
1748 Return a WriteBatch that one can write to. The writes will skip any
1749 transaction locking. The writes will NOT be visible to the transaction.
1750 */
get_blind_write_batch()1751 rocksdb::WriteBatchBase *get_blind_write_batch() {
1752 return get_indexed_write_batch()->GetWriteBatch();
1753 }
1754
1755 virtual rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family,
1756 const rocksdb::Slice &key,
1757 std::string *value) const = 0;
1758 virtual rocksdb::Status
1759 get_for_update(rocksdb::ColumnFamilyHandle *const column_family,
1760 const rocksdb::Slice &key, std::string *const value,
1761 bool exclusive) = 0;
1762
1763 rocksdb::Iterator *
get_iterator(rocksdb::ColumnFamilyHandle * const column_family,bool skip_bloom_filter,bool fill_cache,bool read_current=false,bool create_snapshot=true)1764 get_iterator(rocksdb::ColumnFamilyHandle *const column_family,
1765 bool skip_bloom_filter, bool fill_cache,
1766 bool read_current = false, bool create_snapshot = true) {
1767 // Make sure we are not doing both read_current (which implies we don't
1768 // want a snapshot) and create_snapshot which makes sure we create
1769 // a snapshot
1770 DBUG_ASSERT(column_family != nullptr);
1771 DBUG_ASSERT(!read_current || !create_snapshot);
1772
1773 if (create_snapshot)
1774 acquire_snapshot(true);
1775
1776 rocksdb::ReadOptions options = m_read_opts;
1777
1778 if (skip_bloom_filter) {
1779 options.total_order_seek = true;
1780 } else {
1781 // With this option, Iterator::Valid() returns false if key
1782 // is outside of the prefix bloom filter range set at Seek().
1783 // Must not be set to true if not using bloom filter.
1784 options.prefix_same_as_start = true;
1785 }
1786 options.fill_cache = fill_cache;
1787 if (read_current) {
1788 options.snapshot = nullptr;
1789 }
1790 return get_iterator(options, column_family);
1791 }
1792
1793 virtual bool is_tx_started() const = 0;
1794 virtual void start_tx() = 0;
1795 virtual void start_stmt() = 0;
1796 virtual void rollback_stmt() = 0;
1797
set_tx_failed(bool failed_arg)1798 void set_tx_failed(bool failed_arg) { m_is_tx_failed = failed_arg; }
1799
can_prepare() const1800 bool can_prepare() const {
1801 if (m_rollback_only) {
1802 my_printf_error(ER_UNKNOWN_ERROR, ERRSTR_ROLLBACK_ONLY, MYF(0));
1803 return false;
1804 }
1805 return true;
1806 }
1807
rollback_to_savepoint(void * const savepoint)1808 int rollback_to_savepoint(void *const savepoint) {
1809 if (has_modifications()) {
1810 my_printf_error(ER_UNKNOWN_ERROR,
1811 "MyRocks currently does not support ROLLBACK TO "
1812 "SAVEPOINT if modifying rows.",
1813 MYF(0));
1814 m_rollback_only = true;
1815 return HA_EXIT_FAILURE;
1816 }
1817 return HA_EXIT_SUCCESS;
1818 }
1819
1820 /*
1821 This is used by transactions started with "START TRANSACTION WITH "
1822 "CONSISTENT [ROCKSDB] SNAPSHOT". When tx_read_only is turned on,
1823 snapshot has to be created via DB::GetSnapshot(), not via Transaction
1824 API.
1825 */
is_tx_read_only() const1826 bool is_tx_read_only() const { return m_tx_read_only; }
1827
is_two_phase() const1828 bool is_two_phase() const { return m_is_two_phase; }
1829
set_tx_read_only(bool val)1830 void set_tx_read_only(bool val) { m_tx_read_only = val; }
1831
Rdb_transaction(THD * const thd)1832 explicit Rdb_transaction(THD *const thd)
1833 : m_thd(thd), m_tbl_io_perf(nullptr) {
1834 mysql_mutex_lock(&s_tx_list_mutex);
1835 s_tx_list.insert(this);
1836 mysql_mutex_unlock(&s_tx_list_mutex);
1837 }
1838
~Rdb_transaction()1839 virtual ~Rdb_transaction() {
1840 mysql_mutex_lock(&s_tx_list_mutex);
1841 s_tx_list.erase(this);
1842 mysql_mutex_unlock(&s_tx_list_mutex);
1843 }
1844 };
1845
1846 /*
1847 This is a rocksdb transaction. Its members represent the current transaction,
1848 which consists of:
1849 - the snapshot
1850 - the changes we've made but are not seeing yet.
1851
1852 The changes are made to individual tables, which store them here and then
1853 this object commits them on commit.
1854 */
1855 class Rdb_transaction_impl : public Rdb_transaction {
1856 rocksdb::Transaction *m_rocksdb_tx = nullptr;
1857 rocksdb::Transaction *m_rocksdb_reuse_tx = nullptr;
1858
1859 public:
set_lock_timeout(int timeout_sec_arg)1860 void set_lock_timeout(int timeout_sec_arg) override {
1861 if (m_rocksdb_tx)
1862 m_rocksdb_tx->SetLockTimeout(rdb_convert_sec_to_ms(m_timeout_sec));
1863 }
1864
set_sync(bool sync)1865 void set_sync(bool sync) override {
1866 m_rocksdb_tx->GetWriteOptions()->sync = sync;
1867 }
1868
release_lock(rocksdb::ColumnFamilyHandle * const column_family,const std::string & rowkey)1869 void release_lock(rocksdb::ColumnFamilyHandle *const column_family,
1870 const std::string &rowkey) override {
1871 if (!THDVAR(m_thd, lock_scanned_rows)) {
1872 m_rocksdb_tx->UndoGetForUpdate(column_family, rocksdb::Slice(rowkey));
1873 }
1874 }
1875
is_writebatch_trx() const1876 virtual bool is_writebatch_trx() const override { return false; }
1877
1878 private:
release_tx(void)1879 void release_tx(void) {
1880 // We are done with the current active transaction object. Preserve it
1881 // for later reuse.
1882 DBUG_ASSERT(m_rocksdb_reuse_tx == nullptr);
1883 m_rocksdb_reuse_tx = m_rocksdb_tx;
1884 m_rocksdb_tx = nullptr;
1885 }
1886
prepare(const rocksdb::TransactionName & name)1887 bool prepare(const rocksdb::TransactionName &name) override {
1888 rocksdb::Status s;
1889 s = m_rocksdb_tx->SetName(name);
1890 if (!s.ok()) {
1891 rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
1892 return false;
1893 }
1894
1895 s = m_rocksdb_tx->Prepare();
1896 if (!s.ok()) {
1897 rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
1898 return false;
1899 }
1900 return true;
1901 }
1902
commit_no_binlog()1903 bool commit_no_binlog() override {
1904 bool res = false;
1905 release_snapshot();
1906 const rocksdb::Status s = m_rocksdb_tx->Commit();
1907 if (!s.ok()) {
1908 rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
1909 res = true;
1910 }
1911
1912 /* Save the transaction object to be reused */
1913 release_tx();
1914
1915 m_write_count = 0;
1916 m_lock_count = 0;
1917 set_tx_read_only(false);
1918 m_rollback_only = false;
1919 return res;
1920 }
1921
1922 public:
rollback()1923 void rollback() override {
1924 m_write_count = 0;
1925 m_lock_count = 0;
1926 m_ddl_transaction = false;
1927 if (m_rocksdb_tx) {
1928 release_snapshot();
1929 /* This will also release all of the locks: */
1930 m_rocksdb_tx->Rollback();
1931
1932 /* Save the transaction object to be reused */
1933 release_tx();
1934
1935 set_tx_read_only(false);
1936 m_rollback_only = false;
1937 }
1938 }
1939
acquire_snapshot(bool acquire_now)1940 void acquire_snapshot(bool acquire_now) override {
1941 if (m_read_opts.snapshot == nullptr) {
1942 if (is_tx_read_only()) {
1943 snapshot_created(rdb->GetSnapshot());
1944 } else if (acquire_now) {
1945 m_rocksdb_tx->SetSnapshot();
1946 snapshot_created(m_rocksdb_tx->GetSnapshot());
1947 } else if (!m_is_delayed_snapshot) {
1948 m_rocksdb_tx->SetSnapshotOnNextOperation(m_notifier);
1949 m_is_delayed_snapshot = true;
1950 }
1951 }
1952 }
1953
release_snapshot()1954 void release_snapshot() override {
1955 bool need_clear = m_is_delayed_snapshot;
1956
1957 if (m_read_opts.snapshot != nullptr) {
1958 m_snapshot_timestamp = 0;
1959 if (is_tx_read_only()) {
1960 rdb->ReleaseSnapshot(m_read_opts.snapshot);
1961 need_clear = false;
1962 } else {
1963 need_clear = true;
1964 }
1965 m_read_opts.snapshot = nullptr;
1966 }
1967
1968 if (need_clear && m_rocksdb_tx != nullptr)
1969 m_rocksdb_tx->ClearSnapshot();
1970 }
1971
has_snapshot()1972 bool has_snapshot() { return m_read_opts.snapshot != nullptr; }
1973
put(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const rocksdb::Slice & value)1974 rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family,
1975 const rocksdb::Slice &key,
1976 const rocksdb::Slice &value) override {
1977 ++m_write_count;
1978 ++m_lock_count;
1979 if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks)
1980 return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
1981 return m_rocksdb_tx->Put(column_family, key, value);
1982 }
1983
delete_key(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key)1984 rocksdb::Status delete_key(rocksdb::ColumnFamilyHandle *const column_family,
1985 const rocksdb::Slice &key) override {
1986 ++m_write_count;
1987 ++m_lock_count;
1988 if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks)
1989 return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
1990 return m_rocksdb_tx->Delete(column_family, key);
1991 }
1992
1993 rocksdb::Status
single_delete(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key)1994 single_delete(rocksdb::ColumnFamilyHandle *const column_family,
1995 const rocksdb::Slice &key) override {
1996 ++m_write_count;
1997 ++m_lock_count;
1998 if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks)
1999 return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
2000 return m_rocksdb_tx->SingleDelete(column_family, key);
2001 }
2002
has_modifications() const2003 bool has_modifications() const override {
2004 return m_rocksdb_tx->GetWriteBatch() &&
2005 m_rocksdb_tx->GetWriteBatch()->GetWriteBatch() &&
2006 m_rocksdb_tx->GetWriteBatch()->GetWriteBatch()->Count() > 0;
2007 }
2008
2009 /*
2010 Return a WriteBatch that one can write to. The writes will skip any
2011 transaction locking. The writes WILL be visible to the transaction.
2012 */
get_indexed_write_batch()2013 rocksdb::WriteBatchBase *get_indexed_write_batch() override {
2014 ++m_write_count;
2015 return m_rocksdb_tx->GetWriteBatch();
2016 }
2017
get(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,std::string * value) const2018 rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family,
2019 const rocksdb::Slice &key,
2020 std::string *value) const override {
2021 return m_rocksdb_tx->Get(m_read_opts, column_family, key, value);
2022 }
2023
2024 rocksdb::Status
get_for_update(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,std::string * const value,bool exclusive)2025 get_for_update(rocksdb::ColumnFamilyHandle *const column_family,
2026 const rocksdb::Slice &key, std::string *const value,
2027 bool exclusive) override {
2028 if (++m_lock_count > m_max_row_locks)
2029 return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
2030
2031 return m_rocksdb_tx->GetForUpdate(m_read_opts, column_family, key, value,
2032 exclusive);
2033 }
2034
2035 rocksdb::Iterator *
get_iterator(const rocksdb::ReadOptions & options,rocksdb::ColumnFamilyHandle * const column_family)2036 get_iterator(const rocksdb::ReadOptions &options,
2037 rocksdb::ColumnFamilyHandle *const column_family) override {
2038 return m_rocksdb_tx->GetIterator(options, column_family);
2039 }
2040
get_rdb_trx() const2041 const rocksdb::Transaction *get_rdb_trx() const { return m_rocksdb_tx; }
2042
is_tx_started() const2043 bool is_tx_started() const override { return (m_rocksdb_tx != nullptr); }
2044
start_tx()2045 void start_tx() override {
2046 rocksdb::TransactionOptions tx_opts;
2047 rocksdb::WriteOptions write_opts;
2048 tx_opts.set_snapshot = false;
2049 tx_opts.lock_timeout = rdb_convert_sec_to_ms(m_timeout_sec);
2050 tx_opts.deadlock_detect = THDVAR(m_thd, deadlock_detect);
2051
2052 write_opts.sync = THDVAR(m_thd, write_sync);
2053 write_opts.disableWAL = THDVAR(m_thd, write_disable_wal);
2054 write_opts.ignore_missing_column_families =
2055 THDVAR(m_thd, write_ignore_missing_column_families);
2056 m_is_two_phase = rocksdb_enable_2pc;
2057
2058 /*
2059 If m_rocksdb_reuse_tx is null this will create a new transaction object.
2060 Otherwise it will reuse the existing one.
2061 */
2062 m_rocksdb_tx =
2063 rdb->BeginTransaction(write_opts, tx_opts, m_rocksdb_reuse_tx);
2064 m_rocksdb_reuse_tx = nullptr;
2065
2066 m_read_opts = rocksdb::ReadOptions();
2067
2068 m_ddl_transaction = false;
2069 }
2070
2071 /*
2072 Start a statement inside a multi-statement transaction.
2073
2074 @todo: are we sure this is called once (and not several times) per
2075 statement start?
2076
2077 For hooking to start of statement that is its own transaction, see
2078 ha_rocksdb::external_lock().
2079 */
start_stmt()2080 void start_stmt() override {
2081 // Set the snapshot to delayed acquisition (SetSnapshotOnNextOperation)
2082 acquire_snapshot(false);
2083 m_rocksdb_tx->SetSavePoint();
2084 }
2085
2086 /*
2087 This must be called when last statement is rolled back, but the transaction
2088 continues
2089 */
rollback_stmt()2090 void rollback_stmt() override {
2091 /* TODO: here we must release the locks taken since the start_stmt() call */
2092 if (m_rocksdb_tx) {
2093 const rocksdb::Snapshot *const org_snapshot = m_rocksdb_tx->GetSnapshot();
2094 m_rocksdb_tx->RollbackToSavePoint();
2095
2096 const rocksdb::Snapshot *const cur_snapshot = m_rocksdb_tx->GetSnapshot();
2097 if (org_snapshot != cur_snapshot) {
2098 if (org_snapshot != nullptr)
2099 m_snapshot_timestamp = 0;
2100
2101 m_read_opts.snapshot = cur_snapshot;
2102 if (cur_snapshot != nullptr)
2103 rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp);
2104 else
2105 m_is_delayed_snapshot = true;
2106 }
2107 }
2108 }
2109
Rdb_transaction_impl(THD * const thd)2110 explicit Rdb_transaction_impl(THD *const thd)
2111 : Rdb_transaction(thd), m_rocksdb_tx(nullptr) {
2112 // Create a notifier that can be called when a snapshot gets generated.
2113 m_notifier = std::make_shared<Rdb_snapshot_notifier>(this);
2114 }
2115
~Rdb_transaction_impl()2116 virtual ~Rdb_transaction_impl() {
2117 rollback();
2118
2119 // Theoretically the notifier could outlive the Rdb_transaction_impl
2120 // (because of the shared_ptr), so let it know it can't reference
2121 // the transaction anymore.
2122 m_notifier->detach();
2123
2124 // Free any transaction memory that is still hanging around.
2125 delete m_rocksdb_reuse_tx;
2126 DBUG_ASSERT(m_rocksdb_tx == nullptr);
2127 }
2128 };
2129
2130 /* This is a rocksdb write batch. This class doesn't hold or wait on any
2131 transaction locks (skips rocksdb transaction API) thus giving better
2132 performance. The commit is done through rdb->GetBaseDB()->Commit().
2133
2134 Currently this is only used for replication threads which are guaranteed
2135 to be non-conflicting. Any further usage of this class should completely
2136 be thought thoroughly.
2137 */
2138 class Rdb_writebatch_impl : public Rdb_transaction {
2139 rocksdb::WriteBatchWithIndex *m_batch;
2140 rocksdb::WriteOptions write_opts;
2141 // Called after commit/rollback.
reset()2142 void reset() {
2143 m_batch->Clear();
2144 m_read_opts = rocksdb::ReadOptions();
2145 m_ddl_transaction = false;
2146 }
2147
2148 private:
prepare(const rocksdb::TransactionName & name)2149 bool prepare(const rocksdb::TransactionName &name) override { return true; }
2150
commit_no_binlog()2151 bool commit_no_binlog() override {
2152 bool res = false;
2153 release_snapshot();
2154 const rocksdb::Status s =
2155 rdb->GetBaseDB()->Write(write_opts, m_batch->GetWriteBatch());
2156 if (!s.ok()) {
2157 rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
2158 res = true;
2159 }
2160 reset();
2161
2162 m_write_count = 0;
2163 set_tx_read_only(false);
2164 m_rollback_only = false;
2165 return res;
2166 }
2167
2168 public:
is_writebatch_trx() const2169 bool is_writebatch_trx() const override { return true; }
2170
set_lock_timeout(int timeout_sec_arg)2171 void set_lock_timeout(int timeout_sec_arg) override {
2172 // Nothing to do here.
2173 }
2174
set_sync(bool sync)2175 void set_sync(bool sync) override { write_opts.sync = sync; }
2176
release_lock(rocksdb::ColumnFamilyHandle * const column_family,const std::string & rowkey)2177 void release_lock(rocksdb::ColumnFamilyHandle *const column_family,
2178 const std::string &rowkey) override {
2179 // Nothing to do here since we don't hold any row locks.
2180 }
2181
rollback()2182 void rollback() override {
2183 m_write_count = 0;
2184 m_lock_count = 0;
2185 release_snapshot();
2186
2187 reset();
2188 set_tx_read_only(false);
2189 m_rollback_only = false;
2190 }
2191
acquire_snapshot(bool acquire_now)2192 void acquire_snapshot(bool acquire_now) override {
2193 if (m_read_opts.snapshot == nullptr)
2194 snapshot_created(rdb->GetSnapshot());
2195 }
2196
release_snapshot()2197 void release_snapshot() override {
2198 if (m_read_opts.snapshot != nullptr) {
2199 rdb->ReleaseSnapshot(m_read_opts.snapshot);
2200 m_read_opts.snapshot = nullptr;
2201 }
2202 }
2203
put(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const rocksdb::Slice & value)2204 rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family,
2205 const rocksdb::Slice &key,
2206 const rocksdb::Slice &value) override {
2207 ++m_write_count;
2208 m_batch->Put(column_family, key, value);
2209 // Note Put/Delete in write batch doesn't return any error code. We simply
2210 // return OK here.
2211 return rocksdb::Status::OK();
2212 }
2213
delete_key(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key)2214 rocksdb::Status delete_key(rocksdb::ColumnFamilyHandle *const column_family,
2215 const rocksdb::Slice &key) override {
2216 ++m_write_count;
2217 m_batch->Delete(column_family, key);
2218 return rocksdb::Status::OK();
2219 }
2220
2221 rocksdb::Status
single_delete(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key)2222 single_delete(rocksdb::ColumnFamilyHandle *const column_family,
2223 const rocksdb::Slice &key) override {
2224 ++m_write_count;
2225 m_batch->SingleDelete(column_family, key);
2226 return rocksdb::Status::OK();
2227 }
2228
has_modifications() const2229 bool has_modifications() const override {
2230 return m_batch->GetWriteBatch()->Count() > 0;
2231 }
2232
get_indexed_write_batch()2233 rocksdb::WriteBatchBase *get_indexed_write_batch() override {
2234 ++m_write_count;
2235 return m_batch;
2236 }
2237
get(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,std::string * const value) const2238 rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family,
2239 const rocksdb::Slice &key,
2240 std::string *const value) const override {
2241 return m_batch->GetFromBatchAndDB(rdb, m_read_opts, column_family, key,
2242 value);
2243 }
2244
2245 rocksdb::Status
get_for_update(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,std::string * const value,bool exclusive)2246 get_for_update(rocksdb::ColumnFamilyHandle *const column_family,
2247 const rocksdb::Slice &key, std::string *const value,
2248 bool exclusive) override {
2249 return get(column_family, key, value);
2250 }
2251
2252 rocksdb::Iterator *
get_iterator(const rocksdb::ReadOptions & options,rocksdb::ColumnFamilyHandle * const column_family)2253 get_iterator(const rocksdb::ReadOptions &options,
2254 rocksdb::ColumnFamilyHandle *const column_family) override {
2255 const auto it = rdb->NewIterator(options);
2256 return m_batch->NewIteratorWithBase(it);
2257 }
2258
is_tx_started() const2259 bool is_tx_started() const override { return (m_batch != nullptr); }
2260
start_tx()2261 void start_tx() override {
2262 reset();
2263 write_opts.sync = THDVAR(m_thd, write_sync);
2264 write_opts.disableWAL = THDVAR(m_thd, write_disable_wal);
2265 write_opts.ignore_missing_column_families =
2266 THDVAR(m_thd, write_ignore_missing_column_families);
2267 }
2268
start_stmt()2269 void start_stmt() override { m_batch->SetSavePoint(); }
2270
rollback_stmt()2271 void rollback_stmt() override {
2272 if (m_batch)
2273 m_batch->RollbackToSavePoint();
2274 }
2275
Rdb_writebatch_impl(THD * const thd)2276 explicit Rdb_writebatch_impl(THD *const thd)
2277 : Rdb_transaction(thd), m_batch(nullptr) {
2278 m_batch = new rocksdb::WriteBatchWithIndex(rocksdb::BytewiseComparator(), 0,
2279 true);
2280 }
2281
~Rdb_writebatch_impl()2282 virtual ~Rdb_writebatch_impl() {
2283 rollback();
2284 delete m_batch;
2285 }
2286 };
2287
SnapshotCreated(const rocksdb::Snapshot * const snapshot)2288 void Rdb_snapshot_notifier::SnapshotCreated(
2289 const rocksdb::Snapshot *const snapshot) {
2290 if (m_owning_tx != nullptr) {
2291 m_owning_tx->snapshot_created(snapshot);
2292 }
2293 }
2294
2295 std::multiset<Rdb_transaction *> Rdb_transaction::s_tx_list;
2296 mysql_mutex_t Rdb_transaction::s_tx_list_mutex;
2297
get_tx_from_thd(THD * const thd)2298 static Rdb_transaction *&get_tx_from_thd(THD *const thd) {
2299 return *reinterpret_cast<Rdb_transaction **>(
2300 my_core::thd_ha_data(thd, rocksdb_hton));
2301 }
2302
2303 namespace {
2304
2305 class Rdb_perf_context_guard {
2306 Rdb_io_perf m_io_perf;
2307 THD *m_thd;
2308
2309 public:
2310 Rdb_perf_context_guard(const Rdb_perf_context_guard &) = delete;
2311 Rdb_perf_context_guard &operator=(const Rdb_perf_context_guard &) = delete;
2312
Rdb_perf_context_guard(THD * const thd)2313 explicit Rdb_perf_context_guard(THD *const thd) : m_thd(thd) {
2314 Rdb_transaction *&tx = get_tx_from_thd(m_thd);
2315 /*
2316 if perf_context information is already being recorded, this becomes a
2317 no-op
2318 */
2319 if (tx != nullptr) {
2320 tx->io_perf_start(&m_io_perf);
2321 }
2322 }
2323
~Rdb_perf_context_guard()2324 ~Rdb_perf_context_guard() {
2325 Rdb_transaction *&tx = get_tx_from_thd(m_thd);
2326 if (tx != nullptr) {
2327 tx->io_perf_end_and_record();
2328 }
2329 }
2330 };
2331
2332 } // anonymous namespace
2333
2334 /*
2335 TODO: maybe, call this in external_lock() and store in ha_rocksdb..
2336 */
2337
get_or_create_tx(THD * const thd)2338 static Rdb_transaction *get_or_create_tx(THD *const thd) {
2339 Rdb_transaction *&tx = get_tx_from_thd(thd);
2340 // TODO: this is called too many times.. O(#rows)
2341 if (tx == nullptr) {
2342 if (rpl_skip_tx_api_var && thd->rli_slave)
2343 tx = new Rdb_writebatch_impl(thd);
2344 else
2345 tx = new Rdb_transaction_impl(thd);
2346 tx->set_params(THDVAR(thd, lock_wait_timeout), THDVAR(thd, max_row_locks));
2347 tx->start_tx();
2348 } else {
2349 tx->set_params(THDVAR(thd, lock_wait_timeout), THDVAR(thd, max_row_locks));
2350 if (!tx->is_tx_started()) {
2351 tx->start_tx();
2352 }
2353 }
2354
2355 return tx;
2356 }
2357
rocksdb_close_connection(handlerton * const hton,THD * const thd)2358 static int rocksdb_close_connection(handlerton *const hton, THD *const thd) {
2359 Rdb_transaction *&tx = get_tx_from_thd(thd);
2360 if (tx != nullptr) {
2361 int rc = tx->finish_bulk_load();
2362 if (rc != 0) {
2363 // NO_LINT_DEBUG
2364 sql_print_error("RocksDB: Error %d finalizing last SST file while "
2365 "disconnecting",
2366 rc);
2367 abort_with_stack_traces();
2368 }
2369
2370 delete tx;
2371 tx = nullptr;
2372 }
2373 return HA_EXIT_SUCCESS;
2374 }
2375
2376 /*
2377 * Serializes an xid to a string so that it can
2378 * be used as a rocksdb transaction name
2379 */
rdb_xid_to_string(const XID & src)2380 static std::string rdb_xid_to_string(const XID &src) {
2381 DBUG_ASSERT(src.gtrid_length >= 0 && src.gtrid_length <= MAXGTRIDSIZE);
2382 DBUG_ASSERT(src.bqual_length >= 0 && src.bqual_length <= MAXBQUALSIZE);
2383
2384 std::string buf;
2385 buf.reserve(RDB_XIDHDR_LEN + src.gtrid_length + src.bqual_length);
2386
2387 /*
2388 * expand formatID to fill 8 bytes if it doesn't already
2389 * then reinterpret bit pattern as unsigned and store in network order
2390 */
2391 uchar fidbuf[RDB_FORMATID_SZ];
2392 int64 signed_fid8 = src.formatID;
2393 const uint64 raw_fid8 = *reinterpret_cast<uint64 *>(&signed_fid8);
2394 rdb_netbuf_store_uint64(fidbuf, raw_fid8);
2395 buf.append(reinterpret_cast<const char *>(fidbuf), RDB_FORMATID_SZ);
2396
2397 buf.push_back(src.gtrid_length);
2398 buf.push_back(src.bqual_length);
2399 buf.append(src.data, (src.gtrid_length) + (src.bqual_length));
2400 return buf;
2401 }
2402
2403 /**
2404 Called by hton->flush_logs after MySQL group commit prepares a set of
2405 transactions.
2406 */
rocksdb_flush_wal(handlerton * const hton)2407 static bool rocksdb_flush_wal(handlerton *const hton
2408 __attribute__((__unused__))) {
2409 DBUG_ASSERT(rdb != nullptr);
2410 rocksdb_wal_group_syncs++;
2411 const rocksdb::Status s = rdb->SyncWAL();
2412 if (!s.ok()) {
2413 return HA_EXIT_FAILURE;
2414 }
2415 return HA_EXIT_SUCCESS;
2416 }
2417
2418 /**
2419 For a slave, prepare() updates the slave_gtid_info table which tracks the
2420 replication progress.
2421 */
rocksdb_prepare(handlerton * const hton,THD * const thd,bool prepare_tx)2422 static int rocksdb_prepare(handlerton *const hton, THD *const thd,
2423 bool prepare_tx) {
2424 Rdb_transaction *&tx = get_tx_from_thd(thd);
2425 if (!tx->can_prepare()) {
2426 return HA_EXIT_FAILURE;
2427 }
2428 if (prepare_tx ||
2429 (!my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
2430
2431 if (tx->is_two_phase()) {
2432 if (thd->durability_property == HA_IGNORE_DURABILITY) {
2433 tx->set_sync(false);
2434 }
2435 XID xid;
2436 thd_get_xid(thd, reinterpret_cast<MYSQL_XID *>(&xid));
2437 if (!tx->prepare(rdb_xid_to_string(xid))) {
2438 return HA_EXIT_FAILURE;
2439 }
2440 }
2441
2442 DEBUG_SYNC(thd, "rocksdb.prepared");
2443 }
2444
2445 return HA_EXIT_SUCCESS;
2446 }
2447
2448 /**
2449 do nothing for prepare/commit by xid
2450 this is needed to avoid crashes in XA scenarios
2451 */
rocksdb_commit_by_xid(handlerton * const hton,XID * const xid)2452 static int rocksdb_commit_by_xid(handlerton *const hton, XID *const xid) {
2453 const auto name = rdb_xid_to_string(*xid);
2454 rocksdb::Transaction *const trx = rdb->GetTransactionByName(name);
2455 if (trx == nullptr) {
2456 return HA_EXIT_FAILURE;
2457 }
2458 const rocksdb::Status s = trx->Commit();
2459 if (!s.ok()) {
2460 return HA_EXIT_FAILURE;
2461 }
2462 delete trx;
2463 return HA_EXIT_SUCCESS;
2464 }
2465
rocksdb_rollback_by_xid(handlerton * const hton,XID * const xid)2466 static int rocksdb_rollback_by_xid(handlerton *const hton
2467 __attribute__((__unused__)),
2468 XID *const xid) {
2469 const auto name = rdb_xid_to_string(*xid);
2470 rocksdb::Transaction *const trx = rdb->GetTransactionByName(name);
2471 if (trx == nullptr) {
2472 return HA_EXIT_FAILURE;
2473 }
2474 const rocksdb::Status s = trx->Rollback();
2475 if (!s.ok()) {
2476 return HA_EXIT_FAILURE;
2477 }
2478 delete trx;
2479 return HA_EXIT_SUCCESS;
2480 }
2481
2482 /**
2483 Rebuilds an XID from a serialized version stored in a string.
2484 */
rdb_xid_from_string(const std::string & src,XID * const dst)2485 static void rdb_xid_from_string(const std::string &src, XID *const dst) {
2486 DBUG_ASSERT(dst != nullptr);
2487 uint offset = 0;
2488 uint64 raw_fid8 =
2489 rdb_netbuf_to_uint64(reinterpret_cast<const uchar *>(src.data()));
2490 const int64 signed_fid8 = *reinterpret_cast<int64 *>(&raw_fid8);
2491 dst->formatID = signed_fid8;
2492 offset += RDB_FORMATID_SZ;
2493 dst->gtrid_length = src.at(offset);
2494 offset += RDB_GTRID_SZ;
2495 dst->bqual_length = src.at(offset);
2496 offset += RDB_BQUAL_SZ;
2497
2498 DBUG_ASSERT(dst->gtrid_length >= 0 && dst->gtrid_length <= MAXGTRIDSIZE);
2499 DBUG_ASSERT(dst->bqual_length >= 0 && dst->bqual_length <= MAXBQUALSIZE);
2500
2501 src.copy(dst->data, (dst->gtrid_length) + (dst->bqual_length),
2502 RDB_XIDHDR_LEN);
2503 }
2504
2505 /**
2506 Reading last committed binary log info from RocksDB system row.
2507 The info is needed for crash safe slave/master to work.
2508 */
rocksdb_recover(handlerton * const hton,XID * const xid_list,uint len)2509 static int rocksdb_recover(handlerton *const hton, XID *const xid_list,
2510 uint len) {
2511 if (len == 0 || xid_list == nullptr) {
2512 return HA_EXIT_SUCCESS;
2513 }
2514
2515 std::vector<rocksdb::Transaction *> trans_list;
2516 rdb->GetAllPreparedTransactions(&trans_list);
2517
2518 uint count = 0;
2519 for (auto &trans : trans_list) {
2520 if (count >= len) {
2521 break;
2522 }
2523 auto name = trans->GetName();
2524 rdb_xid_from_string(name, &xid_list[count]);
2525 count++;
2526 }
2527 return count;
2528 }
2529
rocksdb_commit(handlerton * const hton,THD * const thd,bool commit_tx)2530 static int rocksdb_commit(handlerton *const hton, THD *const thd,
2531 bool commit_tx) {
2532 DBUG_ENTER_FUNC();
2533
2534 DBUG_ASSERT(hton != nullptr);
2535 DBUG_ASSERT(thd != nullptr);
2536
2537 /* this will trigger saving of perf_context information */
2538 Rdb_perf_context_guard guard(thd);
2539
2540 /* note: h->external_lock(F_UNLCK) is called after this function is called) */
2541 Rdb_transaction *&tx = get_tx_from_thd(thd);
2542
2543 if (tx != nullptr) {
2544 if (commit_tx || (!my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT |
2545 OPTION_BEGIN))) {
2546 /*
2547 We get here
2548 - For a COMMIT statement that finishes a multi-statement transaction
2549 - For a statement that has its own transaction
2550 */
2551 if (tx->commit())
2552 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
2553 } else {
2554 /*
2555 We get here when committing a statement within a transaction.
2556
2557 We don't need to do anything here. tx->start_stmt() will notify
2558 Rdb_transaction_impl that another statement has started.
2559 */
2560 tx->set_tx_failed(false);
2561 }
2562
2563 if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED) {
2564 // For READ_COMMITTED, we release any existing snapshot so that we will
2565 // see any changes that occurred since the last statement.
2566 tx->release_snapshot();
2567 }
2568 }
2569
2570 DBUG_RETURN(HA_EXIT_SUCCESS);
2571 }
2572
rocksdb_rollback(handlerton * const hton,THD * const thd,bool rollback_tx)2573 static int rocksdb_rollback(handlerton *const hton, THD *const thd,
2574 bool rollback_tx) {
2575 Rdb_perf_context_guard guard(thd);
2576 Rdb_transaction *&tx = get_tx_from_thd(thd);
2577
2578 if (tx != nullptr) {
2579 if (rollback_tx) {
2580 /*
2581 We get here, when
2582 - ROLLBACK statement is issued.
2583
2584 Discard the changes made by the transaction
2585 */
2586 tx->rollback();
2587 } else {
2588 /*
2589 We get here when
2590 - a statement with AUTOCOMMIT=1 is being rolled back (because of some
2591 error)
2592 - a statement inside a transaction is rolled back
2593 */
2594
2595 tx->rollback_stmt();
2596 tx->set_tx_failed(true);
2597 }
2598
2599 if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED) {
2600 // For READ_COMMITTED, we release any existing snapshot so that we will
2601 // see any changes that occurred since the last statement.
2602 tx->release_snapshot();
2603 }
2604 }
2605 return HA_EXIT_SUCCESS;
2606 }
2607
print_stats(THD * const thd,std::string const & type,std::string const & name,std::string const & status,stat_print_fn * stat_print)2608 static bool print_stats(THD *const thd, std::string const &type,
2609 std::string const &name, std::string const &status,
2610 stat_print_fn *stat_print) {
2611 return stat_print(thd, type.c_str(), type.size(), name.c_str(), name.size(),
2612 status.c_str(), status.size());
2613 }
2614
format_string(const char * const format,...)2615 static std::string format_string(const char *const format, ...) {
2616 std::string res;
2617 va_list args;
2618 va_list args_copy;
2619 char static_buff[256];
2620
2621 DBUG_ASSERT(format != nullptr);
2622
2623 va_start(args, format);
2624 va_copy(args_copy, args);
2625
2626 // Calculate how much space we will need
2627 int len = vsnprintf(nullptr, 0, format, args);
2628 va_end(args);
2629
2630 if (len < 0) {
2631 res = std::string("<format error>");
2632 } else if (len == 0) {
2633 // Shortcut for an empty string
2634 res = std::string("");
2635 } else {
2636 // For short enough output use a static buffer
2637 char *buff = static_buff;
2638 std::unique_ptr<char[]> dynamic_buff = nullptr;
2639
2640 len++; // Add one for null terminator
2641
2642 // for longer output use an allocated buffer
2643 if (static_cast<uint>(len) > sizeof(static_buff)) {
2644 dynamic_buff.reset(new char[len]);
2645 buff = dynamic_buff.get();
2646 }
2647
2648 // Now re-do the vsnprintf with the buffer which is now large enough
2649 (void)vsnprintf(buff, len, format, args_copy);
2650
2651 // Convert to a std::string. Note we could have created a std::string
2652 // large enough and then converted the buffer to a 'char*' and created
2653 // the output in place. This would probably work but feels like a hack.
2654 // Since this isn't code that needs to be super-performant we are going
2655 // with this 'safer' method.
2656 res = std::string(buff);
2657 }
2658
2659 va_end(args_copy);
2660
2661 return res;
2662 }
2663
2664 class Rdb_snapshot_status : public Rdb_tx_list_walker {
2665 private:
2666 std::string m_data;
2667
current_timestamp(void)2668 static std::string current_timestamp(void) {
2669 static const char *const format = "%d-%02d-%02d %02d:%02d:%02d";
2670 time_t currtime;
2671 struct tm currtm;
2672
2673 time(&currtime);
2674
2675 localtime_r(&currtime, &currtm);
2676
2677 return format_string(format, currtm.tm_year + 1900, currtm.tm_mon + 1,
2678 currtm.tm_mday, currtm.tm_hour, currtm.tm_min,
2679 currtm.tm_sec);
2680 }
2681
get_header(void)2682 static std::string get_header(void) {
2683 return "\n============================================================\n" +
2684 current_timestamp() +
2685 " ROCKSDB TRANSACTION MONITOR OUTPUT\n"
2686 "============================================================\n"
2687 "---------\n"
2688 "SNAPSHOTS\n"
2689 "---------\n"
2690 "LIST OF SNAPSHOTS FOR EACH SESSION:\n";
2691 }
2692
get_footer(void)2693 static std::string get_footer(void) {
2694 return "-----------------------------------------\n"
2695 "END OF ROCKSDB TRANSACTION MONITOR OUTPUT\n"
2696 "=========================================\n";
2697 }
2698
2699 public:
Rdb_snapshot_status()2700 Rdb_snapshot_status() : m_data(get_header()) {}
2701
getResult()2702 std::string getResult() { return m_data + get_footer(); }
2703
2704 /* Implement Rdb_transaction interface */
2705 /* Create one row in the snapshot status table */
process_tran(const Rdb_transaction * const tx)2706 void process_tran(const Rdb_transaction *const tx) override {
2707 DBUG_ASSERT(tx != nullptr);
2708
2709 /* Calculate the duration the snapshot has existed */
2710 int64_t snapshot_timestamp = tx->m_snapshot_timestamp;
2711 if (snapshot_timestamp != 0) {
2712 int64_t curr_time;
2713 rdb->GetEnv()->GetCurrentTime(&curr_time);
2714
2715 THD *thd = tx->get_thd();
2716 char buffer[1024];
2717 thd_security_context(thd, buffer, sizeof buffer, 0);
2718 m_data += format_string("---SNAPSHOT, ACTIVE %lld sec\n"
2719 "%s\n"
2720 "lock count %llu, write count %llu\n",
2721 curr_time - snapshot_timestamp, buffer,
2722 tx->get_lock_count(), tx->get_write_count());
2723 }
2724 }
2725 };
2726
2727 /**
2728 * @brief
2729 * walks through all non-replication transactions and copies
2730 * out relevant information for information_schema.rocksdb_trx
2731 */
2732 class Rdb_trx_info_aggregator : public Rdb_tx_list_walker {
2733 private:
2734 std::vector<Rdb_trx_info> *m_trx_info;
2735
2736 public:
Rdb_trx_info_aggregator(std::vector<Rdb_trx_info> * const trx_info)2737 explicit Rdb_trx_info_aggregator(std::vector<Rdb_trx_info> *const trx_info)
2738 : m_trx_info(trx_info) {}
2739
process_tran(const Rdb_transaction * const tx)2740 void process_tran(const Rdb_transaction *const tx) override {
2741 static const std::map<int, std::string> state_map = {
2742 {rocksdb::Transaction::STARTED, "STARTED"},
2743 {rocksdb::Transaction::AWAITING_PREPARE, "AWAITING_PREPARE"},
2744 {rocksdb::Transaction::PREPARED, "PREPARED"},
2745 {rocksdb::Transaction::AWAITING_COMMIT, "AWAITING_COMMIT"},
2746 {rocksdb::Transaction::COMMITED, "COMMITED"},
2747 {rocksdb::Transaction::AWAITING_ROLLBACK, "AWAITING_ROLLBACK"},
2748 {rocksdb::Transaction::ROLLEDBACK, "ROLLEDBACK"},
2749 };
2750
2751 DBUG_ASSERT(tx != nullptr);
2752
2753 THD *const thd = tx->get_thd();
2754 ulong thread_id = thd->thread_id;
2755
2756 if (tx->is_writebatch_trx()) {
2757 const auto wb_impl = static_cast<const Rdb_writebatch_impl *>(tx);
2758 DBUG_ASSERT(wb_impl);
2759 m_trx_info->push_back(
2760 {"", /* name */
2761 0, /* trx_id */
2762 wb_impl->get_write_count(), 0, /* lock_count */
2763 0, /* timeout_sec */
2764 "", /* state */
2765 "", /* waiting_key */
2766 0, /* waiting_cf_id */
2767 1, /*is_replication */
2768 1, /* skip_trx_api */
2769 wb_impl->is_tx_read_only(), 0, /* deadlock detection */
2770 wb_impl->num_ongoing_bulk_load(), thread_id, "" /* query string */});
2771 } else {
2772 const auto tx_impl = static_cast<const Rdb_transaction_impl *>(tx);
2773 DBUG_ASSERT(tx_impl);
2774 const rocksdb::Transaction *rdb_trx = tx_impl->get_rdb_trx();
2775
2776 if (rdb_trx == nullptr) {
2777 return;
2778 }
2779
2780 std::string query_str;
2781 LEX_STRING *const lex_str = thd_query_string(thd);
2782 if (lex_str != nullptr && lex_str->str != nullptr) {
2783 query_str = std::string(lex_str->str);
2784 }
2785
2786 const auto state_it = state_map.find(rdb_trx->GetState());
2787 DBUG_ASSERT(state_it != state_map.end());
2788 const int is_replication = (thd->rli_slave != nullptr);
2789 uint32_t waiting_cf_id;
2790 std::string waiting_key;
2791 rdb_trx->GetWaitingTxns(&waiting_cf_id, &waiting_key),
2792
2793 m_trx_info->push_back(
2794 {rdb_trx->GetName(), rdb_trx->GetID(), tx_impl->get_write_count(),
2795 tx_impl->get_lock_count(), tx_impl->get_timeout_sec(),
2796 state_it->second, waiting_key, waiting_cf_id, is_replication,
2797 0, /* skip_trx_api */
2798 tx_impl->is_tx_read_only(), rdb_trx->IsDeadlockDetect(),
2799 tx_impl->num_ongoing_bulk_load(), thread_id, query_str});
2800 }
2801 }
2802 };
2803
2804 /*
2805 returns a vector of info for all non-replication threads
2806 for use by information_schema.rocksdb_trx
2807 */
rdb_get_all_trx_info()2808 std::vector<Rdb_trx_info> rdb_get_all_trx_info() {
2809 std::vector<Rdb_trx_info> trx_info;
2810 Rdb_trx_info_aggregator trx_info_agg(&trx_info);
2811 Rdb_transaction::walk_tx_list(&trx_info_agg);
2812 return trx_info;
2813 }
2814
2815 /* Generate the snapshot status table */
rocksdb_show_snapshot_status(handlerton * const hton,THD * const thd,stat_print_fn * const stat_print)2816 static bool rocksdb_show_snapshot_status(handlerton *const hton, THD *const thd,
2817 stat_print_fn *const stat_print) {
2818 Rdb_snapshot_status showStatus;
2819
2820 Rdb_transaction::walk_tx_list(&showStatus);
2821
2822 /* Send the result data back to MySQL */
2823 return print_stats(thd, "SNAPSHOTS", "rocksdb", showStatus.getResult(),
2824 stat_print);
2825 }
2826
2827 /*
2828 This is called for SHOW ENGINE ROCKSDB STATUS|LOGS|etc.
2829
2830 For now, produce info about live files (which gives an imprecise idea about
2831 what column families are there)
2832 */
2833
rocksdb_show_status(handlerton * const hton,THD * const thd,stat_print_fn * const stat_print,enum ha_stat_type stat_type)2834 static bool rocksdb_show_status(handlerton *const hton, THD *const thd,
2835 stat_print_fn *const stat_print,
2836 enum ha_stat_type stat_type) {
2837 bool res = false;
2838 if (stat_type == HA_ENGINE_STATUS) {
2839 std::string str;
2840
2841 /* Per DB stats */
2842 if (rdb->GetProperty("rocksdb.dbstats", &str)) {
2843 res |= print_stats(thd, "DBSTATS", "rocksdb", str, stat_print);
2844 }
2845
2846 /* Per column family stats */
2847 for (const auto &cf_name : cf_manager.get_cf_names()) {
2848 rocksdb::ColumnFamilyHandle *cfh;
2849 bool is_automatic;
2850
2851 /*
2852 Only the cf name is important. Whether it was generated automatically
2853 does not matter, so is_automatic is ignored.
2854 */
2855 cfh = cf_manager.get_cf(cf_name.c_str(), "", nullptr, &is_automatic);
2856 if (cfh == nullptr)
2857 continue;
2858
2859 if (!rdb->GetProperty(cfh, "rocksdb.cfstats", &str))
2860 continue;
2861
2862 res |= print_stats(thd, "CF_COMPACTION", cf_name, str, stat_print);
2863 }
2864
2865 /* Memory Statistics */
2866 std::vector<rocksdb::DB *> dbs;
2867 std::unordered_set<const rocksdb::Cache *> cache_set;
2868 size_t internal_cache_count = 0;
2869 size_t kDefaultInternalCacheSize = 8 * 1024 * 1024;
2870 char buf[100];
2871
2872 dbs.push_back(rdb);
2873 cache_set.insert(rocksdb_tbl_options.block_cache.get());
2874 for (const auto &cf_handle : cf_manager.get_all_cf()) {
2875 rocksdb::ColumnFamilyDescriptor cf_desc;
2876 cf_handle->GetDescriptor(&cf_desc);
2877 auto *const table_factory = cf_desc.options.table_factory.get();
2878 if (table_factory != nullptr) {
2879 std::string tf_name = table_factory->Name();
2880 if (tf_name.find("BlockBasedTable") != std::string::npos) {
2881 const rocksdb::BlockBasedTableOptions *const bbt_opt =
2882 reinterpret_cast<rocksdb::BlockBasedTableOptions *>(
2883 table_factory->GetOptions());
2884 if (bbt_opt != nullptr) {
2885 if (bbt_opt->block_cache.get() != nullptr) {
2886 cache_set.insert(bbt_opt->block_cache.get());
2887 } else {
2888 internal_cache_count++;
2889 }
2890 cache_set.insert(bbt_opt->block_cache_compressed.get());
2891 }
2892 }
2893 }
2894 }
2895
2896 std::map<rocksdb::MemoryUtil::UsageType, uint64_t> temp_usage_by_type;
2897 str.clear();
2898 rocksdb::MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set,
2899 &temp_usage_by_type);
2900 snprintf(buf, sizeof(buf), "\nMemTable Total: %lu",
2901 temp_usage_by_type[rocksdb::MemoryUtil::kMemTableTotal]);
2902 str.append(buf);
2903 snprintf(buf, sizeof(buf), "\nMemTable Unflushed: %lu",
2904 temp_usage_by_type[rocksdb::MemoryUtil::kMemTableUnFlushed]);
2905 str.append(buf);
2906 snprintf(buf, sizeof(buf), "\nTable Readers Total: %lu",
2907 temp_usage_by_type[rocksdb::MemoryUtil::kTableReadersTotal]);
2908 str.append(buf);
2909 snprintf(buf, sizeof(buf), "\nCache Total: %lu",
2910 temp_usage_by_type[rocksdb::MemoryUtil::kCacheTotal]);
2911 str.append(buf);
2912 snprintf(buf, sizeof(buf), "\nDefault Cache Capacity: %lu",
2913 internal_cache_count * kDefaultInternalCacheSize);
2914 str.append(buf);
2915 res |= print_stats(thd, "Memory_Stats", "rocksdb", str, stat_print);
2916 }
2917
2918 return res;
2919 }
2920
rocksdb_register_tx(handlerton * const hton,THD * const thd,Rdb_transaction * const tx)2921 static inline void rocksdb_register_tx(handlerton *const hton, THD *const thd,
2922 Rdb_transaction *const tx) {
2923 DBUG_ASSERT(tx != nullptr);
2924
2925 trans_register_ha(thd, FALSE, rocksdb_hton);
2926 if (my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
2927 tx->start_stmt();
2928 trans_register_ha(thd, TRUE, rocksdb_hton);
2929 }
2930 }
2931
2932 /*
2933 Supporting START TRANSACTION WITH CONSISTENT SNAPSHOT
2934
2935 - START TRANSACTION WITH CONSISTENT SNAPSHOT
2936 takes both InnoDB and RocksDB snapshots, and both InnoDB and RocksDB
2937 participate in transaction. When executing COMMIT, both InnoDB and
2938 RocksDB modifications are committed. Remember that XA is not supported yet,
2939 so mixing engines is not recommended anyway.
2940 */
rocksdb_start_tx_and_assign_read_view(handlerton * const hton,THD * const thd)2941 static int rocksdb_start_tx_and_assign_read_view(
2942 handlerton *const hton, /*!< in: RocksDB handlerton */
2943 THD *const thd) /*!< in: MySQL thread handle of the
2944 user for whom the transaction should
2945 be committed */
2946 {
2947 Rdb_perf_context_guard guard(thd);
2948
2949 ulong const tx_isolation = my_core::thd_tx_isolation(thd);
2950
2951 Rdb_transaction* tx= get_or_create_tx(thd);
2952 DBUG_ASSERT(!tx->has_snapshot());
2953 tx->set_tx_read_only(true);
2954 rocksdb_register_tx(hton, thd, tx);
2955
2956 if (tx_isolation == ISO_REPEATABLE_READ) {
2957 tx->acquire_snapshot(true);
2958 } else {
2959 push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, HA_ERR_UNSUPPORTED,
2960 "Only REPEATABLE READ isolation level is supported "
2961 "for START TRANSACTION WITH CONSISTENT SNAPSHOT "
2962 "in RocksDB Storage Engine. Snapshot has not been "
2963 "taken.");
2964 } return HA_EXIT_SUCCESS;
2965 }
2966
2967 /* Dummy SAVEPOINT support. This is needed for long running transactions
2968 * like mysqldump (https://bugs.mysql.com/bug.php?id=71017).
2969 * Current SAVEPOINT does not correctly handle ROLLBACK and does not return
2970 * errors. This needs to be addressed in future versions (Issue#96).
2971 */
rocksdb_savepoint(handlerton * const hton,THD * const thd,void * const savepoint)2972 static int rocksdb_savepoint(handlerton *const hton, THD *const thd,
2973 void *const savepoint) {
2974 return HA_EXIT_SUCCESS;
2975 }
2976
rocksdb_rollback_to_savepoint(handlerton * const hton,THD * const thd,void * const savepoint)2977 static int rocksdb_rollback_to_savepoint(handlerton *const hton, THD *const thd,
2978 void *const savepoint) {
2979 Rdb_transaction *&tx = get_tx_from_thd(thd);
2980 return tx->rollback_to_savepoint(savepoint);
2981 }
2982
2983 static bool
rocksdb_rollback_to_savepoint_can_release_mdl(handlerton * const hton,THD * const thd)2984 rocksdb_rollback_to_savepoint_can_release_mdl(handlerton *const hton,
2985 THD *const thd) {
2986 return true;
2987 }
2988
check_rocksdb_options_compatibility(const char * const dbpath,const rocksdb::Options & main_opts,const std::vector<rocksdb::ColumnFamilyDescriptor> & cf_descr)2989 static rocksdb::Status check_rocksdb_options_compatibility(
2990 const char *const dbpath, const rocksdb::Options& main_opts,
2991 const std::vector<rocksdb::ColumnFamilyDescriptor>& cf_descr)
2992 {
2993 DBUG_ASSERT(rocksdb_datadir != nullptr);
2994
2995 rocksdb::DBOptions loaded_db_opt;
2996 std::vector<rocksdb::ColumnFamilyDescriptor> loaded_cf_descs;
2997 rocksdb::Status status = LoadLatestOptions(dbpath,
2998 rocksdb::Env::Default(),
2999 &loaded_db_opt, &loaded_cf_descs);
3000
3001 // If we're starting from scratch and there are no options saved yet then this
3002 // is a valid case. Therefore we can't compare the current set of options to
3003 // anything.
3004 if (status.IsNotFound()) {
3005 return rocksdb::Status::OK();
3006 }
3007
3008 if (!status.ok()) {
3009 return status;
3010 }
3011
3012 if (loaded_cf_descs.size() != cf_descr.size()) {
3013 return rocksdb::Status::NotSupported("Mismatched size of column family "
3014 "descriptors.");
3015 }
3016
3017 // Please see RocksDB documentation for more context about why we need to set
3018 // user-defined functions and pointer-typed options manually.
3019 for (size_t i = 0; i < loaded_cf_descs.size(); i++) {
3020 loaded_cf_descs[i].options.compaction_filter =
3021 cf_descr[i].options.compaction_filter;
3022 loaded_cf_descs[i].options.compaction_filter_factory =
3023 cf_descr[i].options.compaction_filter_factory;
3024 loaded_cf_descs[i].options.comparator = cf_descr[i].options.comparator;
3025 loaded_cf_descs[i].options.memtable_factory =
3026 cf_descr[i].options.memtable_factory;
3027 loaded_cf_descs[i].options.merge_operator =
3028 cf_descr[i].options.merge_operator;
3029 loaded_cf_descs[i].options.prefix_extractor =
3030 cf_descr[i].options.prefix_extractor;
3031 loaded_cf_descs[i].options.table_factory =
3032 cf_descr[i].options.table_factory;
3033 }
3034
3035 // This is the essence of the function - determine if it's safe to open the
3036 // database or not.
3037 status = CheckOptionsCompatibility(dbpath, rocksdb::Env::Default(), main_opts,
3038 loaded_cf_descs);
3039
3040 return status;
3041 }
3042
3043 /*
3044 Storage Engine initialization function, invoked when plugin is loaded.
3045 */
3046
rocksdb_init_func(void * const p)3047 static int rocksdb_init_func(void *const p) {
3048 DBUG_ENTER_FUNC();
3049
3050 // Validate the assumption about the size of ROCKSDB_SIZEOF_HIDDEN_PK_COLUMN.
3051 static_assert(sizeof(longlong) == 8, "Assuming that longlong is 8 bytes.");
3052
3053 #ifdef HAVE_PSI_INTERFACE
3054 init_rocksdb_psi_keys();
3055 #endif
3056
3057 rocksdb_hton = (handlerton *)p;
3058 mysql_mutex_init(rdb_psi_open_tbls_mutex_key, &rdb_open_tables.m_mutex,
3059 MY_MUTEX_INIT_FAST);
3060 #ifdef HAVE_PSI_INTERFACE
3061 rdb_bg_thread.init(rdb_signal_bg_psi_mutex_key, rdb_signal_bg_psi_cond_key);
3062 rdb_drop_idx_thread.init(rdb_signal_drop_idx_psi_mutex_key,
3063 rdb_signal_drop_idx_psi_cond_key);
3064 #else
3065 rdb_bg_thread.init();
3066 rdb_drop_idx_thread.init();
3067 #endif
3068 mysql_mutex_init(rdb_collation_data_mutex_key, &rdb_collation_data_mutex,
3069 MY_MUTEX_INIT_FAST);
3070 mysql_mutex_init(rdb_mem_cmp_space_mutex_key, &rdb_mem_cmp_space_mutex,
3071 MY_MUTEX_INIT_FAST);
3072
3073 #if defined(HAVE_PSI_INTERFACE)
3074 rdb_collation_exceptions = new Regex(key_rwlock_collation_exception_list);
3075 #else
3076 rdb_collation_exceptions = new Regex();
3077 #endif
3078
3079 mysql_mutex_init(rdb_sysvars_psi_mutex_key, &rdb_sysvars_mutex,
3080 MY_MUTEX_INIT_FAST);
3081 rdb_open_tables.init_hash();
3082 Rdb_transaction::init_mutex();
3083
3084 rocksdb_hton->state = SHOW_OPTION_YES;
3085 rocksdb_hton->create = rocksdb_create_handler;
3086 rocksdb_hton->close_connection = rocksdb_close_connection;
3087 rocksdb_hton->prepare = rocksdb_prepare;
3088 rocksdb_hton->commit_by_xid = rocksdb_commit_by_xid;
3089 rocksdb_hton->rollback_by_xid = rocksdb_rollback_by_xid;
3090 rocksdb_hton->recover = rocksdb_recover;
3091 rocksdb_hton->commit = rocksdb_commit;
3092 rocksdb_hton->rollback = rocksdb_rollback;
3093 rocksdb_hton->db_type = DB_TYPE_ROCKSDB;
3094 rocksdb_hton->show_status = rocksdb_show_status;
3095 rocksdb_hton->start_consistent_snapshot =
3096 rocksdb_start_tx_and_assign_read_view;
3097 rocksdb_hton->savepoint_set = rocksdb_savepoint;
3098 rocksdb_hton->savepoint_rollback = rocksdb_rollback_to_savepoint;
3099 rocksdb_hton->savepoint_rollback_can_release_mdl =
3100 rocksdb_rollback_to_savepoint_can_release_mdl;
3101 rocksdb_hton->flush_logs = rocksdb_flush_wal;
3102
3103 rocksdb_hton->flags = HTON_TEMPORARY_NOT_SUPPORTED |
3104 HTON_SUPPORTS_EXTENDED_KEYS | HTON_CAN_RECREATE;
3105
3106 DBUG_ASSERT(!mysqld_embedded);
3107
3108 rocksdb_stats = rocksdb::CreateDBStatistics();
3109 rocksdb_db_options.statistics = rocksdb_stats;
3110
3111 if (rocksdb_rate_limiter_bytes_per_sec != 0) {
3112 rocksdb_rate_limiter.reset(
3113 rocksdb::NewGenericRateLimiter(rocksdb_rate_limiter_bytes_per_sec));
3114 rocksdb_db_options.rate_limiter = rocksdb_rate_limiter;
3115 }
3116
3117 std::shared_ptr<Rdb_logger> myrocks_logger = std::make_shared<Rdb_logger>();
3118 rocksdb::Status s = rocksdb::CreateLoggerFromOptions(
3119 rocksdb_datadir, rocksdb_db_options, &rocksdb_db_options.info_log);
3120 if (s.ok()) {
3121 myrocks_logger->SetRocksDBLogger(rocksdb_db_options.info_log);
3122 }
3123
3124 rocksdb_db_options.info_log = myrocks_logger;
3125 myrocks_logger->SetInfoLogLevel(
3126 static_cast<rocksdb::InfoLogLevel>(rocksdb_info_log_level));
3127 rocksdb_db_options.wal_dir = rocksdb_wal_dir;
3128
3129 rocksdb_db_options.wal_recovery_mode =
3130 static_cast<rocksdb::WALRecoveryMode>(rocksdb_wal_recovery_mode);
3131
3132 rocksdb_db_options.access_hint_on_compaction_start =
3133 static_cast<rocksdb::Options::AccessHint>(
3134 rocksdb_access_hint_on_compaction_start);
3135
3136 if (rocksdb_db_options.allow_mmap_reads &&
3137 rocksdb_db_options.use_direct_reads) {
3138 // allow_mmap_reads implies !use_direct_reads and RocksDB will not open if
3139 // mmap_reads and direct_reads are both on. (NO_LINT_DEBUG)
3140 sql_print_error("RocksDB: Can't enable both use_direct_reads "
3141 "and allow_mmap_reads\n");
3142 rdb_open_tables.free_hash();
3143 DBUG_RETURN(HA_EXIT_FAILURE);
3144 }
3145
3146 if (rocksdb_db_options.allow_mmap_writes &&
3147 rocksdb_db_options.use_direct_writes) {
3148 // See above comment for allow_mmap_reads. (NO_LINT_DEBUG)
3149 sql_print_error("RocksDB: Can't enable both use_direct_writes "
3150 "and allow_mmap_writes\n");
3151 rdb_open_tables.free_hash();
3152 DBUG_RETURN(HA_EXIT_FAILURE);
3153 }
3154
3155 std::vector<std::string> cf_names;
3156 rocksdb::Status status;
3157 status = rocksdb::DB::ListColumnFamilies(rocksdb_db_options, rocksdb_datadir,
3158 &cf_names);
3159 if (!status.ok()) {
3160 /*
3161 When we start on an empty datadir, ListColumnFamilies returns IOError,
3162 and RocksDB doesn't provide any way to check what kind of error it was.
3163 Checking system errno happens to work right now.
3164 */
3165 if (status.IsIOError() && errno == ENOENT) {
3166 sql_print_information("RocksDB: Got ENOENT when listing column families");
3167 sql_print_information(
3168 "RocksDB: assuming that we're creating a new database");
3169 } else {
3170 std::string err_text = status.ToString();
3171 sql_print_error("RocksDB: Error listing column families: %s",
3172 err_text.c_str());
3173 rdb_open_tables.free_hash();
3174 DBUG_RETURN(HA_EXIT_FAILURE);
3175 }
3176 } else
3177 sql_print_information("RocksDB: %ld column families found",
3178 cf_names.size());
3179
3180 std::vector<rocksdb::ColumnFamilyDescriptor> cf_descr;
3181 std::vector<rocksdb::ColumnFamilyHandle *> cf_handles;
3182
3183 rocksdb_tbl_options.index_type =
3184 (rocksdb::BlockBasedTableOptions::IndexType)rocksdb_index_type;
3185
3186 if (!rocksdb_tbl_options.no_block_cache) {
3187 rocksdb_tbl_options.block_cache =
3188 rocksdb::NewLRUCache(rocksdb_block_cache_size);
3189 }
3190 // Using newer BlockBasedTable format version for better compression
3191 // and better memory allocation.
3192 // See:
3193 // https://github.com/facebook/rocksdb/commit/9ab5adfc59a621d12357580c94451d9f7320c2dd
3194 rocksdb_tbl_options.format_version = 2;
3195
3196 if (rocksdb_collect_sst_properties) {
3197 properties_collector_factory =
3198 std::make_shared<Rdb_tbl_prop_coll_factory>(&ddl_manager);
3199
3200 rocksdb_set_compaction_options(nullptr, nullptr, nullptr, nullptr);
3201
3202 mysql_mutex_lock(&rdb_sysvars_mutex);
3203
3204 DBUG_ASSERT(rocksdb_table_stats_sampling_pct <=
3205 RDB_TBL_STATS_SAMPLE_PCT_MAX);
3206 properties_collector_factory->SetTableStatsSamplingPct(
3207 rocksdb_table_stats_sampling_pct);
3208
3209 mysql_mutex_unlock(&rdb_sysvars_mutex);
3210 }
3211
3212 if (rocksdb_persistent_cache_size > 0) {
3213 std::shared_ptr<rocksdb::PersistentCache> pcache;
3214 rocksdb::NewPersistentCache(
3215 rocksdb::Env::Default(), std::string(rocksdb_persistent_cache_path),
3216 rocksdb_persistent_cache_size, myrocks_logger, true, &pcache);
3217 rocksdb_tbl_options.persistent_cache = pcache;
3218 } else if (strlen(rocksdb_persistent_cache_path)) {
3219 sql_print_error("RocksDB: Must specify rocksdb_persistent_cache_size");
3220 DBUG_RETURN(1);
3221 }
3222
3223 if (!rocksdb_cf_options_map.init(
3224 rocksdb_tbl_options, properties_collector_factory,
3225 rocksdb_default_cf_options, rocksdb_override_cf_options)) {
3226 // NO_LINT_DEBUG
3227 sql_print_error("RocksDB: Failed to initialize CF options map.");
3228 rdb_open_tables.free_hash();
3229 DBUG_RETURN(HA_EXIT_FAILURE);
3230 }
3231
3232 /*
3233 If there are no column families, we're creating the new database.
3234 Create one column family named "default".
3235 */
3236 if (cf_names.size() == 0)
3237 cf_names.push_back(DEFAULT_CF_NAME);
3238
3239 std::vector<int> compaction_enabled_cf_indices;
3240 sql_print_information("RocksDB: Column Families at start:");
3241 for (size_t i = 0; i < cf_names.size(); ++i) {
3242 rocksdb::ColumnFamilyOptions opts;
3243 rocksdb_cf_options_map.get_cf_options(cf_names[i], &opts);
3244
3245 sql_print_information(" cf=%s", cf_names[i].c_str());
3246 sql_print_information(" write_buffer_size=%ld", opts.write_buffer_size);
3247 sql_print_information(" target_file_size_base=%" PRIu64,
3248 opts.target_file_size_base);
3249
3250 /*
3251 Temporarily disable compactions to prevent a race condition where
3252 compaction starts before compaction filter is ready.
3253 */
3254 if (!opts.disable_auto_compactions) {
3255 compaction_enabled_cf_indices.push_back(i);
3256 opts.disable_auto_compactions = true;
3257 }
3258 cf_descr.push_back(rocksdb::ColumnFamilyDescriptor(cf_names[i], opts));
3259 }
3260
3261 rocksdb::Options main_opts(rocksdb_db_options,
3262 rocksdb_cf_options_map.get_defaults());
3263
3264 main_opts.env->SetBackgroundThreads(main_opts.max_background_flushes,
3265 rocksdb::Env::Priority::HIGH);
3266 main_opts.env->SetBackgroundThreads(main_opts.max_background_compactions,
3267 rocksdb::Env::Priority::LOW);
3268 rocksdb::TransactionDBOptions tx_db_options;
3269 tx_db_options.transaction_lock_timeout = 2; // 2 seconds
3270 tx_db_options.custom_mutex_factory = std::make_shared<Rdb_mutex_factory>();
3271
3272 status =
3273 check_rocksdb_options_compatibility(rocksdb_datadir, main_opts, cf_descr);
3274
3275 // We won't start if we'll determine that there's a chance of data corruption
3276 // because of incompatible options.
3277 if (!status.ok()) {
3278 // NO_LINT_DEBUG
3279 sql_print_error("RocksDB: compatibility check against existing database "
3280 "options failed. %s",
3281 status.ToString().c_str());
3282 rdb_open_tables.free_hash();
3283 DBUG_RETURN(HA_EXIT_FAILURE);
3284 }
3285
3286 status = rocksdb::TransactionDB::Open(
3287 main_opts, tx_db_options, rocksdb_datadir, cf_descr, &cf_handles, &rdb);
3288
3289 if (!status.ok()) {
3290 std::string err_text = status.ToString();
3291 sql_print_error("RocksDB: Error opening instance: %s", err_text.c_str());
3292 rdb_open_tables.free_hash();
3293 DBUG_RETURN(HA_EXIT_FAILURE);
3294 }
3295 cf_manager.init(&rocksdb_cf_options_map, &cf_handles);
3296
3297 if (dict_manager.init(rdb->GetBaseDB(), &cf_manager)) {
3298 // NO_LINT_DEBUG
3299 sql_print_error("RocksDB: Failed to initialize data dictionary.");
3300 rdb_open_tables.free_hash();
3301 DBUG_RETURN(HA_EXIT_FAILURE);
3302 }
3303
3304 if (ddl_manager.init(&dict_manager, &cf_manager, rocksdb_validate_tables)) {
3305 // NO_LINT_DEBUG
3306 sql_print_error("RocksDB: Failed to initialize DDL manager.");
3307 rdb_open_tables.free_hash();
3308 DBUG_RETURN(HA_EXIT_FAILURE);
3309 }
3310
3311 for (const auto &cf_handle : cf_manager.get_all_cf()) {
3312 uint flags;
3313 if (!dict_manager.get_cf_flags(cf_handle->GetID(), &flags)) {
3314 const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
3315 rocksdb::WriteBatch *const batch = wb.get();
3316 dict_manager.add_cf_flags(batch, cf_handle->GetID(), 0);
3317 dict_manager.commit(batch);
3318 }
3319 }
3320
3321
3322 Rdb_sst_info::init(rdb);
3323
3324 /*
3325 Enable auto compaction, things needed for compaction filter are finished
3326 initializing
3327 */
3328 std::vector<rocksdb::ColumnFamilyHandle *> compaction_enabled_cf_handles;
3329 compaction_enabled_cf_handles.reserve(compaction_enabled_cf_indices.size());
3330 for (const auto &index : compaction_enabled_cf_indices) {
3331 compaction_enabled_cf_handles.push_back(cf_handles[index]);
3332 }
3333
3334 status = rdb->EnableAutoCompaction(compaction_enabled_cf_handles);
3335
3336 if (!status.ok()) {
3337 const std::string err_text = status.ToString();
3338 // NO_LINT_DEBUG
3339 sql_print_error("RocksDB: Error enabling compaction: %s", err_text.c_str());
3340 rdb_open_tables.free_hash();
3341 DBUG_RETURN(HA_EXIT_FAILURE);
3342 }
3343
3344 auto err = rdb_bg_thread.create_thread(BG_THREAD_NAME
3345 #ifdef HAVE_PSI_INTERFACE
3346 ,
3347 rdb_background_psi_thread_key
3348 #endif
3349 );
3350 if (err != 0) {
3351 sql_print_error("RocksDB: Couldn't start the background thread: (errno=%d)",
3352 err);
3353 rdb_open_tables.free_hash();
3354 DBUG_RETURN(HA_EXIT_FAILURE);
3355 }
3356
3357 err = rdb_drop_idx_thread.create_thread(INDEX_THREAD_NAME
3358 #ifdef HAVE_PSI_INTERFACE
3359 ,
3360 rdb_drop_idx_psi_thread_key
3361 #endif
3362 );
3363 if (err != 0) {
3364 sql_print_error("RocksDB: Couldn't start the drop index thread: (errno=%d)",
3365 err);
3366 rdb_open_tables.free_hash();
3367 DBUG_RETURN(HA_EXIT_FAILURE);
3368 }
3369
3370 rdb_set_collation_exception_list(rocksdb_strict_collation_exceptions);
3371
3372 if (rocksdb_pause_background_work) {
3373 rdb->PauseBackgroundWork();
3374 }
3375
3376 sql_print_information("RocksDB instance opened");
3377 DBUG_RETURN(HA_EXIT_SUCCESS);
3378 }
3379
3380 /*
3381 Storage Engine deinitialization function, invoked when plugin is unloaded.
3382 */
3383
rocksdb_done_func(void * const p)3384 static int rocksdb_done_func(void *const p) {
3385 DBUG_ENTER_FUNC();
3386
3387 int error = 0;
3388
3389 // signal the drop index thread to stop
3390 rdb_drop_idx_thread.signal(true);
3391
3392 // Flush all memtables for not lose data, even if WAL is disabled.
3393 rocksdb_flush_all_memtables();
3394
3395 // Stop all rocksdb background work
3396 CancelAllBackgroundWork(rdb->GetBaseDB(), true);
3397
3398 // Signal the background thread to stop and to persist all stats collected
3399 // from background flushes and compactions. This will add more keys to a new
3400 // memtable, but since the memtables were just flushed, it should not trigger
3401 // a flush that can stall due to background threads being stopped. As long
3402 // as these keys are stored in a WAL file, they can be retrieved on restart.
3403 rdb_bg_thread.signal(true);
3404
3405 // Wait for the background thread to finish.
3406 auto err = rdb_bg_thread.join();
3407 if (err != 0) {
3408 // We'll log the message and continue because we're shutting down and
3409 // continuation is the optimal strategy.
3410 // NO_LINT_DEBUG
3411 sql_print_error("RocksDB: Couldn't stop the background thread: (errno=%d)",
3412 err);
3413 }
3414
3415 // Wait for the drop index thread to finish.
3416 err = rdb_drop_idx_thread.join();
3417 if (err != 0) {
3418 // NO_LINT_DEBUG
3419 sql_print_error("RocksDB: Couldn't stop the index thread: (errno=%d)", err);
3420 }
3421
3422 if (rdb_open_tables.m_hash.records) {
3423 // Looks like we are getting unloaded and yet we have some open tables
3424 // left behind.
3425 error = 1;
3426 }
3427
3428 rdb_open_tables.free_hash();
3429 mysql_mutex_destroy(&rdb_open_tables.m_mutex);
3430 mysql_mutex_destroy(&rdb_sysvars_mutex);
3431
3432 delete rdb_collation_exceptions;
3433 mysql_mutex_destroy(&rdb_collation_data_mutex);
3434 mysql_mutex_destroy(&rdb_mem_cmp_space_mutex);
3435
3436 Rdb_transaction::term_mutex();
3437
3438 for (auto &it : rdb_collation_data) {
3439 delete it;
3440 it = nullptr;
3441 }
3442
3443 ddl_manager.cleanup();
3444 dict_manager.cleanup();
3445 cf_manager.cleanup();
3446
3447 delete rdb;
3448 rdb = nullptr;
3449
3450 // Disown the cache data since we're shutting down.
3451 // This results in memory leaks but it improved the shutdown time.
3452 // Don't disown when running under valgrind
3453 #ifndef HAVE_purify
3454 if (rocksdb_tbl_options.block_cache) {
3455 rocksdb_tbl_options.block_cache->DisownData();
3456 }
3457 #endif /* HAVE_purify */
3458
3459 DBUG_RETURN(error);
3460 }
3461
3462 /**
3463 @brief
3464 Example of simple lock controls. The "table_handler" it creates is a
3465 structure we will pass to each ha_rocksdb handler. Do you have to have
3466 one of these? Well, you have pieces that are used for locking, and
3467 they are needed to function.
3468 */
3469
3470 Rdb_table_handler *
get_table_handler(const char * const table_name)3471 Rdb_open_tables_map::get_table_handler(const char *const table_name) {
3472 Rdb_table_handler *table_handler;
3473 uint length;
3474 char *tmp_name;
3475
3476 DBUG_ASSERT(table_name != nullptr);
3477 length = (uint)strlen(table_name);
3478
3479 // First, look up the table in the hash map.
3480 mysql_mutex_lock(&m_mutex);
3481 if (!(table_handler = reinterpret_cast<Rdb_table_handler *>(my_hash_search(
3482 &m_hash, reinterpret_cast<const uchar *>(table_name), length)))) {
3483 // Since we did not find it in the hash map, attempt to create and add it
3484 // to the hash map.
3485 if (!(table_handler = reinterpret_cast<Rdb_table_handler *>(my_multi_malloc(
3486 MYF(MY_WME | MY_ZEROFILL), &table_handler, sizeof(*table_handler),
3487 &tmp_name, length + 1, NullS)))) {
3488 // Allocating a new Rdb_table_handler and a new table name failed.
3489 mysql_mutex_unlock(&m_mutex);
3490 return nullptr;
3491 }
3492
3493 table_handler->m_ref_count = 0;
3494 table_handler->m_table_name_length = length;
3495 table_handler->m_table_name = tmp_name;
3496 strmov(table_handler->m_table_name, table_name);
3497
3498 if (my_hash_insert(&m_hash, reinterpret_cast<uchar *>(table_handler))) {
3499 // Inserting into the hash map failed.
3500 mysql_mutex_unlock(&m_mutex);
3501 my_free(table_handler);
3502 return nullptr;
3503 }
3504
3505 thr_lock_init(&table_handler->m_thr_lock);
3506 table_handler->m_io_perf_read.init();
3507 }
3508 DBUG_ASSERT(table_handler->m_ref_count >= 0);
3509 table_handler->m_ref_count++;
3510
3511 mysql_mutex_unlock(&m_mutex);
3512
3513 return table_handler;
3514 }
3515
rdb_get_open_table_names(void)3516 std::vector<std::string> rdb_get_open_table_names(void) {
3517 return rdb_open_tables.get_table_names();
3518 }
3519
get_table_names(void) const3520 std::vector<std::string> Rdb_open_tables_map::get_table_names(void) const {
3521 ulong i;
3522 const Rdb_table_handler *table_handler;
3523 std::vector<std::string> names;
3524
3525 mysql_mutex_lock(&m_mutex);
3526 for (i = 0; (table_handler = reinterpret_cast<const Rdb_table_handler *>(
3527 my_hash_const_element(&m_hash, i)));
3528 i++) {
3529 DBUG_ASSERT(table_handler != nullptr);
3530 names.push_back(table_handler->m_table_name);
3531 }
3532 DBUG_ASSERT(i == m_hash.records);
3533 mysql_mutex_unlock(&m_mutex);
3534
3535 return names;
3536 }
3537
load_auto_incr_value()3538 void ha_rocksdb::load_auto_incr_value() {
3539 const int save_active_index = active_index;
3540 active_index = table->s->next_number_index;
3541 const uint8 save_table_status = table->status;
3542
3543 /*
3544 load_auto_incr_value() may be called by statements that
3545 do not execute implicit commits (i.e. SHOW CREATE TABLE).
3546 index_last() creates a snapshot. When a snapshot is created
3547 here, it has to be released as well. (GitHub issue#189)
3548 */
3549 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
3550 const bool is_new_snapshot = !tx->has_snapshot();
3551
3552 // Do a lookup. We only need index column, so it should be index-only.
3553 // (another reason to make it index-only is that table->read_set is
3554 // not set appropriately and non-index-only lookup will not read the value)
3555 const bool save_keyread_only = m_keyread_only;
3556 m_keyread_only = true;
3557
3558 if (!index_last(table->record[0]))
3559 update_auto_incr_val();
3560
3561 m_keyread_only = save_keyread_only;
3562 if (is_new_snapshot) {
3563 tx->release_snapshot();
3564 }
3565
3566 table->status = save_table_status;
3567 active_index = save_active_index;
3568
3569 /*
3570 Do what ha_rocksdb::index_end() does.
3571 (Why don't we use index_init/index_end? class handler defines index_init
3572 as private, for some reason).
3573 */
3574 release_scan_iterator();
3575 }
3576
3577 /* Get PK value from table->record[0]. */
3578 /*
3579 TODO(alexyang): No existing support for auto_increment on non-pk columns, see
3580 end of ha_rocksdb::create. Also see opened issue here:
3581 https://github.com/facebook/mysql-5.6/issues/153
3582 */
update_auto_incr_val()3583 void ha_rocksdb::update_auto_incr_val() {
3584 Field *field;
3585 longlong new_val;
3586 field = table->key_info[table->s->next_number_index].key_part[0].field;
3587
3588 my_bitmap_map *const old_map =
3589 dbug_tmp_use_all_columns(table, table->read_set);
3590 new_val = field->val_int() + 1;
3591 dbug_tmp_restore_column_map(table->read_set, old_map);
3592
3593 longlong auto_incr_val = m_tbl_def->m_auto_incr_val;
3594 while (auto_incr_val < new_val &&
3595 !m_tbl_def->m_auto_incr_val.compare_exchange_weak(auto_incr_val,
3596 new_val)) {
3597 // Do nothing - just loop until auto_incr_val is >= new_val or
3598 // we successfully set it
3599 }
3600 }
3601
load_hidden_pk_value()3602 int ha_rocksdb::load_hidden_pk_value() {
3603 const int save_active_index = active_index;
3604 active_index = m_tbl_def->m_key_count - 1;
3605 const uint8 save_table_status = table->status;
3606
3607 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
3608 const bool is_new_snapshot = !tx->has_snapshot();
3609
3610 // Do a lookup.
3611 if (!index_last(table->record[0])) {
3612 /*
3613 Decode PK field from the key
3614 */
3615 longlong hidden_pk_id = 0;
3616 if (read_hidden_pk_id_from_rowkey(&hidden_pk_id)) {
3617 if (is_new_snapshot) {
3618 tx->release_snapshot();
3619 }
3620 return HA_ERR_INTERNAL_ERROR;
3621 }
3622
3623 hidden_pk_id++;
3624 longlong old = m_tbl_def->m_hidden_pk_val;
3625 while (
3626 old < hidden_pk_id &&
3627 !m_tbl_def->m_hidden_pk_val.compare_exchange_weak(old, hidden_pk_id)) {
3628 }
3629 }
3630
3631 if (is_new_snapshot) {
3632 tx->release_snapshot();
3633 }
3634
3635 table->status = save_table_status;
3636 active_index = save_active_index;
3637
3638 release_scan_iterator();
3639
3640 return HA_EXIT_SUCCESS;
3641 }
3642
3643 /* Get PK value from m_tbl_def->m_hidden_pk_info. */
update_hidden_pk_val()3644 longlong ha_rocksdb::update_hidden_pk_val() {
3645 DBUG_ASSERT(has_hidden_pk(table));
3646 const longlong new_val = m_tbl_def->m_hidden_pk_val++;
3647 return new_val;
3648 }
3649
3650 /* Get the id of the hidden pk id from m_last_rowkey */
read_hidden_pk_id_from_rowkey(longlong * const hidden_pk_id)3651 int ha_rocksdb::read_hidden_pk_id_from_rowkey(longlong *const hidden_pk_id) {
3652 DBUG_ASSERT(hidden_pk_id != nullptr);
3653 DBUG_ASSERT(table != nullptr);
3654 DBUG_ASSERT(has_hidden_pk(table));
3655
3656 rocksdb::Slice rowkey_slice(m_last_rowkey.ptr(), m_last_rowkey.length());
3657
3658 // Get hidden primary key from old key slice
3659 Rdb_string_reader reader(&rowkey_slice);
3660 if ((!reader.read(Rdb_key_def::INDEX_NUMBER_SIZE)))
3661 return HA_EXIT_FAILURE;
3662
3663 const int length = Field_longlong::PACK_LENGTH;
3664 const uchar *from = reinterpret_cast<const uchar *>(reader.read(length));
3665 if (from == nullptr) {
3666 return HA_EXIT_FAILURE; /* Mem-comparable image doesn't have enough bytes */
3667 }
3668
3669 *hidden_pk_id = rdb_netbuf_read_uint64(&from);
3670 return HA_EXIT_SUCCESS;
3671 }
3672
3673 /**
3674 @brief
3675 Free lock controls. We call this whenever we close a table. If the table had
3676 the last reference to the table_handler, then we free the memory associated
3677 with it.
3678 */
3679
release_table_handler(Rdb_table_handler * const table_handler)3680 void Rdb_open_tables_map::release_table_handler(
3681 Rdb_table_handler *const table_handler) {
3682 mysql_mutex_lock(&m_mutex);
3683
3684 DBUG_ASSERT(table_handler != nullptr);
3685 DBUG_ASSERT(table_handler->m_ref_count > 0);
3686 if (!--table_handler->m_ref_count) {
3687 // Last rereference was released. Tear down the hash entry.
3688 const auto ret __attribute__((__unused__)) =
3689 my_hash_delete(&m_hash, reinterpret_cast<uchar *>(table_handler));
3690 DBUG_ASSERT(!ret); // the hash entry must actually be found and deleted
3691 my_core::thr_lock_delete(&table_handler->m_thr_lock);
3692 my_free(table_handler);
3693 }
3694
3695 mysql_mutex_unlock(&m_mutex);
3696 }
3697
rocksdb_create_handler(my_core::handlerton * const hton,my_core::TABLE_SHARE * const table_arg,my_core::MEM_ROOT * const mem_root)3698 static handler *rocksdb_create_handler(my_core::handlerton *const hton,
3699 my_core::TABLE_SHARE *const table_arg,
3700 my_core::MEM_ROOT *const mem_root) {
3701 return new (mem_root) ha_rocksdb(hton, table_arg);
3702 }
3703
ha_rocksdb(my_core::handlerton * const hton,my_core::TABLE_SHARE * const table_arg)3704 ha_rocksdb::ha_rocksdb(my_core::handlerton *const hton,
3705 my_core::TABLE_SHARE *const table_arg)
3706 : handler(hton, table_arg), m_table_handler(nullptr), m_scan_it(nullptr),
3707 m_scan_it_skips_bloom(false), m_scan_it_snapshot(nullptr),
3708 m_tbl_def(nullptr), m_pk_descr(nullptr), m_key_descr_arr(nullptr),
3709 m_pk_can_be_decoded(false), m_maybe_unpack_info(false),
3710 m_pk_tuple(nullptr), m_pk_packed_tuple(nullptr),
3711 m_sk_packed_tuple(nullptr), m_end_key_packed_tuple(nullptr),
3712 m_sk_match_prefix(nullptr), m_sk_match_prefix_buf(nullptr),
3713 m_sk_packed_tuple_old(nullptr), m_dup_sk_packed_tuple(nullptr),
3714 m_dup_sk_packed_tuple_old(nullptr), m_pack_buffer(nullptr),
3715 m_lock_rows(RDB_LOCK_NONE), m_keyread_only(FALSE),
3716 m_bulk_load_tx(nullptr), m_encoder_arr(nullptr),
3717 m_row_checksums_checked(0), m_in_rpl_delete_rows(false),
3718 m_in_rpl_update_rows(false) {
3719 // TODO(alexyang): create a valid PSI_mutex_key for this mutex
3720 mysql_mutex_init(0, &m_bulk_load_mutex, MY_MUTEX_INIT_FAST);
3721 }
3722
3723 static const char *ha_rocksdb_exts[] = {NullS};
3724
bas_ext() const3725 const char **ha_rocksdb::bas_ext() const {
3726 DBUG_ENTER_FUNC();
3727
3728 DBUG_RETURN(ha_rocksdb_exts);
3729 }
3730
same_table(const ha_rocksdb & other) const3731 bool ha_rocksdb::same_table(const ha_rocksdb &other) const {
3732 return m_tbl_def->base_tablename() == other.m_tbl_def->base_tablename();
3733 }
3734
init_with_fields()3735 bool ha_rocksdb::init_with_fields() {
3736 DBUG_ENTER_FUNC();
3737
3738 const uint pk = table_share->primary_key;
3739 if (pk != MAX_KEY) {
3740 const uint key_parts = table_share->key_info[pk].user_defined_key_parts;
3741 check_keyread_allowed(pk /*PK*/, key_parts - 1, true);
3742 } else
3743 m_pk_can_be_decoded = false;
3744
3745 cached_table_flags = table_flags();
3746
3747 DBUG_RETURN(false); /* Ok */
3748 }
3749
3750 /**
3751 Convert record from table->record[0] form into a form that can be written
3752 into rocksdb.
3753
3754 @param pk_packed_slice Packed PK tuple. We need it in order to compute
3755 and store its CRC.
3756 @param packed_rec OUT Data slice with record data.
3757 */
3758
convert_record_to_storage_format(const rocksdb::Slice & pk_packed_slice,Rdb_string_writer * const pk_unpack_info,rocksdb::Slice * const packed_rec)3759 void ha_rocksdb::convert_record_to_storage_format(
3760 const rocksdb::Slice &pk_packed_slice,
3761 Rdb_string_writer *const pk_unpack_info, rocksdb::Slice *const packed_rec) {
3762 m_storage_record.length(0);
3763
3764 /* All NULL bits are initially 0 */
3765 m_storage_record.fill(m_null_bytes_in_rec, 0);
3766
3767 // If a primary key may have non-empty unpack_info for certain values,
3768 // (m_maybe_unpack_info=TRUE), we write the unpack_info block. The block
3769 // itself was prepared in Rdb_key_def::pack_record.
3770 if (m_maybe_unpack_info) {
3771 m_storage_record.append(reinterpret_cast<char *>(pk_unpack_info->ptr()),
3772 pk_unpack_info->get_current_pos());
3773 }
3774
3775 for (uint i = 0; i < table->s->fields; i++) {
3776 /* Don't pack decodable PK key parts */
3777 if (m_encoder_arr[i].m_storage_type != Rdb_field_encoder::STORE_ALL) {
3778 continue;
3779 }
3780
3781 Field *const field = table->field[i];
3782 if (m_encoder_arr[i].maybe_null()) {
3783 char *const data = (char *)m_storage_record.ptr();
3784 if (field->is_null()) {
3785 data[m_encoder_arr[i].m_null_offset] |= m_encoder_arr[i].m_null_mask;
3786 /* Don't write anything for NULL values */
3787 continue;
3788 }
3789 }
3790
3791 if (m_encoder_arr[i].m_field_type == MYSQL_TYPE_BLOB) {
3792 my_core::Field_blob *blob = (my_core::Field_blob *)field;
3793 /* Get the number of bytes needed to store length*/
3794 const uint length_bytes = blob->pack_length() - portable_sizeof_char_ptr;
3795
3796 /* Store the length of the value */
3797 m_storage_record.append(reinterpret_cast<char *>(blob->ptr),
3798 length_bytes);
3799
3800 /* Store the blob value itself */
3801 char *data_ptr;
3802 memcpy(&data_ptr, blob->ptr + length_bytes, sizeof(uchar **));
3803 m_storage_record.append(data_ptr, blob->get_length());
3804 } else if (m_encoder_arr[i].m_field_type == MYSQL_TYPE_VARCHAR) {
3805 Field_varstring *const field_var = (Field_varstring *)field;
3806 uint data_len;
3807 /* field_var->length_bytes is 1 or 2 */
3808 if (field_var->length_bytes == 1) {
3809 data_len = field_var->ptr[0];
3810 } else {
3811 DBUG_ASSERT(field_var->length_bytes == 2);
3812 data_len = uint2korr(field_var->ptr);
3813 }
3814 m_storage_record.append(reinterpret_cast<char *>(field_var->ptr),
3815 field_var->length_bytes + data_len);
3816 } else {
3817 /* Copy the field data */
3818 const uint len = field->pack_length_in_rec();
3819 m_storage_record.append(reinterpret_cast<char *>(field->ptr), len);
3820 }
3821 }
3822
3823 if (should_store_row_debug_checksums()) {
3824 const uint32_t key_crc32 = my_core::crc32(
3825 0, rdb_slice_to_uchar_ptr(&pk_packed_slice), pk_packed_slice.size());
3826 const uint32_t val_crc32 =
3827 my_core::crc32(0, rdb_mysql_str_to_uchar_str(&m_storage_record),
3828 m_storage_record.length());
3829 uchar key_crc_buf[RDB_CHECKSUM_SIZE];
3830 uchar val_crc_buf[RDB_CHECKSUM_SIZE];
3831 rdb_netbuf_store_uint32(key_crc_buf, key_crc32);
3832 rdb_netbuf_store_uint32(val_crc_buf, val_crc32);
3833 m_storage_record.append((const char *)&RDB_CHECKSUM_DATA_TAG, 1);
3834 m_storage_record.append((const char *)key_crc_buf, RDB_CHECKSUM_SIZE);
3835 m_storage_record.append((const char *)val_crc_buf, RDB_CHECKSUM_SIZE);
3836 }
3837
3838 *packed_rec =
3839 rocksdb::Slice(m_storage_record.ptr(), m_storage_record.length());
3840 }
3841
3842 /*
3843 @brief
3844 Setup which fields will be unpacked when reading rows
3845
3846 @detail
3847 Two special cases when we still unpack all fields:
3848 - When this table is being updated (m_lock_rows==RDB_LOCK_WRITE).
3849 - When @@rocksdb_verify_row_debug_checksums is ON (In this mode, we need to
3850 read all
3851 fields to find whether there is a row checksum at the end. We could skip
3852 the fields instead of decoding them, but currently we do decoding.)
3853
3854 @seealso
3855 ha_rocksdb::setup_field_converters()
3856 ha_rocksdb::convert_record_from_storage_format()
3857 */
setup_read_decoders()3858 void ha_rocksdb::setup_read_decoders() {
3859 m_decoders_vect.clear();
3860
3861 int last_useful = 0;
3862 int skip_size = 0;
3863
3864 for (uint i = 0; i < table->s->fields; i++) {
3865 // We only need the decoder if the whole record is stored.
3866 if (m_encoder_arr[i].m_storage_type != Rdb_field_encoder::STORE_ALL) {
3867 continue;
3868 }
3869
3870 if (m_lock_rows == RDB_LOCK_WRITE || m_verify_row_debug_checksums ||
3871 bitmap_is_set(table->read_set, table->field[i]->field_index)) {
3872 // We will need to decode this field
3873 m_decoders_vect.push_back({&m_encoder_arr[i], true, skip_size});
3874 last_useful = m_decoders_vect.size();
3875 skip_size = 0;
3876 } else {
3877 if (m_encoder_arr[i].uses_variable_len_encoding() ||
3878 m_encoder_arr[i].maybe_null()) {
3879 // For variable-length field, we need to read the data and skip it
3880 m_decoders_vect.push_back({&m_encoder_arr[i], false, skip_size});
3881 skip_size = 0;
3882 } else {
3883 // Fixed-width field can be skipped without looking at it.
3884 // Add appropriate skip_size to the next field.
3885 skip_size += m_encoder_arr[i].m_pack_length_in_rec;
3886 }
3887 }
3888 }
3889
3890 // It could be that the last few elements are varchars that just do
3891 // skipping. Remove them.
3892 m_decoders_vect.erase(m_decoders_vect.begin() + last_useful,
3893 m_decoders_vect.end());
3894 }
3895
3896 #ifndef NDEBUG
dbug_append_garbage_at_end(std::string & on_disk_rec)3897 void dbug_append_garbage_at_end(std::string &on_disk_rec) {
3898 on_disk_rec.append("abc");
3899 }
3900
dbug_truncate_record(std::string & on_disk_rec)3901 void dbug_truncate_record(std::string &on_disk_rec) { on_disk_rec.resize(0); }
3902
dbug_modify_rec_varchar12(std::string & on_disk_rec)3903 void dbug_modify_rec_varchar12(std::string &on_disk_rec) {
3904 std::string res;
3905 // The record is NULL-byte followed by VARCHAR(10).
3906 // Put the NULL-byte
3907 res.append("\0", 1);
3908 // Then, add a valid VARCHAR(12) value.
3909 res.append("\xC", 1);
3910 res.append("123456789ab", 12);
3911
3912 on_disk_rec.assign(res);
3913 }
3914
dbug_modify_key_varchar8(String & on_disk_rec)3915 void dbug_modify_key_varchar8(String &on_disk_rec) {
3916 std::string res;
3917 // The key starts with index number
3918 res.append(on_disk_rec.ptr(), Rdb_key_def::INDEX_NUMBER_SIZE);
3919
3920 // Then, a mem-comparable form of a varchar(8) value.
3921 res.append("ABCDE\0\0\0\xFC", 9);
3922 on_disk_rec.length(0);
3923 on_disk_rec.append(res.data(), res.size());
3924 }
3925
dbug_create_err_inplace_alter()3926 void dbug_create_err_inplace_alter() {
3927 my_printf_error(ER_UNKNOWN_ERROR,
3928 "Intentional failure in inplace alter occurred.", MYF(0));
3929 }
3930 #endif
3931
convert_record_from_storage_format(const rocksdb::Slice * const key,uchar * const buf)3932 int ha_rocksdb::convert_record_from_storage_format(
3933 const rocksdb::Slice *const key, uchar *const buf) {
3934 DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read1",
3935 dbug_append_garbage_at_end(m_retrieved_record););
3936 DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read2",
3937 dbug_truncate_record(m_retrieved_record););
3938 DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read3",
3939 dbug_modify_rec_varchar12(m_retrieved_record););
3940
3941 const rocksdb::Slice retrieved_rec_slice(&m_retrieved_record.front(),
3942 m_retrieved_record.size());
3943 return convert_record_from_storage_format(key, &retrieved_rec_slice, buf);
3944 }
3945
3946 /*
3947 @brief
3948 Unpack the record in this->m_retrieved_record and this->m_last_rowkey from
3949 storage format into buf (which can be table->record[0] or table->record[1]).
3950
3951 @param key Table record's key in mem-comparable form.
3952 @param buf Store record in table->record[0] format here
3953
3954 @detail
3955 If the table has blobs, the unpacked data in buf may keep pointers to the
3956 data in this->m_retrieved_record.
3957
3958 The key is only needed to check its checksum value (the checksum is in
3959 m_retrieved_record).
3960
3961 @seealso
3962 ha_rocksdb::setup_read_decoders() Sets up data structures which tell which
3963 columns to decode.
3964
3965 @return
3966 0 OK
3967 other Error inpacking the data
3968 */
3969
convert_record_from_storage_format(const rocksdb::Slice * const key,const rocksdb::Slice * const value,uchar * const buf)3970 int ha_rocksdb::convert_record_from_storage_format(
3971 const rocksdb::Slice *const key, const rocksdb::Slice *const value,
3972 uchar *const buf) {
3973 DBUG_ASSERT(key != nullptr);
3974 DBUG_ASSERT(buf != nullptr);
3975
3976 Rdb_string_reader reader(value);
3977 const my_ptrdiff_t ptr_diff = buf - table->record[0];
3978
3979 /*
3980 Decode PK fields from the key
3981 */
3982 DBUG_EXECUTE_IF("myrocks_simulate_bad_pk_read1",
3983 dbug_modify_key_varchar8(m_last_rowkey););
3984
3985 const rocksdb::Slice rowkey_slice(m_last_rowkey.ptr(),
3986 m_last_rowkey.length());
3987 const char *unpack_info = nullptr;
3988 uint16 unpack_info_len = 0;
3989 rocksdb::Slice unpack_slice;
3990
3991 /* Other fields are decoded from the value */
3992 const char *null_bytes = nullptr;
3993 if (m_null_bytes_in_rec && !(null_bytes = reader.read(m_null_bytes_in_rec))) {
3994 return HA_ERR_INTERNAL_ERROR;
3995 }
3996
3997 if (m_maybe_unpack_info) {
3998 unpack_info = reader.read(RDB_UNPACK_HEADER_SIZE);
3999
4000 if (!unpack_info || unpack_info[0] != RDB_UNPACK_DATA_TAG) {
4001 return HA_ERR_INTERNAL_ERROR;
4002 }
4003
4004 unpack_info_len =
4005 rdb_netbuf_to_uint16(reinterpret_cast<const uchar *>(unpack_info + 1));
4006 unpack_slice = rocksdb::Slice(unpack_info, unpack_info_len);
4007
4008 reader.read(unpack_info_len - RDB_UNPACK_HEADER_SIZE);
4009 }
4010
4011 if (m_pk_descr->unpack_record(table, buf, &rowkey_slice,
4012 unpack_info ? &unpack_slice : nullptr,
4013 false /* verify_checksum */)) {
4014 return HA_ERR_INTERNAL_ERROR;
4015 }
4016
4017 for (auto it = m_decoders_vect.begin(); it != m_decoders_vect.end(); it++) {
4018 const Rdb_field_encoder *const field_dec = it->m_field_enc;
4019 const bool decode = it->m_decode;
4020 const bool isNull =
4021 field_dec->maybe_null() &&
4022 ((null_bytes[field_dec->m_null_offset] & field_dec->m_null_mask) != 0);
4023
4024 Field *const field = table->field[field_dec->m_field_index];
4025
4026 /* Skip the bytes we need to skip */
4027 if (it->m_skip && !reader.read(it->m_skip))
4028 return HA_ERR_INTERNAL_ERROR;
4029
4030 if (isNull) {
4031 if (decode) {
4032 /* This sets the NULL-bit of this record */
4033 field->set_null(ptr_diff);
4034 /*
4035 Besides that, set the field value to default value. CHECKSUM TABLE
4036 depends on this.
4037 */
4038 uint field_offset = field->ptr - table->record[0];
4039 memcpy(buf + field_offset, table->s->default_values + field_offset,
4040 field->pack_length());
4041 }
4042 continue;
4043 } else {
4044 if (decode)
4045 field->set_notnull(ptr_diff);
4046 }
4047
4048 if (field_dec->m_field_type == MYSQL_TYPE_BLOB) {
4049 my_core::Field_blob *const blob = (my_core::Field_blob *)field;
4050 /* Get the number of bytes needed to store length*/
4051 const uint length_bytes = blob->pack_length() - portable_sizeof_char_ptr;
4052
4053 blob->move_field_offset(ptr_diff);
4054
4055 const char *data_len_str;
4056 if (!(data_len_str = reader.read(length_bytes))) {
4057 blob->move_field_offset(-ptr_diff);
4058 return HA_ERR_INTERNAL_ERROR;
4059 }
4060
4061 memcpy(blob->ptr, data_len_str, length_bytes);
4062
4063 const uint32 data_len = blob->get_length(
4064 (uchar *)data_len_str, length_bytes, table->s->db_low_byte_first);
4065 const char *blob_ptr;
4066 if (!(blob_ptr = reader.read(data_len))) {
4067 blob->move_field_offset(-ptr_diff);
4068 return HA_ERR_INTERNAL_ERROR;
4069 }
4070
4071 if (decode) {
4072 // set 8-byte pointer to 0, like innodb does (relevant for 32-bit
4073 // platforms)
4074 memset(blob->ptr + length_bytes, 0, 8);
4075 memcpy(blob->ptr + length_bytes, &blob_ptr, sizeof(uchar **));
4076 blob->move_field_offset(-ptr_diff);
4077 }
4078 } else if (field_dec->m_field_type == MYSQL_TYPE_VARCHAR) {
4079 Field_varstring *const field_var = (Field_varstring *)field;
4080 const char *data_len_str;
4081 if (!(data_len_str = reader.read(field_var->length_bytes)))
4082 return HA_ERR_INTERNAL_ERROR;
4083
4084 uint data_len;
4085 /* field_var->length_bytes is 1 or 2 */
4086 if (field_var->length_bytes == 1) {
4087 data_len = (uchar)data_len_str[0];
4088 } else {
4089 DBUG_ASSERT(field_var->length_bytes == 2);
4090 data_len = uint2korr(data_len_str);
4091 }
4092 if (data_len > field->field_length) {
4093 /* The data on disk is longer than table DDL allows? */
4094 return HA_ERR_INTERNAL_ERROR;
4095 }
4096 if (!reader.read(data_len))
4097 return HA_ERR_INTERNAL_ERROR;
4098
4099 if (decode) {
4100 memcpy(field_var->ptr + ptr_diff, data_len_str,
4101 field_var->length_bytes + data_len);
4102 }
4103 } else {
4104 const char *data_bytes;
4105 const uint len = field_dec->m_pack_length_in_rec;
4106 if (len > 0) {
4107 if ((data_bytes = reader.read(len)) == nullptr) {
4108 return HA_ERR_INTERNAL_ERROR;
4109 }
4110 if (decode)
4111 memcpy(field->ptr + ptr_diff, data_bytes, len);
4112 }
4113 }
4114 }
4115
4116 if (m_verify_row_debug_checksums) {
4117 if (reader.remaining_bytes() == RDB_CHECKSUM_CHUNK_SIZE &&
4118 reader.read(1)[0] == RDB_CHECKSUM_DATA_TAG) {
4119 uint32_t stored_key_chksum =
4120 rdb_netbuf_to_uint32((const uchar *)reader.read(RDB_CHECKSUM_SIZE));
4121 uint32_t stored_val_chksum =
4122 rdb_netbuf_to_uint32((const uchar *)reader.read(RDB_CHECKSUM_SIZE));
4123
4124 const uint32_t computed_key_chksum =
4125 my_core::crc32(0, rdb_slice_to_uchar_ptr(key), key->size());
4126 const uint32_t computed_val_chksum =
4127 my_core::crc32(0, rdb_slice_to_uchar_ptr(value),
4128 value->size() - RDB_CHECKSUM_CHUNK_SIZE);
4129
4130 DBUG_EXECUTE_IF("myrocks_simulate_bad_pk_checksum1",
4131 stored_key_chksum++;);
4132
4133 if (stored_key_chksum != computed_key_chksum) {
4134 m_pk_descr->report_checksum_mismatch(true, key->data(), key->size());
4135 return HA_ERR_INTERNAL_ERROR;
4136 }
4137
4138 DBUG_EXECUTE_IF("myrocks_simulate_bad_pk_checksum2",
4139 stored_val_chksum++;);
4140 if (stored_val_chksum != computed_val_chksum) {
4141 m_pk_descr->report_checksum_mismatch(false, value->data(),
4142 value->size());
4143 return HA_ERR_INTERNAL_ERROR;
4144 }
4145
4146 m_row_checksums_checked++;
4147 }
4148 if (reader.remaining_bytes())
4149 return HA_ERR_INTERNAL_ERROR;
4150 }
4151
4152 return HA_EXIT_SUCCESS;
4153 }
4154
get_storage_type(Rdb_field_encoder * const encoder,const uint & kp)4155 void ha_rocksdb::get_storage_type(Rdb_field_encoder *const encoder,
4156 const uint &kp) {
4157 // STORE_SOME uses unpack_info.
4158 if (m_pk_descr->has_unpack_info(kp)) {
4159 DBUG_ASSERT(m_pk_descr->can_unpack(kp));
4160 encoder->m_storage_type = Rdb_field_encoder::STORE_SOME;
4161 m_maybe_unpack_info = true;
4162 } else if (m_pk_descr->can_unpack(kp)) {
4163 encoder->m_storage_type = Rdb_field_encoder::STORE_NONE;
4164 }
4165 }
4166
4167 /*
4168 Setup data needed to convert table->record[] to and from record storage
4169 format.
4170
4171 @seealso
4172 ha_rocksdb::convert_record_to_storage_format,
4173 ha_rocksdb::convert_record_from_storage_format
4174 */
4175
setup_field_converters()4176 void ha_rocksdb::setup_field_converters() {
4177 uint i;
4178 uint null_bytes = 0;
4179 uchar cur_null_mask = 0x1;
4180
4181 DBUG_ASSERT(m_encoder_arr == nullptr);
4182 m_encoder_arr = static_cast<Rdb_field_encoder *>(
4183 my_malloc(table->s->fields * sizeof(Rdb_field_encoder), MYF(0)));
4184 if (m_encoder_arr == nullptr) {
4185 return;
4186 }
4187
4188 for (i = 0; i < table->s->fields; i++) {
4189 Field *const field = table->field[i];
4190 m_encoder_arr[i].m_storage_type = Rdb_field_encoder::STORE_ALL;
4191
4192 /*
4193 Check if this field is
4194 - a part of primary key, and
4195 - it can be decoded back from its key image.
4196 If both hold, we don't need to store this field in the value part of
4197 RocksDB's key-value pair.
4198
4199 If hidden pk exists, we skip this check since the field will never be
4200 part of the hidden pk.
4201 */
4202 if (!has_hidden_pk(table) &&
4203 field->part_of_key.is_set(table->s->primary_key)) {
4204 KEY *const pk_info = &table->key_info[table->s->primary_key];
4205 for (uint kp = 0; kp < pk_info->user_defined_key_parts; kp++) {
4206 /* key_part->fieldnr is counted from 1 */
4207 if (field->field_index + 1 == pk_info->key_part[kp].fieldnr) {
4208 get_storage_type(&m_encoder_arr[i], kp);
4209 break;
4210 }
4211 }
4212 }
4213
4214 m_encoder_arr[i].m_field_type = field->real_type();
4215 m_encoder_arr[i].m_field_index = i;
4216 m_encoder_arr[i].m_pack_length_in_rec = field->pack_length_in_rec();
4217
4218 if (field->real_maybe_null()) {
4219 m_encoder_arr[i].m_null_mask = cur_null_mask;
4220 m_encoder_arr[i].m_null_offset = null_bytes;
4221 if (cur_null_mask == 0x80) {
4222 cur_null_mask = 0x1;
4223 null_bytes++;
4224 } else
4225 cur_null_mask = cur_null_mask << 1;
4226 } else {
4227 m_encoder_arr[i].m_null_mask = 0;
4228 }
4229 }
4230
4231 /* Count the last, unfinished NULL-bits byte */
4232 if (cur_null_mask != 0x1)
4233 null_bytes++;
4234
4235 m_null_bytes_in_rec = null_bytes;
4236 }
4237
alloc_key_buffers(const TABLE * const table_arg,const Rdb_tbl_def * const tbl_def_arg,bool alloc_alter_buffers)4238 int ha_rocksdb::alloc_key_buffers(const TABLE *const table_arg,
4239 const Rdb_tbl_def *const tbl_def_arg,
4240 bool alloc_alter_buffers) {
4241 DBUG_ENTER_FUNC();
4242
4243 DBUG_ASSERT(m_pk_tuple == nullptr);
4244 DBUG_ASSERT(tbl_def_arg != nullptr);
4245
4246 std::shared_ptr<Rdb_key_def> *const kd_arr = tbl_def_arg->m_key_descr_arr;
4247
4248 uint key_len = 0;
4249 uint max_packed_sk_len = 0;
4250 uint pack_key_len = 0;
4251
4252 m_pk_descr = kd_arr[pk_index(table_arg, tbl_def_arg)];
4253 if (has_hidden_pk(table_arg)) {
4254 m_pk_key_parts = 1;
4255 } else {
4256 m_pk_key_parts =
4257 table->key_info[table->s->primary_key].user_defined_key_parts;
4258 key_len = table->key_info[table->s->primary_key].key_length;
4259 }
4260
4261 // move this into get_table_handler() ??
4262 m_pk_descr->setup(table_arg, tbl_def_arg);
4263
4264 m_pk_tuple = reinterpret_cast<uchar *>(my_malloc(key_len, MYF(0)));
4265 if (m_pk_tuple == nullptr) {
4266 goto error;
4267 }
4268
4269 pack_key_len = m_pk_descr->max_storage_fmt_length();
4270 m_pk_packed_tuple =
4271 reinterpret_cast<uchar *>(my_malloc(pack_key_len, MYF(0)));
4272 if (m_pk_packed_tuple == nullptr) {
4273 goto error;
4274 }
4275
4276 /* Sometimes, we may use m_sk_packed_tuple for storing packed PK */
4277 max_packed_sk_len = pack_key_len;
4278 for (uint i = 0; i < table_arg->s->keys; i++) {
4279 if (i == table_arg->s->primary_key) /* Primary key was processed above */
4280 continue;
4281
4282 // TODO: move this into get_table_handler() ??
4283 kd_arr[i]->setup(table_arg, tbl_def_arg);
4284
4285 const uint packed_len = kd_arr[i]->max_storage_fmt_length();
4286 if (packed_len > max_packed_sk_len) {
4287 max_packed_sk_len = packed_len;
4288 }
4289 }
4290
4291 if (!(m_sk_packed_tuple =
4292 reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)))) ||
4293 !(m_sk_match_prefix_buf =
4294 reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)))) ||
4295 !(m_sk_packed_tuple_old =
4296 reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)))) ||
4297 !(m_end_key_packed_tuple =
4298 reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)))) ||
4299 !((m_pack_buffer = reinterpret_cast<uchar *>(
4300 my_malloc(max_packed_sk_len, MYF(0)))))) {
4301 goto error;
4302 }
4303
4304 /*
4305 If inplace alter is happening, allocate special buffers for unique
4306 secondary index duplicate checking.
4307 */
4308 if (alloc_alter_buffers &&
4309 (!(m_dup_sk_packed_tuple =
4310 reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)))) ||
4311 !(m_dup_sk_packed_tuple_old = reinterpret_cast<uchar *>(
4312 my_malloc(max_packed_sk_len, MYF(0)))))) {
4313 goto error;
4314 }
4315
4316 DBUG_RETURN(HA_EXIT_SUCCESS);
4317
4318 error:
4319 // If we're here then this means that at some point above an allocation may
4320 // have failed. To avoid any resource leaks and maintain a clear contract
4321 // we'll clean up before returning the error code.
4322 free_key_buffers();
4323
4324 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
4325 }
4326
free_key_buffers()4327 void ha_rocksdb::free_key_buffers() {
4328 my_free(m_pk_tuple);
4329 m_pk_tuple = nullptr;
4330
4331 my_free(m_pk_packed_tuple);
4332 m_pk_packed_tuple = nullptr;
4333
4334 my_free(m_sk_packed_tuple);
4335 m_sk_packed_tuple = nullptr;
4336
4337 my_free(m_sk_match_prefix_buf);
4338 m_sk_match_prefix_buf = nullptr;
4339
4340 my_free(m_sk_packed_tuple_old);
4341 m_sk_packed_tuple_old = nullptr;
4342
4343 my_free(m_end_key_packed_tuple);
4344 m_end_key_packed_tuple = nullptr;
4345
4346 my_free(m_pack_buffer);
4347 m_pack_buffer = nullptr;
4348
4349 my_free(m_dup_sk_packed_tuple);
4350 m_dup_sk_packed_tuple = nullptr;
4351
4352 my_free(m_dup_sk_packed_tuple_old);
4353 m_dup_sk_packed_tuple_old = nullptr;
4354 }
4355
set_use_read_free_rpl(const char * const whitelist)4356 void ha_rocksdb::set_use_read_free_rpl(const char *const whitelist) {
4357 DBUG_ASSERT(whitelist != nullptr);
4358
4359 #if defined(HAVE_PSI_INTERFACE)
4360 Regex regex_handler(key_rwlock_read_free_rpl_tables);
4361 #else
4362 Regex regex_handler;
4363 #endif
4364
4365 int flags = MY_REG_EXTENDED | MY_REG_NOSUB;
4366 if (lower_case_table_names)
4367 flags |= MY_REG_ICASE;
4368
4369 if (!regex_handler.compile(whitelist,
4370 flags,
4371 table_alias_charset)) {
4372 warn_about_bad_patterns(regex_handler, "read_free_rpl_tables");
4373 }
4374
4375 m_use_read_free_rpl = regex_handler.match(m_tbl_def->base_tablename());
4376 }
4377
open(const char * const name,int mode,uint test_if_locked)4378 int ha_rocksdb::open(const char *const name, int mode, uint test_if_locked) {
4379 DBUG_ENTER_FUNC();
4380
4381 if (close()) {
4382 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
4383 }
4384
4385 m_table_handler = rdb_open_tables.get_table_handler(name);
4386
4387 if (m_table_handler == nullptr) {
4388 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
4389 }
4390
4391 my_core::thr_lock_data_init(&m_table_handler->m_thr_lock, &m_db_lock,
4392 nullptr);
4393 m_io_perf.init(&m_table_handler->m_table_perf_context,
4394 &m_table_handler->m_io_perf_read, &stats);
4395
4396 std::string fullname;
4397 if (rdb_normalize_tablename(name, &fullname)) {
4398 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
4399 }
4400
4401 m_tbl_def = ddl_manager.find(fullname);
4402 if (m_tbl_def == nullptr) {
4403 my_error(ER_INTERNAL_ERROR, MYF(0),
4404 "Attempt to open a table that is not present in RocksDB-SE data "
4405 "dictionary");
4406 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
4407 }
4408 m_lock_rows = RDB_LOCK_NONE;
4409
4410 m_key_descr_arr = m_tbl_def->m_key_descr_arr;
4411
4412 /*
4413 Full table scan actually uses primary key
4414 (UPDATE needs to know this, otherwise it will go into infinite loop on
4415 queries like "UPDATE tbl SET pk=pk+100")
4416 */
4417 key_used_on_scan = table->s->primary_key;
4418
4419 // close() above has already called free_key_buffers(). No need to do it here.
4420 int err = alloc_key_buffers(table, m_tbl_def);
4421
4422 if (err) {
4423 DBUG_RETURN(err);
4424 }
4425
4426 /*
4427 init_with_fields() is used to initialize table flags based on the field
4428 definitions in table->field[].
4429 It is called by open_binary_frm(), but that function calls the method for
4430 a temporary ha_rocksdb object which is later destroyed.
4431
4432 If we are here in ::open(), then init_with_fields() has not been called
4433 for this object. Call it ourselves, we want all member variables to be
4434 properly initialized.
4435 */
4436 init_with_fields();
4437
4438 setup_field_converters();
4439
4440 info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
4441
4442 /*
4443 The following load_XXX code calls row decode functions, and they do
4444 that without having done ::external_lock() or index_init()/rnd_init().
4445 (Note: this also means we're doing a read when there was no
4446 setup_field_converters() call)
4447
4448 Initialize the necessary variables for them:
4449 */
4450 m_verify_row_debug_checksums = false;
4451
4452 /* TODO: move the following to where TABLE_SHARE is opened: */
4453 if (table->found_next_number_field)
4454 load_auto_incr_value();
4455
4456 if (has_hidden_pk(table) && load_hidden_pk_value()) {
4457 free_key_buffers();
4458 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
4459 }
4460
4461 /* Index block size in MyRocks: used by MySQL in query optimization */
4462 stats.block_size = rocksdb_tbl_options.block_size;
4463
4464 /* Determine at open whether we can use Read Free Replication or not */
4465 set_use_read_free_rpl(THDVAR(ha_thd(), read_free_rpl_tables));
4466
4467 DBUG_RETURN(HA_EXIT_SUCCESS);
4468 }
4469
close(void)4470 int ha_rocksdb::close(void) {
4471 DBUG_ENTER_FUNC();
4472
4473 m_pk_descr = nullptr;
4474 m_key_descr_arr = nullptr;
4475
4476 free_key_buffers();
4477
4478 my_free(m_encoder_arr);
4479 m_encoder_arr = nullptr;
4480
4481 if (m_table_handler != nullptr) {
4482 rdb_open_tables.release_table_handler(m_table_handler);
4483 m_table_handler = nullptr;
4484 }
4485
4486 DBUG_RETURN(HA_EXIT_SUCCESS);
4487 }
4488
get_error_message(const int error,String * const buf)4489 bool ha_rocksdb::get_error_message(const int error, String *const buf) {
4490 DBUG_ENTER_FUNC();
4491
4492 static_assert(HA_ERR_ROCKSDB_LAST > HA_ERR_FIRST,
4493 "HA_ERR_ROCKSDB_LAST > HA_ERR_FIRST");
4494 static_assert(HA_ERR_ROCKSDB_LAST > HA_ERR_LAST,
4495 "HA_ERR_ROCKSDB_LAST > HA_ERR_LAST");
4496
4497 DBUG_ASSERT(error > 0);
4498 DBUG_ASSERT(error <= HA_ERR_ROCKSDB_LAST);
4499 DBUG_ASSERT(buf != nullptr);
4500
4501 switch (error) {
4502 case HA_ERR_ROCKSDB_PK_REQUIRED:
4503 buf->append("Table must have a PRIMARY KEY.");
4504 break;
4505 case HA_ERR_ROCKSDB_UNIQUE_NOT_SUPPORTED:
4506 buf->append("Unique indexes are not supported.");
4507 break;
4508 case HA_ERR_ROCKSDB_TOO_MANY_LOCKS:
4509 buf->append("Number of locks held reached @@rocksdb_max_row_locks.");
4510 break;
4511 default:
4512 // We can be called with the values which are < HA_ERR_FIRST because most
4513 // MySQL internal functions will just return HA_EXIT_FAILURE in case of
4514 // an error.
4515 break;
4516 }
4517
4518 DBUG_RETURN(false);
4519 }
4520
4521 /* MyRocks supports only the following collations for indexed columns */
4522 static const std::set<const my_core::CHARSET_INFO *> RDB_INDEX_COLLATIONS = {
4523 &my_charset_bin, &my_charset_utf8_bin, &my_charset_latin1_bin};
4524
4525 static bool
rdb_is_index_collation_supported(const my_core::Field * const field)4526 rdb_is_index_collation_supported(const my_core::Field *const field) {
4527 const my_core::enum_field_types type = field->real_type();
4528 /* Handle [VAR](CHAR|BINARY) or TEXT|BLOB */
4529 if (type == MYSQL_TYPE_VARCHAR || type == MYSQL_TYPE_STRING ||
4530 type == MYSQL_TYPE_BLOB) {
4531 return RDB_INDEX_COLLATIONS.find(field->charset()) !=
4532 RDB_INDEX_COLLATIONS.end();
4533 }
4534 return true;
4535 }
4536
4537 /*
4538 Create structures needed for storing data in rocksdb. This is called when the
4539 table is created. The structures will be shared by all TABLE* objects.
4540
4541 @param
4542 table_arg Table with definition
4543 db_table "dbname.tablename"
4544 len strlen of the above
4545 tbl_def_arg tbl_def whose key_descr is being created/populated
4546 old_tbl_def_arg tbl_def from which keys are being copied over from
4547 (for use during inplace alter)
4548
4549 @return
4550 0 - Ok
4551 other - error, either given table ddl is not supported by rocksdb or OOM.
4552 */
create_key_defs(const TABLE * const table_arg,Rdb_tbl_def * const tbl_def_arg,const TABLE * const old_table_arg,const Rdb_tbl_def * const old_tbl_def_arg) const4553 int ha_rocksdb::create_key_defs(
4554 const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
4555 const TABLE *const old_table_arg /* = nullptr */,
4556 const Rdb_tbl_def *const old_tbl_def_arg
4557 /* = nullptr */) const {
4558 DBUG_ENTER_FUNC();
4559
4560 DBUG_ASSERT(table_arg != nullptr);
4561 DBUG_ASSERT(table_arg->s != nullptr);
4562
4563 uint i;
4564
4565 /*
4566 These need to be one greater than MAX_INDEXES since the user can create
4567 MAX_INDEXES secondary keys and no primary key which would cause us
4568 to generate a hidden one.
4569 */
4570 std::array<key_def_cf_info, MAX_INDEXES + 1> cfs;
4571
4572 /*
4573 NOTE: All new column families must be created before new index numbers are
4574 allocated to each key definition. See below for more details.
4575 http://github.com/MySQLOnRocksDB/mysql-5.6/issues/86#issuecomment-138515501
4576 */
4577 if (create_cfs(table_arg, tbl_def_arg, &cfs)) {
4578 DBUG_RETURN(HA_EXIT_FAILURE);
4579 };
4580
4581 if (!old_tbl_def_arg) {
4582 /*
4583 old_tbl_def doesn't exist. this means we are in the process of creating
4584 a new table.
4585
4586 Get the index numbers (this will update the next_index_number)
4587 and create Rdb_key_def structures.
4588 */
4589 for (i = 0; i < tbl_def_arg->m_key_count; i++) {
4590 if (create_key_def(table_arg, i, tbl_def_arg, &m_key_descr_arr[i],
4591 cfs[i])) {
4592 DBUG_RETURN(HA_EXIT_FAILURE);
4593 }
4594 }
4595 } else {
4596 /*
4597 old_tbl_def exists. This means we are creating a new tbl_def as part of
4598 in-place alter table. Copy over existing keys from the old_tbl_def and
4599 generate the necessary new key definitions if any.
4600 */
4601 if (create_inplace_key_defs(table_arg, tbl_def_arg, old_table_arg,
4602 old_tbl_def_arg, cfs)) {
4603 DBUG_RETURN(HA_EXIT_FAILURE);
4604 }
4605 }
4606
4607 DBUG_RETURN(HA_EXIT_SUCCESS);
4608 }
4609
4610 /*
4611 Checks index parameters and creates column families needed for storing data
4612 in rocksdb if necessary.
4613
4614 @param in
4615 table_arg Table with definition
4616 db_table Table name
4617 tbl_def_arg Table def structure being populated
4618
4619 @param out
4620 cfs CF info for each key definition in 'key_info' order
4621
4622 @return
4623 0 - Ok
4624 other - error
4625 */
create_cfs(const TABLE * const table_arg,Rdb_tbl_def * const tbl_def_arg,std::array<struct key_def_cf_info,MAX_INDEXES+1> * const cfs) const4626 int ha_rocksdb::create_cfs(
4627 const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
4628 std::array<struct key_def_cf_info, MAX_INDEXES + 1> *const cfs) const {
4629 DBUG_ENTER_FUNC();
4630
4631 DBUG_ASSERT(table_arg != nullptr);
4632 DBUG_ASSERT(table_arg->s != nullptr);
4633
4634 char tablename_sys[NAME_LEN + 1];
4635
4636 my_core::filename_to_tablename(tbl_def_arg->base_tablename().c_str(),
4637 tablename_sys, sizeof(tablename_sys));
4638
4639 /*
4640 The first loop checks the index parameters and creates
4641 column families if necessary.
4642 */
4643 for (uint i = 0; i < tbl_def_arg->m_key_count; i++) {
4644 rocksdb::ColumnFamilyHandle *cf_handle;
4645
4646 if (rocksdb_strict_collation_check &&
4647 !is_hidden_pk(i, table_arg, tbl_def_arg) &&
4648 tbl_def_arg->base_tablename().find(tmp_file_prefix) != 0) {
4649 for (uint part = 0; part < table_arg->key_info[i].actual_key_parts;
4650 part++) {
4651 if (!rdb_is_index_collation_supported(
4652 table_arg->key_info[i].key_part[part].field) &&
4653 !rdb_collation_exceptions->match(tablename_sys)) {
4654 std::string collation_err;
4655 for (const auto &coll : RDB_INDEX_COLLATIONS) {
4656 if (collation_err != "") {
4657 collation_err += ", ";
4658 }
4659 collation_err += coll->name;
4660 }
4661 my_printf_error(
4662 ER_UNKNOWN_ERROR, "Unsupported collation on string indexed "
4663 "column %s.%s Use binary collation (%s).",
4664 MYF(0), tbl_def_arg->full_tablename().c_str(),
4665 table_arg->key_info[i].key_part[part].field->field_name,
4666 collation_err.c_str());
4667 DBUG_RETURN(HA_EXIT_FAILURE);
4668 }
4669 }
4670 }
4671
4672 /*
4673 index comment has Column Family name. If there was no comment, we get
4674 NULL, and it means use the default column family.
4675 */
4676 const char *const comment = get_key_comment(i, table_arg, tbl_def_arg);
4677 const char *const key_name = get_key_name(i, table_arg, tbl_def_arg);
4678
4679 if (looks_like_per_index_cf_typo(comment)) {
4680 my_error(ER_NOT_SUPPORTED_YET, MYF(0),
4681 "column family name looks like a typo of $per_index_cf");
4682 DBUG_RETURN(HA_EXIT_FAILURE);
4683 }
4684 /* Prevent create from using the system column family */
4685 if (comment && strcmp(DEFAULT_SYSTEM_CF_NAME, comment) == 0) {
4686 my_error(ER_WRONG_ARGUMENTS, MYF(0),
4687 "column family not valid for storing index data");
4688 DBUG_RETURN(HA_EXIT_FAILURE);
4689 }
4690 bool is_auto_cf_flag;
4691 cf_handle =
4692 cf_manager.get_or_create_cf(rdb, comment, tbl_def_arg->full_tablename(),
4693 key_name, &is_auto_cf_flag);
4694 if (!cf_handle)
4695 DBUG_RETURN(HA_EXIT_FAILURE);
4696
4697 auto &cf = (*cfs)[i];
4698 cf.cf_handle = cf_handle;
4699 cf.is_reverse_cf = Rdb_cf_manager::is_cf_name_reverse(comment);
4700 cf.is_auto_cf = is_auto_cf_flag;
4701 }
4702
4703 DBUG_RETURN(HA_EXIT_SUCCESS);
4704 }
4705
4706 /*
4707 Create key definition needed for storing data in rocksdb during ADD index
4708 inplace operations.
4709
4710 @param in
4711 table_arg Table with definition
4712 tbl_def_arg New table def structure being populated
4713 old_tbl_def_arg Old(current) table def structure
4714 cfs Struct array which contains column family information
4715
4716 @return
4717 0 - Ok
4718 other - error, either given table ddl is not supported by rocksdb or OOM.
4719 */
create_inplace_key_defs(const TABLE * const table_arg,Rdb_tbl_def * const tbl_def_arg,const TABLE * const old_table_arg,const Rdb_tbl_def * const old_tbl_def_arg,const std::array<key_def_cf_info,MAX_INDEXES+1> & cfs) const4720 int ha_rocksdb::create_inplace_key_defs(
4721 const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
4722 const TABLE *const old_table_arg, const Rdb_tbl_def *const old_tbl_def_arg,
4723 const std::array<key_def_cf_info, MAX_INDEXES + 1> &cfs) const {
4724 DBUG_ENTER_FUNC();
4725
4726 DBUG_ASSERT(table_arg != nullptr);
4727 DBUG_ASSERT(tbl_def_arg != nullptr);
4728 DBUG_ASSERT(old_tbl_def_arg != nullptr);
4729
4730 std::shared_ptr<Rdb_key_def> *const old_key_descr =
4731 old_tbl_def_arg->m_key_descr_arr;
4732 std::shared_ptr<Rdb_key_def> *const new_key_descr =
4733 tbl_def_arg->m_key_descr_arr;
4734 const std::unordered_map<std::string, uint> old_key_pos =
4735 get_old_key_positions(table_arg, tbl_def_arg, old_table_arg,
4736 old_tbl_def_arg);
4737
4738 uint i;
4739 for (i = 0; i < tbl_def_arg->m_key_count; i++) {
4740 const auto &it = old_key_pos.find(get_key_name(i, table_arg, tbl_def_arg));
4741 if (it != old_key_pos.end()) {
4742 /*
4743 Found matching index in old table definition, so copy it over to the
4744 new one created.
4745 */
4746 const Rdb_key_def &okd = *old_key_descr[it->second];
4747
4748 uint16 index_dict_version = 0;
4749 uchar index_type = 0;
4750 uint16 kv_version = 0;
4751 const GL_INDEX_ID gl_index_id = okd.get_gl_index_id();
4752 if (!dict_manager.get_index_info(gl_index_id, &index_dict_version,
4753 &index_type, &kv_version)) {
4754 // NO_LINT_DEBUG
4755 sql_print_error("RocksDB: Could not get index information "
4756 "for Index Number (%u,%u), table %s",
4757 gl_index_id.cf_id, gl_index_id.index_id,
4758 old_tbl_def_arg->full_tablename().c_str());
4759 DBUG_RETURN(HA_EXIT_FAILURE);
4760 }
4761
4762 /*
4763 We can't use the copy constructor because we need to update the
4764 keynr within the pack_info for each field and the keyno of the keydef
4765 itself.
4766 */
4767 new_key_descr[i] = std::make_shared<Rdb_key_def>(
4768 okd.get_index_number(), i, okd.get_cf(), index_dict_version,
4769 index_type, kv_version, okd.m_is_reverse_cf, okd.m_is_auto_cf,
4770 okd.m_name.c_str(), dict_manager.get_stats(gl_index_id));
4771 } else if (create_key_def(table_arg, i, tbl_def_arg, &new_key_descr[i],
4772 cfs[i])) {
4773 DBUG_RETURN(HA_EXIT_FAILURE);
4774 }
4775
4776 DBUG_ASSERT(new_key_descr[i] != nullptr);
4777 new_key_descr[i]->setup(table_arg, tbl_def_arg);
4778 }
4779
4780 DBUG_RETURN(HA_EXIT_SUCCESS);
4781 }
4782
get_old_key_positions(const TABLE * const table_arg,const Rdb_tbl_def * const tbl_def_arg,const TABLE * const old_table_arg,const Rdb_tbl_def * const old_tbl_def_arg) const4783 std::unordered_map<std::string, uint> ha_rocksdb::get_old_key_positions(
4784 const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg,
4785 const TABLE *const old_table_arg,
4786 const Rdb_tbl_def *const old_tbl_def_arg) const {
4787 DBUG_ENTER_FUNC();
4788
4789 DBUG_ASSERT(table_arg != nullptr);
4790 DBUG_ASSERT(old_table_arg != nullptr);
4791 DBUG_ASSERT(tbl_def_arg != nullptr);
4792 DBUG_ASSERT(old_tbl_def_arg != nullptr);
4793
4794 std::shared_ptr<Rdb_key_def> *const old_key_descr =
4795 old_tbl_def_arg->m_key_descr_arr;
4796 std::unordered_map<std::string, uint> old_key_pos;
4797 std::unordered_map<std::string, uint> new_key_pos;
4798 uint i;
4799
4800 for (i = 0; i < tbl_def_arg->m_key_count; i++) {
4801 new_key_pos[get_key_name(i, table_arg, tbl_def_arg)] = i;
4802 }
4803
4804 for (i = 0; i < old_tbl_def_arg->m_key_count; i++) {
4805 if (is_hidden_pk(i, old_table_arg, old_tbl_def_arg)) {
4806 old_key_pos[old_key_descr[i]->m_name] = i;
4807 continue;
4808 }
4809
4810 /*
4811 In case of matching key name, need to check key parts of keys as well,
4812 in case a simultaneous drop + add is performed, where the key name is the
4813 same but the key parts are different.
4814
4815 Example:
4816 CREATE TABLE t1 (a INT, b INT, KEY ka(a)) ENGINE=RocksDB;
4817 ALTER TABLE t1 DROP INDEX ka, ADD INDEX ka(b), ALGORITHM=INPLACE;
4818 */
4819 const KEY *const old_key = &old_table_arg->key_info[i];
4820 const auto &it = new_key_pos.find(old_key->name);
4821 if (it == new_key_pos.end()) {
4822 continue;
4823 }
4824
4825 KEY *const new_key = &table_arg->key_info[it->second];
4826
4827 if (!compare_key_parts(old_key, new_key)) {
4828 old_key_pos[old_key->name] = i;
4829 }
4830 }
4831
4832 DBUG_RETURN(old_key_pos);
4833 }
4834
4835 /* Check two keys to ensure that key parts within keys match */
compare_key_parts(const KEY * const old_key,const KEY * const new_key) const4836 int ha_rocksdb::compare_key_parts(const KEY *const old_key,
4837 const KEY *const new_key) const {
4838 DBUG_ENTER_FUNC();
4839
4840 DBUG_ASSERT(old_key != nullptr);
4841 DBUG_ASSERT(new_key != nullptr);
4842
4843 /* Skip if key parts do not match, as it is a different key */
4844 if (new_key->user_defined_key_parts != old_key->user_defined_key_parts) {
4845 DBUG_RETURN(HA_EXIT_FAILURE);
4846 }
4847
4848 /* Check to see that key parts themselves match */
4849 for (uint i = 0; i < old_key->user_defined_key_parts; i++) {
4850 if (strcmp(old_key->key_part[i].field->field_name,
4851 new_key->key_part[i].field->field_name) != 0) {
4852 DBUG_RETURN(HA_EXIT_FAILURE);
4853 }
4854 }
4855
4856 DBUG_RETURN(HA_EXIT_SUCCESS);
4857 }
4858
4859 /*
4860 Create key definition needed for storing data in rocksdb.
4861 This can be called either during CREATE table or doing ADD index operations.
4862
4863 @param in
4864 table_arg Table with definition
4865 i Position of index being created inside table_arg->key_info
4866 tbl_def_arg Table def structure being populated
4867 cf_info Struct which contains column family information
4868
4869 @param out
4870 new_key_def Newly created index definition.
4871
4872 @return
4873 0 - Ok
4874 other - error, either given table ddl is not supported by rocksdb or OOM.
4875 */
create_key_def(const TABLE * const table_arg,const uint & i,const Rdb_tbl_def * const tbl_def_arg,std::shared_ptr<Rdb_key_def> * const new_key_def,const struct key_def_cf_info & cf_info) const4876 int ha_rocksdb::create_key_def(const TABLE *const table_arg, const uint &i,
4877 const Rdb_tbl_def *const tbl_def_arg,
4878 std::shared_ptr<Rdb_key_def> *const new_key_def,
4879 const struct key_def_cf_info &cf_info) const {
4880 DBUG_ENTER_FUNC();
4881
4882 DBUG_ASSERT(new_key_def != nullptr);
4883 DBUG_ASSERT(*new_key_def == nullptr);
4884
4885 const uint index_id = ddl_manager.get_and_update_next_number(&dict_manager);
4886 const uint16_t index_dict_version = Rdb_key_def::INDEX_INFO_VERSION_LATEST;
4887 uchar index_type;
4888 uint16_t kv_version;
4889
4890 if (is_hidden_pk(i, table_arg, tbl_def_arg)) {
4891 index_type = Rdb_key_def::INDEX_TYPE_HIDDEN_PRIMARY;
4892 kv_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_LATEST;
4893 } else if (i == table_arg->s->primary_key) {
4894 index_type = Rdb_key_def::INDEX_TYPE_PRIMARY;
4895 uint16 pk_latest_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_LATEST;
4896 kv_version = pk_latest_version;
4897 } else {
4898 index_type = Rdb_key_def::INDEX_TYPE_SECONDARY;
4899 uint16 sk_latest_version = Rdb_key_def::SECONDARY_FORMAT_VERSION_LATEST;
4900 kv_version = sk_latest_version;
4901 }
4902
4903 const char *const key_name = get_key_name(i, table_arg, m_tbl_def);
4904 *new_key_def = std::make_shared<Rdb_key_def>(
4905 index_id, i, cf_info.cf_handle, index_dict_version, index_type,
4906 kv_version, cf_info.is_reverse_cf, cf_info.is_auto_cf, key_name);
4907
4908 DBUG_RETURN(HA_EXIT_SUCCESS);
4909 }
4910
rdb_normalize_tablename(const std::string & tablename,std::string * const strbuf)4911 int rdb_normalize_tablename(const std::string &tablename,
4912 std::string *const strbuf) {
4913 DBUG_ASSERT(strbuf != nullptr);
4914
4915 if (tablename.size() < 2 || tablename[0] != '.' || tablename[1] != '/') {
4916 DBUG_ASSERT(0); // We were not passed table name?
4917 return HA_ERR_INTERNAL_ERROR;
4918 }
4919
4920 size_t pos = tablename.find_first_of('/', 2);
4921 if (pos == std::string::npos) {
4922 DBUG_ASSERT(0); // We were not passed table name?
4923 return HA_ERR_INTERNAL_ERROR;
4924 }
4925
4926 *strbuf = tablename.substr(2, pos - 2) + "." + tablename.substr(pos + 1);
4927
4928 return HA_EXIT_SUCCESS;
4929 }
4930
4931 /*
4932 Check to see if the user's original statement includes foreign key
4933 references
4934 */
contains_foreign_key(THD * const thd)4935 bool ha_rocksdb::contains_foreign_key(THD *const thd) {
4936 bool success;
4937 const char *str = thd_query_string(thd)->str;
4938
4939 DBUG_ASSERT(str != nullptr);
4940
4941 while (*str != '\0') {
4942 // Scan from our current pos looking for 'FOREIGN'
4943 str = rdb_find_in_string(str, "FOREIGN", &success);
4944 if (!success) {
4945 return false;
4946 }
4947
4948 // Skip past the found "FOREIGN'
4949 str = rdb_check_next_token(&my_charset_bin, str, "FOREIGN", &success);
4950 DBUG_ASSERT(success);
4951
4952 if (!my_isspace(&my_charset_bin, *str)) {
4953 return false;
4954 }
4955
4956 // See if the next token is 'KEY'
4957 str = rdb_check_next_token(&my_charset_bin, str, "KEY", &success);
4958 if (!success) {
4959 continue;
4960 }
4961
4962 // See if the next token is '('
4963 str = rdb_check_next_token(&my_charset_bin, str, "(", &success);
4964 if (!success) {
4965 // There is an optional index id after 'FOREIGN KEY', skip it
4966 str = rdb_skip_id(&my_charset_bin, str);
4967
4968 // Now check for '(' again
4969 str = rdb_check_next_token(&my_charset_bin, str, "(", &success);
4970 }
4971
4972 // If we have found 'FOREIGN KEY [<word>] (' we can be confident we have
4973 // a foreign key clause.
4974 return success;
4975 }
4976
4977 // We never found a valid foreign key clause
4978 return false;
4979 }
4980
4981 /**
4982 @brief
4983 splits the normalized table name of <dbname>.<tablename>#P#<part_no> into
4984 the <dbname>, <tablename> and <part_no> components.
4985
4986 @param dbbuf returns database name/table_schema
4987 @param tablebuf returns tablename
4988 @param partitionbuf returns partition suffix if there is one
4989 @return HA_EXIT_SUCCESS on success, non-zero on failure to split
4990 */
rdb_split_normalized_tablename(const std::string & fullname,std::string * const db,std::string * const table,std::string * const partition)4991 int rdb_split_normalized_tablename(const std::string &fullname,
4992 std::string *const db,
4993 std::string *const table,
4994 std::string *const partition) {
4995 DBUG_ASSERT(!fullname.empty());
4996
4997 #define RDB_PARTITION_STR "#P#"
4998
4999 /* Normalize returns dbname.tablename. */
5000 size_t dotpos = fullname.find('.');
5001
5002 /* Invalid table name? */
5003 if (dotpos == std::string::npos) {
5004 return HA_ERR_INTERNAL_ERROR;
5005 }
5006
5007 // Table must have a database name associated with it.
5008 DBUG_ASSERT(dotpos > 0);
5009
5010 if (db != nullptr) {
5011 *db = fullname.substr(0, dotpos);
5012 }
5013
5014 dotpos++;
5015
5016 const size_t partpos =
5017 fullname.find(RDB_PARTITION_STR, dotpos, strlen(RDB_PARTITION_STR));
5018
5019 if (partpos != std::string::npos) {
5020 DBUG_ASSERT(partpos >= dotpos);
5021
5022 if (table != nullptr) {
5023 *table = fullname.substr(dotpos, partpos - dotpos);
5024 }
5025
5026 if (partition != nullptr) {
5027 *partition = fullname.substr(partpos + strlen(RDB_PARTITION_STR));
5028 }
5029 } else if (table != nullptr) {
5030 *table = fullname.substr(dotpos);
5031 }
5032
5033 return HA_EXIT_SUCCESS;
5034 }
5035
5036 /**
5037 @brief
5038 create() is called to create a database. The variable name will have the name
5039 of the table.
5040
5041 @details
5042 When create() is called you do not need to worry about
5043 opening the table. Also, the .frm file will have already been
5044 created so adjusting create_info is not necessary. You can overwrite
5045 the .frm file at this point if you wish to change the table
5046 definition, but there are no methods currently provided for doing
5047 so.
5048
5049 Called from handle.cc by ha_create_table().
5050
5051 @see
5052 ha_create_table() in handle.cc
5053 */
5054
create(const char * const name,TABLE * const table_arg,HA_CREATE_INFO * const create_info)5055 int ha_rocksdb::create(const char *const name, TABLE *const table_arg,
5056 HA_CREATE_INFO *const create_info) {
5057 DBUG_ENTER_FUNC();
5058
5059 int res;
5060 std::string str;
5061
5062 THD *const thd = my_core::thd_get_current_thd();
5063
5064 if (get_table_if_exists(name)) {
5065 if (thd->lex->sql_command == SQLCOM_TRUNCATE) {
5066 if (delete_table(name)) {
5067 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
5068 }
5069 } else {
5070 if (rdb_normalize_tablename(name, &str)) {
5071 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
5072 }
5073 my_printf_error(
5074 ER_UNKNOWN_ERROR,
5075 "Table '%s' does not exist, but metadata information exists inside "
5076 "MyRocks. This is a sign of data inconsistency. Please check if "
5077 "'%s.frm' exists, and try to restore it if it does not exist.",
5078 MYF(0), str.c_str(), name);
5079 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
5080 }
5081 }
5082
5083 /*
5084 Construct dbname.tablename ourselves, because parititioning
5085 passes strings like "./test/t14#P#p0" for individual partitions,
5086 while table_arg->s->table_name has none of that.
5087 */
5088 if (rdb_normalize_tablename(name, &str)) {
5089 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
5090 }
5091
5092 if (contains_foreign_key(thd)) {
5093 my_printf_error(ER_NOT_SUPPORTED_YET, "MyRocks does not currently support "
5094 "foreign key constraints",
5095 MYF(0));
5096 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
5097 }
5098
5099 /*
5100 TODO(alexyang): Temporarily disable unique indexes support when there is no
5101 Primary Key
5102 */
5103 for (uint keyno = 0; keyno < table_arg->s->keys; keyno++) {
5104 if ((table_arg->key_info[keyno].flags & HA_NOSAME) &&
5105 has_hidden_pk(table_arg)) {
5106 my_printf_error(ER_UNKNOWN_ERROR, "Unique index support is disabled when "
5107 "the table has no primary key. ",
5108 MYF(0));
5109 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
5110 }
5111 }
5112
5113 bool write_err = false;
5114 const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
5115 rocksdb::WriteBatch *const batch = wb.get();
5116
5117 /* Create table/key descriptions and put them into the data dictionary */
5118 m_tbl_def = new Rdb_tbl_def(str);
5119
5120 uint n_keys = table_arg->s->keys;
5121
5122 /*
5123 If no primary key found, create a hidden PK and place it inside table
5124 definition
5125 */
5126 if (has_hidden_pk(table_arg)) {
5127 n_keys += 1;
5128 }
5129
5130 m_key_descr_arr = new std::shared_ptr<Rdb_key_def>[n_keys];
5131 m_tbl_def->m_key_count = n_keys;
5132 m_tbl_def->m_key_descr_arr = m_key_descr_arr;
5133
5134 if ((res = create_key_defs(table_arg, m_tbl_def))) {
5135 goto error;
5136 }
5137
5138 m_pk_descr = m_key_descr_arr[pk_index(table_arg, m_tbl_def)];
5139
5140 dict_manager.lock();
5141 write_err =
5142 ddl_manager.put_and_write(m_tbl_def, batch) || dict_manager.commit(batch);
5143 dict_manager.unlock();
5144
5145 if (write_err) {
5146 goto error;
5147 }
5148
5149 if (create_info->auto_increment_value)
5150 m_tbl_def->m_auto_incr_val = create_info->auto_increment_value;
5151
5152 /*
5153 We only support auto_increment at start of the PRIMARY KEY.
5154 */
5155 // Field *field;
5156 // if ((field= table_arg->next_number_field))
5157 /* TODO mdcallag: disable this for now to let UNIQUE indexes kind of work
5158 if ((field= table_arg->found_next_number_field))
5159 {
5160 int pk= table_arg->s->primary_key;
5161 Field *pk_field= table_arg->key_info[pk].key_part[0].field;
5162 if (field->field_index != pk_field->field_index)
5163 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
5164 }
5165 */
5166 DBUG_RETURN(HA_EXIT_SUCCESS);
5167
5168 error:
5169 /* Delete what we have allocated so far */
5170 delete m_tbl_def;
5171 m_tbl_def = nullptr;
5172 m_key_descr_arr = nullptr;
5173
5174 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
5175 }
5176
5177 /**
5178 @note
5179 This function is used only when the table has not yet been opened, and
5180 keyread_allowed bitmap doesn't have the correct values yet.
5181
5182 See comment in ha_rocksdb::index_flags() for details.
5183 */
5184
check_keyread_allowed(uint inx,uint part,bool all_parts) const5185 bool ha_rocksdb::check_keyread_allowed(uint inx, uint part,
5186 bool all_parts) const {
5187 bool res = true;
5188 KEY *const key_info = &table_share->key_info[inx];
5189
5190 Rdb_field_packing dummy1;
5191 res = dummy1.setup(nullptr, key_info->key_part[part].field, inx, part,
5192 key_info->key_part[part].length);
5193
5194 if (res && all_parts) {
5195 for (uint i = 0; i < part; i++) {
5196 Field *field;
5197 if ((field = key_info->key_part[i].field)) {
5198 Rdb_field_packing dummy;
5199 if (!dummy.setup(nullptr, field, inx, i,
5200 key_info->key_part[i].length)) {
5201 /* Cannot do index-only reads for this column */
5202 res = false;
5203 break;
5204 }
5205 }
5206 }
5207 }
5208
5209 const uint pk = table_share->primary_key;
5210 if (inx == pk && all_parts &&
5211 part + 1 == table_share->key_info[pk].user_defined_key_parts) {
5212 m_pk_can_be_decoded = res;
5213 }
5214
5215 return res;
5216 }
5217
read_key_exact(const Rdb_key_def & kd,rocksdb::Iterator * const iter,const bool & full_key_match,const rocksdb::Slice & key_slice) const5218 int ha_rocksdb::read_key_exact(const Rdb_key_def &kd,
5219 rocksdb::Iterator *const iter,
5220 const bool &full_key_match,
5221 const rocksdb::Slice &key_slice) const {
5222 DBUG_ASSERT(iter != nullptr);
5223
5224 /*
5225 We are looking for the first record such that
5226 index_tuple= lookup_tuple.
5227 lookup_tuple may be a prefix of the index.
5228 */
5229 if (kd.m_is_reverse_cf) {
5230 if (!full_key_match) {
5231 if (!iter->Valid())
5232 iter->SeekToLast();
5233 else
5234 iter->Prev();
5235 }
5236 }
5237
5238 if (!iter->Valid() || !kd.value_matches_prefix(iter->key(), key_slice)) {
5239 /*
5240 Got a record that is not equal to the lookup value, or even a record
5241 from another table.index.
5242 */
5243 return HA_ERR_KEY_NOT_FOUND;
5244 }
5245 return HA_EXIT_SUCCESS;
5246 }
5247
read_before_key(const Rdb_key_def & kd,const bool & full_key_match,const rocksdb::Slice & key_slice)5248 int ha_rocksdb::read_before_key(const Rdb_key_def &kd,
5249 const bool &full_key_match,
5250 const rocksdb::Slice &key_slice) {
5251 /*
5252 We are looking for record with the biggest t.key such that
5253 t.key < lookup_tuple.
5254 */
5255 if (kd.m_is_reverse_cf) {
5256 if (m_scan_it->Valid() && full_key_match &&
5257 kd.value_matches_prefix(m_scan_it->key(), key_slice)) {
5258 /* We are using full key and we've hit an exact match */
5259 m_scan_it->Next();
5260 }
5261 } else {
5262 if (m_scan_it->Valid())
5263 m_scan_it->Prev();
5264 else
5265 m_scan_it->SeekToLast();
5266 }
5267
5268 return m_scan_it->Valid() ? HA_EXIT_SUCCESS : HA_ERR_KEY_NOT_FOUND;
5269 }
5270
read_after_key(const Rdb_key_def & kd,const bool & full_key_match,const rocksdb::Slice & key_slice)5271 int ha_rocksdb::read_after_key(const Rdb_key_def &kd,
5272 const bool &full_key_match,
5273 const rocksdb::Slice &key_slice) {
5274 /*
5275 We are looking for the first record such that
5276
5277 index_tuple $GT lookup_tuple
5278
5279 with HA_READ_AFTER_KEY, $GT = '>',
5280 with HA_READ_KEY_OR_NEXT, $GT = '>='
5281 */
5282 if (kd.m_is_reverse_cf) {
5283 if (!m_scan_it->Valid()) {
5284 m_scan_it->SeekToLast();
5285 } else {
5286 /*
5287 We should step back
5288 - when not using full extended key
5289 - when using full extended key and when we've got an exact match
5290 */
5291 if (!full_key_match ||
5292 !kd.value_matches_prefix(m_scan_it->key(), key_slice)) {
5293 m_scan_it->Prev();
5294 }
5295 }
5296 }
5297
5298 return m_scan_it->Valid() ? HA_EXIT_SUCCESS : HA_ERR_KEY_NOT_FOUND;
5299 }
5300
position_to_correct_key(const Rdb_key_def & kd,const enum ha_rkey_function & find_flag,const bool & full_key_match,const uchar * const key,const key_part_map & keypart_map,const rocksdb::Slice & key_slice,bool * const move_forward)5301 int ha_rocksdb::position_to_correct_key(const Rdb_key_def &kd,
5302 const enum ha_rkey_function &find_flag,
5303 const bool &full_key_match,
5304 const uchar *const key,
5305 const key_part_map &keypart_map,
5306 const rocksdb::Slice &key_slice,
5307 bool *const move_forward) {
5308 int rc = 0;
5309
5310 *move_forward = true;
5311
5312 switch (find_flag) {
5313 case HA_READ_KEY_EXACT:
5314 rc = read_key_exact(kd, m_scan_it, full_key_match, key_slice);
5315 break;
5316 case HA_READ_BEFORE_KEY:
5317 *move_forward = false;
5318 rc = read_before_key(kd, full_key_match, key_slice);
5319 if (rc == 0 && !kd.covers_key(m_scan_it->key())) {
5320 /* The record we've got is not from this index */
5321 rc = HA_ERR_KEY_NOT_FOUND;
5322 }
5323 break;
5324 case HA_READ_AFTER_KEY:
5325 case HA_READ_KEY_OR_NEXT:
5326 rc = read_after_key(kd, full_key_match, key_slice);
5327 if (rc == 0 && !kd.covers_key(m_scan_it->key())) {
5328 /* The record we've got is not from this index */
5329 rc = HA_ERR_KEY_NOT_FOUND;
5330 }
5331 break;
5332 case HA_READ_KEY_OR_PREV:
5333 case HA_READ_PREFIX:
5334 /* This flag is not used by the SQL layer, so we don't support it yet. */
5335 rc = HA_ERR_UNSUPPORTED;
5336 break;
5337 case HA_READ_PREFIX_LAST:
5338 case HA_READ_PREFIX_LAST_OR_PREV:
5339 *move_forward = false;
5340 /*
5341 Find the last record with the specified index prefix lookup.
5342 - HA_READ_PREFIX_LAST requires that the record has the
5343 prefix=lookup (if there are no such records,
5344 HA_ERR_KEY_NOT_FOUND should be returned).
5345 - HA_READ_PREFIX_LAST_OR_PREV has no such requirement. If there are no
5346 records with prefix=lookup, we should return the last record
5347 before that.
5348 */
5349 rc = read_before_key(kd, full_key_match, key_slice);
5350 if (rc == 0) {
5351 const rocksdb::Slice &rkey = m_scan_it->key();
5352 if (!kd.covers_key(rkey)) {
5353 /* The record we've got is not from this index */
5354 rc = HA_ERR_KEY_NOT_FOUND;
5355 } else if (find_flag == HA_READ_PREFIX_LAST) {
5356 uint size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
5357 key, keypart_map);
5358 rocksdb::Slice lookup_tuple(reinterpret_cast<char *>(m_sk_packed_tuple),
5359 size);
5360
5361 // We need to compare the key we've got with the original search prefix.
5362 if (!kd.value_matches_prefix(rkey, lookup_tuple)) {
5363 rc = HA_ERR_KEY_NOT_FOUND;
5364 }
5365 }
5366 }
5367 break;
5368 default:
5369 DBUG_ASSERT(0);
5370 break;
5371 }
5372
5373 return rc;
5374 }
5375
calc_eq_cond_len(const Rdb_key_def & kd,const enum ha_rkey_function & find_flag,const rocksdb::Slice & slice,const int & bytes_changed_by_succ,const key_range * const end_key,uint * const end_key_packed_size)5376 int ha_rocksdb::calc_eq_cond_len(const Rdb_key_def &kd,
5377 const enum ha_rkey_function &find_flag,
5378 const rocksdb::Slice &slice,
5379 const int &bytes_changed_by_succ,
5380 const key_range *const end_key,
5381 uint *const end_key_packed_size) {
5382 if (find_flag == HA_READ_KEY_EXACT)
5383 return slice.size();
5384
5385 if (find_flag == HA_READ_PREFIX_LAST) {
5386 /*
5387 We have made the kd.successor(m_sk_packed_tuple) call above.
5388
5389 The slice is at least Rdb_key_def::INDEX_NUMBER_SIZE bytes long.
5390 */
5391 return slice.size() - bytes_changed_by_succ;
5392 }
5393
5394 if (end_key) {
5395 *end_key_packed_size =
5396 kd.pack_index_tuple(table, m_pack_buffer, m_end_key_packed_tuple,
5397 end_key->key, end_key->keypart_map);
5398
5399 /*
5400 Calculating length of the equal conditions here. 4 byte index id is
5401 included.
5402 Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3)
5403 WHERE id1=1 AND id2=1 AND id3>=2 => eq_cond_len= 4+8+4= 16
5404 WHERE id1=1 AND id2>=1 AND id3>=2 => eq_cond_len= 4+8= 12
5405 Example2: id1 VARCHAR(30), id2 INT, PRIMARY KEY (id1, id2)
5406 WHERE id1 = 'AAA' and id2 < 3; => eq_cond_len=13 (varchar used 9 bytes)
5407 */
5408 rocksdb::Slice end_slice(reinterpret_cast<char *>(m_end_key_packed_tuple),
5409 *end_key_packed_size);
5410 return slice.difference_offset(end_slice);
5411 }
5412
5413 /*
5414 On range scan without any end key condition, there is no
5415 eq cond, and eq cond length is the same as index_id size (4 bytes).
5416 Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3)
5417 WHERE id1>=1 AND id2 >= 2 and id2 <= 5 => eq_cond_len= 4
5418 */
5419 return Rdb_key_def::INDEX_NUMBER_SIZE;
5420 }
5421
read_row_from_primary_key(uchar * const buf)5422 int ha_rocksdb::read_row_from_primary_key(uchar *const buf) {
5423 DBUG_ASSERT(buf != nullptr);
5424
5425 int rc;
5426 const rocksdb::Slice &rkey = m_scan_it->key();
5427 const uint pk_size = rkey.size();
5428 const char *pk_data = rkey.data();
5429
5430 memcpy(m_pk_packed_tuple, pk_data, pk_size);
5431 m_last_rowkey.copy(pk_data, pk_size, &my_charset_bin);
5432
5433 if (m_lock_rows != RDB_LOCK_NONE) {
5434 /* We need to put a lock and re-read */
5435 rc = get_row_by_rowid(buf, m_pk_packed_tuple, pk_size);
5436 } else {
5437 /* Unpack from the row we've read */
5438 const rocksdb::Slice &value = m_scan_it->value();
5439 rc = convert_record_from_storage_format(&rkey, &value, buf);
5440 }
5441
5442 return rc;
5443 }
5444
read_row_from_secondary_key(uchar * const buf,const Rdb_key_def & kd,bool move_forward)5445 int ha_rocksdb::read_row_from_secondary_key(uchar *const buf,
5446 const Rdb_key_def &kd,
5447 bool move_forward) {
5448 DBUG_ASSERT(buf != nullptr);
5449
5450 int rc = 0;
5451 uint pk_size;
5452
5453 if (m_keyread_only && m_lock_rows == RDB_LOCK_NONE && !has_hidden_pk(table)) {
5454 /* Get the key columns and primary key value */
5455 const rocksdb::Slice &rkey = m_scan_it->key();
5456 pk_size =
5457 kd.get_primary_key_tuple(table, *m_pk_descr, &rkey, m_pk_packed_tuple);
5458 const rocksdb::Slice &value = m_scan_it->value();
5459 if (pk_size == RDB_INVALID_KEY_LEN ||
5460 kd.unpack_record(table, buf, &rkey, &value,
5461 m_verify_row_debug_checksums)) {
5462 rc = HA_ERR_INTERNAL_ERROR;
5463 }
5464 } else {
5465 if (kd.m_is_reverse_cf)
5466 move_forward = !move_forward;
5467
5468 rc = find_icp_matching_index_rec(move_forward, buf);
5469 if (!rc) {
5470 const rocksdb::Slice &rkey = m_scan_it->key();
5471 pk_size = kd.get_primary_key_tuple(table, *m_pk_descr, &rkey,
5472 m_pk_packed_tuple);
5473 if (pk_size == RDB_INVALID_KEY_LEN) {
5474 rc = HA_ERR_INTERNAL_ERROR;
5475 } else {
5476 rc = get_row_by_rowid(buf, m_pk_packed_tuple, pk_size);
5477 }
5478 }
5479 }
5480
5481 if (!rc) {
5482 m_last_rowkey.copy((const char *)m_pk_packed_tuple, pk_size,
5483 &my_charset_bin);
5484 }
5485
5486 return rc;
5487 }
5488
5489 /**
5490 @note
5491 The problem with this function is that SQL layer calls it, when
5492 - the table has not been yet opened (no ::open() call done)
5493 - this->table_share already exists, but it is in the process of being
5494 filled, so some of fields are still NULL.
5495 - In particular, table_share->key_info[inx].key_part[] is filled only up
5496 to part #part. Subsequent key parts are not yet filled.
5497
5498 To complicate things further, SQL layer will call index_flags() with
5499 all_parts=TRUE. Essentially, we're asked to provide flags for reading
5500 keyparts whose datatype is not yet known.
5501
5502 We walk around this problem by using check_keyread_allowed(), which uses
5503 table_share object and is careful not to step on unitialized data.
5504
5505 When we get a call with all_parts=TRUE, we try to analyze all parts but
5506 ignore those that have key_part->field==nullptr (these are not initialized
5507 yet).
5508 */
5509
index_flags(uint inx,uint part,bool all_parts) const5510 ulong ha_rocksdb::index_flags(uint inx, uint part, bool all_parts) const {
5511 DBUG_ENTER_FUNC();
5512
5513 ulong base_flags = HA_READ_NEXT | // doesn't seem to be used
5514 HA_READ_ORDER | HA_READ_RANGE | HA_READ_PREV;
5515
5516 if (check_keyread_allowed(inx, part, all_parts))
5517 base_flags |= HA_KEYREAD_ONLY;
5518
5519 if (inx == table_share->primary_key) {
5520 /*
5521 Index-only reads on primary key are the same as table scan for us. Still,
5522 we need to explicitly "allow" them, otherwise SQL layer will miss some
5523 plans.
5524 */
5525 base_flags |= HA_KEYREAD_ONLY;
5526 } else {
5527 /*
5528 We can Index Condition Pushdown any key except the primary. With primary
5529 key, we get (pk, record) pair immediately, there is no place to put the
5530 ICP check.
5531 */
5532 base_flags |= HA_DO_INDEX_COND_PUSHDOWN;
5533 }
5534
5535 DBUG_RETURN(base_flags);
5536 }
5537
5538 /**
5539 @brief
5540 Read next index tuple through the secondary index.
5541
5542 @details
5543 m_scan_it points at the index key-value pair that we should read the (pk,row)
5544 pair for.
5545 */
secondary_index_read(const int keyno,uchar * const buf)5546 int ha_rocksdb::secondary_index_read(const int keyno, uchar *const buf) {
5547 DBUG_ASSERT(buf != nullptr);
5548 DBUG_ASSERT(table != nullptr);
5549
5550 /* Use STATUS_NOT_FOUND when record not found or some error occurred */
5551 table->status = STATUS_NOT_FOUND;
5552
5553 if (m_scan_it->Valid()) {
5554 rocksdb::Slice key = m_scan_it->key();
5555
5556 /* Check if we've ran out of records of this index */
5557 if (m_key_descr_arr[keyno]->covers_key(key)) {
5558 int rc = 0;
5559
5560 // TODO: We could here check if we have ran out of range we're scanning
5561 const uint size = m_key_descr_arr[keyno]->get_primary_key_tuple(
5562 table, *m_pk_descr, &key, m_pk_packed_tuple);
5563 if (size == RDB_INVALID_KEY_LEN) {
5564 return HA_ERR_INTERNAL_ERROR;
5565 }
5566
5567 m_last_rowkey.copy((const char *)m_pk_packed_tuple, size,
5568 &my_charset_bin);
5569
5570 if (m_keyread_only && m_lock_rows == RDB_LOCK_NONE &&
5571 !has_hidden_pk(table)) {
5572 rocksdb::Slice value = m_scan_it->value();
5573 if (m_key_descr_arr[keyno]->unpack_record(
5574 table, buf, &key, &value, m_verify_row_debug_checksums)) {
5575 return HA_ERR_INTERNAL_ERROR;
5576 }
5577 } else {
5578 rc = get_row_by_rowid(buf, m_pk_packed_tuple, size);
5579 }
5580
5581 if (!rc) {
5582 table->status = 0;
5583 update_row_stats(ROWS_READ);
5584 }
5585 return rc;
5586 }
5587 }
5588 return HA_ERR_END_OF_FILE;
5589 }
5590
5591 /*
5592 ha_rocksdb::read_range_first overrides handler::read_range_first.
5593 The only difference from handler::read_range_first is that
5594 ha_rocksdb::read_range_first passes end_key to
5595 ha_rocksdb::index_read_map_impl function.
5596 */
read_range_first(const key_range * const start_key,const key_range * const end_key,bool eq_range_arg,bool sorted)5597 int ha_rocksdb::read_range_first(const key_range *const start_key,
5598 const key_range *const end_key,
5599 bool eq_range_arg, bool sorted) {
5600 DBUG_ENTER_FUNC();
5601
5602 int result;
5603
5604 eq_range = eq_range_arg;
5605 set_end_range(end_key, RANGE_SCAN_ASC);
5606
5607 range_key_part = table->key_info[active_index].key_part;
5608
5609 if (!start_key) // Read first record
5610 result = ha_index_first(table->record[0]);
5611 else {
5612 if (is_using_prohibited_gap_locks(
5613 table,
5614 is_using_full_unique_key(active_index, start_key->keypart_map,
5615 start_key->flag))) {
5616 DBUG_RETURN(HA_ERR_LOCK_DEADLOCK);
5617 }
5618
5619 MYSQL_TABLE_IO_WAIT(m_psi, PSI_TABLE_FETCH_ROW, active_index, 0, {
5620 result =
5621 index_read_map_impl(table->record[0], start_key->key,
5622 start_key->keypart_map, start_key->flag, end_key);
5623 })
5624 }
5625 if (result)
5626 DBUG_RETURN((result == HA_ERR_KEY_NOT_FOUND) ? HA_ERR_END_OF_FILE : result);
5627
5628 if (compare_key(end_range) <= 0) {
5629 DBUG_RETURN(HA_EXIT_SUCCESS);
5630 } else {
5631 /*
5632 The last read row does not fall in the range. So request
5633 storage engine to release row lock if possible.
5634 */
5635 unlock_row();
5636 DBUG_RETURN(HA_ERR_END_OF_FILE);
5637 }
5638 }
5639
index_read_map(uchar * const buf,const uchar * const key,key_part_map keypart_map,enum ha_rkey_function find_flag)5640 int ha_rocksdb::index_read_map(uchar *const buf, const uchar *const key,
5641 key_part_map keypart_map,
5642 enum ha_rkey_function find_flag) {
5643 DBUG_ENTER_FUNC();
5644
5645 DBUG_RETURN(index_read_map_impl(buf, key, keypart_map, find_flag, nullptr));
5646 }
5647
5648 /*
5649 See storage/rocksdb/rocksdb-range-access.txt for description of how MySQL
5650 index navigation commands are converted into RocksDB lookup commands.
5651
5652 This function takes end_key as an argument, and it is set on range scan.
5653 MyRocks needs to decide whether prefix bloom filter can be used or not.
5654 To decide to use prefix bloom filter or not, calculating equal condition
5655 length
5656 is needed. On equal lookups (find_flag == HA_READ_KEY_EXACT), equal
5657 condition length is the same as rocksdb::Slice.size() of the start key.
5658 On range scan, equal condition length is MIN(start_key, end_key) of the
5659 rocksdb::Slice expression.
5660 */
index_read_map_impl(uchar * const buf,const uchar * const key,key_part_map keypart_map,enum ha_rkey_function find_flag,const key_range * end_key)5661 int ha_rocksdb::index_read_map_impl(uchar *const buf, const uchar *const key,
5662 key_part_map keypart_map,
5663 enum ha_rkey_function find_flag,
5664 const key_range *end_key) {
5665 DBUG_ENTER_FUNC();
5666
5667 int rc = 0;
5668
5669 ha_statistic_increment(&SSV::ha_read_key_count);
5670 const Rdb_key_def &kd = *m_key_descr_arr[active_index];
5671 const uint actual_key_parts = kd.get_key_parts();
5672 bool using_full_key = is_using_full_key(keypart_map, actual_key_parts);
5673
5674 if (!end_key)
5675 end_key = end_range;
5676
5677 /* By default, we don't need the retrieved records to match the prefix */
5678 m_sk_match_prefix = nullptr;
5679
5680 if (active_index == table->s->primary_key && find_flag == HA_READ_KEY_EXACT &&
5681 using_full_key) {
5682 /*
5683 Equality lookup over primary key, using full tuple.
5684 This is a special case, use DB::Get.
5685 */
5686 const uint size = kd.pack_index_tuple(table, m_pack_buffer,
5687 m_pk_packed_tuple, key, keypart_map);
5688 rc = get_row_by_rowid(buf, m_pk_packed_tuple, size);
5689 if (!rc) {
5690 update_row_stats(ROWS_READ);
5691 }
5692 DBUG_RETURN(rc);
5693 }
5694
5695 /*
5696 Unique secondary index performs lookups without the extended key fields
5697 */
5698 uint packed_size;
5699 if (active_index != table->s->primary_key &&
5700 table->key_info[active_index].flags & HA_NOSAME &&
5701 find_flag == HA_READ_KEY_EXACT && using_full_key) {
5702 key_part_map tmp_map = (key_part_map(1) << table->key_info[active_index]
5703 .user_defined_key_parts) -
5704 1;
5705 packed_size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
5706 key, tmp_map);
5707 if (table->key_info[active_index].user_defined_key_parts !=
5708 kd.get_key_parts())
5709 using_full_key = false;
5710 } else {
5711 packed_size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
5712 key, keypart_map);
5713 }
5714
5715 if ((pushed_idx_cond && pushed_idx_cond_keyno == active_index) &&
5716 (find_flag == HA_READ_KEY_EXACT || find_flag == HA_READ_PREFIX_LAST)) {
5717 /*
5718 We are doing a point index lookup, and ICP is enabled. It is possible
5719 that this call will be followed by ha_rocksdb->index_next_same() call.
5720
5721 Do what InnoDB does: save the lookup tuple now. We will need it in
5722 index_next_same/find_icp_matching_index_rec in order to stop scanning
5723 as soon as index record doesn't match the lookup tuple.
5724
5725 When not using ICP, handler::index_next_same() will make sure that rows
5726 that don't match the lookup prefix are not returned.
5727 row matches the lookup prefix.
5728 */
5729 m_sk_match_prefix = m_sk_match_prefix_buf;
5730 m_sk_match_length = packed_size;
5731 memcpy(m_sk_match_prefix, m_sk_packed_tuple, packed_size);
5732 }
5733
5734 int bytes_changed_by_succ = 0;
5735 if (find_flag == HA_READ_PREFIX_LAST_OR_PREV ||
5736 find_flag == HA_READ_PREFIX_LAST || find_flag == HA_READ_AFTER_KEY) {
5737 /* See below */
5738 bytes_changed_by_succ = kd.successor(m_sk_packed_tuple, packed_size);
5739 }
5740
5741 rocksdb::Slice slice(reinterpret_cast<const char *>(m_sk_packed_tuple),
5742 packed_size);
5743
5744 uint end_key_packed_size = 0;
5745 const uint eq_cond_len =
5746 calc_eq_cond_len(kd, find_flag, slice, bytes_changed_by_succ, end_key,
5747 &end_key_packed_size);
5748
5749 bool use_all_keys = false;
5750 if (find_flag == HA_READ_KEY_EXACT &&
5751 my_count_bits(keypart_map) == kd.get_key_parts())
5752 use_all_keys = true;
5753
5754 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
5755 const bool is_new_snapshot = !tx->has_snapshot();
5756 // Loop as long as we get a deadlock error AND we end up creating the
5757 // snapshot here (i.e. it did not exist prior to this)
5758 for (;;) {
5759 /*
5760 This will open the iterator and position it at a record that's equal or
5761 greater than the lookup tuple.
5762 */
5763 setup_scan_iterator(kd, &slice, use_all_keys, is_ascending(kd, find_flag),
5764 eq_cond_len);
5765
5766 /*
5767 Once we are positioned on from above, move to the position we really
5768 want: See storage/rocksdb/rocksdb-range-access.txt
5769 */
5770 bool move_forward;
5771 rc = position_to_correct_key(kd, find_flag, using_full_key, key,
5772 keypart_map, slice, &move_forward);
5773
5774 if (rc) {
5775 /* This status is returned on any error */
5776 table->status = STATUS_NOT_FOUND;
5777 DBUG_RETURN(rc);
5778 }
5779
5780 m_skip_scan_it_next_call = false;
5781
5782 /*
5783 Now get the data for the row into 'buf'. If we were using a primary key
5784 then we have all the rows we need. For a secondary key we now need to
5785 lookup the primary key.
5786 */
5787 if (active_index == table->s->primary_key)
5788 rc = read_row_from_primary_key(buf);
5789 else
5790 rc = read_row_from_secondary_key(buf, kd, move_forward);
5791
5792 if (rc != HA_ERR_LOCK_DEADLOCK || !is_new_snapshot)
5793 break; /* Exit the loop */
5794
5795 // release the snapshot and iterator so they will be regenerated
5796 tx->release_snapshot();
5797 release_scan_iterator();
5798 }
5799
5800 if (rc) {
5801 /* the only possible error condition is record-not-found */
5802 table->status = STATUS_NOT_FOUND;
5803 } else {
5804 table->status = 0;
5805 update_row_stats(ROWS_READ);
5806 }
5807
5808 DBUG_RETURN(rc);
5809 }
5810
5811 /*
5812 @brief
5813 Scan the secondary index until we find an index record that satisfies ICP
5814
5815 @param move_forward TRUE <=> move m_scan_it forward
5816 FALSE <=> move m_scan_it backward
5817 @param buf Record buffer (must be the same buffer that
5818 pushed index condition points to, in practice
5819 it is table->record[0])
5820
5821 @detail
5822 Move the current iterator m_scan_it until we get an index tuple that
5823 satisfies the pushed Index Condition.
5824 (if there is no pushed index condition, return right away)
5825
5826 @return
5827 0 - Index tuple satisfies ICP, can do index read.
5828 other - error code
5829 */
5830
find_icp_matching_index_rec(const bool & move_forward,uchar * const buf)5831 int ha_rocksdb::find_icp_matching_index_rec(const bool &move_forward,
5832 uchar *const buf) {
5833 DBUG_ASSERT(buf != nullptr);
5834
5835 if (pushed_idx_cond && pushed_idx_cond_keyno == active_index) {
5836 const Rdb_key_def &kd = *m_key_descr_arr[active_index];
5837
5838 while (1) {
5839 if (!m_scan_it->Valid()) {
5840 table->status = STATUS_NOT_FOUND;
5841 return HA_ERR_END_OF_FILE;
5842 }
5843 const rocksdb::Slice rkey = m_scan_it->key();
5844
5845 if (!kd.covers_key(rkey)) {
5846 table->status = STATUS_NOT_FOUND;
5847 return HA_ERR_END_OF_FILE;
5848 }
5849
5850 if (m_sk_match_prefix) {
5851 const rocksdb::Slice prefix((const char *)m_sk_match_prefix,
5852 m_sk_match_length);
5853 if (!kd.value_matches_prefix(rkey, prefix)) {
5854 table->status = STATUS_NOT_FOUND;
5855 return HA_ERR_END_OF_FILE;
5856 }
5857 }
5858
5859 const rocksdb::Slice value = m_scan_it->value();
5860 if (kd.unpack_record(table, buf, &rkey, &value,
5861 m_verify_row_debug_checksums)) {
5862 return HA_ERR_INTERNAL_ERROR;
5863 }
5864
5865 const enum icp_result icp_status = check_index_cond();
5866 if (icp_status == ICP_NO_MATCH) {
5867 if (move_forward)
5868 m_scan_it->Next();
5869 else
5870 m_scan_it->Prev();
5871 continue; /* Get the next (or prev) index tuple */
5872 } else if (icp_status == ICP_OUT_OF_RANGE) {
5873 /* We have walked out of range we are scanning */
5874 table->status = STATUS_NOT_FOUND;
5875 return HA_ERR_END_OF_FILE;
5876 } else /* icp_status == ICP_MATCH */
5877 {
5878 /* Index Condition is satisfied. We have rc==0, proceed to fetch the
5879 * row. */
5880 break;
5881 }
5882 /*
5883 TODO: should we have this here, or RockDB handles this internally?
5884 if (my_core::thd_killed(current_thd))
5885 {
5886 rc= HA_ERR_INTERNAL_ERROR; // doesn't matter
5887 break;
5888 }
5889 */
5890 }
5891 }
5892 return HA_EXIT_SUCCESS;
5893 }
5894
index_read_last_map(uchar * const buf,const uchar * const key,key_part_map keypart_map)5895 int ha_rocksdb::index_read_last_map(uchar *const buf, const uchar *const key,
5896 key_part_map keypart_map) {
5897 DBUG_ENTER_FUNC();
5898
5899 DBUG_RETURN(index_read_map(buf, key, keypart_map, HA_READ_PREFIX_LAST));
5900 }
5901
check(THD * const thd,HA_CHECK_OPT * const check_opt)5902 int ha_rocksdb::check(THD *const thd, HA_CHECK_OPT *const check_opt) {
5903 DBUG_ENTER_FUNC();
5904
5905 DBUG_ASSERT(thd != nullptr);
5906 DBUG_ASSERT(check_opt != nullptr);
5907
5908 const uint pk = pk_index(table, m_tbl_def);
5909 String rowkey_copy;
5910 String sec_key_copy;
5911 const char *const table_name = table->s->table_name.str;
5912
5913 // Only when debugging: don't use snapshot when reading
5914 // Rdb_transaction *tx= get_or_create_tx(table->in_use);
5915 // tx->snapshot= nullptr;
5916
5917 const bool save_verify_row_debug_checksums = m_verify_row_debug_checksums;
5918 m_verify_row_debug_checksums = true;
5919
5920 /* For each secondary index, check that we can get a PK value from it */
5921 // NO_LINT_DEBUG
5922 sql_print_information("CHECKTABLE %s: Checking table %s", table_name,
5923 table_name);
5924 ha_rows UNINIT_VAR(row_checksums_at_start); // set/used iff first_index==true
5925 ha_rows row_checksums = ha_rows(-1);
5926 bool first_index = true;
5927
5928 for (uint keyno = 0; keyno < table->s->keys; keyno++) {
5929 if (keyno != pk) {
5930 extra(HA_EXTRA_KEYREAD);
5931 ha_index_init(keyno, true);
5932 ha_rows rows = 0;
5933 ha_rows checksums = 0;
5934 if (first_index)
5935 row_checksums_at_start = m_row_checksums_checked;
5936 int res;
5937 // NO_LINT_DEBUG
5938 sql_print_information("CHECKTABLE %s: Checking index %s", table_name,
5939 table->key_info[keyno].name);
5940 while (1) {
5941 if (!rows)
5942 res = index_first(table->record[0]);
5943 else
5944 res = index_next(table->record[0]);
5945
5946 if (res == HA_ERR_END_OF_FILE)
5947 break;
5948 if (res) {
5949 // error
5950 // NO_LINT_DEBUG
5951 sql_print_error("CHECKTABLE %s: .. row %lld: index scan error %d",
5952 table_name, rows, res);
5953 goto error;
5954 }
5955 rocksdb::Slice key = m_scan_it->key();
5956 sec_key_copy.copy(key.data(), key.size(), &my_charset_bin);
5957 rowkey_copy.copy(m_last_rowkey.ptr(), m_last_rowkey.length(),
5958 &my_charset_bin);
5959
5960 if (m_key_descr_arr[keyno]->unpack_info_has_checksum(
5961 m_scan_it->value())) {
5962 checksums++;
5963 }
5964
5965 if ((res = get_row_by_rowid(table->record[0], rowkey_copy.ptr(),
5966 rowkey_copy.length()))) {
5967 // NO_LINT_DEBUG
5968 sql_print_error("CHECKTABLE %s: .. row %lld: "
5969 "failed to fetch row by rowid",
5970 table_name, rows);
5971 goto error;
5972 }
5973
5974 longlong hidden_pk_id = 0;
5975 if (has_hidden_pk(table) &&
5976 read_hidden_pk_id_from_rowkey(&hidden_pk_id))
5977 goto error;
5978
5979 /* Check if we get the same PK value */
5980 uint packed_size = m_pk_descr->pack_record(
5981 table, m_pack_buffer, table->record[0], m_pk_packed_tuple, nullptr,
5982 false, hidden_pk_id);
5983 if (packed_size != rowkey_copy.length() ||
5984 memcmp(m_pk_packed_tuple, rowkey_copy.ptr(), packed_size)) {
5985 // NO_LINT_DEBUG
5986 sql_print_error("CHECKTABLE %s: .. row %lld: PK value mismatch",
5987 table_name, rows);
5988 goto print_and_error;
5989 }
5990
5991 /* Check if we get the same secondary key value */
5992 packed_size = m_key_descr_arr[keyno]->pack_record(
5993 table, m_pack_buffer, table->record[0], m_sk_packed_tuple,
5994 &m_sk_tails, false, hidden_pk_id);
5995 if (packed_size != sec_key_copy.length() ||
5996 memcmp(m_sk_packed_tuple, sec_key_copy.ptr(), packed_size)) {
5997 // NO_LINT_DEBUG
5998 sql_print_error("CHECKTABLE %s: .. row %lld: "
5999 "secondary index value mismatch",
6000 table_name, rows);
6001 goto print_and_error;
6002 }
6003 rows++;
6004 continue;
6005
6006 print_and_error : {
6007 std::string buf;
6008 buf = rdb_hexdump(rowkey_copy.ptr(), rowkey_copy.length(),
6009 RDB_MAX_HEXDUMP_LEN);
6010 // NO_LINT_DEBUG
6011 sql_print_error("CHECKTABLE %s: rowkey: %s", table_name, buf.c_str());
6012
6013 buf = rdb_hexdump(m_retrieved_record.data(), m_retrieved_record.size(),
6014 RDB_MAX_HEXDUMP_LEN);
6015 // NO_LINT_DEBUG
6016 sql_print_error("CHECKTABLE %s: record: %s", table_name, buf.c_str());
6017
6018 buf = rdb_hexdump(sec_key_copy.ptr(), sec_key_copy.length(),
6019 RDB_MAX_HEXDUMP_LEN);
6020 // NO_LINT_DEBUG
6021 sql_print_error("CHECKTABLE %s: index: %s", table_name, buf.c_str());
6022
6023 goto error;
6024 }
6025 }
6026 // NO_LINT_DEBUG
6027 sql_print_information("CHECKTABLE %s: ... %lld index entries checked "
6028 "(%lld had checksums)",
6029 table_name, rows, checksums);
6030
6031 if (first_index) {
6032 row_checksums = m_row_checksums_checked - row_checksums_at_start;
6033 first_index = false;
6034 }
6035 ha_index_end();
6036 }
6037 }
6038 if (row_checksums != ha_rows(-1)) {
6039 // NO_LINT_DEBUG
6040 sql_print_information("CHECKTABLE %s: %lld table records had checksums",
6041 table_name, row_checksums);
6042 }
6043 extra(HA_EXTRA_NO_KEYREAD);
6044
6045 m_verify_row_debug_checksums = save_verify_row_debug_checksums;
6046 /*
6047 TODO: we should check also for PK records that are missing in the secondary
6048 indexes.
6049 For that, need to walk through the PK and check that every PK record has a
6050 proper counterpart in each secondary index.
6051 */
6052 DBUG_RETURN(HA_ADMIN_OK);
6053 error:
6054 m_verify_row_debug_checksums = save_verify_row_debug_checksums;
6055 ha_index_or_rnd_end();
6056 extra(HA_EXTRA_NO_KEYREAD);
6057
6058 DBUG_RETURN(HA_ADMIN_CORRUPT);
6059 }
6060
dbug_dump_str(FILE * const out,const char * const str,int len)6061 static void dbug_dump_str(FILE *const out, const char *const str, int len) {
6062 fprintf(out, "\"");
6063 for (int i = 0; i < len; i++) {
6064 if (str[i] > 32)
6065 fprintf(out, "%c", str[i]);
6066 else
6067 fprintf(out, "\\%d", str[i]);
6068 }
6069 fprintf(out, "\"");
6070 }
6071
6072 /*
6073 Debugging help: dump the whole database into a human-readable file.
6074 Usage:
6075 dbug_dump_database(rdb);
6076 */
6077
dbug_dump_database(rocksdb::DB * const db)6078 void dbug_dump_database(rocksdb::DB *const db) {
6079 FILE *const out = fopen("/tmp/rocksdb.dump", "wt");
6080 if (!out)
6081 return;
6082
6083 rocksdb::Iterator *it = db->NewIterator(rocksdb::ReadOptions());
6084 for (it->SeekToFirst(); it->Valid(); it->Next()) {
6085 rocksdb::Slice key = it->key();
6086 rocksdb::Slice val = it->value();
6087 dbug_dump_str(out, key.data(), key.size());
6088 fprintf(out, " -> ");
6089 dbug_dump_str(out, val.data(), val.size());
6090 fprintf(out, "\n");
6091 }
6092
6093 delete it;
6094 fclose(out);
6095 }
6096
get_for_update(Rdb_transaction * const tx,rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,std::string * const value) const6097 rocksdb::Status ha_rocksdb::get_for_update(
6098 Rdb_transaction *const tx, rocksdb::ColumnFamilyHandle *const column_family,
6099 const rocksdb::Slice &key, std::string *const value) const {
6100 DBUG_ASSERT(m_lock_rows != RDB_LOCK_NONE);
6101 const bool exclusive = m_lock_rows != RDB_LOCK_READ;
6102
6103 rocksdb::Status s = tx->get_for_update(column_family, key, value, exclusive);
6104
6105 // If we have a lock conflict and we are running in READ COMMITTTED mode
6106 // release and reacquire the snapshot and then retry the get_for_update().
6107 if (s.IsBusy() && !s.IsDeadlock() &&
6108 my_core::thd_tx_isolation(ha_thd()) == ISO_READ_COMMITTED) {
6109 tx->release_snapshot();
6110 tx->acquire_snapshot(false);
6111
6112 s = tx->get_for_update(column_family, key, value, exclusive);
6113 }
6114
6115 return s;
6116 }
6117
6118 /*
6119 Given a rowid (i.e. packed PK) as a parameter, get the record.
6120
6121 @return
6122 0 - Ok
6123 other - HA_ERR error code.
6124 */
6125
get_row_by_rowid(uchar * const buf,const char * const rowid,const uint rowid_size)6126 int ha_rocksdb::get_row_by_rowid(uchar *const buf, const char *const rowid,
6127 const uint rowid_size) {
6128 DBUG_ENTER_FUNC();
6129
6130 DBUG_ASSERT(buf != nullptr);
6131 DBUG_ASSERT(rowid != nullptr);
6132 DBUG_ASSERT(table != nullptr);
6133
6134 int rc;
6135
6136 rocksdb::Slice key_slice(rowid, rowid_size);
6137
6138 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
6139 DBUG_ASSERT(tx != nullptr);
6140
6141 DEBUG_SYNC(ha_thd(), "rocksdb.get_row_by_rowid");
6142 DBUG_EXECUTE_IF("dbug.rocksdb.get_row_by_rowid", {
6143 THD *thd = ha_thd();
6144 const char act[] = "now signal Reached "
6145 "wait_for signal.rocksdb.get_row_by_rowid_let_running";
6146 DBUG_ASSERT(opt_debug_sync_timeout > 0);
6147 DBUG_ASSERT(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
6148 };);
6149
6150 bool found;
6151 rocksdb::Status s;
6152
6153 if (m_lock_rows == RDB_LOCK_NONE) {
6154 tx->acquire_snapshot(true);
6155 s = tx->get(m_pk_descr->get_cf(), key_slice, &m_retrieved_record);
6156 } else {
6157 s = get_for_update(tx, m_pk_descr->get_cf(), key_slice,
6158 &m_retrieved_record);
6159 }
6160
6161 if (!s.IsNotFound() && !s.ok()) {
6162 DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def));
6163 }
6164 found = !s.IsNotFound();
6165
6166 table->status = STATUS_NOT_FOUND;
6167 if (found) {
6168 m_last_rowkey.copy((const char *)rowid, rowid_size, &my_charset_bin);
6169 rc = convert_record_from_storage_format(&key_slice, buf);
6170 if (!rc)
6171 table->status = 0;
6172 } else {
6173 /*
6174 Note: we don't need to unlock the row. It is intentional that we keep
6175 locks on rows that don't exist.
6176 */
6177 rc = HA_ERR_KEY_NOT_FOUND;
6178 }
6179
6180 DBUG_RETURN(rc);
6181 }
6182
index_next(uchar * const buf)6183 int ha_rocksdb::index_next(uchar *const buf) {
6184 DBUG_ENTER_FUNC();
6185
6186 bool moves_forward = true;
6187 ha_statistic_increment(&SSV::ha_read_next_count);
6188 if (m_key_descr_arr[active_index]->m_is_reverse_cf) {
6189 moves_forward = false;
6190 }
6191
6192 int rc = index_next_with_direction(buf, moves_forward);
6193 if (rc == HA_ERR_KEY_NOT_FOUND)
6194 rc = HA_ERR_END_OF_FILE;
6195
6196 DBUG_RETURN(rc);
6197 }
6198
index_prev(uchar * const buf)6199 int ha_rocksdb::index_prev(uchar *const buf) {
6200 DBUG_ENTER_FUNC();
6201
6202 bool moves_forward = false;
6203 ha_statistic_increment(&SSV::ha_read_prev_count);
6204 if (m_key_descr_arr[active_index]->m_is_reverse_cf) {
6205 moves_forward = true;
6206 }
6207
6208 int rc = index_next_with_direction(buf, moves_forward);
6209 if (rc == HA_ERR_KEY_NOT_FOUND)
6210 rc = HA_ERR_END_OF_FILE;
6211
6212 DBUG_RETURN(rc);
6213 }
6214
index_next_with_direction(uchar * const buf,bool move_forward)6215 int ha_rocksdb::index_next_with_direction(uchar *const buf, bool move_forward) {
6216 DBUG_ENTER_FUNC();
6217
6218 int rc;
6219
6220 if (active_index == pk_index(table, m_tbl_def)) {
6221 rc = rnd_next_with_direction(buf, move_forward);
6222 } else {
6223 if (m_skip_scan_it_next_call) {
6224 m_skip_scan_it_next_call = false;
6225 } else {
6226 if (move_forward)
6227 m_scan_it->Next(); /* this call cannot fail */
6228 else
6229 m_scan_it->Prev();
6230 }
6231 rc = find_icp_matching_index_rec(move_forward, buf);
6232 if (!rc)
6233 rc = secondary_index_read(active_index, buf);
6234 }
6235
6236 DBUG_RETURN(rc);
6237 }
6238
index_first(uchar * const buf)6239 int ha_rocksdb::index_first(uchar *const buf) {
6240 DBUG_ENTER_FUNC();
6241
6242 m_sk_match_prefix = nullptr;
6243 ha_statistic_increment(&SSV::ha_read_first_count);
6244 int rc = m_key_descr_arr[active_index]->m_is_reverse_cf
6245 ? index_last_intern(buf)
6246 : index_first_intern(buf);
6247 if (rc == HA_ERR_KEY_NOT_FOUND)
6248 rc = HA_ERR_END_OF_FILE;
6249
6250 DBUG_RETURN(rc);
6251 }
6252
index_last(uchar * const buf)6253 int ha_rocksdb::index_last(uchar *const buf) {
6254 DBUG_ENTER_FUNC();
6255
6256 m_sk_match_prefix = nullptr;
6257 ha_statistic_increment(&SSV::ha_read_last_count);
6258 int rc = m_key_descr_arr[active_index]->m_is_reverse_cf
6259 ? index_first_intern(buf)
6260 : index_last_intern(buf);
6261 if (rc == HA_ERR_KEY_NOT_FOUND)
6262 rc = HA_ERR_END_OF_FILE;
6263
6264 DBUG_RETURN(rc);
6265 }
6266
6267 /*
6268 Start scanning from the "first" value.
6269
6270 The 'first' here means "the first from start of the key space".
6271 For reverse-ordered key spaces, the first value will be the biggest, 'zzzz'.
6272
6273 An picture of a forward-ordered keyspace (remember, the keys have form
6274 'indexnr-keyval'. Suppose the index we are at has number n)
6275
6276 (n-1) - ...
6277 ( n ) <--- 1. (n) doesn't exist in the db but it would be here.
6278 ( n ) - aaa <--- 2. Seek("n") will put us here on the first index
6279 ( n ) - bbb record.
6280 ( n ) - cc
6281
6282 So, need to do: Seek(n);
6283
6284 A backward-ordered keyspace:
6285
6286 (n+1) - bbb
6287 (n+1) - aaa
6288 (n+1) <--- (n+1) doesn't exist in the db but would be here.
6289 ( n ) - ccc <--- 1. We need to be here.
6290 ( n ) - bbb
6291 ( n ) - aaa
6292 ( n )
6293
6294 So, need to: Seek(n+1);
6295
6296 */
6297
index_first_intern(uchar * const buf)6298 int ha_rocksdb::index_first_intern(uchar *const buf) {
6299 DBUG_ENTER_FUNC();
6300
6301 DBUG_ASSERT(buf != nullptr);
6302
6303 uchar *key;
6304 uint key_size;
6305 int rc;
6306
6307 if (is_pk(active_index, table, m_tbl_def)) {
6308 key = m_pk_packed_tuple;
6309 } else {
6310 key = m_sk_packed_tuple;
6311 }
6312
6313 DBUG_ASSERT(key != nullptr);
6314
6315 const Rdb_key_def &kd = *m_key_descr_arr[active_index];
6316 if (kd.m_is_reverse_cf) {
6317 kd.get_supremum_key(key, &key_size);
6318 } else {
6319 kd.get_infimum_key(key, &key_size);
6320 }
6321
6322 rocksdb::Slice index_key((const char *)key, key_size);
6323
6324 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
6325 DBUG_ASSERT(tx != nullptr);
6326
6327 const bool is_new_snapshot = !tx->has_snapshot();
6328 // Loop as long as we get a deadlock error AND we end up creating the
6329 // snapshot here (i.e. it did not exist prior to this)
6330 for (;;) {
6331 setup_scan_iterator(kd, &index_key, false, !kd.m_is_reverse_cf,
6332 Rdb_key_def::INDEX_NUMBER_SIZE);
6333 m_skip_scan_it_next_call = true;
6334
6335 rc = index_next_with_direction(buf, true);
6336 if (rc != HA_ERR_LOCK_DEADLOCK || !is_new_snapshot)
6337 break; // exit the loop
6338
6339 // release the snapshot and iterator so they will be regenerated
6340 tx->release_snapshot();
6341 release_scan_iterator();
6342 }
6343
6344 DBUG_RETURN(rc);
6345 }
6346
6347 /**
6348 @details
6349 Start scanning from the "last" value
6350
6351 The 'last' here means "the last from start of the key space".
6352 For reverse-ordered key spaces, we will actually read the smallest value.
6353
6354 An picture of a forward-ordered keyspace (remember, the keys have form
6355 'indexnr-keyval'. Suppose the we are at a key that has number n)
6356
6357 (n-1)-something
6358 ( n )-aaa
6359 ( n )-bbb
6360 ( n )-ccc <----------- Need to seek to here.
6361 (n+1) <---- Doesn't exist, but would be here.
6362 (n+1)-smth, or no value at all
6363
6364 RocksDB's Iterator::Seek($val) seeks to "at $val or first value that's
6365 greater". We can't see to "(n)-ccc" directly, because we don't know what
6366 is the value of 'ccc' (the biggest record with prefix (n)). Instead, we seek
6367 to "(n+1)", which is the least possible value that's greater than any value
6368 in index #n. Then we step one record back.
6369
6370 So, need to: it->Seek(n+1) || it->SeekToLast(); it->Prev();
6371
6372 A backward-ordered keyspace:
6373
6374 (n+1)-something
6375 ( n ) - ccc
6376 ( n ) - bbb
6377 ( n ) - aaa <---------------- (*) Need to seek here.
6378 ( n ) <--- Doesn't exist, but would be here.
6379 (n-1)-smth, or no value at all
6380
6381 So, need to: it->Seek(n) || it->SeekToLast(); it->Prev();
6382 */
6383
index_last_intern(uchar * const buf)6384 int ha_rocksdb::index_last_intern(uchar *const buf) {
6385 DBUG_ENTER_FUNC();
6386
6387 DBUG_ASSERT(buf != nullptr);
6388
6389 uchar *key;
6390 uint key_size;
6391 int rc;
6392
6393 if (is_pk(active_index, table, m_tbl_def)) {
6394 key = m_pk_packed_tuple;
6395 } else {
6396 key = m_sk_packed_tuple;
6397 }
6398
6399 DBUG_ASSERT(key != nullptr);
6400
6401 const Rdb_key_def &kd = *m_key_descr_arr[active_index];
6402 if (kd.m_is_reverse_cf) {
6403 kd.get_infimum_key(key, &key_size);
6404 } else {
6405 kd.get_supremum_key(key, &key_size);
6406 }
6407
6408 rocksdb::Slice index_key((const char *)key, key_size);
6409
6410 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
6411 DBUG_ASSERT(tx != nullptr);
6412
6413 const bool is_new_snapshot = !tx->has_snapshot();
6414 // Loop as long as we get a deadlock error AND we end up creating the
6415 // snapshot here (i.e. it did not exist prior to this)
6416 for (;;) {
6417 setup_scan_iterator(kd, &index_key, false, kd.m_is_reverse_cf,
6418 Rdb_key_def::INDEX_NUMBER_SIZE);
6419
6420 if (!m_scan_it->Valid()) {
6421 /*
6422 RocksDB keyspace has no records that follow our index. It could be, our
6423 index is the last in the keyspace. Navigate to last record in the
6424 keyspace.
6425 */
6426 m_scan_it->SeekToLast();
6427 } else {
6428 /*
6429 Iterator is pointing at some record right after the end of the index.
6430 Move one record back.
6431 */
6432 m_scan_it->Prev();
6433 }
6434 m_skip_scan_it_next_call = false;
6435
6436 if (is_pk(active_index, table, m_tbl_def)) {
6437 m_skip_scan_it_next_call = true;
6438 rc = rnd_next_with_direction(buf, false);
6439 } else {
6440 rc = find_icp_matching_index_rec(false /*move_forward*/, buf);
6441 if (!rc)
6442 rc = secondary_index_read(active_index, buf);
6443 }
6444
6445 if (rc != HA_ERR_LOCK_DEADLOCK || !is_new_snapshot)
6446 break; /* exit the loop */
6447
6448 // release the snapshot and iterator so they will be regenerated
6449 tx->release_snapshot();
6450 release_scan_iterator();
6451 }
6452
6453 DBUG_RETURN(rc);
6454 }
6455
unlock_row()6456 void ha_rocksdb::unlock_row() {
6457 DBUG_ENTER_FUNC();
6458
6459 if (m_lock_rows != RDB_LOCK_NONE) {
6460 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
6461 tx->release_lock(m_pk_descr->get_cf(),
6462 std::string(m_last_rowkey.ptr(), m_last_rowkey.length()));
6463 }
6464
6465 DBUG_VOID_RETURN;
6466 }
6467
6468 /*
6469 Returning true if SingleDelete can be used.
6470 - Secondary Indexes can always use SingleDelete.
6471 - If the index is PRIMARY KEY, and if all of the columns of the table
6472 are covered by the PRIMARY KEY, SingleDelete can be used.
6473 */
can_use_single_delete(const uint & index) const6474 bool ha_rocksdb::can_use_single_delete(const uint &index) const {
6475 return (index != pk_index(table, m_tbl_def) ||
6476 (!has_hidden_pk(table) &&
6477 table->key_info[index].actual_key_parts == table->s->fields));
6478 }
6479
skip_unique_check() const6480 bool ha_rocksdb::skip_unique_check() const {
6481 /*
6482 We want to skip unique checks if:
6483 1) bulk_load is on
6484 2) the user set unique_checks option to 0
6485 */
6486 return THDVAR(table->in_use, bulk_load) ||
6487 my_core::thd_test_options(table->in_use, OPTION_RELAXED_UNIQUE_CHECKS);
6488 }
6489
commit_in_the_middle()6490 bool ha_rocksdb::commit_in_the_middle() {
6491 return THDVAR(table->in_use, bulk_load) ||
6492 THDVAR(table->in_use, commit_in_the_middle);
6493 }
6494
6495 /*
6496 Executing bulk commit if it should.
6497 @retval true if bulk commit failed
6498 @retval false if bulk commit was skipped or succeeded
6499 */
do_bulk_commit(Rdb_transaction * const tx)6500 bool ha_rocksdb::do_bulk_commit(Rdb_transaction *const tx) {
6501 DBUG_ASSERT(tx != nullptr);
6502 return commit_in_the_middle() &&
6503 tx->get_write_count() >= THDVAR(table->in_use, bulk_load_size) &&
6504 tx->flush_batch();
6505 }
6506
6507 /*
6508 If table was created without primary key, SQL layer represents the primary
6509 key number as MAX_INDEXES. Hence, this function returns true if the table
6510 does not contain a primary key. (In which case we generate a hidden
6511 'auto-incremented' pk.)
6512 */
has_hidden_pk(const TABLE * const table) const6513 bool ha_rocksdb::has_hidden_pk(const TABLE *const table) const {
6514 DBUG_ASSERT(table != nullptr);
6515 return Rdb_key_def::table_has_hidden_pk(table);
6516 }
6517
6518 /*
6519 Returns true if given index number is a hidden_pk.
6520 - This is used when a table is created with no primary key.
6521 */
is_hidden_pk(const uint index,const TABLE * const table_arg,const Rdb_tbl_def * const tbl_def_arg)6522 bool ha_rocksdb::is_hidden_pk(const uint index, const TABLE *const table_arg,
6523 const Rdb_tbl_def *const tbl_def_arg) {
6524 DBUG_ASSERT(table_arg != nullptr);
6525 DBUG_ASSERT(table_arg->s != nullptr);
6526 DBUG_ASSERT(tbl_def_arg != nullptr);
6527
6528 return (table_arg->s->primary_key == MAX_INDEXES &&
6529 index == tbl_def_arg->m_key_count - 1);
6530 }
6531
6532 /* Returns index of primary key */
pk_index(const TABLE * const table_arg,const Rdb_tbl_def * const tbl_def_arg)6533 uint ha_rocksdb::pk_index(const TABLE *const table_arg,
6534 const Rdb_tbl_def *const tbl_def_arg) {
6535 DBUG_ASSERT(table_arg != nullptr);
6536 DBUG_ASSERT(table_arg->s != nullptr);
6537 DBUG_ASSERT(tbl_def_arg != nullptr);
6538
6539 return table_arg->s->primary_key == MAX_INDEXES ? tbl_def_arg->m_key_count - 1
6540 : table_arg->s->primary_key;
6541 }
6542
6543 /* Returns true if given index number is a primary key */
is_pk(const uint index,const TABLE * const table_arg,const Rdb_tbl_def * const tbl_def_arg)6544 bool ha_rocksdb::is_pk(const uint index, const TABLE *const table_arg,
6545 const Rdb_tbl_def *const tbl_def_arg) {
6546 DBUG_ASSERT(table_arg != nullptr);
6547 DBUG_ASSERT(table_arg->s != nullptr);
6548 DBUG_ASSERT(tbl_def_arg != nullptr);
6549
6550 return index == table_arg->s->primary_key ||
6551 is_hidden_pk(index, table_arg, tbl_def_arg);
6552 }
6553
get_key_name(const uint index,const TABLE * const table_arg,const Rdb_tbl_def * const tbl_def_arg)6554 const char *ha_rocksdb::get_key_name(const uint index,
6555 const TABLE *const table_arg,
6556 const Rdb_tbl_def *const tbl_def_arg) {
6557 DBUG_ASSERT(table_arg != nullptr);
6558
6559 if (is_hidden_pk(index, table_arg, tbl_def_arg)) {
6560 return HIDDEN_PK_NAME;
6561 }
6562
6563 return table_arg->key_info[index].name;
6564 }
6565
get_key_comment(const uint index,const TABLE * const table_arg,const Rdb_tbl_def * const tbl_def_arg)6566 const char *ha_rocksdb::get_key_comment(const uint index,
6567 const TABLE *const table_arg,
6568 const Rdb_tbl_def *const tbl_def_arg) {
6569 DBUG_ASSERT(table_arg != nullptr);
6570
6571 if (is_hidden_pk(index, table_arg, tbl_def_arg)) {
6572 return nullptr;
6573 }
6574
6575 return table_arg->key_info[index].comment.str;
6576 }
6577
write_row(uchar * const buf)6578 int ha_rocksdb::write_row(uchar *const buf) {
6579 DBUG_ENTER_FUNC();
6580
6581 DBUG_ASSERT(buf != nullptr);
6582 DBUG_ASSERT(buf == table->record[0]);
6583 DBUG_ASSERT(m_lock_rows == RDB_LOCK_WRITE);
6584
6585 ha_statistic_increment(&SSV::ha_write_count);
6586 /*
6587 Note: "buf == table->record[0]" is copied from innodb. I am not aware of
6588 any use cases where this condition is not true.
6589 */
6590 if (table->next_number_field && buf == table->record[0]) {
6591 int err;
6592 if ((err = update_auto_increment())) {
6593 DBUG_RETURN(err);
6594 }
6595 }
6596
6597 const int rv = update_write_row(nullptr, buf, skip_unique_check());
6598
6599 if (rv == 0) {
6600 update_row_stats(ROWS_INSERTED);
6601 }
6602
6603 DBUG_RETURN(rv);
6604 }
6605
6606 /**
6607 Constructing m_last_rowkey (MyRocks key expression) from
6608 before_update|delete image (MySQL row expression).
6609 m_last_rowkey is normally set during lookup phase, such as
6610 rnd_next_with_direction() and rnd_pos(). With Read Free Replication,
6611 these read functions are skipped and update_rows(), delete_rows() are
6612 called without setting m_last_rowkey. This function sets m_last_rowkey
6613 for Read Free Replication.
6614 */
set_last_rowkey(const uchar * const old_data)6615 void ha_rocksdb::set_last_rowkey(const uchar *const old_data) {
6616 if (old_data && use_read_free_rpl()) {
6617 const int old_pk_size = m_pk_descr->pack_record(
6618 table, m_pack_buffer, old_data, m_pk_packed_tuple, nullptr, false);
6619 m_last_rowkey.copy((const char *)m_pk_packed_tuple, old_pk_size,
6620 &my_charset_bin);
6621 }
6622 }
6623
get_pk_for_update(struct update_row_info * const row_info)6624 int ha_rocksdb::get_pk_for_update(struct update_row_info *const row_info) {
6625 int size;
6626
6627 /*
6628 Get new row key for any insert, and any update where the pk is not hidden.
6629 Row key for updates with hidden pk is handled below.
6630 */
6631 if (!has_hidden_pk(table)) {
6632 row_info->hidden_pk_id = 0;
6633
6634 row_info->new_pk_unpack_info = &m_pk_unpack_info;
6635
6636 size = m_pk_descr->pack_record(table, m_pack_buffer, row_info->new_data,
6637 m_pk_packed_tuple,
6638 row_info->new_pk_unpack_info, false);
6639 } else if (row_info->old_data == nullptr) {
6640 row_info->hidden_pk_id = update_hidden_pk_val();
6641 size =
6642 m_pk_descr->pack_hidden_pk(row_info->hidden_pk_id, m_pk_packed_tuple);
6643 } else {
6644 /*
6645 If hidden primary key, rowkey for new record will always be the same as
6646 before
6647 */
6648 size = row_info->old_pk_slice.size();
6649 memcpy(m_pk_packed_tuple, row_info->old_pk_slice.data(), size);
6650 if (read_hidden_pk_id_from_rowkey(&row_info->hidden_pk_id)) {
6651 return HA_ERR_INTERNAL_ERROR;
6652 }
6653 }
6654
6655 row_info->new_pk_slice =
6656 rocksdb::Slice((const char *)m_pk_packed_tuple, size);
6657
6658 return HA_EXIT_SUCCESS;
6659 }
6660
check_and_lock_unique_pk(const uint & key_id,const struct update_row_info & row_info,bool * const found,bool * const pk_changed)6661 int ha_rocksdb::check_and_lock_unique_pk(const uint &key_id,
6662 const struct update_row_info &row_info,
6663 bool *const found,
6664 bool *const pk_changed) {
6665 DBUG_ASSERT(found != nullptr);
6666 DBUG_ASSERT(pk_changed != nullptr);
6667
6668 *pk_changed = false;
6669
6670 /*
6671 For UPDATEs, if the key has changed, we need to obtain a lock. INSERTs
6672 always require locking.
6673 */
6674 if (row_info.old_pk_slice.size() > 0) {
6675 /*
6676 If the keys are the same, then no lock is needed
6677 */
6678 if (!Rdb_pk_comparator::bytewise_compare(row_info.new_pk_slice,
6679 row_info.old_pk_slice)) {
6680 *found = false;
6681 return HA_EXIT_SUCCESS;
6682 }
6683
6684 *pk_changed = true;
6685 }
6686
6687 /*
6688 Perform a read to determine if a duplicate entry exists. For primary
6689 keys, a point lookup will be sufficient.
6690
6691 note: we intentionally don't set options.snapshot here. We want to read
6692 the latest committed data.
6693 */
6694
6695 /*
6696 To prevent race conditions like below, it is necessary to
6697 take a lock for a target row. get_for_update() holds a gap lock if
6698 target key does not exist, so below conditions should never
6699 happen.
6700
6701 1) T1 Get(empty) -> T2 Get(empty) -> T1 Put(insert) -> T1 commit
6702 -> T2 Put(overwrite) -> T2 commit
6703 2) T1 Get(empty) -> T1 Put(insert, not committed yet) -> T2 Get(empty)
6704 -> T2 Put(insert, blocked) -> T1 commit -> T2 commit(overwrite)
6705 */
6706 const rocksdb::Status s =
6707 get_for_update(row_info.tx, m_pk_descr->get_cf(), row_info.new_pk_slice,
6708 &m_retrieved_record);
6709 if (!s.ok() && !s.IsNotFound()) {
6710 return row_info.tx->set_status_error(table->in_use, s,
6711 *m_key_descr_arr[key_id], m_tbl_def);
6712 }
6713
6714 *found = !s.IsNotFound();
6715 return HA_EXIT_SUCCESS;
6716 }
6717
check_and_lock_sk(const uint & key_id,const struct update_row_info & row_info,bool * const found) const6718 int ha_rocksdb::check_and_lock_sk(const uint &key_id,
6719 const struct update_row_info &row_info,
6720 bool *const found) const {
6721 DBUG_ASSERT(found != nullptr);
6722 *found = false;
6723
6724 /*
6725 Can skip checking this key if none of the key fields have changed.
6726 */
6727 if (row_info.old_data != nullptr && !m_update_scope.is_set(key_id)) {
6728 return HA_EXIT_SUCCESS;
6729 }
6730
6731 KEY *key_info = nullptr;
6732 uint n_null_fields = 0;
6733 uint user_defined_key_parts = 1;
6734
6735 key_info = &table->key_info[key_id];
6736 user_defined_key_parts = key_info->user_defined_key_parts;
6737 /*
6738 If there are no uniqueness requirements, there's no need to obtain a
6739 lock for this key.
6740 */
6741 if (!(key_info->flags & HA_NOSAME)) {
6742 return HA_EXIT_SUCCESS;
6743 }
6744
6745 const Rdb_key_def &kd = *m_key_descr_arr[key_id];
6746
6747 /*
6748 Calculate the new key for obtaining the lock
6749
6750 For unique secondary indexes, the key used for locking does not
6751 include the extended fields.
6752 */
6753 int size =
6754 kd.pack_record(table, m_pack_buffer, row_info.new_data, m_sk_packed_tuple,
6755 nullptr, false, 0, user_defined_key_parts, &n_null_fields);
6756 if (n_null_fields > 0) {
6757 /*
6758 If any fields are marked as NULL this will never match another row as
6759 to NULL never matches anything else including another NULL.
6760 */
6761 return HA_EXIT_SUCCESS;
6762 }
6763
6764 const rocksdb::Slice new_slice =
6765 rocksdb::Slice((const char *)m_sk_packed_tuple, size);
6766
6767 /*
6768 For UPDATEs, if the key has changed, we need to obtain a lock. INSERTs
6769 always require locking.
6770 */
6771 if (row_info.old_data != nullptr) {
6772 size = kd.pack_record(table, m_pack_buffer, row_info.old_data,
6773 m_sk_packed_tuple_old, nullptr, false,
6774 row_info.hidden_pk_id, user_defined_key_parts);
6775 const rocksdb::Slice old_slice =
6776 rocksdb::Slice((const char *)m_sk_packed_tuple_old, size);
6777
6778 /*
6779 For updates, if the keys are the same, then no lock is needed
6780
6781 Also check to see if the key has any fields set to NULL. If it does, then
6782 this key is unique since NULL is not equal to each other, so no lock is
6783 needed.
6784 */
6785 if (!Rdb_pk_comparator::bytewise_compare(new_slice, old_slice)) {
6786 return HA_EXIT_SUCCESS;
6787 }
6788 }
6789
6790 /*
6791 Perform a read to determine if a duplicate entry exists - since this is
6792 a secondary indexes a range scan is needed.
6793
6794 note: we intentionally don't set options.snapshot here. We want to read
6795 the latest committed data.
6796 */
6797
6798 const bool all_parts_used = (user_defined_key_parts == kd.get_key_parts());
6799
6800 /*
6801 This iterator seems expensive since we need to allocate and free
6802 memory for each unique index.
6803
6804 If this needs to be optimized, for keys without NULL fields, the
6805 extended primary key fields can be migrated to the value portion of the
6806 key. This enables using Get() instead of Seek() as in the primary key
6807 case.
6808
6809 The bloom filter may need to be disabled for this lookup.
6810 */
6811 const bool total_order_seek = !can_use_bloom_filter(
6812 ha_thd(), kd, new_slice, all_parts_used,
6813 is_ascending(*m_key_descr_arr[key_id], HA_READ_KEY_EXACT));
6814 const bool fill_cache = !THDVAR(ha_thd(), skip_fill_cache);
6815
6816 /*
6817 psergey-todo: we just need to take lock, lookups not needed:
6818 */
6819 std::string dummy_value;
6820 const rocksdb::Status s =
6821 get_for_update(row_info.tx, kd.get_cf(), new_slice, &dummy_value);
6822 if (!s.ok() && !s.IsNotFound()) {
6823 return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def);
6824 }
6825
6826 rocksdb::Iterator *const iter = row_info.tx->get_iterator(
6827 kd.get_cf(), total_order_seek, fill_cache, true /* read current data */,
6828 false /* acquire snapshot */);
6829 /*
6830 Need to scan the transaction to see if there is a duplicate key.
6831 Also need to scan RocksDB and verify the key has not been deleted
6832 in the transaction.
6833 */
6834 iter->Seek(new_slice);
6835 *found = !read_key_exact(kd, iter, all_parts_used, new_slice);
6836 delete iter;
6837
6838 return HA_EXIT_SUCCESS;
6839 }
6840
check_uniqueness_and_lock(const struct update_row_info & row_info,bool * const pk_changed)6841 int ha_rocksdb::check_uniqueness_and_lock(
6842 const struct update_row_info &row_info, bool *const pk_changed) {
6843 /*
6844 Go through each index and determine if the index has uniqueness
6845 requirements. If it does, then try to obtain a row lock on the new values.
6846 Once all locks have been obtained, then perform the changes needed to
6847 update/insert the row.
6848 */
6849 for (uint key_id = 0; key_id < m_tbl_def->m_key_count; key_id++) {
6850 bool found;
6851 int rc;
6852
6853 if (is_pk(key_id, table, m_tbl_def)) {
6854 rc = check_and_lock_unique_pk(key_id, row_info, &found, pk_changed);
6855 } else {
6856 rc = check_and_lock_sk(key_id, row_info, &found);
6857 }
6858
6859 if (rc != 0) {
6860 return rc;
6861 }
6862
6863 if (found) {
6864 /* There is a row with this key already, so error out. */
6865 errkey = key_id;
6866 m_dupp_errkey = errkey;
6867 return HA_ERR_FOUND_DUPP_KEY;
6868 }
6869 }
6870
6871 return HA_EXIT_SUCCESS;
6872 }
6873
check_duplicate_sk(const TABLE * table_arg,const Rdb_key_def & index,const rocksdb::Slice * key,struct unique_sk_buf_info * sk_info)6874 int ha_rocksdb::check_duplicate_sk(const TABLE *table_arg,
6875 const Rdb_key_def &index,
6876 const rocksdb::Slice *key,
6877 struct unique_sk_buf_info *sk_info) {
6878 uint n_null_fields = 0;
6879 const rocksdb::Comparator *index_comp = index.get_cf()->GetComparator();
6880
6881 /* Get proper SK buffer. */
6882 uchar *sk_buf = sk_info->swap_and_get_sk_buf();
6883
6884 /* Get memcmp form of sk without extended pk tail */
6885 uint sk_memcmp_size =
6886 index.get_memcmp_sk_parts(table_arg, *key, sk_buf, &n_null_fields);
6887
6888 sk_info->sk_memcmp_key =
6889 rocksdb::Slice(reinterpret_cast<char *>(sk_buf), sk_memcmp_size);
6890
6891 if (sk_info->sk_memcmp_key_old.size() > 0 && n_null_fields == 0 &&
6892 index_comp->Compare(sk_info->sk_memcmp_key, sk_info->sk_memcmp_key_old) ==
6893 0) {
6894 return 1;
6895 }
6896
6897 sk_info->sk_memcmp_key_old = sk_info->sk_memcmp_key;
6898 return 0;
6899 }
6900
bulk_load_key(Rdb_transaction * const tx,const Rdb_key_def & kd,const rocksdb::Slice & key,const rocksdb::Slice & value)6901 int ha_rocksdb::bulk_load_key(Rdb_transaction *const tx, const Rdb_key_def &kd,
6902 const rocksdb::Slice &key,
6903 const rocksdb::Slice &value) {
6904 rocksdb::ColumnFamilyHandle *const cf = kd.get_cf();
6905 DBUG_ASSERT(cf != nullptr);
6906
6907 if (m_sst_info == nullptr) {
6908 m_sst_info = std::make_shared<Rdb_sst_info>(
6909 rdb, m_table_handler->m_table_name, kd.get_name(), cf,
6910 rocksdb_db_options, THDVAR(ha_thd(), trace_sst_api));
6911 tx->start_bulk_load(this);
6912 m_bulk_load_tx = tx;
6913 }
6914
6915 DBUG_ASSERT(m_sst_info != nullptr);
6916
6917 int rc = m_sst_info->put(key, value);
6918 if (rc != 0) {
6919 my_printf_error(ER_UNKNOWN_ERROR,
6920 "Failed to add a key to sst file writer(%s)", MYF(0),
6921 m_sst_info->error_message().c_str());
6922 rc = HA_ERR_INTERNAL_ERROR;
6923 }
6924
6925 return rc;
6926 }
6927
finalize_bulk_load()6928 int ha_rocksdb::finalize_bulk_load() {
6929 int rc = 0;
6930
6931 /* Skip if there are no possible ongoing bulk loads */
6932 if (m_sst_info == nullptr && m_bulk_load_tx == nullptr) {
6933 return rc;
6934 }
6935
6936 mysql_mutex_lock(&m_bulk_load_mutex);
6937
6938 /*
6939 We need this check because it's possible that m_sst_info has been
6940 flushed and cleared by another thread by the time the mutex has been
6941 acquired.
6942 */
6943 if (m_sst_info != nullptr) {
6944 rc = m_sst_info->commit();
6945 if (rc != 0) {
6946 my_printf_error(ER_UNKNOWN_ERROR,
6947 "Failed to commit bulk loaded sst file to the "
6948 "data store (%s)",
6949 MYF(0), m_sst_info->error_message().c_str());
6950 rc = HA_ERR_INTERNAL_ERROR;
6951 }
6952
6953 m_sst_info = nullptr;
6954 m_bulk_load_tx->end_bulk_load(this);
6955 m_bulk_load_tx = nullptr;
6956 }
6957
6958 mysql_mutex_unlock(&m_bulk_load_mutex);
6959 return rc;
6960 }
6961
update_pk(const Rdb_key_def & kd,const struct update_row_info & row_info,const bool & pk_changed)6962 int ha_rocksdb::update_pk(const Rdb_key_def &kd,
6963 const struct update_row_info &row_info,
6964 const bool &pk_changed) {
6965 const uint key_id = kd.get_keyno();
6966 const bool hidden_pk = is_hidden_pk(key_id, table, m_tbl_def);
6967 if (!hidden_pk && pk_changed) {
6968 /*
6969 The old key needs to be deleted.
6970 */
6971 const rocksdb::Status s = delete_or_singledelete(
6972 key_id, row_info.tx, kd.get_cf(), row_info.old_pk_slice);
6973 if (!s.ok()) {
6974 return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def);
6975 }
6976 }
6977
6978 if (table->next_number_field) {
6979 update_auto_incr_val();
6980 }
6981
6982 rocksdb::Slice value_slice;
6983 convert_record_to_storage_format(row_info.new_pk_slice,
6984 row_info.new_pk_unpack_info, &value_slice);
6985
6986 int rc = 0;
6987 const auto cf = m_pk_descr->get_cf();
6988 if (rocksdb_enable_bulk_load_api && THDVAR(table->in_use, bulk_load) &&
6989 !hidden_pk) {
6990 /*
6991 Write the primary key directly to an SST file using an SstFileWriter
6992 */
6993 rc = bulk_load_key(row_info.tx, kd, row_info.new_pk_slice, value_slice);
6994 } else if (row_info.skip_unique_check) {
6995 /*
6996 It is responsibility of the user to make sure that the data being
6997 inserted doesn't violate any unique keys.
6998 */
6999 row_info.tx->get_blind_write_batch()->Put(cf, row_info.new_pk_slice,
7000 value_slice);
7001 } else if (row_info.tx->m_ddl_transaction) {
7002 /*
7003 DDL statement must check for unique key conflicts. For example:
7004 ALTER TABLE tbl DROP PRIMARY KEY, ADD PRIMARY KEY(non_unique_column)
7005 */
7006 row_info.tx->get_indexed_write_batch()->Put(cf, row_info.new_pk_slice,
7007 value_slice);
7008 } else {
7009 const auto s = row_info.tx->put(cf, row_info.new_pk_slice, value_slice);
7010 if (!s.ok()) {
7011 if (s.IsBusy()) {
7012 errkey = table->s->primary_key;
7013 m_dupp_errkey = errkey;
7014 rc = HA_ERR_FOUND_DUPP_KEY;
7015 } else {
7016 rc = row_info.tx->set_status_error(table->in_use, s, *m_pk_descr,
7017 m_tbl_def);
7018 }
7019 }
7020 }
7021
7022 return rc;
7023 }
7024
update_sk(const TABLE * const table_arg,const Rdb_key_def & kd,const struct update_row_info & row_info)7025 int ha_rocksdb::update_sk(const TABLE *const table_arg, const Rdb_key_def &kd,
7026 const struct update_row_info &row_info) {
7027 int new_packed_size;
7028 int old_packed_size;
7029
7030 rocksdb::Slice new_key_slice;
7031 rocksdb::Slice new_value_slice;
7032 rocksdb::Slice old_key_slice;
7033
7034 const uint key_id = kd.get_keyno();
7035 /*
7036 Can skip updating this key if none of the key fields have changed.
7037 */
7038 if (row_info.old_data != nullptr && !m_update_scope.is_set(key_id)) {
7039 return HA_EXIT_SUCCESS;
7040 }
7041
7042 const bool store_row_debug_checksums = should_store_row_debug_checksums();
7043
7044 new_packed_size = kd.pack_record(
7045 table_arg, m_pack_buffer, row_info.new_data, m_sk_packed_tuple,
7046 &m_sk_tails, store_row_debug_checksums, row_info.hidden_pk_id);
7047
7048 if (row_info.old_data != nullptr) {
7049 // The old value
7050 old_packed_size = kd.pack_record(
7051 table_arg, m_pack_buffer, row_info.old_data, m_sk_packed_tuple_old,
7052 &m_sk_tails_old, store_row_debug_checksums, row_info.hidden_pk_id);
7053
7054 /*
7055 Check if we are going to write the same value. This can happen when
7056 one does
7057 UPDATE tbl SET col='foo'
7058 and we are looking at the row that already has col='foo'.
7059
7060 We also need to compare the unpack info. Suppose, the collation is
7061 case-insensitive, and unpack info contains information about whether
7062 the letters were uppercase and lowercase. Then, both 'foo' and 'FOO'
7063 will have the same key value, but different data in unpack_info.
7064
7065 (note: anyone changing bytewise_compare should take this code into
7066 account)
7067 */
7068 if (old_packed_size == new_packed_size &&
7069 m_sk_tails_old.get_current_pos() == m_sk_tails.get_current_pos() &&
7070 memcmp(m_sk_packed_tuple_old, m_sk_packed_tuple, old_packed_size) ==
7071 0 &&
7072 memcmp(m_sk_tails_old.ptr(), m_sk_tails.ptr(),
7073 m_sk_tails.get_current_pos()) == 0) {
7074 return HA_EXIT_SUCCESS;
7075 }
7076
7077 /*
7078 Deleting entries from secondary index should skip locking, but
7079 be visible to the transaction.
7080 (also note that DDL statements do not delete rows, so this is not a DDL
7081 statement)
7082 */
7083 old_key_slice = rocksdb::Slice(
7084 reinterpret_cast<const char *>(m_sk_packed_tuple_old), old_packed_size);
7085
7086 row_info.tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(),
7087 old_key_slice);
7088 }
7089
7090 /*
7091 We're writing a new entry for secondary key. We can skip locking; we
7092 should write to
7093 - WriteBatchWithIndex normally (so that transaction sees the new row)
7094 - non-indexed WriteBatch, when we don't need to see the new row:
7095 = when doing a DDL operation and writing to a non-unique index, or
7096 = when doing a bulk load
7097 */
7098 rocksdb::WriteBatchBase *write_batch;
7099 if ((row_info.tx->m_ddl_transaction &&
7100 !(table_arg->key_info[key_id].flags & HA_NOSAME)) ||
7101 row_info.skip_unique_check) {
7102 write_batch = row_info.tx->get_blind_write_batch();
7103 } else {
7104 write_batch = row_info.tx->get_indexed_write_batch();
7105 }
7106
7107 new_key_slice = rocksdb::Slice(
7108 reinterpret_cast<const char *>(m_sk_packed_tuple), new_packed_size);
7109 new_value_slice =
7110 rocksdb::Slice(reinterpret_cast<const char *>(m_sk_tails.ptr()),
7111 m_sk_tails.get_current_pos());
7112
7113 write_batch->Put(kd.get_cf(), new_key_slice, new_value_slice);
7114
7115 return HA_EXIT_SUCCESS;
7116 }
7117
update_indexes(const struct update_row_info & row_info,const bool & pk_changed)7118 int ha_rocksdb::update_indexes(const struct update_row_info &row_info,
7119 const bool &pk_changed) {
7120 int rc;
7121
7122 for (uint key_id = 0; key_id < m_tbl_def->m_key_count; key_id++) {
7123 const Rdb_key_def &kd = *m_key_descr_arr[key_id];
7124 if (is_pk(key_id, table, m_tbl_def)) {
7125 rc = update_pk(kd, row_info, pk_changed);
7126 } else {
7127 rc = update_sk(table, kd, row_info);
7128 }
7129
7130 if (rc != 0) {
7131 return rc;
7132 }
7133 }
7134
7135 return HA_EXIT_SUCCESS;
7136 }
7137
update_write_row(const uchar * const old_data,const uchar * const new_data,const bool skip_unique_check)7138 int ha_rocksdb::update_write_row(const uchar *const old_data,
7139 const uchar *const new_data,
7140 const bool skip_unique_check) {
7141 DBUG_ENTER_FUNC();
7142
7143 bool pk_changed = false;
7144 struct update_row_info row_info;
7145
7146 row_info.old_data = old_data;
7147 row_info.new_data = new_data;
7148 row_info.skip_unique_check = skip_unique_check;
7149 row_info.new_pk_unpack_info = nullptr;
7150
7151 set_last_rowkey(old_data);
7152
7153 row_info.tx = get_or_create_tx(table->in_use);
7154
7155 if (old_data != nullptr) {
7156 row_info.old_pk_slice =
7157 rocksdb::Slice(m_last_rowkey.ptr(), m_last_rowkey.length());
7158
7159 /* Determine which indexes need updating. */
7160 calc_updated_indexes();
7161 }
7162
7163 /*
7164 Get the new row key into row_info.new_pk_slice
7165 */
7166 int rc = get_pk_for_update(&row_info);
7167 if (rc != 0) {
7168 DBUG_RETURN(rc);
7169 }
7170
7171 if (!skip_unique_check) {
7172 /*
7173 Check to see if we are going to have failures because of unique
7174 keys. Also lock the appropriate key values.
7175 */
7176 rc = check_uniqueness_and_lock(row_info, &pk_changed);
7177 if (rc != 0) {
7178 DBUG_RETURN(rc);
7179 }
7180 }
7181
7182 DEBUG_SYNC(ha_thd(), "rocksdb.update_write_row_after_unique_check");
7183
7184 /*
7185 At this point, all locks have been obtained, and all checks for duplicate
7186 keys have been performed. No further errors can be allowed to occur from
7187 here because updates to the transaction will be made and those updates
7188 cannot be easily removed without rolling back the entire transaction.
7189 */
7190 rc = update_indexes(row_info, pk_changed);
7191 if (rc != 0) {
7192 DBUG_RETURN(rc);
7193 }
7194
7195 if (do_bulk_commit(row_info.tx)) {
7196 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
7197 }
7198
7199 DBUG_RETURN(HA_EXIT_SUCCESS);
7200 }
7201
7202 /*
7203 Open a cursor and position it at the passed record
7204 */
7205
setup_scan_iterator(const Rdb_key_def & kd,rocksdb::Slice * const slice,const bool use_all_keys,const bool is_ascending,const uint eq_cond_len)7206 void ha_rocksdb::setup_scan_iterator(const Rdb_key_def &kd,
7207 rocksdb::Slice *const slice,
7208 const bool use_all_keys,
7209 const bool is_ascending,
7210 const uint eq_cond_len) {
7211 DBUG_ASSERT(slice != nullptr);
7212 DBUG_ASSERT(slice->size() >= eq_cond_len);
7213
7214 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
7215
7216 bool skip_bloom = true;
7217
7218 const rocksdb::Slice eq_cond(slice->data(), eq_cond_len);
7219 if (can_use_bloom_filter(ha_thd(), kd, eq_cond, use_all_keys, is_ascending)) {
7220 skip_bloom = false;
7221 }
7222
7223 /*
7224 In some cases, setup_scan_iterator() is called multiple times from
7225 the same query but bloom filter can not always be used.
7226 Suppose the following query example. id2 is VARCHAR(30) and PRIMARY KEY
7227 (id1, id2).
7228 select count(*) from t2 WHERE id1=100 and id2 IN ('00000000000000000000',
7229 '100');
7230 In this case, setup_scan_iterator() is called twice, the first time is for
7231 (id1, id2)=(100, '00000000000000000000') and the second time is for (100,
7232 '100').
7233 If prefix bloom filter length is 24 bytes, prefix bloom filter can be used
7234 for the
7235 first condition but not for the second condition.
7236 If bloom filter condition is changed, currently it is necessary to destroy
7237 and
7238 re-create Iterator.
7239 */
7240 if (m_scan_it_skips_bloom != skip_bloom) {
7241 release_scan_iterator();
7242 }
7243
7244 /*
7245 SQL layer can call rnd_init() multiple times in a row.
7246 In that case, re-use the iterator, but re-position it at the table start.
7247 */
7248 if (!m_scan_it) {
7249 const bool fill_cache = !THDVAR(ha_thd(), skip_fill_cache);
7250 if (commit_in_the_middle()) {
7251 DBUG_ASSERT(m_scan_it_snapshot == nullptr);
7252 m_scan_it_snapshot = rdb->GetSnapshot();
7253
7254 auto read_opts = rocksdb::ReadOptions();
7255 read_opts.total_order_seek = true; // TODO: set based on WHERE conditions
7256 read_opts.snapshot = m_scan_it_snapshot;
7257 m_scan_it = rdb->NewIterator(read_opts, kd.get_cf());
7258 } else {
7259 m_scan_it = tx->get_iterator(kd.get_cf(), skip_bloom, fill_cache);
7260 }
7261 m_scan_it_skips_bloom = skip_bloom;
7262 }
7263 /*
7264 Seek() will "Position at the first key in the source that at or past
7265 target".
7266 The operation cannot fail.
7267 */
7268 m_scan_it->Seek(*slice);
7269 }
7270
release_scan_iterator()7271 void ha_rocksdb::release_scan_iterator() {
7272 delete m_scan_it;
7273 m_scan_it = nullptr;
7274
7275 if (m_scan_it_snapshot) {
7276 rdb->ReleaseSnapshot(m_scan_it_snapshot);
7277 m_scan_it_snapshot = nullptr;
7278 }
7279 }
7280
setup_iterator_for_rnd_scan()7281 void ha_rocksdb::setup_iterator_for_rnd_scan() {
7282 uint key_size;
7283
7284 if (m_pk_descr->m_is_reverse_cf)
7285 m_pk_descr->get_supremum_key(m_pk_packed_tuple, &key_size);
7286 else
7287 m_pk_descr->get_infimum_key(m_pk_packed_tuple, &key_size);
7288
7289 rocksdb::Slice table_key((const char *)m_pk_packed_tuple, key_size);
7290
7291 setup_scan_iterator(*m_pk_descr, &table_key);
7292 m_skip_scan_it_next_call = true;
7293 }
7294
rnd_init(bool scan)7295 int ha_rocksdb::rnd_init(bool scan) {
7296 DBUG_ENTER_FUNC();
7297
7298 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
7299 setup_read_decoders();
7300
7301 if (scan) {
7302 m_rnd_scan_is_new_snapshot = !tx->has_snapshot();
7303 setup_iterator_for_rnd_scan();
7304 } else {
7305 /* We don't need any preparations for rnd_pos() calls. */
7306 }
7307
7308 // If m_lock_rows is on then we will be doing a get_for_update when accessing
7309 // the index, so don't acquire the snapshot right away. Otherwise acquire
7310 // the snapshot immediately.
7311 tx->acquire_snapshot(m_lock_rows == RDB_LOCK_NONE);
7312
7313 DBUG_RETURN(HA_EXIT_SUCCESS);
7314 }
7315
rnd_next(uchar * const buf)7316 int ha_rocksdb::rnd_next(uchar *const buf) {
7317 DBUG_ENTER_FUNC();
7318
7319 int rc;
7320 ha_statistic_increment(&SSV::ha_read_rnd_next_count);
7321 for (;;) {
7322 rc = rnd_next_with_direction(buf, true);
7323 if (rc != HA_ERR_LOCK_DEADLOCK || !m_rnd_scan_is_new_snapshot)
7324 break; /* exit the loop */
7325
7326 // release the snapshot and iterator and then regenerate them
7327 Rdb_transaction *tx = get_or_create_tx(table->in_use);
7328 tx->release_snapshot();
7329 release_scan_iterator();
7330 setup_iterator_for_rnd_scan();
7331 }
7332
7333 m_rnd_scan_is_new_snapshot = false;
7334
7335 if (rc == HA_ERR_KEY_NOT_FOUND)
7336 rc = HA_ERR_END_OF_FILE;
7337
7338 DBUG_RETURN(rc);
7339 }
7340
7341 /*
7342 See also secondary_index_read().
7343 */
rnd_next_with_direction(uchar * const buf,bool move_forward)7344 int ha_rocksdb::rnd_next_with_direction(uchar *const buf, bool move_forward) {
7345 DBUG_ENTER_FUNC();
7346
7347 int rc;
7348
7349 table->status = STATUS_NOT_FOUND;
7350
7351 if (!m_scan_it || !m_scan_it->Valid()) {
7352 /*
7353 We can get here when SQL layer has called
7354
7355 h->index_init(PRIMARY);
7356 h->index_read_map(full index tuple, HA_READ_KEY_EXACT);
7357
7358 In this case, we should return EOF.
7359 */
7360 DBUG_RETURN(HA_ERR_END_OF_FILE);
7361 }
7362
7363 for (;;) {
7364 if (m_skip_scan_it_next_call) {
7365 m_skip_scan_it_next_call = false;
7366 } else {
7367 if (move_forward)
7368 m_scan_it->Next(); /* this call cannot fail */
7369 else
7370 m_scan_it->Prev(); /* this call cannot fail */
7371 }
7372
7373 if (!m_scan_it->Valid()) {
7374 rc = HA_ERR_END_OF_FILE;
7375 break;
7376 }
7377
7378 /* check if we're out of this table */
7379 const rocksdb::Slice key = m_scan_it->key();
7380 if (!m_pk_descr->covers_key(key)) {
7381 rc = HA_ERR_END_OF_FILE;
7382 break;
7383 }
7384
7385 if (m_lock_rows != RDB_LOCK_NONE) {
7386 /*
7387 Lock the row we've just read.
7388
7389 Now we call get_for_update which will 1) Take a lock and 2) Will fail
7390 if the row was deleted since the snapshot was taken.
7391 */
7392 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
7393 DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete");
7394 const rocksdb::Status s =
7395 get_for_update(tx, m_pk_descr->get_cf(), key, &m_retrieved_record);
7396 if (s.IsNotFound() &&
7397 my_core::thd_tx_isolation(ha_thd()) == ISO_READ_COMMITTED) {
7398 // This occurs if we accessed a row, tried to lock it, failed,
7399 // released and reacquired the snapshot (because of READ COMMITTED
7400 // mode) and the row was deleted by someone else in the meantime.
7401 // If so, we just want to move on to the next row.
7402 continue;
7403 }
7404
7405 if (!s.ok()) {
7406 DBUG_RETURN(
7407 tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def));
7408 }
7409
7410 // If we called get_for_update() use the value from that call not from
7411 // the iterator as it may be stale since we don't have a snapshot
7412 // when m_lock_rows is not RDB_LOCK_NONE.
7413 m_last_rowkey.copy(key.data(), key.size(), &my_charset_bin);
7414 rc = convert_record_from_storage_format(&key, buf);
7415 } else {
7416 // Use the value from the iterator
7417 rocksdb::Slice value = m_scan_it->value();
7418 m_last_rowkey.copy(key.data(), key.size(), &my_charset_bin);
7419 rc = convert_record_from_storage_format(&key, &value, buf);
7420 }
7421
7422 table->status = 0;
7423 break;
7424 }
7425
7426 if (!rc) {
7427 update_row_stats(ROWS_READ);
7428 }
7429
7430 DBUG_RETURN(rc);
7431 }
7432
rnd_end()7433 int ha_rocksdb::rnd_end() {
7434 DBUG_ENTER_FUNC();
7435
7436 release_scan_iterator();
7437
7438 DBUG_RETURN(HA_EXIT_SUCCESS);
7439 }
7440
index_init(uint idx,bool sorted)7441 int ha_rocksdb::index_init(uint idx, bool sorted) {
7442 DBUG_ENTER_FUNC();
7443
7444 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
7445 DBUG_ASSERT(tx != nullptr);
7446
7447 setup_read_decoders();
7448
7449 // If m_lock_rows is not RDB_LOCK_NONE then we will be doing a get_for_update
7450 // when accessing the index, so don't acquire the snapshot right away.
7451 // Otherwise acquire the snapshot immediately.
7452 tx->acquire_snapshot(m_lock_rows == RDB_LOCK_NONE);
7453
7454 active_index = idx;
7455
7456 DBUG_RETURN(HA_EXIT_SUCCESS);
7457 }
7458
index_end()7459 int ha_rocksdb::index_end() {
7460 DBUG_ENTER_FUNC();
7461
7462 release_scan_iterator();
7463
7464 active_index = MAX_KEY;
7465 in_range_check_pushed_down = FALSE;
7466
7467 DBUG_RETURN(HA_EXIT_SUCCESS);
7468 }
7469
truncate()7470 int ha_rocksdb::truncate() {
7471 DBUG_ENTER_FUNC();
7472
7473 DBUG_ASSERT(m_tbl_def != nullptr);
7474
7475 m_tbl_def->m_auto_incr_val = 1;
7476 m_tbl_def->m_hidden_pk_val = 1;
7477
7478 remove_rows(m_tbl_def);
7479
7480 DBUG_RETURN(HA_EXIT_SUCCESS);
7481 }
7482
7483 /*
7484 Delete the row we've last read. The row is also passed as parameter.
7485
7486 @detail
7487 The caller guarantees table buf points to the row that was just read.
7488 The row is either table->record[0] or table->record[1].
7489 (Check out InnoDB: row_update_for_mysql() has "UT_NOT_USED(mysql_rec)"
7490 */
delete_row(const uchar * const buf)7491 int ha_rocksdb::delete_row(const uchar *const buf) {
7492 DBUG_ENTER_FUNC();
7493
7494 DBUG_ASSERT(buf != nullptr);
7495
7496 ha_statistic_increment(&SSV::ha_delete_count);
7497 set_last_rowkey(buf);
7498
7499 rocksdb::Slice key_slice(m_last_rowkey.ptr(), m_last_rowkey.length());
7500 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
7501
7502 const uint index = pk_index(table, m_tbl_def);
7503 rocksdb::Status s =
7504 delete_or_singledelete(index, tx, m_pk_descr->get_cf(), key_slice);
7505 if (!s.ok()) {
7506 DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def));
7507 }
7508
7509 longlong hidden_pk_id = 0;
7510 if (m_tbl_def->m_key_count > 1 && has_hidden_pk(table) &&
7511 read_hidden_pk_id_from_rowkey(&hidden_pk_id))
7512 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
7513
7514 // Delete the record for every secondary index
7515 for (uint i = 0; i < m_tbl_def->m_key_count; i++) {
7516 if (!is_pk(i, table, m_tbl_def)) {
7517 int packed_size;
7518 const Rdb_key_def &kd = *m_key_descr_arr[i];
7519 packed_size = kd.pack_record(table, m_pack_buffer, buf, m_sk_packed_tuple,
7520 nullptr, false, hidden_pk_id);
7521 rocksdb::Slice secondary_key_slice(
7522 reinterpret_cast<const char *>(m_sk_packed_tuple), packed_size);
7523 /* Deleting on secondary key doesn't need any locks: */
7524 tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(),
7525 secondary_key_slice);
7526 }
7527 }
7528
7529 if (do_bulk_commit(tx)) {
7530 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
7531 }
7532 update_row_stats(ROWS_DELETED);
7533
7534 DBUG_RETURN(HA_EXIT_SUCCESS);
7535 }
7536
delete_or_singledelete(uint index,Rdb_transaction * const tx,rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key)7537 rocksdb::Status ha_rocksdb::delete_or_singledelete(
7538 uint index, Rdb_transaction *const tx,
7539 rocksdb::ColumnFamilyHandle *const column_family,
7540 const rocksdb::Slice &key) {
7541 if (can_use_single_delete(index))
7542 return tx->single_delete(column_family, key);
7543 return tx->delete_key(column_family, key);
7544 }
7545
update_stats(void)7546 void ha_rocksdb::update_stats(void) {
7547 DBUG_ENTER_FUNC();
7548
7549 stats.records = 0;
7550 stats.index_file_length = 0ul;
7551 stats.data_file_length = 0ul;
7552 stats.mean_rec_length = 0;
7553
7554 for (uint i = 0; i < m_tbl_def->m_key_count; i++) {
7555 if (is_pk(i, table, m_tbl_def)) {
7556 stats.data_file_length = m_pk_descr->m_stats.m_actual_disk_size;
7557 stats.records = m_pk_descr->m_stats.m_rows;
7558 } else {
7559 stats.index_file_length += m_key_descr_arr[i]->m_stats.m_actual_disk_size;
7560 }
7561 }
7562
7563 DBUG_VOID_RETURN;
7564 }
7565
info(uint flag)7566 int ha_rocksdb::info(uint flag) {
7567 DBUG_ENTER_FUNC();
7568
7569 if (!table)
7570 return HA_EXIT_FAILURE;
7571
7572 if (flag & HA_STATUS_VARIABLE) {
7573 /*
7574 Test only to simulate corrupted stats
7575 */
7576 DBUG_EXECUTE_IF("myrocks_simulate_negative_stats",
7577 m_pk_descr->m_stats.m_actual_disk_size =
7578 -m_pk_descr->m_stats.m_actual_disk_size;);
7579
7580 update_stats();
7581
7582 /*
7583 If any stats are negative due to bad cached stats, re-run analyze table
7584 and re-retrieve the stats.
7585 */
7586 if (static_cast<longlong>(stats.data_file_length) < 0 ||
7587 static_cast<longlong>(stats.index_file_length) < 0 ||
7588 static_cast<longlong>(stats.records) < 0) {
7589 if (analyze(nullptr, nullptr)) {
7590 DBUG_RETURN(HA_EXIT_FAILURE);
7591 }
7592
7593 update_stats();
7594 }
7595
7596 if (stats.records == 0) {
7597 // most likely, the table is in memtable
7598 // try to deduce from GetApproximateSizes
7599 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
7600 auto r = get_range(pk_index(table, m_tbl_def), buf);
7601 uint64_t sz = 0;
7602
7603 #pragma GCC diagnostic push
7604 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
7605 rdb->GetApproximateSizes(m_pk_descr->get_cf(), &r, 1, &sz, true);
7606 #pragma GCC diagnostic pop
7607
7608 stats.records = sz / ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE;
7609 stats.data_file_length = sz;
7610
7611 if (rocksdb_debug_optimizer_n_rows > 0)
7612 stats.records = rocksdb_debug_optimizer_n_rows;
7613 }
7614
7615 if (stats.records != 0)
7616 stats.mean_rec_length = stats.data_file_length / stats.records;
7617 }
7618 if (flag & HA_STATUS_CONST) {
7619 ref_length = m_pk_descr->max_storage_fmt_length();
7620
7621 // TODO: Needs to reimplement after having real index statistics
7622 for (uint i = 0; i < m_tbl_def->m_key_count; i++) {
7623 if (is_hidden_pk(i, table, m_tbl_def)) {
7624 continue;
7625 }
7626 KEY *const k = &table->key_info[i];
7627 for (uint j = 0; j < k->actual_key_parts; j++) {
7628 const Rdb_index_stats &k_stats = m_key_descr_arr[i]->m_stats;
7629 uint x = k_stats.m_distinct_keys_per_prefix.size() > j &&
7630 k_stats.m_distinct_keys_per_prefix[j] > 0
7631 ? k_stats.m_rows / k_stats.m_distinct_keys_per_prefix[j]
7632 : 0;
7633 if (x > stats.records)
7634 x = stats.records;
7635 if ((x == 0 && rocksdb_debug_optimizer_no_zero_cardinality) ||
7636 rocksdb_debug_optimizer_n_rows > 0) {
7637 // Fake cardinality implementation. For example, (idx1, idx2, idx3)
7638 // index
7639 // will have rec_per_key for (idx1)=4, (idx1,2)=2, and (idx1,2,3)=1.
7640 // rec_per_key for the whole index is 1, and multiplied by 2^n if
7641 // n suffix columns of the index are not used.
7642 x = 1 << (k->actual_key_parts - j - 1);
7643 }
7644 k->rec_per_key[j] = x;
7645 }
7646 }
7647 }
7648
7649 if (flag & HA_STATUS_ERRKEY) {
7650 /*
7651 Currently we support only primary keys so we know which key had a
7652 uniqueness violation.
7653 */
7654 errkey = m_dupp_errkey;
7655 dup_ref = m_pk_tuple; // TODO(?): this should store packed PK.
7656 }
7657
7658 if (flag & HA_STATUS_AUTO) {
7659 stats.auto_increment_value = m_tbl_def->m_auto_incr_val;
7660 }
7661
7662 DBUG_RETURN(HA_EXIT_SUCCESS);
7663 }
7664
position(const uchar * const record)7665 void ha_rocksdb::position(const uchar *const record) {
7666 DBUG_ENTER_FUNC();
7667
7668 longlong hidden_pk_id = 0;
7669 if (has_hidden_pk(table) && read_hidden_pk_id_from_rowkey(&hidden_pk_id))
7670 DBUG_ASSERT(false); // should never reach here
7671
7672 /*
7673 Get packed primary key value from the record.
7674
7675 (Note: m_last_rowkey has the packed PK of last-read-row, which allows to
7676 handle most cases, but there is an exception to this: when slave applies
7677 RBR events, it fills the record and then calls position(); rnd_pos())
7678
7679 Also note that we *can* rely on PK column values being available. This is
7680 because table_flags() includes HA_PRIMARY_KEY_REQUIRED_FOR_POSITION bit.
7681 When that is true, table->prepare_for_position() adds PK columns into the
7682 read set (this may potentially disable index-only access when PK column
7683 cannot be restored from its mem-comparable form in the secondary indexes).
7684 */
7685 const uint packed_size = m_pk_descr->pack_record(
7686 table, m_pack_buffer, record, ref, nullptr, false, hidden_pk_id);
7687
7688 /*
7689 It could be that mem-comparable form of PK occupies less than ref_length
7690 bytes. Fill the remainder with zeros.
7691 */
7692 if (ref_length > packed_size)
7693 memset(ref + packed_size, 0, ref_length - packed_size);
7694
7695 DBUG_VOID_RETURN;
7696 }
7697
rnd_pos(uchar * const buf,uchar * const pos)7698 int ha_rocksdb::rnd_pos(uchar *const buf, uchar *const pos) {
7699 DBUG_ENTER_FUNC();
7700
7701 int rc;
7702 size_t len;
7703
7704 ha_statistic_increment(&SSV::ha_read_rnd_count);
7705 len = m_pk_descr->key_length(table,
7706 rocksdb::Slice((const char *)pos, ref_length));
7707 if (len == size_t(-1)) {
7708 DBUG_RETURN(HA_ERR_INTERNAL_ERROR); /* Data corruption? */
7709 }
7710
7711 rc = get_row_by_rowid(buf, pos, len);
7712
7713 if (!rc) {
7714 update_row_stats(ROWS_READ);
7715 }
7716
7717 DBUG_RETURN(rc);
7718 }
7719
7720 /*
7721 @brief
7722 Calculate (if needed) the bitmap of indexes that are modified by the
7723 current query
7724
7725 @detail
7726 The calculation done by checking index definitions against the
7727 table->write_set
7728 column bitmap.
7729 */
7730
calc_updated_indexes()7731 void ha_rocksdb::calc_updated_indexes() {
7732 if (!m_update_scope_is_valid) {
7733 m_update_scope_is_valid = true;
7734 m_update_scope.clear_all();
7735
7736 for (uint keynr = 0; keynr < table->s->keys; keynr++) {
7737 const Rdb_key_def &kd = *m_key_descr_arr[keynr];
7738 /* Walk over all key parts, including the "extended key" suffix */
7739 const uint key_parts = kd.get_key_parts();
7740 for (uint kp = 0; kp < key_parts; kp++) {
7741 if (has_hidden_pk(table) && kp + 1 == key_parts)
7742 break;
7743
7744 Field *const field = kd.get_table_field_for_part_no(table, kp);
7745 if (bitmap_is_set(table->write_set, field->field_index)) {
7746 m_update_scope.set_bit(keynr);
7747 break;
7748 }
7749 }
7750 }
7751 }
7752 }
7753
update_row(const uchar * const old_data,uchar * const new_data)7754 int ha_rocksdb::update_row(const uchar *const old_data, uchar *const new_data) {
7755 DBUG_ENTER_FUNC();
7756
7757 DBUG_ASSERT(old_data != nullptr);
7758 DBUG_ASSERT(new_data != nullptr);
7759 DBUG_ASSERT(m_lock_rows == RDB_LOCK_WRITE);
7760 /*
7761 old_data points to record we're updating. It is the same as the record
7762 we've just read (for multi-table UPDATE, too, because SQL layer will make
7763 an rnd_pos() call to re-read the record before calling update_row())
7764 */
7765 DBUG_ASSERT(new_data == table->record[0]);
7766
7767 ha_statistic_increment(&SSV::ha_update_count);
7768 const int rv = update_write_row(old_data, new_data, false);
7769
7770 if (rv == 0) {
7771 update_row_stats(ROWS_UPDATED);
7772 }
7773
7774 DBUG_RETURN(rv);
7775 }
7776
7777 /* The following function was copied from ha_blackhole::store_lock: */
store_lock(THD * const thd,THR_LOCK_DATA ** to,enum thr_lock_type lock_type)7778 THR_LOCK_DATA **ha_rocksdb::store_lock(THD *const thd, THR_LOCK_DATA **to,
7779 enum thr_lock_type lock_type) {
7780 DBUG_ENTER_FUNC();
7781
7782 DBUG_ASSERT(thd != nullptr);
7783 DBUG_ASSERT(to != nullptr);
7784
7785 bool in_lock_tables = my_core::thd_in_lock_tables(thd);
7786
7787 /* First, make a decision about MyRocks's internal locking */
7788 if (lock_type >= TL_WRITE_ALLOW_WRITE) {
7789 m_lock_rows = RDB_LOCK_WRITE;
7790 } else if (lock_type == TL_READ_WITH_SHARED_LOCKS) {
7791 m_lock_rows = RDB_LOCK_READ;
7792 } else {
7793 m_lock_rows = RDB_LOCK_NONE;
7794 if (THDVAR(thd, lock_scanned_rows)) {
7795 /*
7796 The following logic was copied directly from
7797 ha_innobase::store_lock_with_x_type() in
7798 storage/innobase/handler/ha_innodb.cc and causes MyRocks to leave
7799 locks in place on rows that are in a table that is not being updated.
7800 */
7801 const uint sql_command = my_core::thd_sql_command(thd);
7802 if ((lock_type == TL_READ && in_lock_tables) ||
7803 (lock_type == TL_READ_HIGH_PRIORITY && in_lock_tables) ||
7804 lock_type == TL_READ_WITH_SHARED_LOCKS ||
7805 lock_type == TL_READ_NO_INSERT ||
7806 (lock_type != TL_IGNORE && sql_command != SQLCOM_SELECT)) {
7807
7808 ulong tx_isolation = my_core::thd_tx_isolation(thd);
7809 if (sql_command != SQLCOM_CHECKSUM &&
7810 ((my_core::thd_test_options(thd, OPTION_BIN_LOG) &&
7811 tx_isolation > ISO_READ_COMMITTED) ||
7812 tx_isolation == ISO_SERIALIZABLE ||
7813 (lock_type != TL_READ && lock_type != TL_READ_NO_INSERT) ||
7814 (sql_command != SQLCOM_INSERT_SELECT &&
7815 sql_command != SQLCOM_REPLACE_SELECT &&
7816 sql_command != SQLCOM_UPDATE && sql_command != SQLCOM_DELETE &&
7817 sql_command != SQLCOM_CREATE_TABLE))) {
7818 m_lock_rows = RDB_LOCK_READ;
7819 }
7820 }
7821 }
7822 }
7823
7824 /* Then, tell the SQL layer what kind of locking it should use: */
7825 if (lock_type != TL_IGNORE && m_db_lock.type == TL_UNLOCK) {
7826 /*
7827 Here is where we get into the guts of a row level lock.
7828 If TL_UNLOCK is set
7829 If we are not doing a LOCK TABLE or DISCARD/IMPORT
7830 TABLESPACE, then allow multiple writers
7831 */
7832
7833 if ((lock_type >= TL_WRITE_CONCURRENT_INSERT && lock_type <= TL_WRITE) &&
7834 !in_lock_tables && !my_core::thd_tablespace_op(thd)) {
7835 lock_type = TL_WRITE_ALLOW_WRITE;
7836 }
7837
7838 /*
7839 In queries of type INSERT INTO t1 SELECT ... FROM t2 ...
7840 MySQL would use the lock TL_READ_NO_INSERT on t2, and that
7841 would conflict with TL_WRITE_ALLOW_WRITE, blocking all inserts
7842 to t2. Convert the lock to a normal read lock to allow
7843 concurrent inserts to t2.
7844 */
7845
7846 if (lock_type == TL_READ_NO_INSERT && !in_lock_tables) {
7847 lock_type = TL_READ;
7848 }
7849
7850 m_db_lock.type = lock_type;
7851 }
7852
7853 *to++ = &m_db_lock;
7854
7855 DBUG_RETURN(to);
7856 }
7857
read_thd_vars(THD * const thd)7858 void ha_rocksdb::read_thd_vars(THD *const thd) {
7859 m_store_row_debug_checksums = THDVAR(thd, store_row_debug_checksums);
7860 m_verify_row_debug_checksums = THDVAR(thd, verify_row_debug_checksums);
7861 m_checksums_pct = THDVAR(thd, checksums_pct);
7862 }
7863
thd_rocksdb_tmpdir()7864 const char *ha_rocksdb::thd_rocksdb_tmpdir() {
7865 const char *tmp_dir = THDVAR(ha_thd(), tmpdir);
7866
7867 /*
7868 We want to treat an empty string as nullptr, in these cases DDL operations
7869 will use the default --tmpdir passed to mysql instead.
7870 */
7871 if (tmp_dir != nullptr && *tmp_dir == '\0') {
7872 tmp_dir = nullptr;
7873 }
7874
7875 return (tmp_dir);
7876 }
7877
external_lock(THD * const thd,int lock_type)7878 int ha_rocksdb::external_lock(THD *const thd, int lock_type) {
7879 DBUG_ENTER_FUNC();
7880
7881 DBUG_ASSERT(thd != nullptr);
7882
7883 int res = 0;
7884 int binlog_format = my_core::thd_binlog_format(thd);
7885 bool unsafe_for_binlog = THDVAR(ha_thd(), unsafe_for_binlog);
7886 if (lock_type == F_WRLCK && !thd->rli_slave && !unsafe_for_binlog &&
7887 binlog_format != BINLOG_FORMAT_ROW &&
7888 binlog_format != BINLOG_FORMAT_UNSPEC &&
7889 my_core::thd_binlog_filter_ok(thd)) {
7890 my_printf_error(
7891 ER_UNKNOWN_ERROR,
7892 "Can't execute updates on master with binlog_format != ROW.", MYF(0));
7893 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
7894 }
7895
7896 if (lock_type == F_UNLCK) {
7897 Rdb_transaction *const tx = get_or_create_tx(thd);
7898
7899 tx->io_perf_end_and_record(&m_io_perf);
7900 if (!my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
7901 /*
7902 Do like InnoDB: when we get here, it's time to commit a
7903 single-statement transaction.
7904
7905 If the statement involved multiple tables, this code will be executed
7906 for each of them, but that's ok because non-first tx->commit() calls
7907 will be no-ops.
7908 */
7909 res = tx->commit_or_rollback();
7910 }
7911 } else {
7912 if (my_core::thd_tx_isolation(thd) < ISO_READ_COMMITTED ||
7913 my_core::thd_tx_isolation(thd) > ISO_REPEATABLE_READ) {
7914 my_printf_error(
7915 ER_UNKNOWN_ERROR, "MyRocks supports only READ COMMITTED and "
7916 "REPEATABLE READ isolation levels. "
7917 "Please change from current isolation "
7918 "level %s",
7919 MYF(0), tx_isolation_names[my_core::thd_tx_isolation(thd)]);
7920 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
7921 }
7922 /*
7923 It's nice to do the following on start of every statement. The problem
7924 is, handler->start_stmt() is not called for INSERTs.
7925 So, we put this code here.
7926 */
7927 Rdb_transaction *const tx = get_or_create_tx(thd);
7928 read_thd_vars(thd);
7929
7930 m_update_scope_is_valid = false;
7931
7932 if (skip_unique_check()) {
7933 if ((thd->lex->sql_command == SQLCOM_INSERT ||
7934 thd->lex->sql_command == SQLCOM_LOAD ||
7935 thd->lex->sql_command == SQLCOM_REPLACE) &&
7936 (thd->lex->duplicates == DUP_REPLACE ||
7937 thd->lex->duplicates == DUP_UPDATE)) {
7938 my_printf_error(ER_UNKNOWN_ERROR,
7939 "When unique checking is disabled in MyRocks, INSERT,"
7940 "UPDATE,LOAD statements with clauses that update or "
7941 "replace the key (i.e. "
7942 "INSERT ON DUPLICATE KEY UPDATE, REPLACE) are not "
7943 "allowed. Query: %s",
7944 MYF(0), thd->query());
7945 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
7946 }
7947 }
7948
7949 if (lock_type == F_WRLCK) {
7950 if (tx->is_tx_read_only()) {
7951 my_printf_error(ER_UNKNOWN_ERROR,
7952 "Can't execute updates when you started a transaction "
7953 "with START TRANSACTION WITH CONSISTENT SNAPSHOT.",
7954 MYF(0));
7955 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
7956 }
7957
7958 /*
7959 SQL layer signals us to take a write lock. It does so when starting DML
7960 statement. We should put locks on the rows we're reading.
7961
7962 Note: sometimes, external_lock() can be called without a prior
7963 ::store_lock call. That's why we need to set lock_* members here, too.
7964 */
7965 m_lock_rows = RDB_LOCK_WRITE;
7966
7967 if (thd->lex->sql_command == SQLCOM_CREATE_INDEX ||
7968 thd->lex->sql_command == SQLCOM_DROP_INDEX ||
7969 thd->lex->sql_command == SQLCOM_ALTER_TABLE) {
7970 tx->m_ddl_transaction = true;
7971 }
7972 }
7973 rocksdb_register_tx(rocksdb_hton, thd, tx);
7974 tx->io_perf_start(&m_io_perf);
7975 }
7976
7977 DBUG_RETURN(res);
7978 }
7979
7980 /**
7981 @note
7982 A quote from ha_innobase::start_stmt():
7983 <quote>
7984 MySQL calls this function at the start of each SQL statement inside LOCK
7985 TABLES. Inside LOCK TABLES the ::external_lock method does not work to
7986 mark SQL statement borders.
7987 </quote>
7988 */
7989
start_stmt(THD * const thd,thr_lock_type lock_type)7990 int ha_rocksdb::start_stmt(THD *const thd, thr_lock_type lock_type) {
7991 DBUG_ENTER_FUNC();
7992
7993 DBUG_ASSERT(thd != nullptr);
7994
7995 Rdb_transaction *const tx = get_or_create_tx(thd);
7996 read_thd_vars(thd);
7997 rocksdb_register_tx(ht, thd, tx);
7998 tx->io_perf_start(&m_io_perf);
7999
8000 DBUG_RETURN(HA_EXIT_SUCCESS);
8001 }
8002
get_range(uint32_t i,uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2],int offset1,int offset2)8003 rocksdb::Range get_range(uint32_t i,
8004 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2],
8005 int offset1, int offset2) {
8006 uchar *buf_begin = buf;
8007 uchar *buf_end = buf + Rdb_key_def::INDEX_NUMBER_SIZE;
8008 rdb_netbuf_store_index(buf_begin, i + offset1);
8009 rdb_netbuf_store_index(buf_end, i + offset2);
8010
8011 return rocksdb::Range(
8012 rocksdb::Slice((const char *)buf_begin, Rdb_key_def::INDEX_NUMBER_SIZE),
8013 rocksdb::Slice((const char *)buf_end, Rdb_key_def::INDEX_NUMBER_SIZE));
8014 }
8015
get_range(const Rdb_key_def & kd,uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2],int offset1,int offset2)8016 static rocksdb::Range get_range(const Rdb_key_def &kd,
8017 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2],
8018 int offset1, int offset2) {
8019 return get_range(kd.get_index_number(), buf, offset1, offset2);
8020 }
8021
get_range(const Rdb_key_def & kd,uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2])8022 rocksdb::Range get_range(const Rdb_key_def &kd,
8023 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]) {
8024 if (kd.m_is_reverse_cf) {
8025 return myrocks::get_range(kd, buf, 1, 0);
8026 } else {
8027 return myrocks::get_range(kd, buf, 0, 1);
8028 }
8029 }
8030
8031 rocksdb::Range
get_range(const int & i,uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]) const8032 ha_rocksdb::get_range(const int &i,
8033 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]) const {
8034 return myrocks::get_range(*m_key_descr_arr[i], buf);
8035 }
8036
8037 /*
8038 Drop index thread's main logic
8039 */
8040
run()8041 void Rdb_drop_index_thread::run() {
8042 mysql_mutex_lock(&m_signal_mutex);
8043
8044 for (;;) {
8045 // The stop flag might be set by shutdown command
8046 // after drop_index_thread releases signal_mutex
8047 // (i.e. while executing expensive Seek()). To prevent drop_index_thread
8048 // from entering long cond_timedwait, checking if stop flag
8049 // is true or not is needed, with drop_index_interrupt_mutex held.
8050 if (m_stop) {
8051 break;
8052 }
8053
8054 timespec ts;
8055 clock_gettime(CLOCK_REALTIME, &ts);
8056 ts.tv_sec += dict_manager.is_drop_index_empty()
8057 ? 24 * 60 * 60 // no filtering
8058 : 60; // filtering
8059
8060 const auto ret __attribute__((__unused__)) =
8061 mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts);
8062 if (m_stop) {
8063 break;
8064 }
8065 // make sure, no program error is returned
8066 DBUG_ASSERT(ret == 0 || ret == ETIMEDOUT);
8067 mysql_mutex_unlock(&m_signal_mutex);
8068
8069 std::unordered_set<GL_INDEX_ID> indices;
8070 dict_manager.get_ongoing_drop_indexes(&indices);
8071 if (!indices.empty()) {
8072 std::unordered_set<GL_INDEX_ID> finished;
8073 rocksdb::ReadOptions read_opts;
8074 read_opts.total_order_seek = true; // disable bloom filter
8075
8076 for (const auto d : indices) {
8077 uint32 cf_flags = 0;
8078 if (!dict_manager.get_cf_flags(d.cf_id, &cf_flags)) {
8079 sql_print_error("RocksDB: Failed to get column family flags "
8080 "from cf id %u. MyRocks data dictionary may "
8081 "get corrupted.",
8082 d.cf_id);
8083 abort_with_stack_traces();
8084 }
8085 rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(d.cf_id);
8086 DBUG_ASSERT(cfh);
8087 const bool is_reverse_cf = cf_flags & Rdb_key_def::REVERSE_CF_FLAG;
8088
8089 bool index_removed = false;
8090 uchar key_buf[Rdb_key_def::INDEX_NUMBER_SIZE] = {0};
8091 rdb_netbuf_store_uint32(key_buf, d.index_id);
8092 const rocksdb::Slice key =
8093 rocksdb::Slice((char *)key_buf, sizeof(key_buf));
8094 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
8095 rocksdb::Range range = get_range(d.index_id, buf, is_reverse_cf ? 1 : 0,
8096 is_reverse_cf ? 0 : 1);
8097 rocksdb::CompactRangeOptions compact_range_options;
8098 compact_range_options.bottommost_level_compaction =
8099 rocksdb::BottommostLevelCompaction::kForce;
8100 compact_range_options.exclusive_manual_compaction = false;
8101 rocksdb::Status status = DeleteFilesInRange(rdb->GetBaseDB(), cfh,
8102 &range.start, &range.limit);
8103 if (!status.ok()) {
8104 if (status.IsShutdownInProgress()) {
8105 break;
8106 }
8107 rdb_handle_io_error(status, RDB_IO_ERROR_BG_THREAD);
8108 }
8109 status = rdb->CompactRange(compact_range_options, cfh, &range.start,
8110 &range.limit);
8111 if (!status.ok()) {
8112 if (status.IsShutdownInProgress()) {
8113 break;
8114 }
8115 rdb_handle_io_error(status, RDB_IO_ERROR_BG_THREAD);
8116 }
8117 std::unique_ptr<rocksdb::Iterator> it(rdb->NewIterator(read_opts, cfh));
8118 it->Seek(key);
8119 if (is_reverse_cf) {
8120 if (!it->Valid()) {
8121 it->SeekToLast();
8122 } else {
8123 it->Prev();
8124 }
8125 }
8126 if (!it->Valid()) {
8127 index_removed = true;
8128 } else {
8129 if (memcmp(it->key().data(), key_buf,
8130 Rdb_key_def::INDEX_NUMBER_SIZE)) {
8131 // Key does not have same prefix
8132 index_removed = true;
8133 }
8134 }
8135 if (index_removed) {
8136 finished.insert(d);
8137 }
8138 }
8139
8140 if (!finished.empty()) {
8141 dict_manager.finish_drop_indexes(finished);
8142 }
8143 }
8144 mysql_mutex_lock(&m_signal_mutex);
8145 }
8146
8147 mysql_mutex_unlock(&m_signal_mutex);
8148 }
8149
get_table_if_exists(const char * const tablename)8150 Rdb_tbl_def *ha_rocksdb::get_table_if_exists(const char *const tablename) {
8151 DBUG_ASSERT(tablename != nullptr);
8152
8153 std::string str;
8154 if (rdb_normalize_tablename(tablename, &str)) {
8155 // We were not passed table name?
8156 DBUG_ASSERT(0);
8157 return nullptr;
8158 }
8159
8160 return ddl_manager.find(str);
8161 }
8162
8163 /*
8164 Note: the following function is called when the table is not open. That is,
8165 this->table==nullptr, pk_key_descr==nullptr, etc.
8166
8167 tablename points to line in form "./dbname/tablename".
8168 */
8169
delete_table(const char * const tablename)8170 int ha_rocksdb::delete_table(const char *const tablename) {
8171 DBUG_ENTER_FUNC();
8172
8173 DBUG_ASSERT(tablename != nullptr);
8174
8175 const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
8176 rocksdb::WriteBatch *const batch = wb.get();
8177
8178 /* Find the table in the hash */
8179 Rdb_tbl_def *const tbl = get_table_if_exists(tablename);
8180 if (!tbl)
8181 DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
8182 dict_manager.add_drop_table(tbl->m_key_descr_arr, tbl->m_key_count, batch);
8183
8184 /*
8185 Remove the table entry in data dictionary (this will also remove it from
8186 the persistent data dictionary).
8187 */
8188 ddl_manager.remove(tbl, batch, true);
8189 if (dict_manager.commit(batch))
8190 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
8191
8192 rdb_drop_idx_thread.signal();
8193
8194 DBUG_RETURN(HA_EXIT_SUCCESS);
8195 }
8196
remove_rows(Rdb_tbl_def * const tbl)8197 void ha_rocksdb::remove_rows(Rdb_tbl_def *const tbl) {
8198 const rocksdb::WriteOptions wo =
8199 rdb_get_rocksdb_write_options(handler::ha_thd());
8200
8201 rocksdb::ReadOptions opts;
8202 opts.total_order_seek = true;
8203 rocksdb::Iterator *const it = rdb->NewIterator(opts);
8204
8205 char key_buf[MAX_KEY_LENGTH];
8206 uint key_len;
8207 /*
8208 Remove all records in each index.
8209 (This is is not crash-safe, but it doesn't matter, because bulk row
8210 deletion will be handled on rocksdb side)
8211 */
8212 for (uint i = 0; i < tbl->m_key_count; i++) {
8213 const Rdb_key_def &kd = *tbl->m_key_descr_arr[i];
8214 kd.get_infimum_key(reinterpret_cast<uchar *>(key_buf), &key_len);
8215
8216 const rocksdb::Slice table_key(key_buf, key_len);
8217 it->Seek(table_key);
8218 while (it->Valid()) {
8219 const rocksdb::Slice key = it->key();
8220 if (!kd.covers_key(key)) {
8221 break;
8222 }
8223
8224 if (can_use_single_delete(i))
8225 rdb->SingleDelete(wo, key);
8226 else
8227 rdb->Delete(wo, key);
8228 it->Next();
8229 }
8230 }
8231 delete it;
8232 }
8233
rename_table(const char * const from,const char * const to)8234 int ha_rocksdb::rename_table(const char *const from, const char *const to) {
8235 DBUG_ENTER_FUNC();
8236
8237 DBUG_ASSERT(from != nullptr);
8238 DBUG_ASSERT(to != nullptr);
8239
8240 bool write_err = false;
8241 std::string from_str;
8242 std::string to_str;
8243 std::string from_db;
8244 std::string to_db;
8245
8246 if (rdb_normalize_tablename(from, &from_str) ||
8247 rdb_split_normalized_tablename(from_str, &from_db)) {
8248 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
8249 }
8250
8251 if (rdb_normalize_tablename(to, &to_str) ||
8252 rdb_split_normalized_tablename(to_str, &to_db)) {
8253 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
8254 }
8255
8256 // If the user changed the database part of the name then validate that the
8257 // 'to' database exists.
8258 if (from_db != to_db && !rdb_database_exists(to_db)) {
8259 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
8260 }
8261
8262 /* Check if any index has a per-index column family */
8263 {
8264 /*
8265 We can't check this directly, because
8266 1. the object that referred to the old table has been close()d.
8267 2. this ha_rocksdb object has no connection to any table at all, it has
8268 been just created with (TABLE*)nullptr.
8269
8270 So, we have to find the old table in the ddl_manager (it's there because
8271 it has been opened by this statement), and check is_auto_cf there.
8272 */
8273 Rdb_tbl_def *tdef;
8274 if (!(tdef = ddl_manager.find(from_str)))
8275 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
8276
8277 for (uint i = 0; i < tdef->m_key_count; i++) {
8278 DBUG_ASSERT(tdef->m_key_descr_arr != nullptr);
8279
8280 if (tdef->m_key_descr_arr[i]->m_is_auto_cf) {
8281 my_error(ER_NOT_SUPPORTED_YET, MYF(0),
8282 "ALTER TABLE on table with per-index CF");
8283 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
8284 }
8285 }
8286 }
8287
8288 const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
8289 rocksdb::WriteBatch *const batch = wb.get();
8290 dict_manager.lock();
8291 write_err =
8292 ddl_manager.rename(from_str, to_str, batch) || dict_manager.commit(batch);
8293 dict_manager.unlock();
8294
8295 DBUG_RETURN(write_err ? HA_ERR_INTERNAL_ERROR : 0);
8296 }
8297
8298 /**
8299 check_if_incompatible_data() called if ALTER TABLE can't detect otherwise
8300 if new and old definition are compatible
8301
8302 @details If there are no other explicit signs like changed number of
8303 fields this function will be called by compare_tables()
8304 (sql/sql_tables.cc) to decide should we rewrite whole table or only .frm
8305 file.
8306
8307 */
8308
check_if_incompatible_data(HA_CREATE_INFO * const info,uint table_changes)8309 bool ha_rocksdb::check_if_incompatible_data(HA_CREATE_INFO *const info,
8310 uint table_changes) {
8311 DBUG_ENTER_FUNC();
8312
8313 DBUG_ASSERT(info != nullptr);
8314
8315 // this function is needed only for online alter-table
8316 DBUG_RETURN(COMPATIBLE_DATA_NO);
8317 }
8318
extra(enum ha_extra_function operation)8319 int ha_rocksdb::extra(enum ha_extra_function operation) {
8320 DBUG_ENTER_FUNC();
8321
8322 switch (operation) {
8323 case HA_EXTRA_KEYREAD:
8324 m_keyread_only = true;
8325 break;
8326 case HA_EXTRA_NO_KEYREAD:
8327 m_keyread_only = false;
8328 break;
8329 case HA_EXTRA_FLUSH:
8330 /*
8331 If the table has blobs, then they are part of m_retrieved_record.
8332 This call invalidates them.
8333 */
8334 m_retrieved_record.clear();
8335 break;
8336 default:
8337 break;
8338 }
8339
8340 DBUG_RETURN(HA_EXIT_SUCCESS);
8341 }
8342
8343 /*
8344 Given a starting key and an ending key, estimate the number of rows that
8345 will exist between the two keys.
8346 */
records_in_range(uint inx,key_range * const min_key,key_range * const max_key)8347 ha_rows ha_rocksdb::records_in_range(uint inx, key_range *const min_key,
8348 key_range *const max_key) {
8349 DBUG_ENTER_FUNC();
8350
8351 ha_rows ret = THDVAR(ha_thd(), records_in_range);
8352 if (ret) {
8353 DBUG_RETURN(ret);
8354 }
8355 if (table->force_index) {
8356 const ha_rows force_rows = THDVAR(ha_thd(), force_index_records_in_range);
8357 if (force_rows) {
8358 DBUG_RETURN(force_rows);
8359 }
8360 }
8361
8362 const Rdb_key_def &kd = *m_key_descr_arr[inx];
8363
8364 uint size1 = 0;
8365 if (min_key) {
8366 size1 = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
8367 min_key->key, min_key->keypart_map);
8368 if (min_key->flag == HA_READ_PREFIX_LAST_OR_PREV ||
8369 min_key->flag == HA_READ_PREFIX_LAST ||
8370 min_key->flag == HA_READ_AFTER_KEY) {
8371 kd.successor(m_sk_packed_tuple, size1);
8372 }
8373 } else {
8374 kd.get_infimum_key(m_sk_packed_tuple, &size1);
8375 }
8376
8377 uint size2 = 0;
8378 if (max_key) {
8379 size2 = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple_old,
8380 max_key->key, max_key->keypart_map);
8381 if (max_key->flag == HA_READ_PREFIX_LAST_OR_PREV ||
8382 max_key->flag == HA_READ_PREFIX_LAST ||
8383 max_key->flag == HA_READ_AFTER_KEY) {
8384 kd.successor(m_sk_packed_tuple_old, size2);
8385 }
8386 // pad the upper key with FFFFs to make sure it is more than the lower
8387 if (size1 > size2) {
8388 memset(m_sk_packed_tuple_old + size2, 0xff, size1 - size2);
8389 size2 = size1;
8390 }
8391 } else {
8392 kd.get_supremum_key(m_sk_packed_tuple_old, &size2);
8393 }
8394
8395 const rocksdb::Slice slice1((const char *)m_sk_packed_tuple, size1);
8396 const rocksdb::Slice slice2((const char *)m_sk_packed_tuple_old, size2);
8397
8398 // slice1 >= slice2 means no row will match
8399 if (slice1.compare(slice2) >= 0) {
8400 DBUG_RETURN(HA_EXIT_SUCCESS);
8401 }
8402
8403 rocksdb::Range r(kd.m_is_reverse_cf ? slice2 : slice1,
8404 kd.m_is_reverse_cf ? slice1 : slice2);
8405
8406 uint64_t sz = 0;
8407 auto disk_size = kd.m_stats.m_actual_disk_size;
8408 if (disk_size == 0)
8409 disk_size = kd.m_stats.m_data_size;
8410 auto rows = kd.m_stats.m_rows;
8411 if (rows == 0 || disk_size == 0) {
8412 rows = 1;
8413 disk_size = ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE;
8414 }
8415
8416 #pragma GCC diagnostic push
8417 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
8418 // Getting statistics, including from Memtables
8419 rdb->GetApproximateSizes(kd.get_cf(), &r, 1, &sz, true);
8420 #pragma GCC diagnostic pop
8421
8422 ret = rows * sz / disk_size;
8423
8424 /*
8425 GetApproximateSizes() gives estimates so ret might exceed stats.records.
8426 MySQL then decides to use full index scan rather than range scan, which
8427 is not efficient for most cases.
8428 To prevent this, changing estimated records slightly smaller than
8429 stats.records.
8430 */
8431 if (ret >= stats.records) {
8432 ret = stats.records * 0.99;
8433 }
8434
8435 if (rocksdb_debug_optimizer_n_rows > 0) {
8436 ret = rocksdb_debug_optimizer_n_rows;
8437 } else if (ret == 0) {
8438 ret = 1;
8439 }
8440
8441 DBUG_RETURN(ret);
8442 }
8443
update_create_info(HA_CREATE_INFO * const create_info)8444 void ha_rocksdb::update_create_info(HA_CREATE_INFO *const create_info) {
8445 DBUG_ENTER_FUNC();
8446
8447 DBUG_ASSERT(create_info != nullptr);
8448
8449 if (!(create_info->used_fields & HA_CREATE_USED_AUTO)) {
8450 create_info->auto_increment_value = m_tbl_def->m_auto_incr_val;
8451 }
8452
8453 DBUG_VOID_RETURN;
8454 }
8455
8456 /**
8457 @brief
8458 Doing manual compaction on OPTIMIZE TABLE in RocksDB.
8459 Compaction itself is executed by background thread in RocksDB, but
8460 CompactRange() waits until compaction completes so this function
8461 may take a long time.
8462 Since RocksDB dataset is allocated per index id, OPTIMIZE TABLE
8463 triggers manual compaction for all indexes of the table.
8464 @details
8465 Compaction range is from the beginning of the index id to
8466 the first row of the next index id. When using reverse order
8467 column family, the first row of the next index id should be
8468 the last row of the previous index id.
8469 */
optimize(THD * const thd,HA_CHECK_OPT * const check_opt)8470 int ha_rocksdb::optimize(THD *const thd, HA_CHECK_OPT *const check_opt) {
8471 DBUG_ENTER_FUNC();
8472
8473 DBUG_ASSERT(thd != nullptr);
8474 DBUG_ASSERT(check_opt != nullptr);
8475
8476 int rc = 0;
8477 for (uint i = 0; i < table->s->keys; i++) {
8478 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
8479 auto range = get_range(i, buf);
8480 if (!rdb->CompactRange(rocksdb::CompactRangeOptions(),
8481 m_key_descr_arr[i]->get_cf(), &range.start,
8482 &range.limit)
8483 .ok()) {
8484 rc = 1;
8485 break;
8486 }
8487 }
8488
8489 DBUG_RETURN(rc);
8490 }
8491
calculate_stats(const TABLE * const table_arg,THD * const thd,HA_CHECK_OPT * const check_opt)8492 int ha_rocksdb::calculate_stats(const TABLE *const table_arg, THD *const thd,
8493 HA_CHECK_OPT *const check_opt) {
8494 DBUG_ENTER_FUNC();
8495
8496 // find per column family key ranges which need to be queried
8497 std::unordered_map<rocksdb::ColumnFamilyHandle *, std::vector<rocksdb::Range>>
8498 ranges;
8499 std::unordered_set<GL_INDEX_ID> ids_to_check;
8500 std::vector<uchar> buf(table_arg->s->keys * 2 *
8501 Rdb_key_def::INDEX_NUMBER_SIZE);
8502 for (uint i = 0; i < table_arg->s->keys; i++) {
8503 const auto bufp = &buf[i * 2 * Rdb_key_def::INDEX_NUMBER_SIZE];
8504 const Rdb_key_def &kd = *m_key_descr_arr[i];
8505 ranges[kd.get_cf()].push_back(get_range(i, bufp));
8506 ids_to_check.insert(kd.get_gl_index_id());
8507 }
8508
8509 // for analyze statements, force flush on memtable to get accurate cardinality
8510 Rdb_cf_manager &cf_manager = rdb_get_cf_manager();
8511 if (thd != nullptr && THDVAR(thd, flush_memtable_on_analyze) &&
8512 !rocksdb_pause_background_work) {
8513 for (auto it : ids_to_check) {
8514 rdb->Flush(rocksdb::FlushOptions(), cf_manager.get_cf(it.cf_id));
8515 }
8516 }
8517
8518 // get RocksDB table properties for these ranges
8519 rocksdb::TablePropertiesCollection props;
8520 for (auto it : ranges) {
8521 const auto old_size __attribute__((__unused__)) = props.size();
8522 const auto status = rdb->GetPropertiesOfTablesInRange(
8523 it.first, &it.second[0], it.second.size(), &props);
8524 DBUG_ASSERT(props.size() >= old_size);
8525 if (!status.ok())
8526 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
8527 }
8528
8529 int num_sst = 0;
8530 // group stats per index id
8531 std::unordered_map<GL_INDEX_ID, Rdb_index_stats> stats;
8532 for (const auto &it : ids_to_check) {
8533 // Initialize the stats to 0. If there are no files that contain
8534 // this gl_index_id, then 0 should be stored for the cached stats.
8535 stats[it] = Rdb_index_stats(it);
8536 }
8537 for (const auto &it : props) {
8538 std::vector<Rdb_index_stats> sst_stats;
8539 Rdb_tbl_prop_coll::read_stats_from_tbl_props(it.second, &sst_stats);
8540 /*
8541 sst_stats is a list of index statistics for indexes that have entries
8542 in the current SST file.
8543 */
8544 for (const auto &it1 : sst_stats) {
8545 /*
8546 Only update statistics for indexes that belong to this SQL table.
8547
8548 The reason is: We are walking through all SST files that have
8549 entries from this table (and so can compute good statistics). For
8550 other SQL tables, it can be that we're only seeing a small fraction
8551 of table's entries (and so we can't update statistics based on that).
8552 */
8553 if (ids_to_check.find(it1.m_gl_index_id) == ids_to_check.end())
8554 continue;
8555
8556 auto kd = ddl_manager.safe_find(it1.m_gl_index_id);
8557 DBUG_ASSERT(kd != nullptr);
8558 stats[it1.m_gl_index_id].merge(it1, true, kd->max_storage_fmt_length());
8559 }
8560 num_sst++;
8561 }
8562
8563 // set and persist new stats
8564 ddl_manager.set_stats(stats);
8565 ddl_manager.persist_stats(true);
8566
8567 DBUG_RETURN(HA_EXIT_SUCCESS);
8568 }
8569
analyze(THD * const thd,HA_CHECK_OPT * const check_opt)8570 int ha_rocksdb::analyze(THD *const thd, HA_CHECK_OPT *const check_opt) {
8571 DBUG_ENTER_FUNC();
8572
8573 if (!table) {
8574 DBUG_RETURN(HA_EXIT_FAILURE);
8575 }
8576
8577 DBUG_RETURN(calculate_stats(table, thd, check_opt));
8578 }
8579
get_auto_increment(ulonglong off,ulonglong inc,ulonglong nb_desired_values,ulonglong * const first_value,ulonglong * const nb_reserved_values)8580 void ha_rocksdb::get_auto_increment(ulonglong off, ulonglong inc,
8581 ulonglong nb_desired_values,
8582 ulonglong *const first_value,
8583 ulonglong *const nb_reserved_values) {
8584 /*
8585 MySQL has a somewhat complicated way of handling the auto-increment value.
8586 The first time get_auto_increment is called for a statement,
8587 nb_desired_values is the estimate for how many values will be needed. The
8588 engine can then reserve some values, and those will be automatically used
8589 by MySQL, until a hard-coded value shows up in the insert statement, after
8590 which MySQL again calls this function to reset its starting value.
8591 *
8592 For simplicity we will just ignore nb_desired_values - we aren't going to
8593 reserve any extra values for a multi-insert statement. Each row will
8594 simply acquire the next value as needed and we will always tell MySQL that
8595 we only reserved 1 value. Since we are using an atomic value for
8596 m_auto_incr_val this should be safe - if we had to grab a mutex, doing
8597 an actual reserve of some values might be a better solution.
8598 */
8599 DEBUG_SYNC(ha_thd(), "rocksdb.autoinc_vars");
8600
8601 if (off > inc) {
8602 off = 1;
8603 }
8604
8605 // Local variable reference to simplify code below
8606 std::atomic<longlong> &auto_incr = m_tbl_def->m_auto_incr_val;
8607
8608 if (inc == 1 && off == 1) {
8609 // Optimization for the standard case where we are always simply
8610 // incrementing from the last position
8611 *first_value = auto_incr++;
8612 } else {
8613 // The next value can be more complicated if either `inc` or 'off' is not 1
8614 longlong new_val;
8615 longlong last_val = auto_incr;
8616
8617 // Loop until we can correctly update the atomic value
8618 do {
8619 // Calculate the next value in the auto increment series:
8620 // offset + N * increment
8621 // where N is 0, 1, 2, ...
8622 //
8623 // For further information please visit:
8624 // http://dev.mysql.com/doc/refman/5.7/en/replication-options-master.html
8625 new_val = ((last_val + (inc - off) - 1) / inc) * inc + off;
8626
8627 // Attempt to store the new value (plus 1 since m_auto_incr_val contains
8628 // the next available value) into the atomic value. If the current
8629 // value no longer matches what we have in 'last_val' this will fail and
8630 // we will repeat the loop (`last_val` will automatically get updated
8631 // with the current value).
8632 } while (!auto_incr.compare_exchange_weak(last_val, new_val + 1));
8633
8634 *first_value = new_val;
8635 }
8636
8637 *nb_reserved_values = 1;
8638 }
8639
8640 #ifndef DBUG_OFF
8641
8642 /* Debugger help function */
8643 static char dbug_item_print_buf[512];
8644
dbug_print_item(Item * const item)8645 const char *dbug_print_item(Item *const item) {
8646 char *const buf = dbug_item_print_buf;
8647 String str(buf, sizeof(dbug_item_print_buf), &my_charset_bin);
8648 str.length(0);
8649 if (!item)
8650 return "(Item*)nullptr";
8651 item->print(&str, QT_ORDINARY);
8652 if (str.c_ptr() == buf)
8653 return buf;
8654 else
8655 return "Couldn't fit into buffer";
8656 }
8657
8658 #endif /*DBUG_OFF*/
8659
8660 /**
8661 SQL layer calls this function to push an index condition.
8662
8663 @details
8664 The condition is for index keyno (only one condition can be pushed at a
8665 time).
8666 The caller guarantees that condition refers only to index fields; besides
8667 that, fields must have
8668
8669 $field->part_of_key.set_bit(keyno)
8670
8671 which means that
8672
8673 (handler->index_flags(keyno, $keypart, 0) & HA_KEYREAD_ONLY) == 1
8674
8675 which means that field value can be restored from the index tuple.
8676
8677 @return
8678 Part of condition we couldn't check (always nullptr).
8679 */
8680
idx_cond_push(uint keyno,class Item * const idx_cond)8681 class Item *ha_rocksdb::idx_cond_push(uint keyno, class Item *const idx_cond) {
8682 DBUG_ENTER_FUNC();
8683
8684 DBUG_ASSERT(keyno != MAX_KEY);
8685 DBUG_ASSERT(idx_cond != nullptr);
8686
8687 pushed_idx_cond = idx_cond;
8688 pushed_idx_cond_keyno = keyno;
8689 in_range_check_pushed_down = TRUE;
8690
8691 /* We will check the whole condition */
8692 DBUG_RETURN(nullptr);
8693 }
8694
8695 /*
8696 @brief
8697 Check the index condition.
8698
8699 @detail
8700 Check the index condition. (The caller has unpacked all needed index
8701 columns into table->record[0])
8702
8703 @return
8704 ICP_NO_MATCH - Condition not satisfied (caller should continue
8705 scanning)
8706 OUT_OF_RANGE - We've left the range we're scanning (caller should
8707 stop scanning and return HA_ERR_END_OF_FILE)
8708
8709 ICP_MATCH - Condition is satisfied (caller should fetch the record
8710 and return it)
8711 */
8712
check_index_cond() const8713 enum icp_result ha_rocksdb::check_index_cond() const {
8714 DBUG_ASSERT(pushed_idx_cond);
8715 DBUG_ASSERT(pushed_idx_cond_keyno != MAX_KEY);
8716
8717 if (end_range && compare_key_icp(end_range) > 0) {
8718 /* caller should return HA_ERR_END_OF_FILE already */
8719 return ICP_OUT_OF_RANGE;
8720 }
8721
8722 return pushed_idx_cond->val_int() ? ICP_MATCH : ICP_NO_MATCH;
8723 }
8724
8725 /**
8726 Checking if an index is used for ascending scan or not
8727
8728 @detail
8729 Currently RocksDB does not support bloom filter for
8730 prefix lookup + descending scan, but supports bloom filter for
8731 prefix lookup + ascending scan. This function returns true if
8732 the scan pattern is absolutely ascending.
8733 @param kd
8734 @param find_flag
8735 */
is_ascending(const Rdb_key_def & kd,enum ha_rkey_function find_flag) const8736 bool ha_rocksdb::is_ascending(const Rdb_key_def &kd,
8737 enum ha_rkey_function find_flag) const {
8738 bool is_ascending;
8739 switch (find_flag) {
8740 case HA_READ_KEY_EXACT: {
8741 is_ascending = !kd.m_is_reverse_cf;
8742 break;
8743 }
8744 case HA_READ_PREFIX: {
8745 is_ascending = true;
8746 break;
8747 }
8748 case HA_READ_KEY_OR_NEXT:
8749 case HA_READ_AFTER_KEY: {
8750 is_ascending = !kd.m_is_reverse_cf;
8751 break;
8752 }
8753 case HA_READ_KEY_OR_PREV:
8754 case HA_READ_BEFORE_KEY:
8755 case HA_READ_PREFIX_LAST:
8756 case HA_READ_PREFIX_LAST_OR_PREV: {
8757 is_ascending = kd.m_is_reverse_cf;
8758 break;
8759 }
8760 default:
8761 is_ascending = false;
8762 }
8763 return is_ascending;
8764 }
8765
8766 /*
8767 Checks if inplace alter is supported for a given operation.
8768 */
8769
check_if_supported_inplace_alter(TABLE * altered_table,my_core::Alter_inplace_info * const ha_alter_info)8770 my_core::enum_alter_inplace_result ha_rocksdb::check_if_supported_inplace_alter(
8771 TABLE *altered_table, my_core::Alter_inplace_info *const ha_alter_info) {
8772 DBUG_ENTER_FUNC();
8773
8774 DBUG_ASSERT(ha_alter_info != nullptr);
8775
8776 if (ha_alter_info->handler_flags &
8777 ~(my_core::Alter_inplace_info::DROP_INDEX |
8778 my_core::Alter_inplace_info::DROP_UNIQUE_INDEX |
8779 my_core::Alter_inplace_info::ADD_INDEX |
8780 my_core::Alter_inplace_info::ADD_UNIQUE_INDEX)) {
8781 DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED);
8782 }
8783
8784 /* We don't support unique keys on table w/ no primary keys */
8785 if ((ha_alter_info->handler_flags &
8786 my_core::Alter_inplace_info::ADD_UNIQUE_INDEX) &&
8787 has_hidden_pk(altered_table)) {
8788 DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED);
8789 }
8790
8791 DBUG_RETURN(my_core::HA_ALTER_INPLACE_SHARED_LOCK_AFTER_PREPARE);
8792 }
8793
8794 /**
8795 Allows the storage engine to update internal structures with concurrent
8796 writes blocked. If check_if_supported_inplace_alter() returns
8797 HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE or
8798 HA_ALTER_INPLACE_SHARED_AFTER_PREPARE, this function is called with
8799 exclusive lock otherwise the same level of locking as for
8800 inplace_alter_table() will be used.
8801
8802 @note Storage engines are responsible for reporting any errors by
8803 calling my_error()/print_error()
8804
8805 @note If this function reports error, commit_inplace_alter_table()
8806 will be called with commit= false.
8807
8808 @note For partitioning, failing to prepare one partition, means that
8809 commit_inplace_alter_table() will be called to roll back changes for
8810 all partitions. This means that commit_inplace_alter_table() might be
8811 called without prepare_inplace_alter_table() having been called first
8812 for a given partition.
8813
8814 @param altered_table TABLE object for new version of table.
8815 @param ha_alter_info Structure describing changes to be done
8816 by ALTER TABLE and holding data used
8817 during in-place alter.
8818
8819 @retval true Error
8820 @retval false Success
8821 */
prepare_inplace_alter_table(TABLE * const altered_table,my_core::Alter_inplace_info * const ha_alter_info)8822 bool ha_rocksdb::prepare_inplace_alter_table(
8823 TABLE *const altered_table,
8824 my_core::Alter_inplace_info *const ha_alter_info) {
8825 DBUG_ENTER_FUNC();
8826
8827 DBUG_ASSERT(altered_table != nullptr);
8828 DBUG_ASSERT(ha_alter_info != nullptr);
8829
8830 const uint n_dropped_keys = ha_alter_info->index_drop_count;
8831 const uint n_added_keys = ha_alter_info->index_add_count;
8832 const uint old_n_keys = m_tbl_def->m_key_count;
8833 uint new_n_keys = altered_table->s->keys;
8834
8835 if (has_hidden_pk(altered_table)) {
8836 new_n_keys += 1;
8837 }
8838
8839 std::shared_ptr<Rdb_key_def> *const old_key_descr =
8840 m_tbl_def->m_key_descr_arr;
8841 std::shared_ptr<Rdb_key_def> *const new_key_descr =
8842 new std::shared_ptr<Rdb_key_def>[new_n_keys];
8843
8844 Rdb_tbl_def *const new_tdef = new Rdb_tbl_def(m_tbl_def->full_tablename());
8845 new_tdef->m_key_descr_arr = new_key_descr;
8846 new_tdef->m_key_count = new_n_keys;
8847 new_tdef->m_auto_incr_val =
8848 m_tbl_def->m_auto_incr_val.load(std::memory_order_relaxed);
8849 new_tdef->m_hidden_pk_val =
8850 m_tbl_def->m_hidden_pk_val.load(std::memory_order_relaxed);
8851
8852 if (ha_alter_info->handler_flags &
8853 (my_core::Alter_inplace_info::DROP_INDEX |
8854 my_core::Alter_inplace_info::DROP_UNIQUE_INDEX |
8855 my_core::Alter_inplace_info::ADD_INDEX |
8856 my_core::Alter_inplace_info::ADD_UNIQUE_INDEX) &&
8857 create_key_defs(altered_table, new_tdef, table, m_tbl_def)) {
8858 /* Delete the new key descriptors */
8859 delete[] new_key_descr;
8860
8861 /*
8862 Explicitly mark as nullptr so we don't accidentally remove entries
8863 from data dictionary on cleanup (or cause double delete[]).
8864 */
8865 new_tdef->m_key_descr_arr = nullptr;
8866 delete new_tdef;
8867
8868 my_printf_error(ER_UNKNOWN_ERROR,
8869 "MyRocks failed creating new key definitions during alter.",
8870 MYF(0));
8871 DBUG_RETURN(HA_EXIT_FAILURE);
8872 }
8873
8874 std::unordered_set<std::shared_ptr<Rdb_key_def>> added_indexes;
8875 std::unordered_set<GL_INDEX_ID> dropped_index_ids;
8876
8877 uint i;
8878 uint j;
8879 const KEY *key;
8880
8881 /* Determine which(if any) key definition(s) need to be dropped */
8882 for (i = 0; i < ha_alter_info->index_drop_count; i++) {
8883 const KEY *const key = ha_alter_info->index_drop_buffer[i];
8884 for (j = 0; j < old_n_keys; j++) {
8885 if (!old_key_descr[j]->m_name.compare(key->name)) {
8886 dropped_index_ids.insert(old_key_descr[j]->get_gl_index_id());
8887 break;
8888 }
8889 }
8890 }
8891
8892 DBUG_ASSERT(dropped_index_ids.size() == ha_alter_info->index_drop_count);
8893
8894 /* Determine which(if any) key definitions(s) need to be added */
8895 for (i = 0; i < ha_alter_info->index_add_count; i++) {
8896 key = &ha_alter_info->key_info_buffer[ha_alter_info->index_add_buffer[i]];
8897 for (j = 0; j < new_n_keys; j++) {
8898 if (!new_key_descr[j]->m_name.compare(key->name)) {
8899 added_indexes.insert(new_key_descr[j]);
8900 break;
8901 }
8902 }
8903 }
8904
8905 DBUG_ASSERT(added_indexes.size() == ha_alter_info->index_add_count);
8906
8907 ha_alter_info->handler_ctx = new Rdb_inplace_alter_ctx(
8908 new_tdef, old_key_descr, new_key_descr, old_n_keys, new_n_keys,
8909 added_indexes, dropped_index_ids, n_added_keys, n_dropped_keys);
8910
8911 DBUG_RETURN(HA_EXIT_SUCCESS);
8912 }
8913
8914 /**
8915 Alter the table structure in-place with operations specified using
8916 HA_ALTER_FLAGS and Alter_inplace_info. The level of concurrency allowed
8917 during this operation depends on the return value from
8918 check_if_supported_inplace_alter().
8919
8920 @note Storage engines are responsible for reporting any errors by
8921 calling my_error()/print_error()
8922
8923 @note If this function reports error, commit_inplace_alter_table()
8924 will be called with commit= false.
8925
8926 @param altered_table TABLE object for new version of table.
8927 @param ha_alter_info Structure describing changes to be done
8928 by ALTER TABLE and holding data used
8929 during in-place alter.
8930
8931 @retval true Error
8932 @retval false Success
8933 */
inplace_alter_table(TABLE * const altered_table,my_core::Alter_inplace_info * const ha_alter_info)8934 bool ha_rocksdb::inplace_alter_table(
8935 TABLE *const altered_table,
8936 my_core::Alter_inplace_info *const ha_alter_info) {
8937 DBUG_ENTER_FUNC();
8938
8939 DBUG_ASSERT(altered_table != nullptr);
8940 DBUG_ASSERT(ha_alter_info != nullptr);
8941 DBUG_ASSERT(ha_alter_info->handler_ctx != nullptr);
8942
8943 Rdb_inplace_alter_ctx *const ctx =
8944 static_cast<Rdb_inplace_alter_ctx *>(ha_alter_info->handler_ctx);
8945
8946 if (ha_alter_info->handler_flags &
8947 (my_core::Alter_inplace_info::ADD_INDEX |
8948 my_core::Alter_inplace_info::ADD_UNIQUE_INDEX)) {
8949 /*
8950 Buffers need to be set up again to account for new, possibly longer
8951 secondary keys.
8952 */
8953 free_key_buffers();
8954
8955 DBUG_ASSERT(ctx != nullptr);
8956
8957 /*
8958 If adding unique index, allocate special buffers for duplicate checking.
8959 */
8960 int err;
8961 if ((err = alloc_key_buffers(
8962 altered_table, ctx->m_new_tdef,
8963 ha_alter_info->handler_flags &
8964 my_core::Alter_inplace_info::ADD_UNIQUE_INDEX))) {
8965 my_printf_error(ER_UNKNOWN_ERROR,
8966 "MyRocks failed allocating key buffers during alter.",
8967 MYF(0));
8968 DBUG_RETURN(err);
8969 }
8970
8971 /* Populate all new secondary keys by scanning the primary key. */
8972 if ((err = inplace_populate_sk(altered_table, ctx->m_added_indexes))) {
8973 my_printf_error(ER_UNKNOWN_ERROR,
8974 "MyRocks failed populating secondary key during alter.",
8975 MYF(0));
8976 DBUG_RETURN(HA_EXIT_FAILURE);
8977 }
8978 }
8979
8980 DBUG_EXECUTE_IF("myrocks_simulate_index_create_rollback", {
8981 dbug_create_err_inplace_alter();
8982 DBUG_RETURN(HA_EXIT_FAILURE);
8983 };);
8984
8985 DBUG_RETURN(HA_EXIT_SUCCESS);
8986 }
8987
8988 /**
8989 Scan the Primary Key index entries and populate the new secondary keys.
8990 */
inplace_populate_sk(TABLE * const new_table_arg,const std::unordered_set<std::shared_ptr<Rdb_key_def>> & indexes)8991 int ha_rocksdb::inplace_populate_sk(
8992 TABLE *const new_table_arg,
8993 const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes) {
8994 DBUG_ENTER_FUNC();
8995
8996 const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
8997 rocksdb::WriteBatch *const batch = wb.get();
8998
8999 /* Update the data dictionary */
9000 std::unordered_set<GL_INDEX_ID> create_index_ids;
9001 for (const auto &index : indexes) {
9002 create_index_ids.insert(index->get_gl_index_id());
9003 }
9004 dict_manager.add_create_index(create_index_ids, batch);
9005 dict_manager.commit(batch);
9006
9007 const bool hidden_pk_exists = has_hidden_pk(table);
9008
9009 int res = 0;
9010 Rdb_transaction *tx = get_or_create_tx(table->in_use);
9011
9012 /*
9013 There is one specific scenario where m_sst_info may not be nullptr. This
9014 happens if the handler we're using happens to be the handler where the PK
9015 bulk load was done on. The sequence of events that lead to this is as
9016 follows (T1 is PK bulk load, T2 is SK alter table):
9017
9018 T1: Execute last INSERT statement
9019 T1: Return TABLE and handler object back to Table_cache_manager
9020 T1: Close connection
9021 T2: Execute ALTER statement
9022 T2: Take same TABLE/handler from Table_cache_manager
9023 T2: Call closefrm which will call finalize_bulk_load on every other open
9024 table/handler *except* the one it's on.
9025 T2: Acquire stale snapshot of PK
9026 T1: Call finalize_bulk_load
9027
9028 This is rare because usually, closefrm will call the destructor (and thus
9029 finalize_bulk_load) on the handler where PK bulk load is done. However, if
9030 the thread ids of the bulk load thread and the alter thread differ by a
9031 multiple of table_cache_instances (8 by default), then they hash to the
9032 same bucket in Table_cache_manager and the alter thread will not not call
9033 the destructor on the handler it is holding. Thus, its m_sst_info will not
9034 be nullptr.
9035
9036 At this point, it is safe to refresh the snapshot because we know all other
9037 open handlers have been closed at this point, and the one we're on is the
9038 only one left.
9039 */
9040 if (m_sst_info != nullptr) {
9041 if ((res = finalize_bulk_load())) {
9042 DBUG_RETURN(res);
9043 }
9044 tx->commit();
9045 }
9046
9047 const ulonglong rdb_merge_buf_size = THDVAR(ha_thd(), merge_buf_size);
9048 const ulonglong rdb_merge_combine_read_size =
9049 THDVAR(ha_thd(), merge_combine_read_size);
9050
9051 for (const auto &index : indexes) {
9052 const rocksdb::Comparator *index_comp = index->get_cf()->GetComparator();
9053 bool is_unique_index =
9054 new_table_arg->key_info[index->get_keyno()].flags & HA_NOSAME;
9055
9056 Rdb_index_merge rdb_merge(thd_rocksdb_tmpdir(), rdb_merge_buf_size,
9057 rdb_merge_combine_read_size, index_comp);
9058
9059 if ((res = rdb_merge.init())) {
9060 DBUG_RETURN(res);
9061 }
9062
9063 /*
9064 Note: We pass in the currently existing table + tbl_def object here,
9065 as the pk index position may have changed in the case of hidden primary
9066 keys.
9067 */
9068 const uint pk = pk_index(table, m_tbl_def);
9069 ha_index_init(pk, true);
9070
9071 /* Scan each record in the primary key in order */
9072 for (res = index_first(table->record[0]); res == 0;
9073 res = index_next(table->record[0])) {
9074 longlong hidden_pk_id = 0;
9075 if (hidden_pk_exists && read_hidden_pk_id_from_rowkey(&hidden_pk_id)) {
9076 // NO_LINT_DEBUG
9077 sql_print_error("Error retrieving hidden pk id.");
9078 ha_index_end();
9079 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
9080 }
9081
9082 /* Create new secondary index entry */
9083 const int new_packed_size = index->pack_record(
9084 new_table_arg, m_pack_buffer, table->record[0], m_sk_packed_tuple,
9085 &m_sk_tails, should_store_row_debug_checksums(), hidden_pk_id);
9086
9087 const rocksdb::Slice key = rocksdb::Slice(
9088 reinterpret_cast<const char *>(m_sk_packed_tuple), new_packed_size);
9089 const rocksdb::Slice val =
9090 rocksdb::Slice(reinterpret_cast<const char *>(m_sk_tails.ptr()),
9091 m_sk_tails.get_current_pos());
9092
9093 /*
9094 Add record to offset tree in preparation for writing out to
9095 disk in sorted chunks.
9096 */
9097 if ((res = rdb_merge.add(key, val))) {
9098 ha_index_end();
9099 DBUG_RETURN(res);
9100 }
9101 }
9102
9103 if (res != HA_ERR_END_OF_FILE) {
9104 // NO_LINT_DEBUG
9105 sql_print_error("Error retrieving index entry from primary key.");
9106 ha_index_end();
9107 DBUG_RETURN(res);
9108 }
9109
9110 ha_index_end();
9111
9112 /*
9113 Perform an n-way merge of n sorted buffers on disk, then writes all
9114 results to RocksDB via SSTFileWriter API.
9115 */
9116 rocksdb::Slice merge_key;
9117 rocksdb::Slice merge_val;
9118
9119 struct unique_sk_buf_info sk_info;
9120 sk_info.dup_sk_buf = m_dup_sk_packed_tuple;
9121 sk_info.dup_sk_buf_old = m_dup_sk_packed_tuple_old;
9122
9123 while ((res = rdb_merge.next(&merge_key, &merge_val)) == 0) {
9124 /* Perform uniqueness check if needed */
9125 if (is_unique_index) {
9126 if (check_duplicate_sk(new_table_arg, *index, &merge_key, &sk_info)) {
9127 /*
9128 Duplicate entry found when trying to create unique secondary key.
9129 We need to unpack the record into new_table_arg->record[0] as it
9130 is used inside print_keydup_error so that the error message shows
9131 the duplicate record.
9132 */
9133 if (index->unpack_record(new_table_arg, new_table_arg->record[0],
9134 &merge_key, nullptr,
9135 m_verify_row_debug_checksums)) {
9136 /* Should never reach here */
9137 DBUG_ASSERT(0);
9138 }
9139
9140 print_keydup_error(new_table_arg,
9141 &new_table_arg->key_info[index->get_keyno()],
9142 MYF(0));
9143 DBUG_RETURN(ER_DUP_ENTRY);
9144 }
9145 }
9146
9147 /*
9148 Insert key and slice to SST via SSTFileWriter API.
9149 */
9150 if ((res = bulk_load_key(tx, *index, merge_key, merge_val))) {
9151 break;
9152 }
9153 }
9154
9155 /*
9156 Here, res == -1 means that we are finished, while > 0 means an error
9157 occurred.
9158 */
9159 if (res > 0) {
9160 // NO_LINT_DEBUG
9161 sql_print_error("Error while bulk loading keys in external merge sort.");
9162 DBUG_RETURN(res);
9163 }
9164
9165 if ((res = tx->finish_bulk_load())) {
9166 // NO_LINT_DEBUG
9167 sql_print_error("Error finishing bulk load.");
9168 DBUG_RETURN(res);
9169 }
9170 }
9171
9172 /*
9173 Explicitly tell jemalloc to clean up any unused dirty pages at this point.
9174 See https://reviews.facebook.net/D63723 for more details.
9175 */
9176 purge_all_jemalloc_arenas();
9177
9178 DBUG_EXECUTE_IF("crash_during_online_index_creation", DBUG_SUICIDE(););
9179 DBUG_RETURN(res);
9180 }
9181
9182 /**
9183 Commit or rollback the changes made during prepare_inplace_alter_table()
9184 and inplace_alter_table() inside the storage engine.
9185 Note that in case of rollback the allowed level of concurrency during
9186 this operation will be the same as for inplace_alter_table() and thus
9187 might be higher than during prepare_inplace_alter_table(). (For example,
9188 concurrent writes were blocked during prepare, but might not be during
9189 rollback).
9190
9191 @note Storage engines are responsible for reporting any errors by
9192 calling my_error()/print_error()
9193
9194 @note If this function with commit= true reports error, it will be called
9195 again with commit= false.
9196
9197 @note In case of partitioning, this function might be called for rollback
9198 without prepare_inplace_alter_table() having been called first.
9199 Also partitioned tables sets ha_alter_info->group_commit_ctx to a NULL
9200 terminated array of the partitions handlers and if all of them are
9201 committed as one, then group_commit_ctx should be set to NULL to indicate
9202 to the partitioning handler that all partitions handlers are committed.
9203 @see prepare_inplace_alter_table().
9204
9205 @param altered_table TABLE object for new version of table.
9206 @param ha_alter_info Structure describing changes to be done
9207 by ALTER TABLE and holding data used
9208 during in-place alter.
9209 @param commit True => Commit, False => Rollback.
9210
9211 @retval true Error
9212 @retval false Success
9213 */
commit_inplace_alter_table(my_core::TABLE * const altered_table,my_core::Alter_inplace_info * const ha_alter_info,bool commit)9214 bool ha_rocksdb::commit_inplace_alter_table(
9215 my_core::TABLE *const altered_table,
9216 my_core::Alter_inplace_info *const ha_alter_info, bool commit) {
9217 DBUG_ENTER_FUNC();
9218
9219 DBUG_ASSERT(altered_table != nullptr);
9220 DBUG_ASSERT(ha_alter_info != nullptr);
9221
9222 Rdb_inplace_alter_ctx *const ctx0 =
9223 static_cast<Rdb_inplace_alter_ctx *>(ha_alter_info->handler_ctx);
9224
9225 /*
9226 IMPORTANT: When rollback is requested, mysql will abort with
9227 an assertion failure. That means every failed commit during inplace alter
9228 table will result in a fatal error on the server. Indexes ongoing creation
9229 will be detected when the server restarts, and dropped.
9230
9231 For partitioned tables, a rollback call to this function (commit == false)
9232 is done for each partition. A successful commit call only executes once
9233 for all partitions.
9234 */
9235 if (!commit) {
9236 /* If ctx has not been created yet, nothing to do here */
9237 if (!ctx0) {
9238 DBUG_RETURN(HA_EXIT_SUCCESS);
9239 }
9240
9241 /*
9242 Cannot call destructor for Rdb_tbl_def directly because we don't want to
9243 erase the mappings inside the ddl_manager, as the old_key_descr is still
9244 using them.
9245 */
9246 if (ctx0->m_new_key_descr) {
9247 /* Delete the new key descriptors */
9248 for (uint i = 0; i < ctx0->m_new_tdef->m_key_count; i++) {
9249 ctx0->m_new_key_descr[i] = nullptr;
9250 }
9251
9252 delete[] ctx0->m_new_key_descr;
9253 ctx0->m_new_key_descr = nullptr;
9254 ctx0->m_new_tdef->m_key_descr_arr = nullptr;
9255
9256 delete ctx0->m_new_tdef;
9257 }
9258
9259 /* Rollback any partially created indexes */
9260 dict_manager.rollback_ongoing_index_creation();
9261
9262 DBUG_RETURN(HA_EXIT_SUCCESS);
9263 }
9264
9265 DBUG_ASSERT(ctx0);
9266
9267 /*
9268 For partitioned tables, we need to commit all changes to all tables at
9269 once, unlike in the other inplace alter API methods.
9270 */
9271 inplace_alter_handler_ctx **ctx_array;
9272 inplace_alter_handler_ctx *ctx_single[2];
9273
9274 if (ha_alter_info->group_commit_ctx) {
9275 DBUG_EXECUTE_IF("crash_during_index_creation_partition", DBUG_SUICIDE(););
9276 ctx_array = ha_alter_info->group_commit_ctx;
9277 } else {
9278 ctx_single[0] = ctx0;
9279 ctx_single[1] = nullptr;
9280 ctx_array = ctx_single;
9281 }
9282
9283 DBUG_ASSERT(ctx0 == ctx_array[0]);
9284 ha_alter_info->group_commit_ctx = nullptr;
9285
9286 if (ha_alter_info->handler_flags &
9287 (my_core::Alter_inplace_info::DROP_INDEX |
9288 my_core::Alter_inplace_info::DROP_UNIQUE_INDEX |
9289 my_core::Alter_inplace_info::ADD_INDEX |
9290 my_core::Alter_inplace_info::ADD_UNIQUE_INDEX)) {
9291 const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
9292 rocksdb::WriteBatch *const batch = wb.get();
9293 std::unordered_set<GL_INDEX_ID> create_index_ids;
9294
9295 m_tbl_def = ctx0->m_new_tdef;
9296 m_key_descr_arr = m_tbl_def->m_key_descr_arr;
9297 m_pk_descr = m_key_descr_arr[pk_index(altered_table, m_tbl_def)];
9298
9299 dict_manager.lock();
9300 for (inplace_alter_handler_ctx **pctx = ctx_array; *pctx; pctx++) {
9301 Rdb_inplace_alter_ctx *const ctx =
9302 static_cast<Rdb_inplace_alter_ctx *>(*pctx);
9303
9304 /* Mark indexes to be dropped */
9305 dict_manager.add_drop_index(ctx->m_dropped_index_ids, batch);
9306
9307 for (const auto &index : ctx->m_added_indexes) {
9308 create_index_ids.insert(index->get_gl_index_id());
9309 }
9310
9311 if (ddl_manager.put_and_write(ctx->m_new_tdef, batch)) {
9312 /*
9313 Failed to write new entry into data dictionary, this should never
9314 happen.
9315 */
9316 DBUG_ASSERT(0);
9317 }
9318 }
9319
9320 if (dict_manager.commit(batch)) {
9321 /*
9322 Should never reach here. We assume MyRocks will abort if commit fails.
9323 */
9324 DBUG_ASSERT(0);
9325 }
9326
9327 dict_manager.unlock();
9328
9329 /* Mark ongoing create indexes as finished/remove from data dictionary */
9330 dict_manager.finish_indexes_operation(
9331 create_index_ids, Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
9332
9333 /*
9334 We need to recalculate the index stats here manually. The reason is that
9335 the secondary index does not exist inside
9336 m_index_num_to_keydef until it is committed to the data dictionary, which
9337 prevents us from updating the stats normally as the ddl_manager cannot
9338 find the proper gl_index_ids yet during adjust_stats calls.
9339 */
9340 if (calculate_stats(altered_table, nullptr, nullptr)) {
9341 /* Failed to update index statistics, should never happen */
9342 DBUG_ASSERT(0);
9343 }
9344
9345 rdb_drop_idx_thread.signal();
9346 }
9347
9348 DBUG_RETURN(HA_EXIT_SUCCESS);
9349 }
9350
9351 #define SHOW_FNAME(name) rocksdb_show_##name
9352
9353 #define DEF_SHOW_FUNC(name, key) \
9354 static int SHOW_FNAME(name)(MYSQL_THD thd, SHOW_VAR * var, char *buff) { \
9355 rocksdb_status_counters.name = \
9356 rocksdb_stats->getTickerCount(rocksdb::key); \
9357 var->type = SHOW_LONGLONG; \
9358 var->value = (char *)&rocksdb_status_counters.name; \
9359 return HA_EXIT_SUCCESS; \
9360 }
9361
9362 #define DEF_STATUS_VAR(name) \
9363 { "rocksdb_" #name, (char *)&SHOW_FNAME(name), SHOW_FUNC }
9364
9365 #define DEF_STATUS_VAR_PTR(name, ptr, option) \
9366 { "rocksdb_" name, (char *)ptr, option }
9367
9368 #define DEF_STATUS_VAR_FUNC(name, ptr, option) \
9369 { name, reinterpret_cast<char *>(ptr), option }
9370
9371 struct rocksdb_status_counters_t {
9372 uint64_t block_cache_miss;
9373 uint64_t block_cache_hit;
9374 uint64_t block_cache_add;
9375 uint64_t block_cache_index_miss;
9376 uint64_t block_cache_index_hit;
9377 uint64_t block_cache_filter_miss;
9378 uint64_t block_cache_filter_hit;
9379 uint64_t block_cache_data_miss;
9380 uint64_t block_cache_data_hit;
9381 uint64_t bloom_filter_useful;
9382 uint64_t memtable_hit;
9383 uint64_t memtable_miss;
9384 uint64_t compaction_key_drop_new;
9385 uint64_t compaction_key_drop_obsolete;
9386 uint64_t compaction_key_drop_user;
9387 uint64_t number_keys_written;
9388 uint64_t number_keys_read;
9389 uint64_t number_keys_updated;
9390 uint64_t bytes_written;
9391 uint64_t bytes_read;
9392 uint64_t no_file_closes;
9393 uint64_t no_file_opens;
9394 uint64_t no_file_errors;
9395 uint64_t l0_slowdown_micros;
9396 uint64_t memtable_compaction_micros;
9397 uint64_t l0_num_files_stall_micros;
9398 uint64_t rate_limit_delay_millis;
9399 uint64_t num_iterators;
9400 uint64_t number_multiget_get;
9401 uint64_t number_multiget_keys_read;
9402 uint64_t number_multiget_bytes_read;
9403 uint64_t number_deletes_filtered;
9404 uint64_t number_merge_failures;
9405 uint64_t bloom_filter_prefix_checked;
9406 uint64_t bloom_filter_prefix_useful;
9407 uint64_t number_reseeks_iteration;
9408 uint64_t getupdatessince_calls;
9409 uint64_t block_cachecompressed_miss;
9410 uint64_t block_cachecompressed_hit;
9411 uint64_t wal_synced;
9412 uint64_t wal_bytes;
9413 uint64_t write_self;
9414 uint64_t write_other;
9415 uint64_t write_timedout;
9416 uint64_t write_wal;
9417 uint64_t flush_write_bytes;
9418 uint64_t compact_read_bytes;
9419 uint64_t compact_write_bytes;
9420 uint64_t number_superversion_acquires;
9421 uint64_t number_superversion_releases;
9422 uint64_t number_superversion_cleanups;
9423 uint64_t number_block_not_compressed;
9424 };
9425
9426 static rocksdb_status_counters_t rocksdb_status_counters;
9427
DEF_SHOW_FUNC(block_cache_miss,BLOCK_CACHE_MISS)9428 DEF_SHOW_FUNC(block_cache_miss, BLOCK_CACHE_MISS)
9429 DEF_SHOW_FUNC(block_cache_hit, BLOCK_CACHE_HIT)
9430 DEF_SHOW_FUNC(block_cache_add, BLOCK_CACHE_ADD)
9431 DEF_SHOW_FUNC(block_cache_index_miss, BLOCK_CACHE_INDEX_MISS)
9432 DEF_SHOW_FUNC(block_cache_index_hit, BLOCK_CACHE_INDEX_HIT)
9433 DEF_SHOW_FUNC(block_cache_filter_miss, BLOCK_CACHE_FILTER_MISS)
9434 DEF_SHOW_FUNC(block_cache_filter_hit, BLOCK_CACHE_FILTER_HIT)
9435 DEF_SHOW_FUNC(block_cache_data_miss, BLOCK_CACHE_DATA_MISS)
9436 DEF_SHOW_FUNC(block_cache_data_hit, BLOCK_CACHE_DATA_HIT)
9437 DEF_SHOW_FUNC(bloom_filter_useful, BLOOM_FILTER_USEFUL)
9438 DEF_SHOW_FUNC(memtable_hit, MEMTABLE_HIT)
9439 DEF_SHOW_FUNC(memtable_miss, MEMTABLE_MISS)
9440 DEF_SHOW_FUNC(compaction_key_drop_new, COMPACTION_KEY_DROP_NEWER_ENTRY)
9441 DEF_SHOW_FUNC(compaction_key_drop_obsolete, COMPACTION_KEY_DROP_OBSOLETE)
9442 DEF_SHOW_FUNC(compaction_key_drop_user, COMPACTION_KEY_DROP_USER)
9443 DEF_SHOW_FUNC(number_keys_written, NUMBER_KEYS_WRITTEN)
9444 DEF_SHOW_FUNC(number_keys_read, NUMBER_KEYS_READ)
9445 DEF_SHOW_FUNC(number_keys_updated, NUMBER_KEYS_UPDATED)
9446 DEF_SHOW_FUNC(bytes_written, BYTES_WRITTEN)
9447 DEF_SHOW_FUNC(bytes_read, BYTES_READ)
9448 DEF_SHOW_FUNC(no_file_closes, NO_FILE_CLOSES)
9449 DEF_SHOW_FUNC(no_file_opens, NO_FILE_OPENS)
9450 DEF_SHOW_FUNC(no_file_errors, NO_FILE_ERRORS)
9451 DEF_SHOW_FUNC(l0_slowdown_micros, STALL_L0_SLOWDOWN_MICROS)
9452 DEF_SHOW_FUNC(memtable_compaction_micros, STALL_MEMTABLE_COMPACTION_MICROS)
9453 DEF_SHOW_FUNC(l0_num_files_stall_micros, STALL_L0_NUM_FILES_MICROS)
9454 DEF_SHOW_FUNC(rate_limit_delay_millis, RATE_LIMIT_DELAY_MILLIS)
9455 DEF_SHOW_FUNC(num_iterators, NO_ITERATORS)
9456 DEF_SHOW_FUNC(number_multiget_get, NUMBER_MULTIGET_CALLS)
9457 DEF_SHOW_FUNC(number_multiget_keys_read, NUMBER_MULTIGET_KEYS_READ)
9458 DEF_SHOW_FUNC(number_multiget_bytes_read, NUMBER_MULTIGET_BYTES_READ)
9459 DEF_SHOW_FUNC(number_deletes_filtered, NUMBER_FILTERED_DELETES)
9460 DEF_SHOW_FUNC(number_merge_failures, NUMBER_MERGE_FAILURES)
9461 DEF_SHOW_FUNC(bloom_filter_prefix_checked, BLOOM_FILTER_PREFIX_CHECKED)
9462 DEF_SHOW_FUNC(bloom_filter_prefix_useful, BLOOM_FILTER_PREFIX_USEFUL)
9463 DEF_SHOW_FUNC(number_reseeks_iteration, NUMBER_OF_RESEEKS_IN_ITERATION)
9464 DEF_SHOW_FUNC(getupdatessince_calls, GET_UPDATES_SINCE_CALLS)
9465 DEF_SHOW_FUNC(block_cachecompressed_miss, BLOCK_CACHE_COMPRESSED_MISS)
9466 DEF_SHOW_FUNC(block_cachecompressed_hit, BLOCK_CACHE_COMPRESSED_HIT)
9467 DEF_SHOW_FUNC(wal_synced, WAL_FILE_SYNCED)
9468 DEF_SHOW_FUNC(wal_bytes, WAL_FILE_BYTES)
9469 DEF_SHOW_FUNC(write_self, WRITE_DONE_BY_SELF)
9470 DEF_SHOW_FUNC(write_other, WRITE_DONE_BY_OTHER)
9471 DEF_SHOW_FUNC(write_timedout, WRITE_TIMEDOUT)
9472 DEF_SHOW_FUNC(write_wal, WRITE_WITH_WAL)
9473 DEF_SHOW_FUNC(flush_write_bytes, FLUSH_WRITE_BYTES)
9474 DEF_SHOW_FUNC(compact_read_bytes, COMPACT_READ_BYTES)
9475 DEF_SHOW_FUNC(compact_write_bytes, COMPACT_WRITE_BYTES)
9476 DEF_SHOW_FUNC(number_superversion_acquires, NUMBER_SUPERVERSION_ACQUIRES)
9477 DEF_SHOW_FUNC(number_superversion_releases, NUMBER_SUPERVERSION_RELEASES)
9478 DEF_SHOW_FUNC(number_superversion_cleanups, NUMBER_SUPERVERSION_CLEANUPS)
9479 DEF_SHOW_FUNC(number_block_not_compressed, NUMBER_BLOCK_NOT_COMPRESSED)
9480
9481 static void myrocks_update_status() {
9482 export_stats.rows_deleted = global_stats.rows[ROWS_DELETED];
9483 export_stats.rows_inserted = global_stats.rows[ROWS_INSERTED];
9484 export_stats.rows_read = global_stats.rows[ROWS_READ];
9485 export_stats.rows_updated = global_stats.rows[ROWS_UPDATED];
9486
9487 export_stats.system_rows_deleted = global_stats.system_rows[ROWS_DELETED];
9488 export_stats.system_rows_inserted = global_stats.system_rows[ROWS_INSERTED];
9489 export_stats.system_rows_read = global_stats.system_rows[ROWS_READ];
9490 export_stats.system_rows_updated = global_stats.system_rows[ROWS_UPDATED];
9491 }
9492
9493 static SHOW_VAR myrocks_status_variables[] = {
9494 DEF_STATUS_VAR_FUNC("rows_deleted", &export_stats.rows_deleted,
9495 SHOW_LONGLONG),
9496 DEF_STATUS_VAR_FUNC("rows_inserted", &export_stats.rows_inserted,
9497 SHOW_LONGLONG),
9498 DEF_STATUS_VAR_FUNC("rows_read", &export_stats.rows_read, SHOW_LONGLONG),
9499 DEF_STATUS_VAR_FUNC("rows_updated", &export_stats.rows_updated,
9500 SHOW_LONGLONG),
9501 DEF_STATUS_VAR_FUNC("system_rows_deleted",
9502 &export_stats.system_rows_deleted, SHOW_LONGLONG),
9503 DEF_STATUS_VAR_FUNC("system_rows_inserted",
9504 &export_stats.system_rows_inserted, SHOW_LONGLONG),
9505 DEF_STATUS_VAR_FUNC("system_rows_read", &export_stats.system_rows_read,
9506 SHOW_LONGLONG),
9507 DEF_STATUS_VAR_FUNC("system_rows_updated",
9508 &export_stats.system_rows_updated, SHOW_LONGLONG),
9509
9510 {NullS, NullS, SHOW_LONG}};
9511
show_myrocks_vars(THD * thd,SHOW_VAR * var,char * buff)9512 static void show_myrocks_vars(THD *thd, SHOW_VAR *var, char *buff) {
9513 myrocks_update_status();
9514 var->type = SHOW_ARRAY;
9515 var->value = reinterpret_cast<char *>(&myrocks_status_variables);
9516 }
9517
9518 static SHOW_VAR rocksdb_status_vars[] = {
9519 DEF_STATUS_VAR(block_cache_miss),
9520 DEF_STATUS_VAR(block_cache_hit),
9521 DEF_STATUS_VAR(block_cache_add),
9522 DEF_STATUS_VAR(block_cache_index_miss),
9523 DEF_STATUS_VAR(block_cache_index_hit),
9524 DEF_STATUS_VAR(block_cache_filter_miss),
9525 DEF_STATUS_VAR(block_cache_filter_hit),
9526 DEF_STATUS_VAR(block_cache_data_miss),
9527 DEF_STATUS_VAR(block_cache_data_hit),
9528 DEF_STATUS_VAR(bloom_filter_useful),
9529 DEF_STATUS_VAR(memtable_hit),
9530 DEF_STATUS_VAR(memtable_miss),
9531 DEF_STATUS_VAR(compaction_key_drop_new),
9532 DEF_STATUS_VAR(compaction_key_drop_obsolete),
9533 DEF_STATUS_VAR(compaction_key_drop_user),
9534 DEF_STATUS_VAR(number_keys_written),
9535 DEF_STATUS_VAR(number_keys_read),
9536 DEF_STATUS_VAR(number_keys_updated),
9537 DEF_STATUS_VAR(bytes_written),
9538 DEF_STATUS_VAR(bytes_read),
9539 DEF_STATUS_VAR(no_file_closes),
9540 DEF_STATUS_VAR(no_file_opens),
9541 DEF_STATUS_VAR(no_file_errors),
9542 DEF_STATUS_VAR(l0_slowdown_micros),
9543 DEF_STATUS_VAR(memtable_compaction_micros),
9544 DEF_STATUS_VAR(l0_num_files_stall_micros),
9545 DEF_STATUS_VAR(rate_limit_delay_millis),
9546 DEF_STATUS_VAR(num_iterators),
9547 DEF_STATUS_VAR(number_multiget_get),
9548 DEF_STATUS_VAR(number_multiget_keys_read),
9549 DEF_STATUS_VAR(number_multiget_bytes_read),
9550 DEF_STATUS_VAR(number_deletes_filtered),
9551 DEF_STATUS_VAR(number_merge_failures),
9552 DEF_STATUS_VAR(bloom_filter_prefix_checked),
9553 DEF_STATUS_VAR(bloom_filter_prefix_useful),
9554 DEF_STATUS_VAR(number_reseeks_iteration),
9555 DEF_STATUS_VAR(getupdatessince_calls),
9556 DEF_STATUS_VAR(block_cachecompressed_miss),
9557 DEF_STATUS_VAR(block_cachecompressed_hit),
9558 DEF_STATUS_VAR(wal_synced),
9559 DEF_STATUS_VAR(wal_bytes),
9560 DEF_STATUS_VAR(write_self),
9561 DEF_STATUS_VAR(write_other),
9562 DEF_STATUS_VAR(write_timedout),
9563 DEF_STATUS_VAR(write_wal),
9564 DEF_STATUS_VAR(flush_write_bytes),
9565 DEF_STATUS_VAR(compact_read_bytes),
9566 DEF_STATUS_VAR(compact_write_bytes),
9567 DEF_STATUS_VAR(number_superversion_acquires),
9568 DEF_STATUS_VAR(number_superversion_releases),
9569 DEF_STATUS_VAR(number_superversion_cleanups),
9570 DEF_STATUS_VAR(number_block_not_compressed),
9571 DEF_STATUS_VAR_PTR("snapshot_conflict_errors",
9572 &rocksdb_snapshot_conflict_errors, SHOW_LONGLONG),
9573 DEF_STATUS_VAR_PTR("wal_group_syncs", &rocksdb_wal_group_syncs,
9574 SHOW_LONGLONG),
9575 DEF_STATUS_VAR_PTR("number_stat_computes", &rocksdb_number_stat_computes,
9576 SHOW_LONGLONG),
9577 DEF_STATUS_VAR_PTR("number_sst_entry_put", &rocksdb_num_sst_entry_put,
9578 SHOW_LONGLONG),
9579 DEF_STATUS_VAR_PTR("number_sst_entry_delete", &rocksdb_num_sst_entry_delete,
9580 SHOW_LONGLONG),
9581 DEF_STATUS_VAR_PTR("number_sst_entry_singledelete",
9582 &rocksdb_num_sst_entry_singledelete, SHOW_LONGLONG),
9583 DEF_STATUS_VAR_PTR("number_sst_entry_merge", &rocksdb_num_sst_entry_merge,
9584 SHOW_LONGLONG),
9585 DEF_STATUS_VAR_PTR("number_sst_entry_other", &rocksdb_num_sst_entry_other,
9586 SHOW_LONGLONG),
9587 {"rocksdb", reinterpret_cast<char *>(&show_myrocks_vars), SHOW_FUNC},
9588 {NullS, NullS, SHOW_LONG}};
9589
9590 /*
9591 Background thread's main logic
9592 */
9593
run()9594 void Rdb_background_thread::run() {
9595 // How many seconds to wait till flushing the WAL next time.
9596 const int WAKE_UP_INTERVAL = 1;
9597
9598 timespec ts_next_sync;
9599 clock_gettime(CLOCK_REALTIME, &ts_next_sync);
9600 ts_next_sync.tv_sec += WAKE_UP_INTERVAL;
9601
9602 for (;;) {
9603 // Wait until the next timeout or until we receive a signal to stop the
9604 // thread. Request to stop the thread should only be triggered when the
9605 // storage engine is being unloaded.
9606 mysql_mutex_lock(&m_signal_mutex);
9607 const auto ret __attribute__((__unused__)) =
9608 mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts_next_sync);
9609
9610 // Check that we receive only the expected error codes.
9611 DBUG_ASSERT(ret == 0 || ret == ETIMEDOUT);
9612 const bool local_stop = m_stop;
9613 const bool local_save_stats = m_save_stats;
9614 reset();
9615 mysql_mutex_unlock(&m_signal_mutex);
9616
9617 if (local_stop) {
9618 // If we're here then that's because condition variable was signaled by
9619 // another thread and we're shutting down. Break out the loop to make
9620 // sure that shutdown thread can proceed.
9621 break;
9622 }
9623
9624 // This path should be taken only when the timer expired.
9625 DBUG_ASSERT(ret == ETIMEDOUT);
9626
9627 if (local_save_stats) {
9628 ddl_manager.persist_stats();
9629 }
9630
9631 timespec ts;
9632 clock_gettime(CLOCK_REALTIME, &ts);
9633
9634 // Flush the WAL.
9635 if (rdb && rocksdb_background_sync) {
9636 DBUG_ASSERT(!rocksdb_db_options.allow_mmap_writes);
9637 const rocksdb::Status s = rdb->SyncWAL();
9638 if (!s.ok()) {
9639 rdb_handle_io_error(s, RDB_IO_ERROR_BG_THREAD);
9640 }
9641 }
9642
9643 // Set the next timestamp for mysql_cond_timedwait() (which ends up calling
9644 // pthread_cond_timedwait()) to wait on.
9645 ts_next_sync.tv_sec = ts.tv_sec + WAKE_UP_INTERVAL;
9646 }
9647
9648 // save remaining stats which might've left unsaved
9649 ddl_manager.persist_stats();
9650 }
9651
9652 /**
9653 Deciding if it is possible to use bloom filter or not.
9654
9655 @detail
9656 Even if bloom filter exists, it is not always possible
9657 to use bloom filter. If using bloom filter when you shouldn't,
9658 false negative may happen -- fewer rows than expected may be returned.
9659 It is users' responsibility to use bloom filter correctly.
9660
9661 If bloom filter does not exist, return value does not matter because
9662 RocksDB does not use bloom filter internally.
9663
9664 @param kd
9665 @param eq_cond Equal condition part of the key. This always includes
9666 system index id (4 bytes).
9667 @param use_all_keys True if all key parts are set with equal conditions.
9668 This is aware of extended keys.
9669 */
can_use_bloom_filter(THD * thd,const Rdb_key_def & kd,const rocksdb::Slice & eq_cond,const bool use_all_keys,bool is_ascending)9670 bool can_use_bloom_filter(THD *thd, const Rdb_key_def &kd,
9671 const rocksdb::Slice &eq_cond,
9672 const bool use_all_keys, bool is_ascending) {
9673 bool can_use = false;
9674
9675 if (THDVAR(thd, skip_bloom_filter_on_read)) {
9676 return can_use;
9677 }
9678
9679 const rocksdb::SliceTransform *prefix_extractor = kd.get_extractor();
9680 if (prefix_extractor) {
9681 /*
9682 This is an optimized use case for CappedPrefixTransform.
9683 If eq_cond length >= prefix extractor length and if
9684 all keys are used for equal lookup, it is
9685 always possible to use bloom filter.
9686
9687 Prefix bloom filter can't be used on descending scan with
9688 prefix lookup (i.e. WHERE id1=1 ORDER BY id2 DESC), because of
9689 RocksDB's limitation. On ascending (or not sorting) scan,
9690 keys longer than the capped prefix length will be truncated down
9691 to the capped length and the resulting key is added to the bloom filter.
9692
9693 Keys shorter than the capped prefix length will be added to
9694 the bloom filter. When keys are looked up, key conditionals
9695 longer than the capped length can be used; key conditionals
9696 shorter require all parts of the key to be available
9697 for the short key match.
9698 */
9699 if (use_all_keys && prefix_extractor->InRange(eq_cond))
9700 can_use = true;
9701 else if (!is_ascending)
9702 can_use = false;
9703 else if (prefix_extractor->SameResultWhenAppended(eq_cond))
9704 can_use = true;
9705 else
9706 can_use = false;
9707 } else {
9708 /*
9709 if prefix extractor is not defined, all key parts have to be
9710 used by eq_cond.
9711 */
9712 if (use_all_keys)
9713 can_use = true;
9714 else
9715 can_use = false;
9716 }
9717
9718 return can_use;
9719 }
9720
9721 /* For modules that need access to the global data structures */
rdb_get_rocksdb_db()9722 rocksdb::TransactionDB *rdb_get_rocksdb_db() { return rdb; }
9723
rdb_get_cf_manager()9724 Rdb_cf_manager &rdb_get_cf_manager() { return cf_manager; }
9725
rdb_get_table_options()9726 rocksdb::BlockBasedTableOptions &rdb_get_table_options() {
9727 return rocksdb_tbl_options;
9728 }
9729
rdb_get_table_perf_counters(const char * const tablename,Rdb_perf_counters * const counters)9730 int rdb_get_table_perf_counters(const char *const tablename,
9731 Rdb_perf_counters *const counters) {
9732 DBUG_ASSERT(counters != nullptr);
9733 DBUG_ASSERT(tablename != nullptr);
9734
9735 Rdb_table_handler *table_handler;
9736 table_handler = rdb_open_tables.get_table_handler(tablename);
9737 if (table_handler == nullptr) {
9738 return HA_ERR_INTERNAL_ERROR;
9739 }
9740
9741 counters->load(table_handler->m_table_perf_context);
9742
9743 rdb_open_tables.release_table_handler(table_handler);
9744 return HA_EXIT_SUCCESS;
9745 }
9746
rdb_handle_io_error(rocksdb::Status status,RDB_IO_ERROR_TYPE err_type)9747 void rdb_handle_io_error(rocksdb::Status status, RDB_IO_ERROR_TYPE err_type) {
9748 if (status.IsIOError()) {
9749 switch (err_type) {
9750 case RDB_IO_ERROR_TX_COMMIT:
9751 case RDB_IO_ERROR_DICT_COMMIT: {
9752 sql_print_error("RocksDB: Failed to write to WAL - status %d, %s",
9753 status.code(), status.ToString().c_str());
9754 sql_print_error("RocksDB: Aborting on WAL write error.");
9755 abort_with_stack_traces();
9756 break;
9757 }
9758 case RDB_IO_ERROR_BG_THREAD: {
9759 sql_print_warning("RocksDB: BG Thread failed to write to RocksDB "
9760 "- status %d, %s",
9761 status.code(), status.ToString().c_str());
9762 break;
9763 }
9764 case RDB_IO_ERROR_GENERAL: {
9765 sql_print_error("RocksDB: Failed on I/O - status %d, %s", status.code(),
9766 status.ToString().c_str());
9767 sql_print_error("RocksDB: Aborting on I/O error.");
9768 abort_with_stack_traces();
9769 break;
9770 }
9771 default:
9772 DBUG_ASSERT(0);
9773 break;
9774 }
9775 } else if (status.IsCorruption()) {
9776 /* NO_LINT_DEBUG */
9777 sql_print_error("RocksDB: Data Corruption detected! %d, %s", status.code(),
9778 status.ToString().c_str());
9779 /* NO_LINT_DEBUG */
9780 sql_print_error("RocksDB: Aborting because of data corruption.");
9781 abort_with_stack_traces();
9782 } else if (!status.ok()) {
9783 switch (err_type) {
9784 case RDB_IO_ERROR_DICT_COMMIT: {
9785 sql_print_error("RocksDB: Failed to write to WAL (dictionary) - "
9786 "status %d, %s",
9787 status.code(), status.ToString().c_str());
9788 sql_print_error("RocksDB: Aborting on WAL write error.");
9789 abort_with_stack_traces();
9790 break;
9791 }
9792 default:
9793 sql_print_warning("RocksDB: Failed to read/write in RocksDB "
9794 "- status %d, %s",
9795 status.code(), status.ToString().c_str());
9796 break;
9797 }
9798 }
9799 }
9800
rdb_get_dict_manager(void)9801 Rdb_dict_manager *rdb_get_dict_manager(void) { return &dict_manager; }
9802
rdb_get_ddl_manager(void)9803 Rdb_ddl_manager *rdb_get_ddl_manager(void) { return &ddl_manager; }
9804
rocksdb_set_compaction_options(my_core::THD * const thd,my_core::st_mysql_sys_var * const var,void * const var_ptr,const void * const save)9805 void rocksdb_set_compaction_options(my_core::THD *const thd
9806 __attribute__((__unused__)),
9807 my_core::st_mysql_sys_var *const var
9808 __attribute__((__unused__)),
9809 void *const var_ptr,
9810 const void *const save) {
9811 if (var_ptr && save) {
9812 *(uint64_t *)var_ptr = *(const uint64_t *)save;
9813 }
9814 const Rdb_compact_params params = {
9815 (uint64_t)rocksdb_compaction_sequential_deletes,
9816 (uint64_t)rocksdb_compaction_sequential_deletes_window,
9817 (uint64_t)rocksdb_compaction_sequential_deletes_file_size};
9818 if (properties_collector_factory) {
9819 properties_collector_factory->SetCompactionParams(params);
9820 }
9821 }
9822
rocksdb_set_table_stats_sampling_pct(my_core::THD * const thd,my_core::st_mysql_sys_var * const var,void * const var_ptr,const void * const save)9823 void rocksdb_set_table_stats_sampling_pct(
9824 my_core::THD *const thd __attribute__((__unused__)),
9825 my_core::st_mysql_sys_var *const var __attribute__((__unused__)),
9826 void *const var_ptr __attribute__((__unused__)), const void *const save) {
9827 mysql_mutex_lock(&rdb_sysvars_mutex);
9828
9829 const uint32_t new_val = *static_cast<const uint32_t *>(save);
9830
9831 if (new_val != rocksdb_table_stats_sampling_pct) {
9832 rocksdb_table_stats_sampling_pct = new_val;
9833
9834 if (properties_collector_factory) {
9835 properties_collector_factory->SetTableStatsSamplingPct(
9836 rocksdb_table_stats_sampling_pct);
9837 }
9838 }
9839
9840 mysql_mutex_unlock(&rdb_sysvars_mutex);
9841 }
9842
9843 /*
9844 This function allows setting the rate limiter's bytes per second value
9845 but only if the rate limiter is turned on which has to be done at startup.
9846 If the rate is already 0 (turned off) or we are changing it to 0 (trying
9847 to turn it off) this function will push a warning to the client and do
9848 nothing.
9849 This is similar to the code in innodb_doublewrite_update (found in
9850 storage/innobase/handler/ha_innodb.cc).
9851 */
rocksdb_set_rate_limiter_bytes_per_sec(my_core::THD * const thd,my_core::st_mysql_sys_var * const var,void * const var_ptr,const void * const save)9852 void rocksdb_set_rate_limiter_bytes_per_sec(my_core::THD *const thd,
9853 my_core::st_mysql_sys_var *const var
9854 __attribute__((__unused__)),
9855 void *const var_ptr
9856 __attribute__((__unused__)),
9857 const void *const save) {
9858 const uint64_t new_val = *static_cast<const uint64_t *>(save);
9859 if (new_val == 0 || rocksdb_rate_limiter_bytes_per_sec == 0) {
9860 /*
9861 If a rate_limiter was not enabled at startup we can't change it nor
9862 can we disable it if one was created at startup
9863 */
9864 push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_WRONG_ARGUMENTS,
9865 "RocksDB: rocksdb_rate_limiter_bytes_per_sec cannot "
9866 "be dynamically changed to or from 0. Do a clean "
9867 "shutdown if you want to change it from or to 0.");
9868 } else if (new_val != rocksdb_rate_limiter_bytes_per_sec) {
9869 /* Apply the new value to the rate limiter and store it locally */
9870 DBUG_ASSERT(rocksdb_rate_limiter != nullptr);
9871 rocksdb_rate_limiter_bytes_per_sec = new_val;
9872 rocksdb_rate_limiter->SetBytesPerSecond(new_val);
9873 }
9874 }
9875
rdb_set_collation_exception_list(const char * const exception_list)9876 void rdb_set_collation_exception_list(const char *const exception_list) {
9877 DBUG_ASSERT(rdb_collation_exceptions != nullptr);
9878
9879 int flags = MY_REG_EXTENDED | MY_REG_NOSUB;
9880 if (lower_case_table_names)
9881 flags |= MY_REG_ICASE;
9882 if (!rdb_collation_exceptions->compile(
9883 exception_list, flags, table_alias_charset)) {
9884 warn_about_bad_patterns(*rdb_collation_exceptions,
9885 "strict_collation_exceptions");
9886 }
9887 }
9888
rocksdb_set_collation_exception_list(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)9889 void rocksdb_set_collation_exception_list(THD *const thd,
9890 struct st_mysql_sys_var *const var,
9891 void *const var_ptr,
9892 const void *const save) {
9893 const char *const val = *static_cast<const char *const *>(save);
9894
9895 rdb_set_collation_exception_list(val);
9896
9897 *static_cast<const char **>(var_ptr) = val;
9898 }
9899
rocksdb_set_bulk_load(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)9900 void rocksdb_set_bulk_load(THD *const thd, struct st_mysql_sys_var *const var
9901 __attribute__((__unused__)),
9902 void *const var_ptr, const void *const save) {
9903 Rdb_transaction *&tx = get_tx_from_thd(thd);
9904
9905 if (tx != nullptr) {
9906 const int rc = tx->finish_bulk_load();
9907 if (rc != 0) {
9908 // NO_LINT_DEBUG
9909 sql_print_error("RocksDB: Error %d finalizing last SST file while "
9910 "setting bulk loading variable",
9911 rc);
9912 abort_with_stack_traces();
9913 }
9914 }
9915
9916 *static_cast<bool *>(var_ptr) = *static_cast<const bool *>(save);
9917 }
9918
rocksdb_set_max_background_compactions(THD * thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)9919 static void rocksdb_set_max_background_compactions(
9920 THD *thd, struct st_mysql_sys_var *const var, void *const var_ptr,
9921 const void *const save) {
9922 DBUG_ASSERT(save != nullptr);
9923
9924 mysql_mutex_lock(&rdb_sysvars_mutex);
9925 rocksdb_db_options.max_background_compactions =
9926 *static_cast<const int *>(save);
9927 rocksdb_db_options.env->SetBackgroundThreads(
9928 rocksdb_db_options.max_background_compactions,
9929 rocksdb::Env::Priority::LOW);
9930 mysql_mutex_unlock(&rdb_sysvars_mutex);
9931 }
9932
rdb_queue_save_stats_request()9933 void rdb_queue_save_stats_request() { rdb_bg_thread.request_save_stats(); }
9934
rpl_before_delete_rows()9935 void ha_rocksdb::rpl_before_delete_rows() {
9936 DBUG_ENTER_FUNC();
9937
9938 m_in_rpl_delete_rows = true;
9939
9940 DBUG_VOID_RETURN;
9941 }
9942
rpl_after_delete_rows()9943 void ha_rocksdb::rpl_after_delete_rows() {
9944 DBUG_ENTER_FUNC();
9945
9946 m_in_rpl_delete_rows = false;
9947
9948 DBUG_VOID_RETURN;
9949 }
9950
rpl_before_update_rows()9951 void ha_rocksdb::rpl_before_update_rows() {
9952 DBUG_ENTER_FUNC();
9953
9954 m_in_rpl_update_rows = true;
9955
9956 DBUG_VOID_RETURN;
9957 }
9958
rpl_after_update_rows()9959 void ha_rocksdb::rpl_after_update_rows() {
9960 DBUG_ENTER_FUNC();
9961
9962 m_in_rpl_update_rows = false;
9963
9964 DBUG_VOID_RETURN;
9965 }
9966
9967 /**
9968 @brief
9969 Read Free Replication can be used or not. Returning False means
9970 Read Free Replication can be used. Read Free Replication can be used
9971 on UPDATE or DELETE row events, and table must have user defined
9972 primary key.
9973 */
use_read_free_rpl()9974 bool ha_rocksdb::use_read_free_rpl() {
9975 DBUG_ENTER_FUNC();
9976
9977 DBUG_RETURN((m_in_rpl_delete_rows || m_in_rpl_update_rows) &&
9978 !has_hidden_pk(table) && m_use_read_free_rpl);
9979 }
9980
read_time(uint index,uint ranges,ha_rows rows)9981 double ha_rocksdb::read_time(uint index, uint ranges, ha_rows rows) {
9982 DBUG_ENTER_FUNC();
9983
9984 if (index != table->s->primary_key) {
9985 /* Non covering index range scan */
9986 DBUG_RETURN(handler::read_time(index, ranges, rows));
9987 }
9988
9989 DBUG_RETURN((rows / 20.0) + 1);
9990 }
9991
9992 } // namespace myrocks
9993
9994 /*
9995 Register the storage engine plugin outside of myrocks namespace
9996 so that mysql_declare_plugin does not get confused when it does
9997 its name generation.
9998 */
9999
10000 struct st_mysql_storage_engine rocksdb_storage_engine = {
10001 MYSQL_HANDLERTON_INTERFACE_VERSION};
10002
mysql_declare_plugin(rocksdb_se)10003 mysql_declare_plugin(rocksdb_se){
10004 MYSQL_STORAGE_ENGINE_PLUGIN, /* Plugin Type */
10005 &rocksdb_storage_engine, /* Plugin Descriptor */
10006 "ROCKSDB", /* Plugin Name */
10007 "Monty Program Ab", /* Plugin Author */
10008 "RocksDB storage engine", /* Plugin Description */
10009 PLUGIN_LICENSE_GPL, /* Plugin Licence */
10010 myrocks::rocksdb_init_func, /* Plugin Entry Point */
10011 myrocks::rocksdb_done_func, /* Plugin Deinitializer */
10012 0x0001, /* version number (0.1) */
10013 myrocks::rocksdb_status_vars, /* status variables */
10014 myrocks::rocksdb_system_variables, /* system variables */
10015 nullptr, /* config options */
10016 0, /* flags */
10017 },
10018 myrocks::rdb_i_s_cfstats, myrocks::rdb_i_s_dbstats,
10019 myrocks::rdb_i_s_perf_context, myrocks::rdb_i_s_perf_context_global,
10020 myrocks::rdb_i_s_cfoptions, myrocks::rdb_i_s_compact_stats,
10021 myrocks::rdb_i_s_global_info, myrocks::rdb_i_s_ddl,
10022 myrocks::rdb_i_s_index_file_map, myrocks::rdb_i_s_lock_info,
10023 myrocks::rdb_i_s_trx_info mysql_declare_plugin_end;
10024