1 /*
2 Copyright (c) 2012, Monty Program Ab
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; version 2 of the License.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
16
17 #ifdef USE_PRAGMA_IMPLEMENTATION
18 #pragma implementation // gcc: Class implementation
19 #endif
20
21 #define MYSQL_SERVER 1
22
23 /* The C++ file's header */
24 #include "./ha_rocksdb.h"
25
26 #ifdef TARGET_OS_LINUX
27 #include <errno.h>
28 #include <sys/resource.h>
29 #include <sys/syscall.h>
30 #include <sys/types.h>
31 #endif
32
33 /* C++ standard header files */
34 #include <inttypes.h>
35 #include <algorithm>
36 #include <limits>
37 #include <map>
38 #include <queue>
39 #include <set>
40 #include <string>
41 #include <vector>
42
43 /* MySQL includes */
44 #include "debug_sync.h"
45 #include "my_bit.h"
46 #include "my_stacktrace.h"
47 #include "my_sys.h"
48 #include "mysql/psi/mysql_table.h"
49 #include "mysql/thread_pool_priv.h"
50 #include "mysys_err.h"
51 #include "sql_audit.h"
52 #include "sql_partition.h"
53 #include "sql_table.h"
54
55 /* RocksDB includes */
56 #include "monitoring/histogram.h"
57 #include "rocksdb/compaction_filter.h"
58 #include "rocksdb/env.h"
59 #include "rocksdb/memory_allocator.h"
60 #include "rocksdb/persistent_cache.h"
61 #include "rocksdb/rate_limiter.h"
62 #include "rocksdb/slice_transform.h"
63 #include "rocksdb/thread_status.h"
64 #include "rocksdb/trace_reader_writer.h"
65 #include "rocksdb/utilities/checkpoint.h"
66 #include "rocksdb/utilities/convenience.h"
67 #include "rocksdb/utilities/memory_util.h"
68 #include "rocksdb/utilities/sim_cache.h"
69 #include "rocksdb/utilities/write_batch_with_index.h"
70 #include "util/stop_watch.h"
71
72 /* MyRocks includes */
73 #include "./event_listener.h"
74 #include "./ha_rocksdb_proto.h"
75 #include "./ha_rockspart.h"
76 #include "./logger.h"
77 #include "./rdb_cf_manager.h"
78 #include "./rdb_cf_options.h"
79 #include "./rdb_converter.h"
80 #include "./rdb_datadic.h"
81 #include "./rdb_i_s.h"
82 #include "./rdb_index_merge.h"
83 #include "./rdb_mutex_wrapper.h"
84 #include "./rdb_psi.h"
85 #include "./rdb_threads.h"
86
87 // Internal MySQL APIs not exposed in any header.
88 extern "C" {
89 /**
90 * Get the user thread's binary logging format
91 * @param thd user thread
92 * @return Value to be used as index into the binlog_format_names array
93 */
94 int thd_binlog_format(const MYSQL_THD thd);
95
96 /**
97 * Check if binary logging is filtered for thread's current db.
98 * @param thd Thread handle
99 * @retval 1 the query is not filtered, 0 otherwise.
100 */
101 bool thd_binlog_filter_ok(const MYSQL_THD thd);
102 }
103
104 namespace myrocks {
105
106 static st_global_stats global_stats;
107 static st_export_stats export_stats;
108 static st_memory_stats memory_stats;
109 static st_io_stall_stats io_stall_stats;
110
111 const std::string DEFAULT_CF_NAME("default");
112 const std::string DEFAULT_SYSTEM_CF_NAME("__system__");
113 const std::string PER_INDEX_CF_NAME("$per_index_cf");
114 const std::string DEFAULT_SK_CF_NAME("default_sk");
115 const std::string TRUNCATE_TABLE_PREFIX("#truncate_tmp#");
116
117 static std::vector<std::string> rdb_tables_to_recalc;
118
119 static Rdb_exec_time st_rdb_exec_time;
120
121 /**
122 Updates row counters based on the table type and operation type.
123 */
update_row_stats(const operation_type & type)124 void ha_rocksdb::update_row_stats(const operation_type &type) {
125 assert(type < ROWS_MAX);
126 // Find if we are modifying system databases.
127 if (table->s && m_tbl_def->m_is_mysql_system_table) {
128 global_stats.system_rows[type].inc();
129 } else {
130 global_stats.rows[type].inc();
131 }
132 }
133
inc_covered_sk_lookup()134 void ha_rocksdb::inc_covered_sk_lookup() {
135 global_stats.covered_secondary_key_lookups.inc();
136 }
137
138 void dbug_dump_database(rocksdb::DB *db);
139 static handler *rocksdb_create_handler(my_core::handlerton *hton,
140 my_core::TABLE_SHARE *table_arg,
141 my_core::MEM_ROOT *mem_root);
142
getCompactRangeOptions(int concurrency=0,rocksdb::BottommostLevelCompaction bottommost_level_compaction=rocksdb::BottommostLevelCompaction::kForceOptimized)143 static rocksdb::CompactRangeOptions getCompactRangeOptions(
144 int concurrency = 0,
145 rocksdb::BottommostLevelCompaction bottommost_level_compaction =
146 rocksdb::BottommostLevelCompaction::kForceOptimized) {
147 rocksdb::CompactRangeOptions compact_range_options;
148 compact_range_options.bottommost_level_compaction =
149 bottommost_level_compaction;
150 compact_range_options.exclusive_manual_compaction = false;
151 if (concurrency > 0) {
152 compact_range_options.max_subcompactions = concurrency;
153 }
154 return compact_range_options;
155 }
156
157 ///////////////////////////////////////////////////////////
158 // Parameters and settings
159 ///////////////////////////////////////////////////////////
160 static char *rocksdb_default_cf_options = nullptr;
161 static char *rocksdb_override_cf_options = nullptr;
162 static char *rocksdb_update_cf_options = nullptr;
163 static my_bool rocksdb_use_default_sk_cf = false;
164
165 ///////////////////////////////////////////////////////////
166 // Globals
167 ///////////////////////////////////////////////////////////
168 handlerton *rocksdb_hton;
169
170 rocksdb::TransactionDB *rdb = nullptr;
171 rocksdb::HistogramImpl *commit_latency_stats = nullptr;
172
173 static std::shared_ptr<rocksdb::Statistics> rocksdb_stats;
174 static std::shared_ptr<Rdb_tbl_prop_coll_factory> properties_collector_factory;
175
176 Rdb_dict_manager dict_manager;
177 Rdb_cf_manager cf_manager;
178 Rdb_ddl_manager ddl_manager;
179 Rdb_hton_init_state hton_init_state;
180
181 /**
182 MyRocks background thread control
183 N.B. This is besides RocksDB's own background threads
184 (@see rocksdb::CancelAllBackgroundWork())
185 */
186
187 static Rdb_background_thread rdb_bg_thread;
188
189 static Rdb_index_stats_thread rdb_is_thread;
190
191 static Rdb_manual_compaction_thread rdb_mc_thread;
192
193 static Rdb_drop_index_thread rdb_drop_idx_thread;
194 // List of table names (using regex) that are exceptions to the strict
195 // collation check requirement.
196 Regex *rdb_collation_exceptions;
197
198 static const char *rdb_get_error_messages(int error);
199
rocksdb_flush_all_memtables()200 static void rocksdb_flush_all_memtables() {
201 const Rdb_cf_manager &cf_manager = rdb_get_cf_manager();
202
203 // RocksDB will fail the flush if the CF is deleted,
204 // but here we don't handle return status
205 for (const auto &cf_handle : cf_manager.get_all_cf()) {
206 rdb->Flush(rocksdb::FlushOptions(), cf_handle.get());
207 }
208 }
209
rocksdb_delete_column_family_stub(THD * const,struct st_mysql_sys_var * const,void * const,const void * const)210 static void rocksdb_delete_column_family_stub(
211 THD *const /* thd */, struct st_mysql_sys_var *const /* var */,
212 void *const /* var_ptr */, const void *const /* save */) {}
213
rocksdb_delete_column_family(THD * const,struct st_mysql_sys_var * const,void * const,struct st_mysql_value * const value)214 static int rocksdb_delete_column_family(
215 THD *const /* thd */, struct st_mysql_sys_var *const /* var */,
216 void *const /* var_ptr */, struct st_mysql_value *const value) {
217 assert(value != nullptr);
218
219 char buff[STRING_BUFFER_USUAL_SIZE];
220 int len = sizeof(buff);
221
222 const char *const cf = value->val_str(value, buff, &len);
223 if (cf == nullptr) return HA_EXIT_SUCCESS;
224
225 std::string cf_name = std::string(cf);
226 // Forbid to remove these built-in CFs
227 if (cf_name == DEFAULT_SYSTEM_CF_NAME || cf_name == DEFAULT_CF_NAME ||
228 cf_name.empty() ||
229 (cf_name == DEFAULT_SK_CF_NAME && rocksdb_use_default_sk_cf)) {
230 my_error(ER_CANT_DROP_CF, MYF(0), cf);
231 return HA_EXIT_FAILURE;
232 }
233
234 auto &cf_manager = rdb_get_cf_manager();
235 int ret = 0;
236
237 {
238 std::lock_guard<Rdb_dict_manager> dm_lock(dict_manager);
239 ret = cf_manager.drop_cf(&ddl_manager, &dict_manager, cf_name);
240 }
241
242 if (ret == HA_EXIT_SUCCESS) {
243 rdb_drop_idx_thread.signal();
244 } else {
245 my_error(ER_CANT_DROP_CF, MYF(0), cf);
246 }
247
248 return ret;
249 }
250
251 ///////////////////////////////////////////////////////////
252 // Hash map: table name => open table handler
253 ///////////////////////////////////////////////////////////
254
255 namespace // anonymous namespace = not visible outside this source file
256 {
257
258 class Rdb_open_tables_map {
259 private:
260 /* Hash table used to track the handlers of open tables */
261 std::unordered_map<std::string, Rdb_table_handler *> m_table_map;
262
263 /* The mutex used to protect the hash table */
264 mutable mysql_mutex_t m_mutex;
265
266 public:
init()267 void init() {
268 m_table_map.clear();
269 mysql_mutex_init(rdb_psi_open_tbls_mutex_key, &m_mutex, MY_MUTEX_INIT_FAST);
270 }
271
free()272 void free() {
273 m_table_map.clear();
274 mysql_mutex_destroy(&m_mutex);
275 }
276
count()277 size_t count() { return m_table_map.size(); }
278
279 Rdb_table_handler *get_table_handler(const char *const table_name);
280 void release_table_handler(Rdb_table_handler *const table_handler);
281
282 std::vector<std::string> get_table_names(void) const;
283 };
284
285 } // anonymous namespace
286
287 static Rdb_open_tables_map rdb_open_tables;
288
rdb_normalize_dir(std::string dir)289 static std::string rdb_normalize_dir(std::string dir) {
290 while (dir.size() > 0 && dir.back() == '/') {
291 dir.resize(dir.size() - 1);
292 }
293 return dir;
294 }
295
rocksdb_create_checkpoint(THD * const thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * const var MY_ATTRIBUTE ((__unused__)),void * const save MY_ATTRIBUTE ((__unused__)),struct st_mysql_value * const value)296 static int rocksdb_create_checkpoint(
297 THD *const thd MY_ATTRIBUTE((__unused__)),
298 struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
299 void *const save MY_ATTRIBUTE((__unused__)),
300 struct st_mysql_value *const value) {
301 char buf[FN_REFLEN];
302 int len = sizeof(buf);
303 const char *const checkpoint_dir_raw = value->val_str(value, buf, &len);
304 if (checkpoint_dir_raw) {
305 if (rdb != nullptr) {
306 std::string checkpoint_dir = rdb_normalize_dir(checkpoint_dir_raw);
307 // NO_LINT_DEBUG
308 sql_print_information("RocksDB: creating checkpoint in directory : %s\n",
309 checkpoint_dir.c_str());
310 rocksdb::Checkpoint *checkpoint;
311 auto status = rocksdb::Checkpoint::Create(rdb, &checkpoint);
312 // We can only return HA_EXIT_FAILURE/HA_EXIT_SUCCESS here which is why
313 // the return code is ignored, but by calling into rdb_error_to_mysql,
314 // it will call my_error for us, which will propogate up to the client.
315 int rc MY_ATTRIBUTE((__unused__));
316 if (status.ok()) {
317 status = checkpoint->CreateCheckpoint(checkpoint_dir.c_str());
318 delete checkpoint;
319 if (status.ok()) {
320 // NO_LINT_DEBUG
321 sql_print_information(
322 "RocksDB: created checkpoint in directory : %s\n",
323 checkpoint_dir.c_str());
324 return HA_EXIT_SUCCESS;
325 } else {
326 rc = ha_rocksdb::rdb_error_to_mysql(status);
327 }
328 } else {
329 rc = ha_rocksdb::rdb_error_to_mysql(status);
330 }
331 }
332 }
333 return HA_EXIT_FAILURE;
334 }
335
336 /* This method is needed to indicate that the
337 ROCKSDB_CREATE_CHECKPOINT command is not read-only */
rocksdb_create_checkpoint_stub(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)338 static void rocksdb_create_checkpoint_stub(THD *const thd,
339 struct st_mysql_sys_var *const var,
340 void *const var_ptr,
341 const void *const save) {}
342
rocksdb_force_flush_memtable_now_stub(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)343 static void rocksdb_force_flush_memtable_now_stub(
344 THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
345 const void *const save) {}
346
rocksdb_force_flush_memtable_now(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,struct st_mysql_value * const value)347 static int rocksdb_force_flush_memtable_now(
348 THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
349 struct st_mysql_value *const value) {
350 // NO_LINT_DEBUG
351 sql_print_information("RocksDB: Manual memtable flush.");
352 rocksdb_flush_all_memtables();
353 return HA_EXIT_SUCCESS;
354 }
355
rocksdb_force_flush_memtable_and_lzero_now_stub(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)356 static void rocksdb_force_flush_memtable_and_lzero_now_stub(
357 THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
358 const void *const save) {}
359
rocksdb_force_flush_memtable_and_lzero_now(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,struct st_mysql_value * const value)360 static int rocksdb_force_flush_memtable_and_lzero_now(
361 THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
362 struct st_mysql_value *const value) {
363 // NO_LINT_DEBUG
364 sql_print_information("RocksDB: Manual memtable and L0 flush.");
365 rocksdb_flush_all_memtables();
366
367 const Rdb_cf_manager &cf_manager = rdb_get_cf_manager();
368 rocksdb::CompactionOptions c_options = rocksdb::CompactionOptions();
369 rocksdb::ColumnFamilyMetaData metadata;
370 rocksdb::ColumnFamilyDescriptor cf_descr;
371
372 static constexpr int max_attempts = 3;
373 int i, num_errors = 0;
374
375 for (const auto &cf_handle : cf_manager.get_all_cf()) {
376 for (i = 0; i < max_attempts; i++) {
377 rdb->GetColumnFamilyMetaData(cf_handle.get(), &metadata);
378 cf_handle->GetDescriptor(&cf_descr);
379 c_options.output_file_size_limit = cf_descr.options.target_file_size_base;
380
381 assert(metadata.levels[0].level == 0);
382 std::vector<std::string> file_names;
383 for (const auto &file : metadata.levels[0].files) {
384 file_names.emplace_back(file.db_path + file.name);
385 }
386
387 if (file_names.empty()) {
388 break;
389 }
390
391 rocksdb::Status s;
392 s = rdb->CompactFiles(c_options, cf_handle.get(), file_names, 1);
393
394 if (!s.ok()) {
395 std::shared_ptr<rocksdb::ColumnFamilyHandle> cfh =
396 cf_manager.get_cf(cf_handle->GetID());
397
398 // If the CF handle has been removed from cf_manager, it is not an
399 // error. We are done with this CF and proceed to the next CF.
400 if (!cfh) {
401 // NO_LINT_DEBUG
402 sql_print_information("cf %s has been dropped during CompactFiles.",
403 cf_handle->GetName().c_str());
404 break;
405 }
406
407 // Due to a race, it's possible for CompactFiles to collide
408 // with auto compaction, causing an error to return
409 // regarding file not found. In that case, retry.
410 if (s.IsInvalidArgument()) {
411 continue;
412 }
413
414 if (!s.ok() && !s.IsAborted()) {
415 rdb_handle_io_error(s, RDB_IO_ERROR_GENERAL);
416 return HA_EXIT_FAILURE;
417 }
418 break;
419 }
420 }
421 if (i == max_attempts) {
422 num_errors++;
423 }
424 }
425
426 return num_errors == 0 ? HA_EXIT_SUCCESS : HA_EXIT_FAILURE;
427 }
428
429 static void rocksdb_drop_index_wakeup_thread(
430 my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
431 struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
432 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save);
433
434 static my_bool rocksdb_pause_background_work = 0;
435 static mysql_mutex_t rdb_sysvars_mutex;
436 static mysql_mutex_t rdb_block_cache_resize_mutex;
437 static mysql_mutex_t rdb_bottom_pri_background_compactions_resize_mutex;
438
rocksdb_set_pause_background_work(my_core::THD * const thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * const var MY_ATTRIBUTE ((__unused__)),void * const var_ptr MY_ATTRIBUTE ((__unused__)),const void * const save)439 static void rocksdb_set_pause_background_work(
440 my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
441 struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
442 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
443 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
444 const bool pause_requested = *static_cast<const bool *>(save);
445 if (rocksdb_pause_background_work != pause_requested) {
446 if (pause_requested) {
447 rdb->PauseBackgroundWork();
448 } else {
449 rdb->ContinueBackgroundWork();
450 }
451 rocksdb_pause_background_work = pause_requested;
452 }
453 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
454 }
455
456 static void rocksdb_set_compaction_options(THD *thd,
457 struct st_mysql_sys_var *var,
458 void *var_ptr, const void *save);
459
460 static void rocksdb_set_table_stats_sampling_pct(THD *thd,
461 struct st_mysql_sys_var *var,
462 void *var_ptr,
463 const void *save);
464
465 static void rocksdb_update_table_stats_use_table_scan(
466 THD *const /* thd */, struct st_mysql_sys_var *const /* var */,
467 void *const var_ptr, const void *const save);
468
469 static int rocksdb_index_stats_thread_renice(
470 THD *const /* thd */, struct st_mysql_sys_var *const /* var */,
471 void *const save, struct st_mysql_value *const value);
472
473 static void rocksdb_set_rate_limiter_bytes_per_sec(THD *thd,
474 struct st_mysql_sys_var *var,
475 void *var_ptr,
476 const void *save);
477
478 static void rocksdb_set_sst_mgr_rate_bytes_per_sec(THD *thd,
479 struct st_mysql_sys_var *var,
480 void *var_ptr,
481 const void *save);
482
483 static void rocksdb_set_delayed_write_rate(THD *thd,
484 struct st_mysql_sys_var *var,
485 void *var_ptr, const void *save);
486
487 static void rocksdb_set_max_latest_deadlocks(THD *thd,
488 struct st_mysql_sys_var *var,
489 void *var_ptr, const void *save);
490
491 static void rdb_set_collation_exception_list(const char *exception_list);
492 static void rocksdb_set_collation_exception_list(THD *thd,
493 struct st_mysql_sys_var *var,
494 void *var_ptr,
495 const void *save);
496
497 static int rocksdb_validate_update_cf_options(THD *thd,
498 struct st_mysql_sys_var *var,
499 void *save,
500 st_mysql_value *value);
501
502 static void rocksdb_set_update_cf_options(THD *thd,
503 struct st_mysql_sys_var *var,
504 void *var_ptr, const void *save);
505
506 static int rocksdb_check_bulk_load(
507 THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)),
508 void *save, struct st_mysql_value *value);
509
510 static int rocksdb_check_bulk_load_allow_unsorted(
511 THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)),
512 void *save, struct st_mysql_value *value);
513
514 static void rocksdb_set_max_background_jobs(THD *thd,
515 struct st_mysql_sys_var *const var,
516 void *const var_ptr,
517 const void *const save);
518 static void rocksdb_set_max_background_compactions(THD *thd,
519 struct st_mysql_sys_var *const var,
520 void *const var_ptr,
521 const void *const save);
522
523 static void rocksdb_set_bytes_per_sync(THD *thd,
524 struct st_mysql_sys_var *const var,
525 void *const var_ptr,
526 const void *const save);
527 static void rocksdb_set_wal_bytes_per_sync(THD *thd,
528 struct st_mysql_sys_var *const var,
529 void *const var_ptr,
530 const void *const save);
531 static int rocksdb_validate_set_block_cache_size(
532 THD *thd, struct st_mysql_sys_var *const var, void *var_ptr,
533 struct st_mysql_value *value);
534 static int rocksdb_tracing(THD *const thd MY_ATTRIBUTE((__unused__)),
535 struct st_mysql_sys_var *const var, void *const save,
536 struct st_mysql_value *const value,
537 bool trace_block_cache_access = true);
538 static int rocksdb_validate_max_bottom_pri_background_compactions(
539 THD *thd MY_ATTRIBUTE((__unused__)),
540 struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
541 void *var_ptr, struct st_mysql_value *value);
542 //////////////////////////////////////////////////////////////////////////////
543 // Options definitions
544 //////////////////////////////////////////////////////////////////////////////
545 static const constexpr ulong RDB_MAX_LOCK_WAIT_SECONDS = 1024 * 1024 * 1024;
546 static const constexpr ulong RDB_MAX_ROW_LOCKS = 1024 * 1024 * 1024;
547 static const constexpr ulong RDB_DEFAULT_ROW_LOCKS = 1024 * 1024;
548 static const constexpr ulong RDB_DEFAULT_BULK_LOAD_SIZE = 1000;
549 static const constexpr ulong RDB_MAX_BULK_LOAD_SIZE = 1024 * 1024 * 1024;
550 static const constexpr size_t RDB_DEFAULT_MERGE_BUF_SIZE = 64 * 1024 * 1024;
551 static const constexpr size_t RDB_MIN_MERGE_BUF_SIZE = 100;
552 static const constexpr size_t RDB_DEFAULT_MERGE_COMBINE_READ_SIZE =
553 1024 * 1024 * 1024;
554 static const constexpr size_t RDB_MIN_MERGE_COMBINE_READ_SIZE = 100;
555 static const constexpr size_t RDB_DEFAULT_MERGE_TMP_FILE_REMOVAL_DELAY = 0;
556 static const constexpr size_t RDB_MIN_MERGE_TMP_FILE_REMOVAL_DELAY = 0;
557 static const constexpr int64 RDB_DEFAULT_BLOCK_CACHE_SIZE = 512 * 1024 * 1024;
558 static const constexpr int64 RDB_MIN_BLOCK_CACHE_SIZE = 1024;
559 static const constexpr int RDB_MAX_CHECKSUMS_PCT = 100;
560 static const constexpr uint32_t
561 RDB_DEFAULT_FORCE_COMPUTE_MEMTABLE_STATS_CACHETIME = 60 * 1000 * 1000;
562 static const constexpr ulong RDB_DEADLOCK_DETECT_DEPTH = 50;
563 static const constexpr uint ROCKSDB_MAX_BOTTOM_PRI_BACKGROUND_COMPACTIONS = 64;
564
565 static long long rocksdb_block_cache_size = RDB_DEFAULT_BLOCK_CACHE_SIZE;
566 static long long rocksdb_sim_cache_size = 0;
567 static double rocksdb_cache_high_pri_pool_ratio = 0.0;
568 static my_bool rocksdb_cache_dump = FALSE;
569 /* Use unsigned long long instead of uint64_t because of MySQL compatibility */
570 static unsigned long long // NOLINT(runtime/int)
571 rocksdb_rate_limiter_bytes_per_sec = 0;
572 static unsigned long long // NOLINT(runtime/int)
573 rocksdb_sst_mgr_rate_bytes_per_sec = DEFAULT_SST_MGR_RATE_BYTES_PER_SEC;
574 static unsigned long long rocksdb_delayed_write_rate;
575 static uint32_t rocksdb_max_latest_deadlocks = RDB_DEADLOCK_DETECT_DEPTH;
576 static unsigned long // NOLINT(runtime/int)
577 rocksdb_persistent_cache_size_mb = 0;
578 static uint64_t rocksdb_info_log_level = rocksdb::InfoLogLevel::ERROR_LEVEL;
579 static char *rocksdb_wal_dir = nullptr;
580 static char *rocksdb_persistent_cache_path = nullptr;
581 static uint64_t rocksdb_index_type =
582 rocksdb::BlockBasedTableOptions::kBinarySearch;
583 static uint32_t rocksdb_flush_log_at_trx_commit = 1;
584 static uint32_t rocksdb_debug_optimizer_n_rows = 0;
585 static my_bool rocksdb_force_compute_memtable_stats = TRUE;
586 static uint32_t rocksdb_force_compute_memtable_stats_cachetime =
587 RDB_DEFAULT_FORCE_COMPUTE_MEMTABLE_STATS_CACHETIME;
588 static my_bool rocksdb_debug_optimizer_no_zero_cardinality = TRUE;
589 static uint32_t rocksdb_wal_recovery_mode =
590 static_cast<uint32_t>(rocksdb::WALRecoveryMode::kPointInTimeRecovery);
591 static my_bool rocksdb_track_and_verify_wals_in_manifest = TRUE;
592 static uint32_t rocksdb_stats_level = 0;
593 static uint32_t rocksdb_access_hint_on_compaction_start =
594 rocksdb::Options::AccessHint::NORMAL;
595 static char *rocksdb_compact_cf_name = nullptr;
596 static char *rocksdb_delete_cf_name = nullptr;
597 static char *rocksdb_checkpoint_name = nullptr;
598 static char *rocksdb_block_cache_trace_options_str = nullptr;
599 static char *rocksdb_trace_options_str = nullptr;
600 static my_bool rocksdb_signal_drop_index_thread = FALSE;
601 static my_bool rocksdb_strict_collation_check = TRUE;
602 static my_bool rocksdb_ignore_unknown_options = TRUE;
603 static char *rocksdb_strict_collation_exceptions = nullptr;
604 static my_bool rocksdb_collect_sst_properties = TRUE;
605 static my_bool rocksdb_force_flush_memtable_now_var = FALSE;
606 static my_bool rocksdb_force_flush_memtable_and_lzero_now_var = FALSE;
607 static my_bool rocksdb_enable_native_partition = FALSE;
608 static my_bool rocksdb_enable_ttl = TRUE;
609 static my_bool rocksdb_enable_ttl_read_filtering = TRUE;
610 static int rocksdb_debug_ttl_rec_ts = 0;
611 static int rocksdb_debug_ttl_snapshot_ts = 0;
612 static int rocksdb_debug_ttl_read_filter_ts = 0;
613 static my_bool rocksdb_debug_ttl_ignore_pk = FALSE;
614 static my_bool rocksdb_reset_stats = FALSE;
615 static uint32_t rocksdb_seconds_between_stat_computes = 3600;
616 static long long rocksdb_compaction_sequential_deletes = 0l;
617 static long long rocksdb_compaction_sequential_deletes_window = 0l;
618 static long long rocksdb_compaction_sequential_deletes_file_size = 0l;
619 #if defined(ROCKSDB_INCLUDE_VALIDATE_TABLES) && ROCKSDB_INCLUDE_VALIDATE_TABLES
620 static uint32_t rocksdb_validate_tables = 1;
621 #endif // defined(ROCKSDB_INCLUDE_VALIDATE_TABLES) &&
622 // ROCKSDB_INCLUDE_VALIDATE_TABLES
623 static char *rocksdb_datadir = nullptr;
624 static uint32_t rocksdb_max_bottom_pri_background_compactions = 0;
625 static uint32_t rocksdb_table_stats_sampling_pct =
626 RDB_DEFAULT_TBL_STATS_SAMPLE_PCT;
627 static uint32_t rocksdb_table_stats_recalc_threshold_pct = 10;
628 static unsigned long long rocksdb_table_stats_recalc_threshold_count = 100ul;
629 static my_bool rocksdb_table_stats_use_table_scan = 0;
630 static int32_t rocksdb_table_stats_background_thread_nice_value =
631 THREAD_PRIO_MAX;
632 static unsigned long long rocksdb_table_stats_max_num_rows_scanned = 0ul;
633 static my_bool rocksdb_enable_bulk_load_api = TRUE;
634 static my_bool rocksdb_enable_remove_orphaned_dropped_cfs = TRUE;
635 static my_bool rpl_skip_tx_api_var = FALSE;
636 static my_bool rocksdb_print_snapshot_conflict_queries = FALSE;
637 static my_bool rocksdb_large_prefix = FALSE;
638 static my_bool rocksdb_allow_to_start_after_corruption = FALSE;
639 static uint64_t rocksdb_write_policy =
640 rocksdb::TxnDBWritePolicy::WRITE_COMMITTED;
641 char *rocksdb_read_free_rpl_tables;
642 ulong rocksdb_max_row_locks;
643 std::mutex rocksdb_read_free_rpl_tables_mutex;
644 #if defined(HAVE_PSI_INTERFACE)
645 Regex rdb_read_free_regex_handler(key_rwlock_read_free_rpl_tables);
646 #else
647 Regex rdb_read_free_regex_handler;
648 #endif
649 enum read_free_rpl_type { OFF = 0, PK_ONLY, PK_SK };
650 static uint64_t rocksdb_read_free_rpl = read_free_rpl_type::OFF;
651 static my_bool rocksdb_error_on_suboptimal_collation = FALSE;
652 static uint32_t rocksdb_stats_recalc_rate = 0;
653 static my_bool rocksdb_no_create_column_family = FALSE;
654 static uint32_t rocksdb_debug_manual_compaction_delay = 0;
655 static uint32_t rocksdb_max_manual_compactions = 0;
656 static my_bool rocksdb_rollback_on_timeout = FALSE;
657 static my_bool rocksdb_enable_insert_with_update_caching = TRUE;
658 static my_bool rocksdb_skip_locks_if_skip_unique_check = FALSE;
659 static my_bool rocksdb_alter_column_default_inplace = FALSE;
660 std::atomic<uint64_t> rocksdb_row_lock_deadlocks(0);
661 std::atomic<uint64_t> rocksdb_row_lock_wait_timeouts(0);
662 std::atomic<uint64_t> rocksdb_snapshot_conflict_errors(0);
663 std::atomic<uint64_t> rocksdb_wal_group_syncs(0);
664 std::atomic<uint64_t> rocksdb_manual_compactions_processed(0);
665 std::atomic<uint64_t> rocksdb_manual_compactions_running(0);
666 #ifndef NDEBUG
667 std::atomic<uint64_t> rocksdb_num_get_for_update_calls(0);
668 #endif
669
rocksdb_trace_block_cache_access(THD * const thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * const var,void * const save,struct st_mysql_value * const value)670 static int rocksdb_trace_block_cache_access(
671 THD *const thd MY_ATTRIBUTE((__unused__)),
672 struct st_mysql_sys_var *const var, void *const save,
673 struct st_mysql_value *const value) {
674 return rocksdb_tracing(thd, var, save, value,
675 /* trace_block_cache_accecss = */ true);
676 }
677
rocksdb_trace_queries(THD * const thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * const var,void * const save,struct st_mysql_value * const value)678 static int rocksdb_trace_queries(THD *const thd MY_ATTRIBUTE((__unused__)),
679 struct st_mysql_sys_var *const var,
680 void *const save,
681 struct st_mysql_value *const value) {
682 return rocksdb_tracing(thd, var, save, value,
683 /* trace_block_cache_accecss = */ false);
684 }
685
rocksdb_tracing(THD * const thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * const var,void * const save,struct st_mysql_value * const value,bool trace_block_cache_access)686 static int rocksdb_tracing(THD *const thd MY_ATTRIBUTE((__unused__)),
687 struct st_mysql_sys_var *const var, void *const save,
688 struct st_mysql_value *const value,
689 bool trace_block_cache_access) {
690 std::string trace_folder =
691 trace_block_cache_access ? "/block_cache_traces" : "/queries_traces";
692 int len = 0;
693 const char *const trace_opt_str_raw = value->val_str(value, nullptr, &len);
694 if (trace_opt_str_raw == nullptr) {
695 return HA_EXIT_FAILURE;
696 }
697
698 rocksdb::Status s;
699 if (rdb == nullptr) {
700 return HA_EXIT_FAILURE;
701 }
702 int rc __attribute__((__unused__));
703 std::string trace_opt_str(trace_opt_str_raw);
704 if (trace_opt_str.empty()) {
705 // End tracing block cache accesses or queries.
706 // NO_LINT_DEBUG
707 sql_print_information(
708 "RocksDB: Stop tracing block cache accesses or queries.\n");
709 s = trace_block_cache_access ? rdb->EndBlockCacheTrace() : rdb->EndTrace();
710
711 if (!s.ok()) {
712 rc = ha_rocksdb::rdb_error_to_mysql(s);
713 return HA_EXIT_FAILURE;
714 }
715 *static_cast<const char **>(save) = trace_opt_str_raw;
716 return HA_EXIT_SUCCESS;
717 }
718
719 // Start tracing block cache accesses or queries.
720 std::stringstream ss(trace_opt_str);
721 std::vector<std::string> trace_opts_strs;
722 while (ss.good()) {
723 std::string substr;
724 getline(ss, substr, ':');
725 trace_opts_strs.push_back(substr);
726 }
727 rocksdb::TraceOptions trace_opt;
728 try {
729 if (trace_opts_strs.size() != 3) {
730 throw std::invalid_argument("Incorrect number of arguments.");
731 }
732 trace_opt.sampling_frequency = std::stoull(trace_opts_strs[0]);
733 trace_opt.max_trace_file_size = std::stoull(trace_opts_strs[1]);
734 } catch (const std::exception &e) {
735 // NO_LINT_DEBUG
736 sql_print_information(
737 "RocksDB: Failed to parse trace option string: %s. The correct "
738 "format is sampling_frequency:max_trace_file_size:trace_file_name. "
739 "sampling_frequency and max_trace_file_size are positive integers. "
740 "The block accesses or queries are saved to the "
741 "rocksdb_datadir%s/trace_file_name.\n",
742 trace_opt_str.c_str(), trace_folder.c_str());
743 return HA_EXIT_FAILURE;
744 }
745 const std::string &trace_file_name = trace_opts_strs[2];
746 if (trace_file_name.find("/") != std::string::npos) {
747 // NO_LINT_DEBUG
748 sql_print_information(
749 "RocksDB: Start tracing failed (trace option string: %s). The file "
750 "name contains directory separator.\n",
751 trace_opt_str.c_str());
752 return HA_EXIT_FAILURE;
753 }
754 const std::string trace_dir = std::string(rocksdb_datadir) + trace_folder;
755 s = rdb->GetEnv()->CreateDirIfMissing(trace_dir);
756 if (!s.ok()) {
757 // NO_LINT_DEBUG
758 sql_print_information(
759 "RocksDB: Start tracing failed (trace option string: %s). Failed to "
760 "create the trace directory %s: %s\n",
761 trace_opt_str.c_str(), trace_dir.c_str(), s.ToString().c_str());
762 return HA_EXIT_FAILURE;
763 }
764 const std::string trace_file_path = trace_dir + "/" + trace_file_name;
765 s = rdb->GetEnv()->FileExists(trace_file_path);
766 if (s.ok() || !s.IsNotFound()) {
767 // NO_LINT_DEBUG
768 sql_print_information(
769 "RocksDB: Start tracing failed (trace option string: %s). The trace "
770 "file either already exists or we encountered an error "
771 "when calling rdb->GetEnv()->FileExists. The returned status string "
772 "is: %s\n",
773 trace_opt_str.c_str(), s.ToString().c_str());
774 return HA_EXIT_FAILURE;
775 }
776 std::unique_ptr<rocksdb::TraceWriter> trace_writer;
777 const rocksdb::EnvOptions env_option(rdb->GetDBOptions());
778 s = rocksdb::NewFileTraceWriter(rdb->GetEnv(), env_option, trace_file_path,
779 &trace_writer);
780 if (!s.ok()) {
781 rc = ha_rocksdb::rdb_error_to_mysql(s);
782 return HA_EXIT_FAILURE;
783 }
784 if (trace_block_cache_access) {
785 s = rdb->StartBlockCacheTrace(trace_opt, std::move(trace_writer));
786 } else {
787 s = rdb->StartTrace(trace_opt, std::move(trace_writer));
788 }
789 if (!s.ok()) {
790 rc = ha_rocksdb::rdb_error_to_mysql(s);
791 return HA_EXIT_FAILURE;
792 }
793 // NO_LINT_DEBUG
794 sql_print_information(
795 "RocksDB: Start tracing block cache accesses or queries. Sampling "
796 "frequency: %lu, "
797 "Maximum trace file size: %lu, Trace file path %s.\n",
798 trace_opt.sampling_frequency, trace_opt.max_trace_file_size,
799 trace_file_path.c_str());
800 // Save the trace option.
801 *static_cast<const char **>(save) = trace_opt_str_raw;
802 return HA_EXIT_SUCCESS;
803 }
804
rdb_init_rocksdb_db_options(void)805 static std::unique_ptr<rocksdb::DBOptions> rdb_init_rocksdb_db_options(void) {
806 auto o = std::unique_ptr<rocksdb::DBOptions>(new rocksdb::DBOptions());
807
808 o->create_if_missing = true;
809 o->listeners.push_back(std::make_shared<Rdb_event_listener>(&ddl_manager));
810 o->info_log_level = rocksdb::InfoLogLevel::INFO_LEVEL;
811 o->max_subcompactions = DEFAULT_SUBCOMPACTIONS;
812 o->max_open_files = -2; // auto-tune to 50% open_files_limit
813
814 o->two_write_queues = true;
815 o->manual_wal_flush = true;
816 return o;
817 }
818
819 /* DBOptions contains Statistics and needs to be destructed last */
820 static std::unique_ptr<rocksdb::BlockBasedTableOptions> rocksdb_tbl_options =
821 std::unique_ptr<rocksdb::BlockBasedTableOptions>(
822 new rocksdb::BlockBasedTableOptions());
823 static std::unique_ptr<rocksdb::DBOptions> rocksdb_db_options =
824 rdb_init_rocksdb_db_options();
825
826 static std::shared_ptr<rocksdb::RateLimiter> rocksdb_rate_limiter;
827
828 /* This enum needs to be kept up to date with rocksdb::TxnDBWritePolicy */
829 static const char *write_policy_names[] = {"write_committed", "write_prepared",
830 "write_unprepared", NullS};
831
832 static TYPELIB write_policy_typelib = {array_elements(write_policy_names) - 1,
833 "write_policy_typelib",
834 write_policy_names, nullptr};
835
836 /* This array needs to be kept up to date with myrocks::read_free_rpl_type */
837 static const char *read_free_rpl_names[] = {"OFF", "PK_ONLY", "PK_SK", NullS};
838
839 static TYPELIB read_free_rpl_typelib = {array_elements(read_free_rpl_names) - 1,
840 "read_free_rpl_typelib",
841 read_free_rpl_names, nullptr};
842
843 /* This enum needs to be kept up to date with rocksdb::InfoLogLevel */
844 static const char *info_log_level_names[] = {"debug_level", "info_level",
845 "warn_level", "error_level",
846 "fatal_level", NullS};
847
848 static TYPELIB info_log_level_typelib = {
849 array_elements(info_log_level_names) - 1, "info_log_level_typelib",
850 info_log_level_names, nullptr};
851
852 /* This enum needs to be kept up to date with rocksdb::BottommostLevelCompaction
853 */
854 static const char *bottommost_level_compaction_names[] = {
855 "kSkip", "kIfHaveCompactionFilter", "kForce", "kForceOptimized", NullS};
856
857 static TYPELIB bottommost_level_compaction_typelib = {
858 array_elements(bottommost_level_compaction_names) - 1,
859 "bottommost_level_compaction_typelib", bottommost_level_compaction_names,
860 nullptr};
861
rocksdb_set_rocksdb_info_log_level(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)862 static void rocksdb_set_rocksdb_info_log_level(
863 THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
864 const void *const save) {
865 assert(save != nullptr);
866
867 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
868 rocksdb_info_log_level = *static_cast<const uint64_t *>(save);
869 rocksdb_db_options->info_log->SetInfoLogLevel(
870 static_cast<rocksdb::InfoLogLevel>(rocksdb_info_log_level));
871 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
872 }
873
rocksdb_set_rocksdb_stats_level(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)874 static void rocksdb_set_rocksdb_stats_level(THD *const thd,
875 struct st_mysql_sys_var *const var,
876 void *const var_ptr,
877 const void *const save) {
878 assert(save != nullptr);
879
880 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
881 rocksdb_db_options->statistics->set_stats_level(
882 static_cast<rocksdb::StatsLevel>(*static_cast<const uint64_t *>(save)));
883 // Actual stats level is defined at rocksdb dbopt::statistics::stats_level_
884 // so adjusting rocksdb_stats_level here to make sure it points to
885 // the correct stats level.
886 rocksdb_stats_level = rocksdb_db_options->statistics->get_stats_level();
887 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
888 }
889
rocksdb_set_reset_stats(my_core::THD * const,my_core::st_mysql_sys_var * const,void * const var_ptr,const void * const save)890 static void rocksdb_set_reset_stats(
891 my_core::THD *const /* unused */,
892 my_core::st_mysql_sys_var *const /* unused */, void *const var_ptr,
893 const void *const save) {
894 assert(save != nullptr);
895 assert(rdb != nullptr);
896 assert(rocksdb_stats != nullptr);
897
898 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
899
900 *static_cast<bool *>(var_ptr) = *static_cast<const bool *>(save);
901
902 if (rocksdb_reset_stats) {
903 rocksdb::Status s = rdb->ResetStats();
904
905 // RocksDB will always return success. Let's document this assumption here
906 // as well so that we'll get immediately notified when contract changes.
907 assert(s == rocksdb::Status::OK());
908
909 s = rocksdb_stats->Reset();
910 assert(s == rocksdb::Status::OK());
911 }
912
913 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
914 }
915
916 enum rocksdb_flush_log_at_trx_commit_type : unsigned int {
917 FLUSH_LOG_NEVER = 0,
918 FLUSH_LOG_SYNC,
919 FLUSH_LOG_BACKGROUND,
920 FLUSH_LOG_MAX /* must be last */
921 };
922
rocksdb_validate_flush_log_at_trx_commit(THD * const thd,struct st_mysql_sys_var * const var,void * var_ptr,struct st_mysql_value * const value)923 static int rocksdb_validate_flush_log_at_trx_commit(
924 THD *const thd,
925 struct st_mysql_sys_var *const var, /* in: pointer to system variable */
926 void *var_ptr, /* out: immediate result for update function */
927 struct st_mysql_value *const value /* in: incoming value */) {
928 long long new_value;
929
930 /* value is NULL */
931 if (value->val_int(value, &new_value)) {
932 return HA_EXIT_FAILURE;
933 }
934
935 if (rocksdb_db_options->allow_mmap_writes && new_value != FLUSH_LOG_NEVER) {
936 return HA_EXIT_FAILURE;
937 }
938
939 *static_cast<uint32_t *>(var_ptr) = static_cast<uint32_t>(new_value);
940 return HA_EXIT_SUCCESS;
941 }
rocksdb_compact_column_family_stub(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)942 static void rocksdb_compact_column_family_stub(
943 THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
944 const void *const save) {}
945
946 static int rocksdb_compact_column_family(THD *const thd,
947 struct st_mysql_sys_var *const var,
948 void *const var_ptr,
949 struct st_mysql_value *const value);
950
951 static const char *index_type_names[] = {"kBinarySearch", "kHashSearch", NullS};
952
953 static TYPELIB index_type_typelib = {array_elements(index_type_names) - 1,
954 "index_type_typelib", index_type_names,
955 nullptr};
956
957 // TODO: 0 means don't wait at all, and we don't support it yet?
958 static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG,
959 "Number of seconds to wait for lock", nullptr,
960 nullptr, /*default*/ 1, /*min*/ 1,
961 /*max*/ RDB_MAX_LOCK_WAIT_SECONDS, 0);
962
963 static MYSQL_THDVAR_BOOL(deadlock_detect, PLUGIN_VAR_RQCMDARG,
964 "Enables deadlock detection", nullptr, nullptr, false);
965
966 static MYSQL_THDVAR_ULONG(deadlock_detect_depth, PLUGIN_VAR_RQCMDARG,
967 "Number of transactions deadlock detection will "
968 "traverse through before assuming deadlock",
969 nullptr, nullptr,
970 /*default*/ RDB_DEADLOCK_DETECT_DEPTH,
971 /*min*/ 2,
972 /*max*/ ULONG_MAX, 0);
973
974 static MYSQL_THDVAR_BOOL(
975 commit_time_batch_for_recovery, PLUGIN_VAR_RQCMDARG,
976 "TransactionOptions::commit_time_batch_for_recovery for RocksDB", nullptr,
977 nullptr, false);
978
979 static MYSQL_THDVAR_BOOL(
980 trace_sst_api, PLUGIN_VAR_RQCMDARG,
981 "Generate trace output in the log for each call to the SstFileWriter",
982 nullptr, nullptr, false);
983
984 static MYSQL_THDVAR_BOOL(
985 bulk_load, PLUGIN_VAR_RQCMDARG,
986 "Use bulk-load mode for inserts. This disables "
987 "unique_checks and enables rocksdb_commit_in_the_middle.",
988 rocksdb_check_bulk_load, nullptr, false);
989
990 static MYSQL_THDVAR_BOOL(bulk_load_allow_sk, PLUGIN_VAR_RQCMDARG,
991 "Allow bulk loading of sk keys during bulk-load. "
992 "Can be changed only when bulk load is disabled.",
993 /* Intentionally reuse unsorted's check function */
994 rocksdb_check_bulk_load_allow_unsorted, nullptr,
995 false);
996
997 static MYSQL_THDVAR_BOOL(bulk_load_allow_unsorted, PLUGIN_VAR_RQCMDARG,
998 "Allow unsorted input during bulk-load. "
999 "Can be changed only when bulk load is disabled.",
1000 rocksdb_check_bulk_load_allow_unsorted, nullptr,
1001 false);
1002
1003 static MYSQL_SYSVAR_BOOL(enable_bulk_load_api, rocksdb_enable_bulk_load_api,
1004 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1005 "Enables using SstFileWriter for bulk loading",
1006 nullptr, nullptr, rocksdb_enable_bulk_load_api);
1007
1008 static MYSQL_SYSVAR_BOOL(
1009 enable_pipelined_write,
1010 *reinterpret_cast<my_bool *>(&rocksdb_db_options->enable_pipelined_write),
1011 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1012 "DBOptions::enable_pipelined_write for RocksDB", nullptr, nullptr,
1013 rocksdb_db_options->enable_pipelined_write);
1014
1015 static MYSQL_SYSVAR_BOOL(enable_remove_orphaned_dropped_cfs,
1016 rocksdb_enable_remove_orphaned_dropped_cfs,
1017 PLUGIN_VAR_RQCMDARG,
1018 "Enables removing dropped cfs from metadata if it "
1019 "doesn't exist in cf manager",
1020 nullptr, nullptr,
1021 rocksdb_enable_remove_orphaned_dropped_cfs);
1022
1023 static MYSQL_THDVAR_STR(tmpdir, PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_MEMALLOC,
1024 "Directory for temporary files during DDL operations.",
1025 nullptr, nullptr, "");
1026
1027 static MYSQL_THDVAR_BOOL(
1028 commit_in_the_middle, PLUGIN_VAR_RQCMDARG,
1029 "Commit rows implicitly every rocksdb_bulk_load_size, on bulk load/insert, "
1030 "update and delete",
1031 nullptr, nullptr, false);
1032
1033 #if defined(ROCKSDB_INCLUDE_RFR) && ROCKSDB_INCLUDE_RFR
1034
1035 static MYSQL_THDVAR_BOOL(
1036 blind_delete_primary_key, PLUGIN_VAR_RQCMDARG,
1037 "Deleting rows by primary key lookup, without reading rows (Blind Deletes)."
1038 " Blind delete is disabled if the table has secondary key",
1039 nullptr, nullptr, FALSE);
1040
1041 static MYSQL_THDVAR_BOOL(
1042 enable_iterate_bounds, PLUGIN_VAR_OPCMDARG,
1043 "Enable rocksdb iterator upper/lower bounds in read options.", nullptr,
1044 nullptr, TRUE);
1045
1046 static const char *DEFAULT_READ_FREE_RPL_TABLES = ".*";
1047
get_regex_flags()1048 static int get_regex_flags() {
1049 int flags = MY_REG_EXTENDED | MY_REG_NOSUB;
1050 if (lower_case_table_names) flags |= MY_REG_ICASE;
1051 return flags;
1052 }
1053
rocksdb_validate_read_free_rpl_tables(THD * thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * var MY_ATTRIBUTE ((__unused__)),void * save,struct st_mysql_value * value)1054 static int rocksdb_validate_read_free_rpl_tables(
1055 THD *thd MY_ATTRIBUTE((__unused__)),
1056 struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)), void *save,
1057 struct st_mysql_value *value) {
1058 char buff[STRING_BUFFER_USUAL_SIZE];
1059 int length = sizeof(buff);
1060 const char *wlist_buf = value->val_str(value, buff, &length);
1061 if (wlist_buf)
1062 wlist_buf = thd->strmake(wlist_buf, length); // make a temp copy
1063 const auto wlist = wlist_buf ? wlist_buf : DEFAULT_READ_FREE_RPL_TABLES;
1064
1065 #if defined(HAVE_PSI_INTERFACE)
1066 Regex regex_handler(key_rwlock_read_free_rpl_tables);
1067 #else
1068 Regex regex_handler;
1069 #endif
1070
1071 if (!regex_handler.compile(wlist, get_regex_flags(), table_alias_charset)) {
1072 warn_about_bad_patterns(regex_handler, "rocksdb_read_free_rpl_tables");
1073 return HA_EXIT_FAILURE;
1074 }
1075
1076 *static_cast<const char **>(save) = wlist;
1077 return HA_EXIT_SUCCESS;
1078 }
1079
rocksdb_update_read_free_rpl_tables(THD * thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * var MY_ATTRIBUTE ((__unused__)),void * var_ptr,const void * save)1080 static void rocksdb_update_read_free_rpl_tables(
1081 THD *thd MY_ATTRIBUTE((__unused__)),
1082 struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)), void *var_ptr,
1083 const void *save) {
1084 const auto wlist = *static_cast<const char *const *>(save);
1085 assert(wlist != nullptr);
1086
1087 // This is bound to succeed since we've already checked for bad patterns in
1088 // rocksdb_validate_read_free_rpl_tables
1089 rdb_read_free_regex_handler.compile(wlist, get_regex_flags(),
1090 table_alias_charset);
1091
1092 // update all table defs
1093 struct Rdb_read_free_rpl_updater : public Rdb_tables_scanner {
1094 int add_table(Rdb_tbl_def *tdef) override {
1095 tdef->check_and_set_read_free_rpl_table();
1096 return HA_EXIT_SUCCESS;
1097 }
1098 } updater;
1099 ddl_manager.scan_for_tables(&updater);
1100
1101 *static_cast<const char **>(var_ptr) = *static_cast<char *const *>(save);
1102 }
1103
rocksdb_set_max_bottom_pri_background_compactions_internal(uint val)1104 static void rocksdb_set_max_bottom_pri_background_compactions_internal(
1105 uint val) {
1106 // Set lower priority for compactions
1107 if (val > 0) {
1108 // This creates background threads in rocksdb with BOTTOM priority pool.
1109 // Compactions for bottommost level use threads in the BOTTOM pool, and
1110 // the threads in the BOTTOM pool run with lower OS priority (19 in Linux).
1111 rdb->GetEnv()->SetBackgroundThreads(val, rocksdb::Env::Priority::BOTTOM);
1112 rdb->GetEnv()->LowerThreadPoolCPUPriority(rocksdb::Env::Priority::BOTTOM);
1113 sql_print_information(
1114 "Set %d compaction thread(s) with "
1115 "lower scheduling priority.",
1116 val);
1117 }
1118 }
1119
1120 static MYSQL_SYSVAR_STR(
1121 read_free_rpl_tables, rocksdb_read_free_rpl_tables,
1122 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
1123 "Regex that describes set of tables that will use read-free replication "
1124 "on the slave (i.e. not lookup a row during replication)",
1125 rocksdb_validate_read_free_rpl_tables, rocksdb_update_read_free_rpl_tables,
1126 DEFAULT_READ_FREE_RPL_TABLES);
1127
1128 static MYSQL_SYSVAR_ENUM(
1129 read_free_rpl, rocksdb_read_free_rpl,
1130 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
1131 "Use read-free replication on the slave (i.e. no row lookup during "
1132 "replication). Default is OFF, PK_SK will enable it on all tables with "
1133 "primary key. PK_ONLY will enable it on tables where the only key is the "
1134 "primary key (i.e. no secondary keys).",
1135 nullptr, nullptr, read_free_rpl_type::OFF, &read_free_rpl_typelib);
1136 #endif // defined(ROCKSDB_INCLUDE_RFR) && ROCKSDB_INCLUDE_RFR
1137
1138 static MYSQL_SYSVAR_BOOL(
1139 rpl_skip_tx_api, rpl_skip_tx_api_var, PLUGIN_VAR_RQCMDARG,
1140 "Use write batches for replication thread instead of tx api", nullptr,
1141 nullptr, false);
1142
1143 static MYSQL_THDVAR_BOOL(skip_bloom_filter_on_read, PLUGIN_VAR_RQCMDARG,
1144 "Skip using bloom filter for reads", nullptr, nullptr,
1145 false);
1146
1147 static MYSQL_SYSVAR_ULONG(max_row_locks, rocksdb_max_row_locks,
1148 PLUGIN_VAR_RQCMDARG,
1149 "Maximum number of locks a transaction can have",
1150 nullptr, nullptr,
1151 /*default*/ RDB_DEFAULT_ROW_LOCKS,
1152 /*min*/ 1,
1153 /*max*/ RDB_MAX_ROW_LOCKS, 0);
1154
1155 static MYSQL_THDVAR_ULONGLONG(
1156 write_batch_max_bytes, PLUGIN_VAR_RQCMDARG,
1157 "Maximum size of write batch in bytes. 0 means no limit.", nullptr, nullptr,
1158 /* default */ 0, /* min */ 0, /* max */ SIZE_T_MAX, 1);
1159
1160 static MYSQL_THDVAR_ULONGLONG(
1161 write_batch_flush_threshold, PLUGIN_VAR_RQCMDARG,
1162 "Maximum size of write batch in bytes before flushing. Only valid if "
1163 "rocksdb_write_policy is WRITE_UNPREPARED. 0 means no limit.",
1164 nullptr, nullptr, /* default */ 0, /* min */ 0, /* max */ SIZE_T_MAX, 1);
1165
1166 static MYSQL_THDVAR_BOOL(
1167 lock_scanned_rows, PLUGIN_VAR_RQCMDARG,
1168 "Take and hold locks on rows that are scanned but not updated", nullptr,
1169 nullptr, false);
1170
1171 static MYSQL_THDVAR_ULONG(bulk_load_size, PLUGIN_VAR_RQCMDARG,
1172 "Max #records in a batch for bulk-load mode", nullptr,
1173 nullptr,
1174 /*default*/ RDB_DEFAULT_BULK_LOAD_SIZE,
1175 /*min*/ 1,
1176 /*max*/ RDB_MAX_BULK_LOAD_SIZE, 0);
1177
1178 static MYSQL_THDVAR_ULONGLONG(
1179 merge_buf_size, PLUGIN_VAR_RQCMDARG,
1180 "Size to allocate for merge sort buffers written out to disk "
1181 "during inplace index creation.",
1182 nullptr, nullptr,
1183 /* default (64MB) */ RDB_DEFAULT_MERGE_BUF_SIZE,
1184 /* min (100B) */ RDB_MIN_MERGE_BUF_SIZE,
1185 /* max */ SIZE_T_MAX, 1);
1186
1187 static MYSQL_THDVAR_ULONGLONG(
1188 merge_combine_read_size, PLUGIN_VAR_RQCMDARG,
1189 "Size that we have to work with during combine (reading from disk) phase "
1190 "of "
1191 "external sort during fast index creation.",
1192 nullptr, nullptr,
1193 /* default (1GB) */ RDB_DEFAULT_MERGE_COMBINE_READ_SIZE,
1194 /* min (100B) */ RDB_MIN_MERGE_COMBINE_READ_SIZE,
1195 /* max */ SIZE_T_MAX, 1);
1196
1197 static MYSQL_THDVAR_ULONGLONG(
1198 merge_tmp_file_removal_delay_ms, PLUGIN_VAR_RQCMDARG,
1199 "Fast index creation creates a large tmp file on disk during index "
1200 "creation. Removing this large file all at once when index creation is "
1201 "complete can cause trim stalls on Flash. This variable specifies a "
1202 "duration to sleep (in milliseconds) between calling chsize() to truncate "
1203 "the file in chunks. The chunk size is the same as merge_buf_size.",
1204 nullptr, nullptr,
1205 /* default (0ms) */ RDB_DEFAULT_MERGE_TMP_FILE_REMOVAL_DELAY,
1206 /* min (0ms) */ RDB_MIN_MERGE_TMP_FILE_REMOVAL_DELAY,
1207 /* max */ SIZE_T_MAX, 1);
1208
1209 static MYSQL_THDVAR_INT(
1210 manual_compaction_threads, PLUGIN_VAR_RQCMDARG,
1211 "How many rocksdb threads to run for manual compactions", nullptr, nullptr,
1212 /* default rocksdb.dboption max_subcompactions */ 0,
1213 /* min */ 0, /* max */ 128, 0);
1214
1215 static MYSQL_THDVAR_ENUM(
1216 manual_compaction_bottommost_level, PLUGIN_VAR_RQCMDARG,
1217 "Option for bottommost level compaction during manual "
1218 "compaction",
1219 nullptr, nullptr,
1220 /* default */
1221 (ulong)rocksdb::BottommostLevelCompaction::kForceOptimized,
1222 &bottommost_level_compaction_typelib);
1223
1224 static MYSQL_SYSVAR_BOOL(
1225 create_if_missing,
1226 *reinterpret_cast<my_bool *>(&rocksdb_db_options->create_if_missing),
1227 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1228 "DBOptions::create_if_missing for RocksDB", nullptr, nullptr,
1229 rocksdb_db_options->create_if_missing);
1230
concurrent_prepare_update(THD * thd,st_mysql_sys_var * var,void * var_ptr,const void * save)1231 static void concurrent_prepare_update(THD *thd, st_mysql_sys_var *var,
1232 void *var_ptr, const void *save) {
1233 push_warning(thd, Sql_condition::SL_WARNING, HA_ERR_WRONG_COMMAND,
1234 "Using rocksdb_concurrent_prepare is deprecated and the "
1235 "parameter may be removed in future releases.");
1236 }
1237
1238 static MYSQL_SYSVAR_BOOL(
1239 concurrent_prepare,
1240 *reinterpret_cast<my_bool *>(&rocksdb_db_options->two_write_queues),
1241 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1242 "DEPRECATED, use rocksdb_two_write_queries instead.", nullptr,
1243 concurrent_prepare_update, rocksdb_db_options->two_write_queues);
1244
1245 static MYSQL_SYSVAR_BOOL(
1246 two_write_queues,
1247 *reinterpret_cast<my_bool *>(&rocksdb_db_options->two_write_queues),
1248 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1249 "DBOptions::two_write_queues for RocksDB", nullptr, nullptr,
1250 rocksdb_db_options->two_write_queues);
1251
1252 static MYSQL_SYSVAR_BOOL(
1253 manual_wal_flush,
1254 *reinterpret_cast<my_bool *>(&rocksdb_db_options->manual_wal_flush),
1255 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1256 "DBOptions::manual_wal_flush for RocksDB", nullptr, nullptr,
1257 rocksdb_db_options->manual_wal_flush);
1258
1259 static MYSQL_SYSVAR_ENUM(write_policy, rocksdb_write_policy,
1260 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1261 "DBOptions::write_policy for RocksDB", nullptr,
1262 nullptr, rocksdb::TxnDBWritePolicy::WRITE_COMMITTED,
1263 &write_policy_typelib);
1264
1265 static MYSQL_SYSVAR_BOOL(
1266 create_missing_column_families,
1267 *reinterpret_cast<my_bool *>(
1268 &rocksdb_db_options->create_missing_column_families),
1269 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1270 "DBOptions::create_missing_column_families for RocksDB", nullptr, nullptr,
1271 rocksdb_db_options->create_missing_column_families);
1272
1273 static MYSQL_SYSVAR_BOOL(
1274 error_if_exists,
1275 *reinterpret_cast<my_bool *>(&rocksdb_db_options->error_if_exists),
1276 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1277 "DBOptions::error_if_exists for RocksDB", nullptr, nullptr,
1278 rocksdb_db_options->error_if_exists);
1279
1280 static MYSQL_SYSVAR_BOOL(
1281 paranoid_checks,
1282 *reinterpret_cast<my_bool *>(&rocksdb_db_options->paranoid_checks),
1283 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1284 "DBOptions::paranoid_checks for RocksDB", nullptr, nullptr,
1285 rocksdb_db_options->paranoid_checks);
1286
1287 static MYSQL_SYSVAR_ULONGLONG(
1288 rate_limiter_bytes_per_sec, rocksdb_rate_limiter_bytes_per_sec,
1289 PLUGIN_VAR_RQCMDARG, "DBOptions::rate_limiter bytes_per_sec for RocksDB",
1290 nullptr, rocksdb_set_rate_limiter_bytes_per_sec, /* default */ 0L,
1291 /* min */ 0L, /* max */ MAX_RATE_LIMITER_BYTES_PER_SEC, 0);
1292
1293 static MYSQL_SYSVAR_ULONGLONG(
1294 sst_mgr_rate_bytes_per_sec, rocksdb_sst_mgr_rate_bytes_per_sec,
1295 PLUGIN_VAR_RQCMDARG,
1296 "DBOptions::sst_file_manager rate_bytes_per_sec for RocksDB", nullptr,
1297 rocksdb_set_sst_mgr_rate_bytes_per_sec,
1298 /* default */ DEFAULT_SST_MGR_RATE_BYTES_PER_SEC,
1299 /* min */ 0L, /* max */ UINT64_MAX, 0);
1300
1301 static MYSQL_SYSVAR_ULONGLONG(delayed_write_rate, rocksdb_delayed_write_rate,
1302 PLUGIN_VAR_RQCMDARG,
1303 "DBOptions::delayed_write_rate", nullptr,
1304 rocksdb_set_delayed_write_rate,
1305 rocksdb_db_options->delayed_write_rate, 0,
1306 UINT64_MAX, 0);
1307
1308 static MYSQL_SYSVAR_UINT(max_latest_deadlocks, rocksdb_max_latest_deadlocks,
1309 PLUGIN_VAR_RQCMDARG,
1310 "Maximum number of recent "
1311 "deadlocks to store",
1312 nullptr, rocksdb_set_max_latest_deadlocks,
1313 rocksdb::kInitialMaxDeadlocks, 0, UINT32_MAX, 0);
1314
1315 static MYSQL_SYSVAR_ENUM(
1316 info_log_level, rocksdb_info_log_level, PLUGIN_VAR_RQCMDARG,
1317 "Filter level for info logs to be written mysqld error log. "
1318 "Valid values include 'debug_level', 'info_level', 'warn_level'"
1319 "'error_level' and 'fatal_level'.",
1320 nullptr, rocksdb_set_rocksdb_info_log_level,
1321 rocksdb::InfoLogLevel::ERROR_LEVEL, &info_log_level_typelib);
1322
1323 static MYSQL_THDVAR_INT(
1324 perf_context_level, PLUGIN_VAR_RQCMDARG,
1325 "Perf Context Level for rocksdb internal timer stat collection", nullptr,
1326 nullptr,
1327 /* default */ rocksdb::PerfLevel::kUninitialized,
1328 /* min */ rocksdb::PerfLevel::kUninitialized,
1329 /* max */ rocksdb::PerfLevel::kOutOfBounds - 1, 0);
1330
1331 static MYSQL_SYSVAR_UINT(
1332 wal_recovery_mode, rocksdb_wal_recovery_mode, PLUGIN_VAR_RQCMDARG,
1333 "DBOptions::wal_recovery_mode for RocksDB. Default is kPointInTimeRecovery",
1334 nullptr, nullptr,
1335 /* default */ (uint)rocksdb::WALRecoveryMode::kPointInTimeRecovery,
1336 /* min */ (uint)rocksdb::WALRecoveryMode::kTolerateCorruptedTailRecords,
1337 /* max */ (uint)rocksdb::WALRecoveryMode::kSkipAnyCorruptedRecords, 0);
1338
1339 static MYSQL_SYSVAR_BOOL(
1340 track_and_verify_wals_in_manifest,
1341 *reinterpret_cast<my_bool *>(&rocksdb_track_and_verify_wals_in_manifest),
1342 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1343 "DBOptions::track_and_verify_wals_in_manifest for RocksDB", nullptr,
1344 nullptr, true);
1345
1346 static MYSQL_SYSVAR_UINT(
1347 stats_level, rocksdb_stats_level, PLUGIN_VAR_RQCMDARG,
1348 "Statistics Level for RocksDB. Default is 1 (kExceptHistogramOrTimers)",
1349 nullptr, rocksdb_set_rocksdb_stats_level,
1350 /* default */ (uint)rocksdb::StatsLevel::kExceptHistogramOrTimers,
1351 /* min */ (uint)rocksdb::StatsLevel::kExceptTickers,
1352 /* max */ (uint)rocksdb::StatsLevel::kAll, 0);
1353
1354 static MYSQL_SYSVAR_ULONG(compaction_readahead_size,
1355 rocksdb_db_options->compaction_readahead_size,
1356 PLUGIN_VAR_RQCMDARG,
1357 "DBOptions::compaction_readahead_size for RocksDB",
1358 nullptr, nullptr,
1359 rocksdb_db_options->compaction_readahead_size,
1360 /* min */ 0L, /* max */ ULONG_MAX, 0);
1361
1362 static MYSQL_SYSVAR_BOOL(
1363 new_table_reader_for_compaction_inputs,
1364 *reinterpret_cast<my_bool *>(
1365 &rocksdb_db_options->new_table_reader_for_compaction_inputs),
1366 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1367 "DBOptions::new_table_reader_for_compaction_inputs for RocksDB", nullptr,
1368 nullptr, rocksdb_db_options->new_table_reader_for_compaction_inputs);
1369
1370 static MYSQL_SYSVAR_UINT(
1371 access_hint_on_compaction_start, rocksdb_access_hint_on_compaction_start,
1372 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1373 "DBOptions::access_hint_on_compaction_start for RocksDB", nullptr, nullptr,
1374 /* default */ (uint)rocksdb::Options::AccessHint::NORMAL,
1375 /* min */ (uint)rocksdb::Options::AccessHint::NONE,
1376 /* max */ (uint)rocksdb::Options::AccessHint::WILLNEED, 0);
1377
1378 static MYSQL_SYSVAR_BOOL(
1379 allow_concurrent_memtable_write,
1380 *reinterpret_cast<my_bool *>(
1381 &rocksdb_db_options->allow_concurrent_memtable_write),
1382 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1383 "DBOptions::allow_concurrent_memtable_write for RocksDB", nullptr, nullptr,
1384 false);
1385
1386 static MYSQL_SYSVAR_BOOL(
1387 enable_write_thread_adaptive_yield,
1388 *reinterpret_cast<my_bool *>(
1389 &rocksdb_db_options->enable_write_thread_adaptive_yield),
1390 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1391 "DBOptions::enable_write_thread_adaptive_yield for RocksDB", nullptr,
1392 nullptr, false);
1393
1394 static MYSQL_SYSVAR_INT(max_open_files, rocksdb_db_options->max_open_files,
1395 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1396 "DBOptions::max_open_files for RocksDB", nullptr,
1397 nullptr, rocksdb_db_options->max_open_files,
1398 /* min */ -2, /* max */ INT_MAX, 0);
1399
1400 static MYSQL_SYSVAR_ULONG(max_total_wal_size,
1401 rocksdb_db_options->max_total_wal_size,
1402 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1403 "DBOptions::max_total_wal_size for RocksDB", nullptr,
1404 nullptr, rocksdb_db_options->max_total_wal_size,
1405 /* min */ 0L, /* max */ LONG_MAX, 0);
1406
1407 static MYSQL_SYSVAR_BOOL(
1408 use_fsync, *reinterpret_cast<my_bool *>(&rocksdb_db_options->use_fsync),
1409 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1410 "DBOptions::use_fsync for RocksDB", nullptr, nullptr,
1411 rocksdb_db_options->use_fsync);
1412
1413 static MYSQL_SYSVAR_STR(wal_dir, rocksdb_wal_dir,
1414 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1415 "DBOptions::wal_dir for RocksDB", nullptr, nullptr,
1416 rocksdb_db_options->wal_dir.c_str());
1417
1418 static MYSQL_SYSVAR_STR(
1419 persistent_cache_path, rocksdb_persistent_cache_path,
1420 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1421 "Path for BlockBasedTableOptions::persistent_cache for RocksDB", nullptr,
1422 nullptr, "");
1423
1424 static MYSQL_SYSVAR_ULONG(
1425 persistent_cache_size_mb, rocksdb_persistent_cache_size_mb,
1426 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1427 "Size of cache in MB for BlockBasedTableOptions::persistent_cache "
1428 "for RocksDB",
1429 nullptr, nullptr, rocksdb_persistent_cache_size_mb,
1430 /* min */ 0L, /* max */ ULONG_MAX, 0);
1431
1432 static MYSQL_SYSVAR_ULONG(
1433 delete_obsolete_files_period_micros,
1434 rocksdb_db_options->delete_obsolete_files_period_micros,
1435 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1436 "DBOptions::delete_obsolete_files_period_micros for RocksDB", nullptr,
1437 nullptr, rocksdb_db_options->delete_obsolete_files_period_micros,
1438 /* min */ 0L, /* max */ LONG_MAX, 0);
1439
1440 static MYSQL_SYSVAR_INT(max_background_jobs,
1441 rocksdb_db_options->max_background_jobs,
1442 PLUGIN_VAR_RQCMDARG,
1443 "DBOptions::max_background_jobs for RocksDB", nullptr,
1444 rocksdb_set_max_background_jobs,
1445 rocksdb_db_options->max_background_jobs,
1446 /* min */ -1, /* max */ MAX_BACKGROUND_JOBS, 0);
1447
1448 static MYSQL_SYSVAR_INT(max_background_flushes,
1449 rocksdb_db_options->max_background_flushes,
1450 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1451 "DBOptions::max_background_flushes for RocksDB", nullptr,
1452 nullptr,
1453 rocksdb_db_options->max_background_flushes,
1454 /* min */ -1, /* max */ 64, 0);
1455
1456 static MYSQL_SYSVAR_INT(max_background_compactions,
1457 rocksdb_db_options->max_background_compactions,
1458 PLUGIN_VAR_RQCMDARG,
1459 "DBOptions::max_background_compactions for RocksDB", nullptr,
1460 rocksdb_set_max_background_compactions,
1461 rocksdb_db_options->max_background_compactions,
1462 /* min */ -1, /* max */ 64, 0);
1463
1464 static MYSQL_SYSVAR_UINT(
1465 max_bottom_pri_background_compactions,
1466 rocksdb_max_bottom_pri_background_compactions, PLUGIN_VAR_RQCMDARG,
1467 "Creating specified number of threads, setting lower "
1468 "CPU priority, and letting Lmax compactions use them. "
1469 "Maximum total compaction concurrency continues to be capped to "
1470 "rocksdb_max_background_compactions or "
1471 "rocksdb_max_background_jobs. In addition to that, Lmax "
1472 "compaction concurrency is capped to "
1473 "rocksdb_max_bottom_pri_background_compactions. Default value is 0, "
1474 "which means all compactions are under concurrency of "
1475 "rocksdb_max_background_compactions|jobs. If you set very low "
1476 "rocksdb_max_bottom_pri_background_compactions (e.g. 1 or 2), compactions "
1477 "may not be able to keep up. Since Lmax normally has "
1478 "90 percent of data, it is recommended to set closer number to "
1479 "rocksdb_max_background_compactions|jobs. This option is helpful to "
1480 "give more CPU resources to other threads (e.g. query processing).",
1481 rocksdb_validate_max_bottom_pri_background_compactions, nullptr, 0,
1482 /* min */ 0, /* max */ ROCKSDB_MAX_BOTTOM_PRI_BACKGROUND_COMPACTIONS, 0);
1483
1484 static MYSQL_SYSVAR_UINT(max_subcompactions,
1485 rocksdb_db_options->max_subcompactions,
1486 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1487 "DBOptions::max_subcompactions for RocksDB", nullptr,
1488 nullptr, rocksdb_db_options->max_subcompactions,
1489 /* min */ 1, /* max */ MAX_SUBCOMPACTIONS, 0);
1490
1491 static MYSQL_SYSVAR_ULONG(max_log_file_size,
1492 rocksdb_db_options->max_log_file_size,
1493 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1494 "DBOptions::max_log_file_size for RocksDB", nullptr,
1495 nullptr, rocksdb_db_options->max_log_file_size,
1496 /* min */ 0L, /* max */ LONG_MAX, 0);
1497
1498 static MYSQL_SYSVAR_ULONG(log_file_time_to_roll,
1499 rocksdb_db_options->log_file_time_to_roll,
1500 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1501 "DBOptions::log_file_time_to_roll for RocksDB",
1502 nullptr, nullptr,
1503 rocksdb_db_options->log_file_time_to_roll,
1504 /* min */ 0L, /* max */ LONG_MAX, 0);
1505
1506 static MYSQL_SYSVAR_ULONG(keep_log_file_num,
1507 rocksdb_db_options->keep_log_file_num,
1508 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1509 "DBOptions::keep_log_file_num for RocksDB", nullptr,
1510 nullptr, rocksdb_db_options->keep_log_file_num,
1511 /* min */ 0L, /* max */ LONG_MAX, 0);
1512
1513 static MYSQL_SYSVAR_ULONG(max_manifest_file_size,
1514 rocksdb_db_options->max_manifest_file_size,
1515 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1516 "DBOptions::max_manifest_file_size for RocksDB",
1517 nullptr, nullptr,
1518 rocksdb_db_options->max_manifest_file_size,
1519 /* min */ 0L, /* max */ ULONG_MAX, 0);
1520
1521 static MYSQL_SYSVAR_INT(table_cache_numshardbits,
1522 rocksdb_db_options->table_cache_numshardbits,
1523 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1524 "DBOptions::table_cache_numshardbits for RocksDB",
1525 nullptr, nullptr,
1526 rocksdb_db_options->table_cache_numshardbits,
1527 /* min */ 0, /* max */ 19, 0);
1528
1529 static MYSQL_SYSVAR_ULONG(wal_ttl_seconds, rocksdb_db_options->WAL_ttl_seconds,
1530 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1531 "DBOptions::WAL_ttl_seconds for RocksDB", nullptr,
1532 nullptr, rocksdb_db_options->WAL_ttl_seconds,
1533 /* min */ 0L, /* max */ LONG_MAX, 0);
1534
1535 static MYSQL_SYSVAR_ULONG(wal_size_limit_mb,
1536 rocksdb_db_options->WAL_size_limit_MB,
1537 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1538 "DBOptions::WAL_size_limit_MB for RocksDB", nullptr,
1539 nullptr, rocksdb_db_options->WAL_size_limit_MB,
1540 /* min */ 0L, /* max */ LONG_MAX, 0);
1541
1542 static MYSQL_SYSVAR_ULONG(manifest_preallocation_size,
1543 rocksdb_db_options->manifest_preallocation_size,
1544 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1545 "DBOptions::manifest_preallocation_size for RocksDB",
1546 nullptr, nullptr,
1547 rocksdb_db_options->manifest_preallocation_size,
1548 /* min */ 0L, /* max */ LONG_MAX, 0);
1549
1550 static MYSQL_SYSVAR_BOOL(
1551 use_direct_reads,
1552 *reinterpret_cast<my_bool *>(&rocksdb_db_options->use_direct_reads),
1553 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1554 "DBOptions::use_direct_reads for RocksDB", nullptr, nullptr,
1555 rocksdb_db_options->use_direct_reads);
1556
1557 static MYSQL_SYSVAR_BOOL(
1558 use_direct_io_for_flush_and_compaction,
1559 *reinterpret_cast<my_bool *>(
1560 &rocksdb_db_options->use_direct_io_for_flush_and_compaction),
1561 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1562 "DBOptions::use_direct_io_for_flush_and_compaction for RocksDB", nullptr,
1563 nullptr, rocksdb_db_options->use_direct_io_for_flush_and_compaction);
1564
1565 static MYSQL_SYSVAR_BOOL(
1566 allow_mmap_reads,
1567 *reinterpret_cast<my_bool *>(&rocksdb_db_options->allow_mmap_reads),
1568 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1569 "DBOptions::allow_mmap_reads for RocksDB", nullptr, nullptr,
1570 rocksdb_db_options->allow_mmap_reads);
1571
1572 static MYSQL_SYSVAR_BOOL(
1573 allow_mmap_writes,
1574 *reinterpret_cast<my_bool *>(&rocksdb_db_options->allow_mmap_writes),
1575 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1576 "DBOptions::allow_mmap_writes for RocksDB", nullptr, nullptr,
1577 rocksdb_db_options->allow_mmap_writes);
1578
1579 static MYSQL_SYSVAR_BOOL(
1580 is_fd_close_on_exec,
1581 *reinterpret_cast<my_bool *>(&rocksdb_db_options->is_fd_close_on_exec),
1582 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1583 "DBOptions::is_fd_close_on_exec for RocksDB", nullptr, nullptr,
1584 rocksdb_db_options->is_fd_close_on_exec);
1585
1586 static MYSQL_SYSVAR_UINT(stats_dump_period_sec,
1587 rocksdb_db_options->stats_dump_period_sec,
1588 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1589 "DBOptions::stats_dump_period_sec for RocksDB",
1590 nullptr, nullptr,
1591 rocksdb_db_options->stats_dump_period_sec,
1592 /* min */ 0, /* max */ INT_MAX, 0);
1593
1594 static MYSQL_SYSVAR_BOOL(
1595 advise_random_on_open,
1596 *reinterpret_cast<my_bool *>(&rocksdb_db_options->advise_random_on_open),
1597 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1598 "DBOptions::advise_random_on_open for RocksDB", nullptr, nullptr,
1599 rocksdb_db_options->advise_random_on_open);
1600
1601 static MYSQL_SYSVAR_ULONG(db_write_buffer_size,
1602 rocksdb_db_options->db_write_buffer_size,
1603 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1604 "DBOptions::db_write_buffer_size for RocksDB",
1605 nullptr, nullptr,
1606 rocksdb_db_options->db_write_buffer_size,
1607 /* min */ 0L, /* max */ LONG_MAX, 0);
1608
1609 static MYSQL_SYSVAR_BOOL(
1610 use_adaptive_mutex,
1611 *reinterpret_cast<my_bool *>(&rocksdb_db_options->use_adaptive_mutex),
1612 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1613 "DBOptions::use_adaptive_mutex for RocksDB", nullptr, nullptr,
1614 rocksdb_db_options->use_adaptive_mutex);
1615
1616 static MYSQL_SYSVAR_ULONG(bytes_per_sync, rocksdb_db_options->bytes_per_sync,
1617 PLUGIN_VAR_RQCMDARG,
1618 "DBOptions::bytes_per_sync for RocksDB", nullptr,
1619 rocksdb_set_bytes_per_sync,
1620 rocksdb_db_options->bytes_per_sync,
1621 /* min */ 0L, /* max */ LONG_MAX, 0);
1622
1623 static MYSQL_SYSVAR_ULONG(wal_bytes_per_sync,
1624 rocksdb_db_options->wal_bytes_per_sync,
1625 PLUGIN_VAR_RQCMDARG,
1626 "DBOptions::wal_bytes_per_sync for RocksDB", nullptr,
1627 rocksdb_set_wal_bytes_per_sync,
1628 rocksdb_db_options->wal_bytes_per_sync,
1629 /* min */ 0L, /* max */ LONG_MAX, 0);
1630
1631 static MYSQL_SYSVAR_BOOL(
1632 enable_thread_tracking,
1633 *reinterpret_cast<my_bool *>(&rocksdb_db_options->enable_thread_tracking),
1634 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1635 "DBOptions::enable_thread_tracking for RocksDB", nullptr, nullptr, true);
1636
1637 static MYSQL_SYSVAR_LONGLONG(block_cache_size, rocksdb_block_cache_size,
1638 PLUGIN_VAR_RQCMDARG,
1639 "block_cache size for RocksDB",
1640 rocksdb_validate_set_block_cache_size, nullptr,
1641 /* default */ RDB_DEFAULT_BLOCK_CACHE_SIZE,
1642 /* min */ RDB_MIN_BLOCK_CACHE_SIZE,
1643 /* max */ LLONG_MAX,
1644 /* Block size */ RDB_MIN_BLOCK_CACHE_SIZE);
1645
1646 static MYSQL_SYSVAR_LONGLONG(sim_cache_size, rocksdb_sim_cache_size,
1647 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1648 "Simulated cache size for RocksDB", nullptr,
1649 nullptr,
1650 /* default */ 0,
1651 /* min */ 0,
1652 /* max */ LLONG_MAX,
1653 /* Block size */ 0);
1654
1655 static MYSQL_SYSVAR_BOOL(cache_dump, rocksdb_cache_dump,
1656 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1657 "Include RocksDB block cache content in core dump.",
1658 nullptr, nullptr, true);
1659
1660 static MYSQL_SYSVAR_DOUBLE(cache_high_pri_pool_ratio,
1661 rocksdb_cache_high_pri_pool_ratio,
1662 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1663 "Specify the size of block cache high-pri pool",
1664 nullptr, nullptr, /* default */ 0.0, /* min */ 0.0,
1665 /* max */ 1.0, 0);
1666
1667 static MYSQL_SYSVAR_BOOL(
1668 cache_index_and_filter_blocks,
1669 *reinterpret_cast<my_bool *>(
1670 &rocksdb_tbl_options->cache_index_and_filter_blocks),
1671 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1672 "BlockBasedTableOptions::cache_index_and_filter_blocks for RocksDB",
1673 nullptr, nullptr, true);
1674
1675 static MYSQL_SYSVAR_BOOL(
1676 cache_index_and_filter_with_high_priority,
1677 *reinterpret_cast<my_bool *>(
1678 &rocksdb_tbl_options->cache_index_and_filter_blocks_with_high_priority),
1679 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1680 "cache_index_and_filter_blocks_with_high_priority for RocksDB", nullptr,
1681 nullptr, true);
1682
1683 // When pin_l0_filter_and_index_blocks_in_cache is true, RocksDB will use the
1684 // LRU cache, but will always keep the filter & idndex block's handle checked
1685 // out (=won't call ShardedLRUCache::Release), plus the parsed out objects
1686 // the LRU cache will never push flush them out, hence they're pinned.
1687 //
1688 // This fixes the mutex contention between :ShardedLRUCache::Lookup and
1689 // ShardedLRUCache::Release which reduced the QPS ratio (QPS using secondary
1690 // index / QPS using PK).
1691 static MYSQL_SYSVAR_BOOL(
1692 pin_l0_filter_and_index_blocks_in_cache,
1693 *reinterpret_cast<my_bool *>(
1694 &rocksdb_tbl_options->pin_l0_filter_and_index_blocks_in_cache),
1695 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1696 "pin_l0_filter_and_index_blocks_in_cache for RocksDB", nullptr, nullptr,
1697 true);
1698
1699 static MYSQL_SYSVAR_ENUM(index_type, rocksdb_index_type,
1700 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1701 "BlockBasedTableOptions::index_type for RocksDB",
1702 nullptr, nullptr,
1703 (uint64_t)rocksdb_tbl_options->index_type,
1704 &index_type_typelib);
1705
1706 static MYSQL_SYSVAR_BOOL(
1707 hash_index_allow_collision,
1708 *reinterpret_cast<my_bool *>(
1709 &rocksdb_tbl_options->hash_index_allow_collision),
1710 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1711 "BlockBasedTableOptions::hash_index_allow_collision for RocksDB", nullptr,
1712 nullptr, rocksdb_tbl_options->hash_index_allow_collision);
1713
1714 static MYSQL_SYSVAR_BOOL(
1715 no_block_cache,
1716 *reinterpret_cast<my_bool *>(&rocksdb_tbl_options->no_block_cache),
1717 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1718 "BlockBasedTableOptions::no_block_cache for RocksDB", nullptr, nullptr,
1719 rocksdb_tbl_options->no_block_cache);
1720
1721 static MYSQL_SYSVAR_ULONG(block_size, rocksdb_tbl_options->block_size,
1722 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1723 "BlockBasedTableOptions::block_size for RocksDB",
1724 nullptr, nullptr, rocksdb_tbl_options->block_size,
1725 /* min */ 1024L, /* max */ LONG_MAX, 0);
1726
1727 static MYSQL_SYSVAR_INT(
1728 block_size_deviation, rocksdb_tbl_options->block_size_deviation,
1729 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1730 "BlockBasedTableOptions::block_size_deviation for RocksDB", nullptr,
1731 nullptr, rocksdb_tbl_options->block_size_deviation,
1732 /* min */ 0, /* max */ INT_MAX, 0);
1733
1734 static MYSQL_SYSVAR_INT(
1735 block_restart_interval, rocksdb_tbl_options->block_restart_interval,
1736 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1737 "BlockBasedTableOptions::block_restart_interval for RocksDB", nullptr,
1738 nullptr, rocksdb_tbl_options->block_restart_interval,
1739 /* min */ 1, /* max */ INT_MAX, 0);
1740
1741 static MYSQL_SYSVAR_BOOL(
1742 whole_key_filtering,
1743 *reinterpret_cast<my_bool *>(&rocksdb_tbl_options->whole_key_filtering),
1744 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1745 "BlockBasedTableOptions::whole_key_filtering for RocksDB", nullptr, nullptr,
1746 rocksdb_tbl_options->whole_key_filtering);
1747
1748 static MYSQL_SYSVAR_STR(default_cf_options, rocksdb_default_cf_options,
1749 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1750 "default cf options for RocksDB", nullptr, nullptr,
1751 "compression=kLZ4Compression;"
1752 "bottommost_compression=kLZ4Compression");
1753
1754 static MYSQL_SYSVAR_STR(override_cf_options, rocksdb_override_cf_options,
1755 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1756 "option overrides per cf for RocksDB", nullptr, nullptr,
1757 "");
1758
1759 static MYSQL_SYSVAR_STR(update_cf_options, rocksdb_update_cf_options,
1760 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC |
1761 PLUGIN_VAR_NOCMDOPT,
1762 "Option updates per column family for RocksDB",
1763 rocksdb_validate_update_cf_options,
1764 rocksdb_set_update_cf_options, nullptr);
1765
1766 static MYSQL_SYSVAR_BOOL(use_default_sk_cf, rocksdb_use_default_sk_cf,
1767 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1768 "Use default_sk for secondary keys", nullptr, nullptr,
1769 false);
1770
1771 static MYSQL_SYSVAR_UINT(flush_log_at_trx_commit,
1772 rocksdb_flush_log_at_trx_commit, PLUGIN_VAR_RQCMDARG,
1773 "Sync on transaction commit. Similar to "
1774 "innodb_flush_log_at_trx_commit. 1: sync on commit, "
1775 "0,2: not sync on commit",
1776 rocksdb_validate_flush_log_at_trx_commit, nullptr,
1777 /* default */ FLUSH_LOG_SYNC,
1778 /* min */ FLUSH_LOG_NEVER,
1779 /* max */ FLUSH_LOG_BACKGROUND, 0);
1780
1781 static MYSQL_THDVAR_BOOL(write_disable_wal, PLUGIN_VAR_RQCMDARG,
1782 "WriteOptions::disableWAL for RocksDB", nullptr,
1783 nullptr, rocksdb::WriteOptions().disableWAL);
1784
1785 static MYSQL_THDVAR_BOOL(
1786 write_ignore_missing_column_families, PLUGIN_VAR_RQCMDARG,
1787 "WriteOptions::ignore_missing_column_families for RocksDB", nullptr,
1788 nullptr, rocksdb::WriteOptions().ignore_missing_column_families);
1789
1790 static MYSQL_THDVAR_BOOL(skip_fill_cache, PLUGIN_VAR_RQCMDARG,
1791 "Skip filling block cache on read requests", nullptr,
1792 nullptr, false);
1793
1794 static MYSQL_THDVAR_BOOL(
1795 unsafe_for_binlog, PLUGIN_VAR_RQCMDARG,
1796 "Allowing statement based binary logging which may break consistency",
1797 nullptr, nullptr, false);
1798
1799 static MYSQL_THDVAR_UINT(records_in_range, PLUGIN_VAR_RQCMDARG,
1800 "Used to override the result of records_in_range(). "
1801 "Set to a positive number to override",
1802 nullptr, nullptr, 0,
1803 /* min */ 0, /* max */ INT_MAX, 0);
1804
1805 static MYSQL_THDVAR_UINT(force_index_records_in_range, PLUGIN_VAR_RQCMDARG,
1806 "Used to override the result of records_in_range() "
1807 "when FORCE INDEX is used.",
1808 nullptr, nullptr, 0,
1809 /* min */ 0, /* max */ INT_MAX, 0);
1810
1811 static MYSQL_SYSVAR_UINT(
1812 debug_optimizer_n_rows, rocksdb_debug_optimizer_n_rows,
1813 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_NOSYSVAR,
1814 "Test only to override rocksdb estimates of table size in a memtable",
1815 nullptr, nullptr, 0, /* min */ 0, /* max */ INT_MAX, 0);
1816
1817 static MYSQL_SYSVAR_BOOL(force_compute_memtable_stats,
1818 rocksdb_force_compute_memtable_stats,
1819 PLUGIN_VAR_RQCMDARG,
1820 "Force to always compute memtable stats", nullptr,
1821 nullptr, true);
1822
1823 static MYSQL_SYSVAR_UINT(
1824 force_compute_memtable_stats_cachetime,
1825 rocksdb_force_compute_memtable_stats_cachetime, PLUGIN_VAR_RQCMDARG,
1826 "Time in usecs to cache memtable estimates", nullptr, nullptr,
1827 /* default */ RDB_DEFAULT_FORCE_COMPUTE_MEMTABLE_STATS_CACHETIME,
1828 /* min */ 0, /* max */ INT_MAX, 0);
1829
1830 static MYSQL_SYSVAR_BOOL(
1831 debug_optimizer_no_zero_cardinality,
1832 rocksdb_debug_optimizer_no_zero_cardinality, PLUGIN_VAR_RQCMDARG,
1833 "In case if cardinality is zero, overrides it with some value", nullptr,
1834 nullptr, true);
1835
1836 static MYSQL_SYSVAR_STR(compact_cf, rocksdb_compact_cf_name,
1837 PLUGIN_VAR_RQCMDARG, "Compact column family",
1838 rocksdb_compact_column_family,
1839 rocksdb_compact_column_family_stub, "");
1840
1841 static MYSQL_SYSVAR_STR(delete_cf, rocksdb_delete_cf_name, PLUGIN_VAR_RQCMDARG,
1842 "Delete column family", rocksdb_delete_column_family,
1843 rocksdb_delete_column_family_stub, "");
1844
1845 static MYSQL_SYSVAR_STR(create_checkpoint, rocksdb_checkpoint_name,
1846 PLUGIN_VAR_RQCMDARG, "Checkpoint directory",
1847 rocksdb_create_checkpoint,
1848 rocksdb_create_checkpoint_stub, "");
1849
1850 static MYSQL_SYSVAR_BOOL(signal_drop_index_thread,
1851 rocksdb_signal_drop_index_thread, PLUGIN_VAR_RQCMDARG,
1852 "Wake up drop index thread", nullptr,
1853 rocksdb_drop_index_wakeup_thread, false);
1854
1855 static MYSQL_SYSVAR_BOOL(pause_background_work, rocksdb_pause_background_work,
1856 PLUGIN_VAR_RQCMDARG,
1857 "Disable all rocksdb background operations", nullptr,
1858 rocksdb_set_pause_background_work, false);
1859
1860 static MYSQL_SYSVAR_BOOL(enable_native_partition,
1861 rocksdb_enable_native_partition, PLUGIN_VAR_READONLY,
1862 "Enable native partitioning", nullptr, nullptr, false);
1863
1864 static MYSQL_SYSVAR_BOOL(
1865 enable_ttl, rocksdb_enable_ttl, PLUGIN_VAR_RQCMDARG,
1866 "Enable expired TTL records to be dropped during compaction.", nullptr,
1867 nullptr, true);
1868
1869 static MYSQL_SYSVAR_BOOL(
1870 enable_ttl_read_filtering, rocksdb_enable_ttl_read_filtering,
1871 PLUGIN_VAR_RQCMDARG,
1872 "For tables with TTL, expired records are skipped/filtered out during "
1873 "processing and in query results. Disabling this will allow these records "
1874 "to be seen, but as a result rows may disappear in the middle of "
1875 "transactions as they are dropped during compaction. Use with caution.",
1876 nullptr, nullptr, true);
1877
1878 static MYSQL_SYSVAR_INT(
1879 debug_ttl_rec_ts, rocksdb_debug_ttl_rec_ts, PLUGIN_VAR_RQCMDARG,
1880 "For debugging purposes only. Overrides the TTL of records to "
1881 "now() + debug_ttl_rec_ts. The value can be +/- to simulate "
1882 "a record inserted in the past vs a record inserted in the 'future'. "
1883 "A value of 0 denotes that the variable is not set. This variable is a "
1884 "no-op in non-debug builds.",
1885 nullptr, nullptr, 0, /* min */ -3600, /* max */ 3600, 0);
1886
1887 static MYSQL_SYSVAR_INT(
1888 debug_ttl_snapshot_ts, rocksdb_debug_ttl_snapshot_ts, PLUGIN_VAR_RQCMDARG,
1889 "For debugging purposes only. Sets the snapshot during compaction to "
1890 "now() + debug_set_ttl_snapshot_ts. The value can be +/- to simulate "
1891 "a snapshot in the past vs a snapshot created in the 'future'. "
1892 "A value of 0 denotes that the variable is not set. This variable is a "
1893 "no-op in non-debug builds.",
1894 nullptr, nullptr, 0, /* min */ -3600, /* max */ 3600, 0);
1895
1896 static MYSQL_SYSVAR_INT(
1897 debug_ttl_read_filter_ts, rocksdb_debug_ttl_read_filter_ts,
1898 PLUGIN_VAR_RQCMDARG,
1899 "For debugging purposes only. Overrides the TTL read filtering time to "
1900 "time + debug_ttl_read_filter_ts. A value of 0 denotes that the variable "
1901 "is not set. This variable is a no-op in non-debug builds.",
1902 nullptr, nullptr, 0, /* min */ -3600, /* max */ 3600, 0);
1903
1904 static MYSQL_SYSVAR_BOOL(
1905 debug_ttl_ignore_pk, rocksdb_debug_ttl_ignore_pk, PLUGIN_VAR_RQCMDARG,
1906 "For debugging purposes only. If true, compaction filtering will not occur "
1907 "on PK TTL data. This variable is a no-op in non-debug builds.",
1908 nullptr, nullptr, false);
1909
1910 static MYSQL_SYSVAR_UINT(
1911 max_manual_compactions, rocksdb_max_manual_compactions, PLUGIN_VAR_RQCMDARG,
1912 "Maximum number of pending + ongoing number of manual compactions.",
1913 nullptr, nullptr, /* default */ 10, /* min */ 0, /* max */ UINT_MAX, 0);
1914
1915 static MYSQL_SYSVAR_BOOL(
1916 rollback_on_timeout, rocksdb_rollback_on_timeout, PLUGIN_VAR_OPCMDARG,
1917 "Whether to roll back the complete transaction or a single statement on "
1918 "lock wait timeout (a single statement by default)",
1919 NULL, NULL, FALSE);
1920
1921 static MYSQL_SYSVAR_UINT(
1922 debug_manual_compaction_delay, rocksdb_debug_manual_compaction_delay,
1923 PLUGIN_VAR_RQCMDARG,
1924 "For debugging purposes only. Sleeping specified seconds "
1925 "for simulating long running compactions.",
1926 nullptr, nullptr, 0, /* min */ 0, /* max */ UINT_MAX, 0);
1927
1928 static MYSQL_SYSVAR_BOOL(
1929 reset_stats, rocksdb_reset_stats, PLUGIN_VAR_RQCMDARG,
1930 "Reset the RocksDB internal statistics without restarting the DB.", nullptr,
1931 rocksdb_set_reset_stats, false);
1932
1933 static MYSQL_SYSVAR_BOOL(ignore_unknown_options, rocksdb_ignore_unknown_options,
1934 PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
1935 "Enable ignoring unknown options passed to RocksDB",
1936 nullptr, nullptr, true);
1937
1938 static MYSQL_SYSVAR_BOOL(strict_collation_check, rocksdb_strict_collation_check,
1939 PLUGIN_VAR_RQCMDARG,
1940 "Enforce case sensitive collation for MyRocks indexes",
1941 nullptr, nullptr, true);
1942
1943 static MYSQL_SYSVAR_STR(strict_collation_exceptions,
1944 rocksdb_strict_collation_exceptions,
1945 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
1946 "Regex that describes set of tables that are excluded "
1947 "from the case sensitive collation enforcement",
1948 nullptr, rocksdb_set_collation_exception_list, "");
1949
1950 static MYSQL_SYSVAR_BOOL(collect_sst_properties, rocksdb_collect_sst_properties,
1951 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1952 "Enables collecting SST file properties on each flush",
1953 nullptr, nullptr, rocksdb_collect_sst_properties);
1954
1955 static MYSQL_SYSVAR_BOOL(
1956 force_flush_memtable_now, rocksdb_force_flush_memtable_now_var,
1957 PLUGIN_VAR_RQCMDARG,
1958 "Forces memstore flush which may block all write requests so be careful",
1959 rocksdb_force_flush_memtable_now, rocksdb_force_flush_memtable_now_stub,
1960 false);
1961
1962 static MYSQL_SYSVAR_BOOL(
1963 force_flush_memtable_and_lzero_now,
1964 rocksdb_force_flush_memtable_and_lzero_now_var, PLUGIN_VAR_RQCMDARG,
1965 "Acts similar to force_flush_memtable_now, but also compacts all L0 files.",
1966 rocksdb_force_flush_memtable_and_lzero_now,
1967 rocksdb_force_flush_memtable_and_lzero_now_stub, false);
1968
1969 static MYSQL_SYSVAR_UINT(
1970 seconds_between_stat_computes, rocksdb_seconds_between_stat_computes,
1971 PLUGIN_VAR_RQCMDARG,
1972 "Sets a number of seconds to wait between optimizer stats recomputation. "
1973 "Only changed indexes will be refreshed.",
1974 nullptr, nullptr, rocksdb_seconds_between_stat_computes,
1975 /* min */ 0L, /* max */ UINT_MAX, 0);
1976
1977 static MYSQL_SYSVAR_LONGLONG(compaction_sequential_deletes,
1978 rocksdb_compaction_sequential_deletes,
1979 PLUGIN_VAR_RQCMDARG,
1980 "RocksDB will trigger compaction for the file if "
1981 "it has more than this number sequential deletes "
1982 "per window",
1983 nullptr, rocksdb_set_compaction_options,
1984 DEFAULT_COMPACTION_SEQUENTIAL_DELETES,
1985 /* min */ 0L,
1986 /* max */ MAX_COMPACTION_SEQUENTIAL_DELETES, 0);
1987
1988 static MYSQL_SYSVAR_LONGLONG(
1989 compaction_sequential_deletes_window,
1990 rocksdb_compaction_sequential_deletes_window, PLUGIN_VAR_RQCMDARG,
1991 "Size of the window for counting rocksdb_compaction_sequential_deletes",
1992 nullptr, rocksdb_set_compaction_options,
1993 DEFAULT_COMPACTION_SEQUENTIAL_DELETES_WINDOW,
1994 /* min */ 0L, /* max */ MAX_COMPACTION_SEQUENTIAL_DELETES_WINDOW, 0);
1995
1996 static MYSQL_SYSVAR_LONGLONG(
1997 compaction_sequential_deletes_file_size,
1998 rocksdb_compaction_sequential_deletes_file_size, PLUGIN_VAR_RQCMDARG,
1999 "Minimum file size required for compaction_sequential_deletes", nullptr,
2000 rocksdb_set_compaction_options, 0L,
2001 /* min */ -1L, /* max */ LLONG_MAX, 0);
2002
2003 static MYSQL_SYSVAR_BOOL(
2004 compaction_sequential_deletes_count_sd,
2005 rocksdb_compaction_sequential_deletes_count_sd, PLUGIN_VAR_RQCMDARG,
2006 "Counting SingleDelete as rocksdb_compaction_sequential_deletes", nullptr,
2007 nullptr, rocksdb_compaction_sequential_deletes_count_sd);
2008
2009 static MYSQL_SYSVAR_BOOL(
2010 print_snapshot_conflict_queries, rocksdb_print_snapshot_conflict_queries,
2011 PLUGIN_VAR_RQCMDARG,
2012 "Logging queries that got snapshot conflict errors into *.err log", nullptr,
2013 nullptr, rocksdb_print_snapshot_conflict_queries);
2014
2015 static MYSQL_THDVAR_INT(checksums_pct, PLUGIN_VAR_RQCMDARG,
2016 "How many percentages of rows to be checksummed",
2017 nullptr, nullptr, RDB_MAX_CHECKSUMS_PCT,
2018 /* min */ 0, /* max */ RDB_MAX_CHECKSUMS_PCT, 0);
2019
2020 static MYSQL_THDVAR_BOOL(store_row_debug_checksums, PLUGIN_VAR_RQCMDARG,
2021 "Include checksums when writing index/table records",
2022 nullptr, nullptr, false /* default value */);
2023
2024 static MYSQL_THDVAR_BOOL(verify_row_debug_checksums, PLUGIN_VAR_RQCMDARG,
2025 "Verify checksums when reading index/table records",
2026 nullptr, nullptr, false /* default value */);
2027
2028 static MYSQL_THDVAR_BOOL(master_skip_tx_api, PLUGIN_VAR_RQCMDARG,
2029 "Skipping holding any lock on row access. "
2030 "Not effective on slave.",
2031 nullptr, nullptr, false);
2032
2033 #if defined(ROCKSDB_INCLUDE_VALIDATE_TABLES) && ROCKSDB_INCLUDE_VALIDATE_TABLES
2034 static MYSQL_SYSVAR_UINT(
2035 validate_tables, rocksdb_validate_tables,
2036 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
2037 "Verify all .frm files match all RocksDB tables (0 means no verification, "
2038 "1 means verify and fail on error, and 2 means verify but continue",
2039 nullptr, nullptr, 1 /* default value */, 0 /* min value */,
2040 2 /* max value */, 0);
2041 #endif // defined(ROCKSDB_INCLUDE_VALIDATE_TABLES) &&
2042 // ROCKSDB_INCLUDE_VALIDATE_TABLES
2043
2044 static MYSQL_SYSVAR_STR(datadir, rocksdb_datadir,
2045 PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
2046 "RocksDB data directory", nullptr, nullptr,
2047 "./.rocksdb");
2048
2049 static MYSQL_SYSVAR_UINT(
2050 table_stats_sampling_pct, rocksdb_table_stats_sampling_pct,
2051 PLUGIN_VAR_RQCMDARG,
2052 "Percentage of entries to sample when collecting statistics about table "
2053 "properties. Specify either 0 to sample everything or percentage "
2054 "[" STRINGIFY_ARG(RDB_TBL_STATS_SAMPLE_PCT_MIN) ".." STRINGIFY_ARG(
2055 RDB_TBL_STATS_SAMPLE_PCT_MAX) "]. "
2056 "By default " STRINGIFY_ARG(
2057 RDB_DEFAULT_TBL_STATS_SAMPLE_PCT) "% "
2058 "of"
2059 " e"
2060 "nt"
2061 "ri"
2062 "es"
2063 " a"
2064 "re"
2065 " "
2066 "sa"
2067 "mp"
2068 "le"
2069 "d"
2070 ".",
2071 nullptr, rocksdb_set_table_stats_sampling_pct, /* default */
2072 RDB_DEFAULT_TBL_STATS_SAMPLE_PCT, /* everything */ 0,
2073 /* max */ RDB_TBL_STATS_SAMPLE_PCT_MAX, 0);
2074
2075 static MYSQL_SYSVAR_UINT(table_stats_recalc_threshold_pct,
2076 rocksdb_table_stats_recalc_threshold_pct,
2077 PLUGIN_VAR_RQCMDARG,
2078 "Percentage of number of modified rows over total "
2079 "number of rows to trigger stats recalculation",
2080 nullptr, nullptr, /* default */
2081 rocksdb_table_stats_recalc_threshold_pct,
2082 /* everything */ 0,
2083 /* max */ RDB_TBL_STATS_RECALC_THRESHOLD_PCT_MAX, 0);
2084
2085 static MYSQL_SYSVAR_ULONGLONG(
2086 table_stats_recalc_threshold_count,
2087 rocksdb_table_stats_recalc_threshold_count, PLUGIN_VAR_RQCMDARG,
2088 "Number of modified rows to trigger stats recalculation", nullptr,
2089 nullptr, /* default */
2090 rocksdb_table_stats_recalc_threshold_count,
2091 /* everything */ 0,
2092 /* max */ UINT64_MAX, 0);
2093
2094 static MYSQL_SYSVAR_INT(
2095 table_stats_background_thread_nice_value,
2096 rocksdb_table_stats_background_thread_nice_value, PLUGIN_VAR_RQCMDARG,
2097 "nice value for index stats", rocksdb_index_stats_thread_renice, nullptr,
2098 /* default */ rocksdb_table_stats_background_thread_nice_value,
2099 /* min */ THREAD_PRIO_MIN, /* max */ THREAD_PRIO_MAX, 0);
2100
2101 static MYSQL_SYSVAR_ULONGLONG(
2102 table_stats_max_num_rows_scanned, rocksdb_table_stats_max_num_rows_scanned,
2103 PLUGIN_VAR_RQCMDARG,
2104 "The maximum number of rows to scan in table scan based "
2105 "cardinality calculation",
2106 nullptr, nullptr, /* default */
2107 0, /* everything */ 0,
2108 /* max */ UINT64_MAX, 0);
2109
2110 static MYSQL_SYSVAR_UINT(
2111 stats_recalc_rate, rocksdb_stats_recalc_rate, PLUGIN_VAR_RQCMDARG,
2112 "The number of indexes per second to recalculate statistics for. 0 to "
2113 "disable background recalculation.",
2114 nullptr, nullptr, 0 /* default value */, 0 /* min value */,
2115 UINT_MAX /* max value */, 0);
2116
2117 static MYSQL_SYSVAR_BOOL(table_stats_use_table_scan,
2118 rocksdb_table_stats_use_table_scan,
2119 PLUGIN_VAR_RQCMDARG,
2120 "Enable table scan based index calculation.", nullptr,
2121 rocksdb_update_table_stats_use_table_scan,
2122 rocksdb_table_stats_use_table_scan);
2123
2124 static MYSQL_SYSVAR_BOOL(
2125 large_prefix, rocksdb_large_prefix, PLUGIN_VAR_RQCMDARG,
2126 "Support large index prefix length of 3072 bytes. If off, the maximum "
2127 "index prefix length is 767.",
2128 nullptr, nullptr, false);
2129
2130 static MYSQL_SYSVAR_BOOL(
2131 allow_to_start_after_corruption, rocksdb_allow_to_start_after_corruption,
2132 PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
2133 "Allow server to start successfully when RocksDB corruption is detected.",
2134 nullptr, nullptr, false);
2135
2136 static MYSQL_SYSVAR_BOOL(error_on_suboptimal_collation,
2137 rocksdb_error_on_suboptimal_collation,
2138 PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
2139 "Raise an error instead of warning if a sub-optimal "
2140 "collation is used",
2141 nullptr, nullptr, false);
2142
2143 static MYSQL_SYSVAR_BOOL(
2144 no_create_column_family, rocksdb_no_create_column_family,
2145 PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
2146 "Do not allow creation of new Column Families through index comments.",
2147 nullptr, nullptr, false);
2148
2149 static MYSQL_SYSVAR_BOOL(
2150 enable_insert_with_update_caching,
2151 rocksdb_enable_insert_with_update_caching, PLUGIN_VAR_OPCMDARG,
2152 "Whether to enable optimization where we cache the read from a failed "
2153 "insertion attempt in INSERT ON DUPLICATE KEY UPDATE",
2154 nullptr, nullptr, true);
2155
2156 static MYSQL_SYSVAR_STR(
2157 trace_block_cache_access, rocksdb_block_cache_trace_options_str,
2158 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
2159 "Block cache trace option string. The format is "
2160 "sampling_frequency:max_trace_file_size:trace_file_name. "
2161 "sampling_frequency and max_trace_file_size are positive integers. The "
2162 "block accesses are saved to the "
2163 "rocksdb_datadir/block_cache_traces/trace_file_name.",
2164 rocksdb_trace_block_cache_access, nullptr, "");
2165
2166 static MYSQL_SYSVAR_STR(
2167 trace_queries, rocksdb_trace_options_str,
2168 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
2169 "Trace option string. The format is "
2170 "sampling_frequency:max_trace_file_size:trace_file_name. "
2171 "sampling_frequency and max_trace_file_size are positive integers. The "
2172 "queries are saved to the "
2173 "rocksdb_datadir/queries_traces/trace_file_name.",
2174 rocksdb_trace_queries, nullptr, "");
2175
2176 static MYSQL_SYSVAR_BOOL(skip_locks_if_skip_unique_check,
2177 rocksdb_skip_locks_if_skip_unique_check,
2178 PLUGIN_VAR_RQCMDARG,
2179 "Skip row locking when unique checks are disabled.",
2180 nullptr, nullptr, FALSE);
2181
2182 static MYSQL_SYSVAR_BOOL(
2183 alter_column_default_inplace, rocksdb_alter_column_default_inplace,
2184 PLUGIN_VAR_RQCMDARG,
2185 "Allow inplace alter for alter column default operation", nullptr, nullptr,
2186 TRUE);
2187
2188 static const int ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE = 100;
2189
2190 static struct st_mysql_sys_var *rocksdb_system_variables[] = {
2191 MYSQL_SYSVAR(lock_wait_timeout),
2192 MYSQL_SYSVAR(deadlock_detect),
2193 MYSQL_SYSVAR(deadlock_detect_depth),
2194 MYSQL_SYSVAR(commit_time_batch_for_recovery),
2195 MYSQL_SYSVAR(max_row_locks),
2196 MYSQL_SYSVAR(write_batch_max_bytes),
2197 MYSQL_SYSVAR(write_batch_flush_threshold),
2198 MYSQL_SYSVAR(lock_scanned_rows),
2199 MYSQL_SYSVAR(bulk_load),
2200 MYSQL_SYSVAR(bulk_load_allow_sk),
2201 MYSQL_SYSVAR(bulk_load_allow_unsorted),
2202 MYSQL_SYSVAR(trace_sst_api),
2203 MYSQL_SYSVAR(commit_in_the_middle),
2204 MYSQL_SYSVAR(blind_delete_primary_key),
2205 MYSQL_SYSVAR(enable_iterate_bounds),
2206 #if defined(ROCKSDB_INCLUDE_RFR) && ROCKSDB_INCLUDE_RFR
2207 MYSQL_SYSVAR(read_free_rpl_tables),
2208 MYSQL_SYSVAR(read_free_rpl),
2209 #endif // defined(ROCKSDB_INCLUDE_RFR) && ROCKSDB_INCLUDE_RFR
2210 MYSQL_SYSVAR(rpl_skip_tx_api),
2211 MYSQL_SYSVAR(bulk_load_size),
2212 MYSQL_SYSVAR(merge_buf_size),
2213 MYSQL_SYSVAR(enable_bulk_load_api),
2214 MYSQL_SYSVAR(enable_pipelined_write),
2215 MYSQL_SYSVAR(enable_remove_orphaned_dropped_cfs),
2216 MYSQL_SYSVAR(tmpdir),
2217 MYSQL_SYSVAR(merge_combine_read_size),
2218 MYSQL_SYSVAR(merge_tmp_file_removal_delay_ms),
2219 MYSQL_SYSVAR(skip_bloom_filter_on_read),
2220
2221 MYSQL_SYSVAR(create_if_missing),
2222 MYSQL_SYSVAR(concurrent_prepare),
2223 MYSQL_SYSVAR(two_write_queues),
2224 MYSQL_SYSVAR(manual_wal_flush),
2225 MYSQL_SYSVAR(write_policy),
2226 MYSQL_SYSVAR(create_missing_column_families),
2227 MYSQL_SYSVAR(error_if_exists),
2228 MYSQL_SYSVAR(paranoid_checks),
2229 MYSQL_SYSVAR(rate_limiter_bytes_per_sec),
2230 MYSQL_SYSVAR(sst_mgr_rate_bytes_per_sec),
2231 MYSQL_SYSVAR(delayed_write_rate),
2232 MYSQL_SYSVAR(max_latest_deadlocks),
2233 MYSQL_SYSVAR(info_log_level),
2234 MYSQL_SYSVAR(max_open_files),
2235 MYSQL_SYSVAR(max_total_wal_size),
2236 MYSQL_SYSVAR(use_fsync),
2237 MYSQL_SYSVAR(wal_dir),
2238 MYSQL_SYSVAR(persistent_cache_path),
2239 MYSQL_SYSVAR(persistent_cache_size_mb),
2240 MYSQL_SYSVAR(delete_obsolete_files_period_micros),
2241 MYSQL_SYSVAR(max_background_jobs),
2242 MYSQL_SYSVAR(max_background_flushes),
2243 MYSQL_SYSVAR(max_background_compactions),
2244 MYSQL_SYSVAR(max_bottom_pri_background_compactions),
2245 MYSQL_SYSVAR(max_log_file_size),
2246 MYSQL_SYSVAR(max_subcompactions),
2247 MYSQL_SYSVAR(log_file_time_to_roll),
2248 MYSQL_SYSVAR(keep_log_file_num),
2249 MYSQL_SYSVAR(max_manifest_file_size),
2250 MYSQL_SYSVAR(table_cache_numshardbits),
2251 MYSQL_SYSVAR(wal_ttl_seconds),
2252 MYSQL_SYSVAR(wal_size_limit_mb),
2253 MYSQL_SYSVAR(manifest_preallocation_size),
2254 MYSQL_SYSVAR(use_direct_reads),
2255 MYSQL_SYSVAR(use_direct_io_for_flush_and_compaction),
2256 MYSQL_SYSVAR(allow_mmap_reads),
2257 MYSQL_SYSVAR(allow_mmap_writes),
2258 MYSQL_SYSVAR(is_fd_close_on_exec),
2259 MYSQL_SYSVAR(stats_dump_period_sec),
2260 MYSQL_SYSVAR(advise_random_on_open),
2261 MYSQL_SYSVAR(db_write_buffer_size),
2262 MYSQL_SYSVAR(use_adaptive_mutex),
2263 MYSQL_SYSVAR(bytes_per_sync),
2264 MYSQL_SYSVAR(wal_bytes_per_sync),
2265 MYSQL_SYSVAR(enable_thread_tracking),
2266 MYSQL_SYSVAR(perf_context_level),
2267 MYSQL_SYSVAR(wal_recovery_mode),
2268 MYSQL_SYSVAR(track_and_verify_wals_in_manifest),
2269 MYSQL_SYSVAR(stats_level),
2270 MYSQL_SYSVAR(access_hint_on_compaction_start),
2271 MYSQL_SYSVAR(new_table_reader_for_compaction_inputs),
2272 MYSQL_SYSVAR(compaction_readahead_size),
2273 MYSQL_SYSVAR(allow_concurrent_memtable_write),
2274 MYSQL_SYSVAR(enable_write_thread_adaptive_yield),
2275
2276 MYSQL_SYSVAR(block_cache_size),
2277 MYSQL_SYSVAR(sim_cache_size),
2278 MYSQL_SYSVAR(cache_high_pri_pool_ratio),
2279 MYSQL_SYSVAR(cache_dump),
2280 MYSQL_SYSVAR(cache_index_and_filter_blocks),
2281 MYSQL_SYSVAR(cache_index_and_filter_with_high_priority),
2282 MYSQL_SYSVAR(pin_l0_filter_and_index_blocks_in_cache),
2283 MYSQL_SYSVAR(index_type),
2284 MYSQL_SYSVAR(hash_index_allow_collision),
2285 MYSQL_SYSVAR(no_block_cache),
2286 MYSQL_SYSVAR(block_size),
2287 MYSQL_SYSVAR(block_size_deviation),
2288 MYSQL_SYSVAR(block_restart_interval),
2289 MYSQL_SYSVAR(whole_key_filtering),
2290
2291 MYSQL_SYSVAR(default_cf_options),
2292 MYSQL_SYSVAR(override_cf_options),
2293 MYSQL_SYSVAR(update_cf_options),
2294 MYSQL_SYSVAR(use_default_sk_cf),
2295
2296 MYSQL_SYSVAR(flush_log_at_trx_commit),
2297 MYSQL_SYSVAR(write_disable_wal),
2298 MYSQL_SYSVAR(write_ignore_missing_column_families),
2299
2300 MYSQL_SYSVAR(skip_fill_cache),
2301 MYSQL_SYSVAR(unsafe_for_binlog),
2302
2303 MYSQL_SYSVAR(records_in_range),
2304 MYSQL_SYSVAR(force_index_records_in_range),
2305 MYSQL_SYSVAR(debug_optimizer_n_rows),
2306 MYSQL_SYSVAR(force_compute_memtable_stats),
2307 MYSQL_SYSVAR(force_compute_memtable_stats_cachetime),
2308 MYSQL_SYSVAR(debug_optimizer_no_zero_cardinality),
2309
2310 MYSQL_SYSVAR(compact_cf),
2311 MYSQL_SYSVAR(delete_cf),
2312 MYSQL_SYSVAR(signal_drop_index_thread),
2313 MYSQL_SYSVAR(pause_background_work),
2314 MYSQL_SYSVAR(ignore_unknown_options),
2315 MYSQL_SYSVAR(strict_collation_check),
2316 MYSQL_SYSVAR(strict_collation_exceptions),
2317 MYSQL_SYSVAR(collect_sst_properties),
2318 MYSQL_SYSVAR(force_flush_memtable_now),
2319 MYSQL_SYSVAR(force_flush_memtable_and_lzero_now),
2320 MYSQL_SYSVAR(enable_native_partition),
2321 MYSQL_SYSVAR(enable_ttl),
2322 MYSQL_SYSVAR(enable_ttl_read_filtering),
2323 MYSQL_SYSVAR(debug_ttl_rec_ts),
2324 MYSQL_SYSVAR(debug_ttl_snapshot_ts),
2325 MYSQL_SYSVAR(debug_ttl_read_filter_ts),
2326 MYSQL_SYSVAR(debug_ttl_ignore_pk),
2327 MYSQL_SYSVAR(reset_stats),
2328 MYSQL_SYSVAR(seconds_between_stat_computes),
2329
2330 MYSQL_SYSVAR(compaction_sequential_deletes),
2331 MYSQL_SYSVAR(compaction_sequential_deletes_window),
2332 MYSQL_SYSVAR(compaction_sequential_deletes_file_size),
2333 MYSQL_SYSVAR(compaction_sequential_deletes_count_sd),
2334 MYSQL_SYSVAR(print_snapshot_conflict_queries),
2335
2336 MYSQL_SYSVAR(datadir),
2337 MYSQL_SYSVAR(create_checkpoint),
2338
2339 MYSQL_SYSVAR(checksums_pct),
2340 MYSQL_SYSVAR(store_row_debug_checksums),
2341 MYSQL_SYSVAR(verify_row_debug_checksums),
2342 MYSQL_SYSVAR(master_skip_tx_api),
2343
2344 #if defined(ROCKSDB_INCLUDE_VALIDATE_TABLES) && ROCKSDB_INCLUDE_VALIDATE_TABLES
2345 MYSQL_SYSVAR(validate_tables),
2346 #endif // defined(ROCKSDB_INCLUDE_VALIDATE_TABLES) &&
2347 // ROCKSDB_INCLUDE_VALIDATE_TABLES
2348 MYSQL_SYSVAR(table_stats_sampling_pct),
2349 MYSQL_SYSVAR(table_stats_recalc_threshold_pct),
2350 MYSQL_SYSVAR(table_stats_recalc_threshold_count),
2351 MYSQL_SYSVAR(table_stats_max_num_rows_scanned),
2352 MYSQL_SYSVAR(table_stats_use_table_scan),
2353 MYSQL_SYSVAR(table_stats_background_thread_nice_value),
2354
2355 MYSQL_SYSVAR(large_prefix),
2356 MYSQL_SYSVAR(allow_to_start_after_corruption),
2357 MYSQL_SYSVAR(error_on_suboptimal_collation),
2358 MYSQL_SYSVAR(no_create_column_family),
2359 MYSQL_SYSVAR(stats_recalc_rate),
2360 MYSQL_SYSVAR(debug_manual_compaction_delay),
2361 MYSQL_SYSVAR(max_manual_compactions),
2362 MYSQL_SYSVAR(manual_compaction_threads),
2363 MYSQL_SYSVAR(manual_compaction_bottommost_level),
2364 MYSQL_SYSVAR(rollback_on_timeout),
2365
2366 MYSQL_SYSVAR(enable_insert_with_update_caching),
2367 MYSQL_SYSVAR(trace_block_cache_access),
2368 MYSQL_SYSVAR(trace_queries),
2369
2370 MYSQL_SYSVAR(skip_locks_if_skip_unique_check),
2371 MYSQL_SYSVAR(alter_column_default_inplace),
2372 nullptr};
2373
rdb_get_rocksdb_write_options(my_core::THD * const thd)2374 static rocksdb::WriteOptions rdb_get_rocksdb_write_options(
2375 my_core::THD *const thd) {
2376 rocksdb::WriteOptions opt;
2377
2378 opt.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
2379 opt.disableWAL = THDVAR(thd, write_disable_wal);
2380 opt.ignore_missing_column_families =
2381 THDVAR(thd, write_ignore_missing_column_families);
2382
2383 return opt;
2384 }
2385
rocksdb_compact_column_family(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,struct st_mysql_value * const value)2386 static int rocksdb_compact_column_family(THD *const thd,
2387 struct st_mysql_sys_var *const var,
2388 void *const var_ptr,
2389 struct st_mysql_value *const value) {
2390 char buff[STRING_BUFFER_USUAL_SIZE];
2391 int len = sizeof(buff);
2392
2393 assert(value != nullptr);
2394
2395 if (const char *const cf = value->val_str(value, buff, &len)) {
2396 DBUG_EXECUTE_IF("rocksdb_compact_column_family", {
2397 static constexpr char act[] =
2398 "now signal ready_to_mark_cf_dropped_in_compact_column_family "
2399 "wait_for mark_cf_dropped_done_in_compact_column_family";
2400 assert(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
2401 });
2402
2403 std::string cf_name = std::string(cf);
2404 // use rocksdb_compact_cf="" or "default" to compact default CF
2405 if (cf_name.empty()) cf_name = DEFAULT_CF_NAME;
2406
2407 auto cfh = cf_manager.get_cf(cf_name);
2408 if (cfh != nullptr && rdb != nullptr) {
2409 rocksdb::BottommostLevelCompaction bottommost_level_compaction =
2410 (rocksdb::BottommostLevelCompaction)THDVAR(
2411 thd, manual_compaction_bottommost_level);
2412
2413 int mc_id = rdb_mc_thread.request_manual_compaction(
2414 cfh, nullptr, nullptr, THDVAR(thd, manual_compaction_threads),
2415 bottommost_level_compaction);
2416 if (mc_id == -1) {
2417 my_error(ER_INTERNAL_ERROR, MYF(0),
2418 "Can't schedule more manual compactions. "
2419 "Increase rocksdb_max_manual_compactions or stop issuing "
2420 "more manual compactions.");
2421 return HA_EXIT_FAILURE;
2422 } else if (mc_id < 0) {
2423 return HA_EXIT_FAILURE;
2424 }
2425 // NO_LINT_DEBUG
2426 sql_print_information("RocksDB: Manual compaction of column family: %s\n",
2427 cf);
2428 // Checking thd state every short cycle (100ms). This is for allowing to
2429 // exiting this function without waiting for CompactRange to finish.
2430 do {
2431 my_sleep(100000);
2432 } while (!thd->killed &&
2433 !rdb_mc_thread.is_manual_compaction_finished(mc_id));
2434
2435 if (thd->killed) {
2436 // This cancels if requested compaction state is INITED.
2437 // TODO(yoshinorim): Cancel running compaction as well once
2438 // it is supported in RocksDB.
2439 rdb_mc_thread.clear_manual_compaction_request(mc_id, true);
2440 }
2441 }
2442 }
2443 return HA_EXIT_SUCCESS;
2444 }
2445
2446 /*
2447 * Serializes an xid to a string so that it can
2448 * be used as a rocksdb transaction name
2449 */
rdb_xid_to_string(const XID & src)2450 static std::string rdb_xid_to_string(const XID &src) {
2451 assert(src.get_gtrid_length() >= 0);
2452 assert(src.get_gtrid_length() <= MAXGTRIDSIZE);
2453 assert(src.get_bqual_length() >= 0);
2454 assert(src.get_bqual_length() <= MAXBQUALSIZE);
2455
2456 std::string buf;
2457 buf.reserve(RDB_XIDHDR_LEN + src.get_gtrid_length() + src.get_bqual_length());
2458
2459 /*
2460 * expand formatID to fill 8 bytes if it doesn't already
2461 * then reinterpret bit pattern as unsigned and store in network order
2462 */
2463 uchar fidbuf[RDB_FORMATID_SZ];
2464 int64 signed_fid8 = src.get_format_id();
2465 const uint64 raw_fid8 = *reinterpret_cast<uint64 *>(&signed_fid8);
2466 rdb_netbuf_store_uint64(fidbuf, raw_fid8);
2467 buf.append(reinterpret_cast<const char *>(fidbuf), RDB_FORMATID_SZ);
2468
2469 buf.push_back(src.get_gtrid_length());
2470 buf.push_back(src.get_bqual_length());
2471 buf.append(src.get_data(),
2472 (src.get_gtrid_length()) + (src.get_bqual_length()));
2473 return buf;
2474 }
2475
2476 ///////////////////////////////////////////////////////////////////////////////////////////
2477
2478 /*
2479 Drop index thread's control
2480 */
2481
rocksdb_drop_index_wakeup_thread(my_core::THD * const thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * const var MY_ATTRIBUTE ((__unused__)),void * const var_ptr MY_ATTRIBUTE ((__unused__)),const void * const save)2482 static void rocksdb_drop_index_wakeup_thread(
2483 my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
2484 struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
2485 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
2486 if (*static_cast<const bool *>(save)) {
2487 rdb_drop_idx_thread.signal();
2488 }
2489 }
2490
rocksdb_perf_context_level(THD * const thd)2491 static inline uint32_t rocksdb_perf_context_level(THD *const thd) {
2492 assert(thd != nullptr);
2493
2494 const int session_perf_context_level = THDVAR(thd, perf_context_level);
2495 if (session_perf_context_level > rocksdb::PerfLevel::kUninitialized) {
2496 return session_perf_context_level;
2497 }
2498
2499 /*
2500 Fallback to global thdvar, if session specific one was not set to a valid
2501 value.
2502 */
2503
2504 const int global_perf_context_level = THDVAR(nullptr, perf_context_level);
2505 if (global_perf_context_level > rocksdb::PerfLevel::kUninitialized) {
2506 return global_perf_context_level;
2507 }
2508
2509 return rocksdb::PerfLevel::kDisable;
2510 }
2511
2512 /*
2513 Very short (functor-like) interface to be passed to
2514 Rdb_transaction::walk_tx_list()
2515 */
2516
2517 interface Rdb_tx_list_walker {
2518 virtual ~Rdb_tx_list_walker() {}
2519 virtual void process_tran(const Rdb_transaction *const) = 0;
2520 };
2521
2522 /*
2523 This is a helper class that is passed to RocksDB to get notifications when
2524 a snapshot gets created.
2525 */
2526
2527 class Rdb_snapshot_notifier : public rocksdb::TransactionNotifier {
2528 Rdb_transaction *m_owning_tx;
2529
2530 void SnapshotCreated(const rocksdb::Snapshot *snapshot) override;
2531
2532 public:
2533 Rdb_snapshot_notifier(const Rdb_snapshot_notifier &) = delete;
2534 Rdb_snapshot_notifier &operator=(const Rdb_snapshot_notifier &) = delete;
2535
Rdb_snapshot_notifier(Rdb_transaction * const owning_tx)2536 explicit Rdb_snapshot_notifier(Rdb_transaction *const owning_tx)
2537 : m_owning_tx(owning_tx) {}
2538
2539 // If the owning Rdb_transaction gets destructed we need to not reference
2540 // it anymore.
detach()2541 void detach() { m_owning_tx = nullptr; }
2542 };
2543
2544 /* This is the base class for transactions when interacting with rocksdb.
2545 */
2546 class Rdb_transaction {
2547 protected:
2548 ulonglong m_write_count = 0;
2549 // per row data
2550 ulonglong m_row_lock_count = 0;
2551 std::unordered_map<GL_INDEX_ID, ulonglong> m_auto_incr_map;
2552
2553 bool m_is_delayed_snapshot = false;
2554
2555 std::unordered_set<Rdb_tbl_def*> modified_tables;
2556
2557 private:
2558 /*
2559 Number of write operations this transaction had when we took the last
2560 savepoint (the idea is not to take another savepoint if we haven't made
2561 any changes)
2562 */
2563 ulonglong m_writes_at_last_savepoint;
2564
2565 protected:
2566 THD *m_thd = nullptr;
2567
2568 rocksdb::ReadOptions m_read_opts;
2569
2570 static std::multiset<Rdb_transaction *> s_tx_list;
2571 static mysql_mutex_t s_tx_list_mutex;
2572
2573 Rdb_io_perf *m_tbl_io_perf;
2574
2575 bool m_tx_read_only = false;
2576
2577 int m_timeout_sec; /* Cached value of @@rocksdb_lock_wait_timeout */
2578
2579 /* Maximum number of locks the transaction can have */
2580 ulonglong m_max_row_locks;
2581
2582 bool m_is_tx_failed = false;
2583 bool m_rollback_only = false;
2584
2585 std::shared_ptr<Rdb_snapshot_notifier> m_notifier;
2586
2587 // This should be used only when updating binlog information.
2588 virtual rocksdb::WriteBatchBase *get_write_batch() = 0;
2589 virtual bool commit_no_binlog() = 0;
2590 virtual rocksdb::Iterator *get_iterator(
2591 const rocksdb::ReadOptions &options,
2592 rocksdb::ColumnFamilyHandle *column_family) = 0;
2593
2594 /*
2595 @detail
2596 This function takes in the WriteBatch of the transaction to add
2597 all the AUTO_INCREMENT merges. It does so by iterating through
2598 m_auto_incr_map and then constructing key/value pairs to call merge upon.
2599
2600 @param wb
2601 */
merge_auto_incr_map(rocksdb::WriteBatchBase * const wb)2602 rocksdb::Status merge_auto_incr_map(rocksdb::WriteBatchBase *const wb) {
2603 DBUG_EXECUTE_IF("myrocks_autoinc_upgrade", return rocksdb::Status::OK(););
2604
2605 // Iterate through the merge map merging all keys into data dictionary.
2606 rocksdb::Status s;
2607 for (auto &it : m_auto_incr_map) {
2608 s = dict_manager.put_auto_incr_val(wb, it.first, it.second);
2609 if (!s.ok()) {
2610 return s;
2611 }
2612 }
2613 m_auto_incr_map.clear();
2614 return s;
2615 }
2616
2617 protected:
2618 /*
2619 The following two are helper functions to be overloaded by child classes.
2620 They should provide RocksDB's savepoint semantics.
2621 */
2622 virtual void do_set_savepoint() = 0;
2623 virtual rocksdb::Status do_pop_savepoint() = 0;
2624 virtual void do_rollback_to_savepoint() = 0;
2625
2626 public:
2627 int64_t m_snapshot_timestamp = 0;
2628 bool m_ddl_transaction;
2629
2630 /*
2631 Tracks the number of tables in use through external_lock.
2632 This should not be reset during start_tx().
2633 */
2634 int64_t m_n_mysql_tables_in_use = 0;
2635
2636 /*
2637 for distinction between rdb_transaction_impl and rdb_writebatch_impl
2638 when using walk tx list
2639 */
2640 virtual bool is_writebatch_trx() const = 0;
2641
init_mutex()2642 static void init_mutex() {
2643 mysql_mutex_init(key_mutex_tx_list, &s_tx_list_mutex, MY_MUTEX_INIT_FAST);
2644 }
2645
term_mutex()2646 static void term_mutex() {
2647 assert(s_tx_list.size() == 0);
2648 mysql_mutex_destroy(&s_tx_list_mutex);
2649 }
2650
walk_tx_list(Rdb_tx_list_walker * walker)2651 static void walk_tx_list(Rdb_tx_list_walker *walker) {
2652 assert(walker != nullptr);
2653
2654 RDB_MUTEX_LOCK_CHECK(s_tx_list_mutex);
2655
2656 for (auto it : s_tx_list) walker->process_tran(it);
2657
2658 RDB_MUTEX_UNLOCK_CHECK(s_tx_list_mutex);
2659 }
2660
set_status_error(THD * const thd,const rocksdb::Status & s,const Rdb_key_def & kd,Rdb_tbl_def * const tbl_def)2661 int set_status_error(THD *const thd, const rocksdb::Status &s,
2662 const Rdb_key_def &kd, Rdb_tbl_def *const tbl_def) {
2663 assert(!s.ok());
2664 assert(tbl_def != nullptr);
2665
2666 if (s.IsTimedOut()) {
2667 /*
2668 SQL layer has weird expectations. If we return an error when
2669 doing a read in DELETE IGNORE, it will ignore the error ("because it's
2670 an IGNORE command!) but then will fail an assert, because "error code
2671 was returned, but no error happened". Do what InnoDB's
2672 convert_error_code_to_mysql() does: force a statement
2673 rollback before returning HA_ERR_LOCK_WAIT_TIMEOUT:
2674 */
2675 thd->mark_transaction_to_rollback(
2676 static_cast<bool>(rocksdb_rollback_on_timeout));
2677
2678 rocksdb_row_lock_wait_timeouts++;
2679
2680 return HA_ERR_LOCK_WAIT_TIMEOUT;
2681 }
2682
2683 if (s.IsDeadlock()) {
2684 thd->mark_transaction_to_rollback(true /* whole transaction */);
2685 rocksdb_row_lock_deadlocks++;
2686 return HA_ERR_LOCK_DEADLOCK;
2687 } else if (s.IsBusy()) {
2688 rocksdb_snapshot_conflict_errors++;
2689 if (rocksdb_print_snapshot_conflict_queries) {
2690 char user_host_buff[MAX_USER_HOST_SIZE + 1];
2691 make_user_name(thd->security_context(), user_host_buff);
2692 // NO_LINT_DEBUG
2693 sql_print_warning("Got snapshot conflict errors: User: %s Query: %.*s",
2694 user_host_buff, static_cast<int>(thd->query().length),
2695 thd->query().str);
2696 }
2697 return HA_ERR_ROCKSDB_STATUS_BUSY;
2698 }
2699
2700 if (s.IsIOError() || s.IsCorruption()) {
2701 rdb_handle_io_error(s, RDB_IO_ERROR_GENERAL);
2702 }
2703
2704 return ha_rocksdb::rdb_error_to_mysql(s);
2705 }
2706
get_thd() const2707 THD *get_thd() const { return m_thd; }
2708
2709 /* Used for tracking io_perf counters */
io_perf_start(Rdb_io_perf * const io_perf)2710 void io_perf_start(Rdb_io_perf *const io_perf) {
2711 /*
2712 Since perf_context is tracked per thread, it is difficult and expensive
2713 to maintain perf_context on a per table basis. Therefore, roll all
2714 perf_context data into the first table used in a query. This works well
2715 for single table queries and is probably good enough for queries that hit
2716 multiple tables.
2717
2718 perf_context stats gathering is started when the table lock is acquired
2719 or when ha_rocksdb::start_stmt is called in case of LOCK TABLES. They
2720 are recorded when the table lock is released, or when commit/rollback
2721 is called on the transaction, whichever comes first. Table lock release
2722 and commit/rollback can happen in different orders. In the case where
2723 the lock is released before commit/rollback is called, an extra step to
2724 gather stats during commit/rollback is needed.
2725 */
2726 if (m_tbl_io_perf == nullptr &&
2727 io_perf->start(rocksdb_perf_context_level(m_thd))) {
2728 m_tbl_io_perf = io_perf;
2729 }
2730 }
2731
io_perf_end_and_record(void)2732 void io_perf_end_and_record(void) {
2733 if (m_tbl_io_perf != nullptr) {
2734 m_tbl_io_perf->end_and_record(rocksdb_perf_context_level(m_thd));
2735 m_tbl_io_perf = nullptr;
2736 }
2737 }
2738
io_perf_end_and_record(Rdb_io_perf * const io_perf)2739 void io_perf_end_and_record(Rdb_io_perf *const io_perf) {
2740 if (m_tbl_io_perf == io_perf) {
2741 io_perf_end_and_record();
2742 }
2743 }
2744
set_params(int timeout_sec_arg,int max_row_locks_arg)2745 void set_params(int timeout_sec_arg, int max_row_locks_arg) {
2746 m_timeout_sec = timeout_sec_arg;
2747 m_max_row_locks = max_row_locks_arg;
2748 set_lock_timeout(timeout_sec_arg);
2749 }
2750
2751 virtual void set_lock_timeout(int timeout_sec_arg) = 0;
2752
get_write_count() const2753 ulonglong get_write_count() const { return m_write_count; }
2754
get_row_lock_count() const2755 ulonglong get_row_lock_count() const { return m_row_lock_count; }
2756
incr_row_lock_count()2757 void incr_row_lock_count() { ++m_row_lock_count; }
2758
get_max_row_lock_count() const2759 ulonglong get_max_row_lock_count() const { return m_max_row_locks; }
2760
get_timeout_sec() const2761 int get_timeout_sec() const { return m_timeout_sec; }
2762
2763 virtual void set_sync(bool sync) = 0;
2764
2765 virtual void release_lock(const Rdb_key_def &key_descr,
2766 const std::string &rowkey) = 0;
2767
2768 virtual bool prepare() = 0;
2769
commit_or_rollback()2770 bool commit_or_rollback() {
2771 bool res;
2772 if (m_is_tx_failed) {
2773 rollback();
2774 res = false;
2775 } else {
2776 res = commit();
2777 }
2778 return res;
2779 }
2780
commit()2781 bool commit() {
2782 if (get_write_count() == 0) {
2783 rollback();
2784 return false;
2785 } else if (m_rollback_only) {
2786 /*
2787 Transactions marked as rollback_only are expected to be rolled back at
2788 prepare(). But there are some exceptions like below that prepare() is
2789 never called and commit() is called instead.
2790 1. Binlog is disabled
2791 2. No modification exists in binlog cache for the transaction (#195)
2792 In both cases, rolling back transaction is safe. Nothing is written to
2793 binlog.
2794 */
2795 my_error(ER_ROLLBACK_ONLY, MYF(0));
2796 rollback();
2797 return true;
2798 } else {
2799 return commit_no_binlog();
2800 }
2801 }
2802
2803 virtual void rollback() = 0;
2804
snapshot_created(const rocksdb::Snapshot * const snapshot)2805 void snapshot_created(const rocksdb::Snapshot *const snapshot) {
2806 assert(snapshot != nullptr);
2807
2808 m_read_opts.snapshot = snapshot;
2809 rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp);
2810 m_is_delayed_snapshot = false;
2811 }
2812
2813 virtual void acquire_snapshot(bool acquire_now) = 0;
2814 virtual void release_snapshot() = 0;
2815
has_snapshot() const2816 bool has_snapshot() const { return m_read_opts.snapshot != nullptr; }
2817
2818 private:
2819 // The Rdb_sst_info structures we are currently loading. In a partitioned
2820 // table this can have more than one entry
2821 std::vector<std::shared_ptr<Rdb_sst_info>> m_curr_bulk_load;
2822 std::string m_curr_bulk_load_tablename;
2823
2824 /* External merge sorts for bulk load: key ID -> merge sort instance */
2825 std::unordered_map<GL_INDEX_ID, Rdb_index_merge> m_key_merge;
2826
2827 public:
get_key_merge(GL_INDEX_ID kd_gl_id,rocksdb::ColumnFamilyHandle * cf,Rdb_index_merge ** key_merge)2828 int get_key_merge(GL_INDEX_ID kd_gl_id, rocksdb::ColumnFamilyHandle *cf,
2829 Rdb_index_merge **key_merge) {
2830 int res;
2831 auto it = m_key_merge.find(kd_gl_id);
2832 if (it == m_key_merge.end()) {
2833 m_key_merge.emplace(
2834 std::piecewise_construct, std::make_tuple(kd_gl_id),
2835 std::make_tuple(
2836 get_rocksdb_tmpdir(), THDVAR(get_thd(), merge_buf_size),
2837 THDVAR(get_thd(), merge_combine_read_size),
2838 THDVAR(get_thd(), merge_tmp_file_removal_delay_ms), cf));
2839 it = m_key_merge.find(kd_gl_id);
2840 if ((res = it->second.init()) != 0) {
2841 return res;
2842 }
2843 }
2844 *key_merge = &it->second;
2845 return HA_EXIT_SUCCESS;
2846 }
2847
2848 /* Finish bulk loading for all table handlers belongs to one connection */
finish_bulk_load(bool * is_critical_error=nullptr,int print_client_error=true)2849 int finish_bulk_load(bool *is_critical_error = nullptr,
2850 int print_client_error = true) {
2851 Ensure_cleanup cleanup([&]() {
2852 // Always clear everything regardless of success/failure
2853 m_curr_bulk_load.clear();
2854 m_curr_bulk_load_tablename.clear();
2855 m_key_merge.clear();
2856 });
2857
2858 int rc = 0;
2859 if (is_critical_error) {
2860 *is_critical_error = true;
2861 }
2862
2863 // PREPARE phase: finish all on-going bulk loading Rdb_sst_info and
2864 // collect all Rdb_sst_commit_info containing (SST files, cf)
2865 int rc2 = 0;
2866 std::vector<Rdb_sst_info::Rdb_sst_commit_info> sst_commit_list;
2867 sst_commit_list.reserve(m_curr_bulk_load.size());
2868
2869 for (auto &sst_info : m_curr_bulk_load) {
2870 Rdb_sst_info::Rdb_sst_commit_info commit_info;
2871
2872 // Commit the list of SST files and move it to the end of
2873 // sst_commit_list, effectively transfer the ownership over
2874 rc2 = sst_info->finish(&commit_info, print_client_error);
2875 if (rc2 && rc == 0) {
2876 // Don't return yet - make sure we finish all the SST infos
2877 rc = rc2;
2878 }
2879
2880 // Make sure we have work to do - we might be losing the race
2881 if (rc2 == 0 && commit_info.has_work()) {
2882 sst_commit_list.emplace_back(std::move(commit_info));
2883 assert(!commit_info.has_work());
2884 }
2885 }
2886
2887 if (rc) {
2888 return rc;
2889 }
2890
2891 // MERGING Phase: Flush the index_merge sort buffers into SST files in
2892 // Rdb_sst_info and collect all Rdb_sst_commit_info containing
2893 // (SST files, cf)
2894 if (!m_key_merge.empty()) {
2895 Ensure_cleanup malloc_cleanup([]() {
2896 /*
2897 Explicitly tell jemalloc to clean up any unused dirty pages at this
2898 point.
2899 See https://reviews.facebook.net/D63723 for more details.
2900 */
2901 purge_all_jemalloc_arenas();
2902 });
2903
2904 rocksdb::Slice merge_key;
2905 rocksdb::Slice merge_val;
2906 for (auto it = m_key_merge.begin(); it != m_key_merge.end(); it++) {
2907 GL_INDEX_ID index_id = it->first;
2908 std::shared_ptr<const Rdb_key_def> keydef =
2909 ddl_manager.safe_find(index_id);
2910 std::string table_name = ddl_manager.safe_get_table_name(index_id);
2911
2912 // Unable to find key definition or table name since the
2913 // table could have been dropped.
2914 // TODO(herman): there is a race here between dropping the table
2915 // and detecting a drop here. If the table is dropped while bulk
2916 // loading is finishing, these keys being added here may
2917 // be missed by the compaction filter and not be marked for
2918 // removal. It is unclear how to lock the sql table from the storage
2919 // engine to prevent modifications to it while bulk load is occurring.
2920 if (keydef == nullptr) {
2921 if (is_critical_error) {
2922 // We used to set the error but simply ignores it. This follows
2923 // current behavior and we should revisit this later
2924 *is_critical_error = false;
2925 }
2926 return HA_ERR_KEY_NOT_FOUND;
2927 } else if (table_name.empty()) {
2928 if (is_critical_error) {
2929 // We used to set the error but simply ignores it. This follows
2930 // current behavior and we should revisit this later
2931 *is_critical_error = false;
2932 }
2933 return HA_ERR_NO_SUCH_TABLE;
2934 }
2935 const std::string &index_name = keydef->get_name();
2936 Rdb_index_merge &rdb_merge = it->second;
2937
2938 // Rdb_sst_info expects a denormalized table name in the form of
2939 // "./database/table"
2940 std::replace(table_name.begin(), table_name.end(), '.', '/');
2941 table_name = "./" + table_name;
2942 auto sst_info = std::make_shared<Rdb_sst_info>(
2943 rdb, table_name, index_name, rdb_merge.get_cf(),
2944 *rocksdb_db_options, THDVAR(get_thd(), trace_sst_api));
2945
2946 while ((rc2 = rdb_merge.next(&merge_key, &merge_val)) == 0) {
2947 if ((rc2 = sst_info->put(merge_key, merge_val)) != 0) {
2948 rc = rc2;
2949
2950 // Don't return yet - make sure we finish the sst_info
2951 break;
2952 }
2953 }
2954
2955 // -1 => no more items
2956 if (rc2 != -1 && rc != 0) {
2957 rc = rc2;
2958 }
2959
2960 Rdb_sst_info::Rdb_sst_commit_info commit_info;
2961 rc2 = sst_info->finish(&commit_info, print_client_error);
2962 if (rc2 != 0 && rc == 0) {
2963 // Only set the error from sst_info->finish if finish failed and we
2964 // didn't fail before. In other words, we don't have finish's
2965 // success mask earlier failures
2966 rc = rc2;
2967 }
2968
2969 if (rc) {
2970 return rc;
2971 }
2972
2973 if (commit_info.has_work()) {
2974 sst_commit_list.emplace_back(std::move(commit_info));
2975 assert(!commit_info.has_work());
2976 }
2977 }
2978 }
2979
2980 // Early return in case we lost the race completely and end up with no
2981 // work at all
2982 if (sst_commit_list.size() == 0) {
2983 return rc;
2984 }
2985
2986 // INGEST phase: Group all Rdb_sst_commit_info by cf (as they might
2987 // have the same cf across different indexes) and call out to RocksDB
2988 // to ingest all SST files in one atomic operation
2989 rocksdb::IngestExternalFileOptions options;
2990 options.move_files = true;
2991 options.snapshot_consistency = false;
2992 options.allow_global_seqno = false;
2993 options.allow_blocking_flush = false;
2994
2995 std::map<rocksdb::ColumnFamilyHandle *, rocksdb::IngestExternalFileArg>
2996 arg_map;
2997
2998 // Group by column_family
2999 for (auto &commit_info : sst_commit_list) {
3000 if (arg_map.find(commit_info.get_cf()) == arg_map.end()) {
3001 rocksdb::IngestExternalFileArg arg;
3002 arg.column_family = commit_info.get_cf(),
3003 arg.external_files = commit_info.get_committed_files(),
3004 arg.options = options;
3005
3006 arg_map.emplace(commit_info.get_cf(), arg);
3007 } else {
3008 auto &files = arg_map[commit_info.get_cf()].external_files;
3009 files.insert(files.end(), commit_info.get_committed_files().begin(),
3010 commit_info.get_committed_files().end());
3011 }
3012 }
3013
3014 std::vector<rocksdb::IngestExternalFileArg> args;
3015 size_t file_count = 0;
3016 for (auto &cf_files_pair : arg_map) {
3017 args.push_back(cf_files_pair.second);
3018 file_count += cf_files_pair.second.external_files.size();
3019 }
3020
3021 const rocksdb::Status s = rdb->IngestExternalFiles(args);
3022 if (THDVAR(m_thd, trace_sst_api)) {
3023 // NO_LINT_DEBUG
3024 sql_print_information(
3025 "SST Tracing: IngestExternalFile '%zu' files returned %s", file_count,
3026 s.ok() ? "ok" : "not ok");
3027 }
3028
3029 if (!s.ok()) {
3030 if (print_client_error) {
3031 Rdb_sst_info::report_error_msg(s, nullptr);
3032 }
3033 return HA_ERR_ROCKSDB_BULK_LOAD;
3034 }
3035
3036 // COMMIT phase: mark everything as completed. This avoids SST file
3037 // deletion kicking in. Otherwise SST files would get deleted if this
3038 // entire operation is aborted
3039 for (auto &commit_info : sst_commit_list) {
3040 commit_info.commit();
3041 }
3042
3043 return rc;
3044 }
3045
start_bulk_load(ha_rocksdb * const bulk_load,std::shared_ptr<Rdb_sst_info> sst_info)3046 int start_bulk_load(ha_rocksdb *const bulk_load,
3047 std::shared_ptr<Rdb_sst_info> sst_info) {
3048 /*
3049 If we already have an open bulk load of a table and the name doesn't
3050 match the current one, close out the currently running one. This allows
3051 multiple bulk loads to occur on a partitioned table, but then closes
3052 them all out when we switch to another table.
3053 */
3054 assert(bulk_load != nullptr);
3055
3056 if (!m_curr_bulk_load.empty() &&
3057 bulk_load->get_table_basename() != m_curr_bulk_load_tablename) {
3058 const auto res = finish_bulk_load();
3059 if (res != HA_EXIT_SUCCESS) {
3060 return res;
3061 }
3062 }
3063
3064 /*
3065 This used to track ha_rocksdb handler objects, but those can be
3066 freed by the table cache while this was referencing them. Instead
3067 of tracking ha_rocksdb handler objects, this now tracks the
3068 Rdb_sst_info allocated, and both the ha_rocksdb handler and the
3069 Rdb_transaction both have shared pointers to them.
3070
3071 On transaction complete, it will commit each Rdb_sst_info structure found.
3072 If the ha_rocksdb object is freed, etc., it will also commit
3073 the Rdb_sst_info. The Rdb_sst_info commit path needs to be idempotent.
3074 */
3075 m_curr_bulk_load.push_back(sst_info);
3076 m_curr_bulk_load_tablename = bulk_load->get_table_basename();
3077 return HA_EXIT_SUCCESS;
3078 }
3079
num_ongoing_bulk_load() const3080 int num_ongoing_bulk_load() const { return m_curr_bulk_load.size(); }
3081
get_rocksdb_tmpdir() const3082 const char *get_rocksdb_tmpdir() const {
3083 const char *tmp_dir = THDVAR(get_thd(), tmpdir);
3084
3085 /*
3086 We want to treat an empty string as nullptr, in these cases DDL operations
3087 will use the default --tmpdir passed to mysql instead.
3088 */
3089 if (tmp_dir != nullptr && *tmp_dir == '\0') {
3090 tmp_dir = nullptr;
3091 }
3092 return (tmp_dir);
3093 }
3094
3095 /*
3096 Flush the data accumulated so far. This assumes we're doing a bulk insert.
3097
3098 @detail
3099 This should work like transaction commit, except that we don't
3100 synchronize with the binlog (there is no API that would allow to have
3101 binlog flush the changes accumulated so far and return its current
3102 position)
3103
3104 @todo
3105 Add test coverage for what happens when somebody attempts to do bulk
3106 inserts while inside a multi-statement transaction.
3107 */
flush_batch()3108 bool flush_batch() {
3109 if (get_write_count() == 0) return false;
3110
3111 /* Commit the current transaction */
3112 if (commit_no_binlog()) return true;
3113
3114 /* Start another one */
3115 start_tx();
3116 return false;
3117 }
3118
set_auto_incr(const GL_INDEX_ID & gl_index_id,ulonglong curr_id)3119 void set_auto_incr(const GL_INDEX_ID &gl_index_id, ulonglong curr_id) {
3120 m_auto_incr_map[gl_index_id] =
3121 std::max(m_auto_incr_map[gl_index_id], curr_id);
3122 }
3123
3124 #ifndef NDEBUG
get_auto_incr(const GL_INDEX_ID & gl_index_id)3125 ulonglong get_auto_incr(const GL_INDEX_ID &gl_index_id) {
3126 if (m_auto_incr_map.count(gl_index_id) > 0) {
3127 return m_auto_incr_map[gl_index_id];
3128 }
3129 return 0;
3130 }
3131 #endif
3132
3133 virtual rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family,
3134 const rocksdb::Slice &key,
3135 const rocksdb::Slice &value,
3136 const bool assume_tracked) = 0;
3137 virtual rocksdb::Status delete_key(
3138 rocksdb::ColumnFamilyHandle *const column_family,
3139 const rocksdb::Slice &key, const bool assume_tracked) = 0;
3140 virtual rocksdb::Status single_delete(
3141 rocksdb::ColumnFamilyHandle *const column_family,
3142 const rocksdb::Slice &key, const bool assume_tracked) = 0;
3143
3144 virtual bool has_modifications() const = 0;
3145
3146 virtual rocksdb::WriteBatchBase *get_indexed_write_batch() = 0;
3147 /*
3148 Return a WriteBatch that one can write to. The writes will skip any
3149 transaction locking. The writes will NOT be visible to the transaction.
3150 */
get_blind_write_batch()3151 rocksdb::WriteBatchBase *get_blind_write_batch() {
3152 return get_indexed_write_batch()->GetWriteBatch();
3153 }
3154
3155 virtual rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family,
3156 const rocksdb::Slice &key,
3157 rocksdb::PinnableSlice *const value) const = 0;
3158 virtual rocksdb::Status get_for_update(const Rdb_key_def &key_descr,
3159 const rocksdb::Slice &key,
3160 rocksdb::PinnableSlice *const value,
3161 bool exclusive,
3162 const bool do_validate) = 0;
3163
get_iterator(rocksdb::ColumnFamilyHandle * const column_family,bool skip_bloom_filter,bool fill_cache,const rocksdb::Slice & eq_cond_lower_bound,const rocksdb::Slice & eq_cond_upper_bound,bool read_current=false,bool create_snapshot=true)3164 rocksdb::Iterator *get_iterator(
3165 rocksdb::ColumnFamilyHandle *const column_family, bool skip_bloom_filter,
3166 bool fill_cache, const rocksdb::Slice &eq_cond_lower_bound,
3167 const rocksdb::Slice &eq_cond_upper_bound, bool read_current = false,
3168 bool create_snapshot = true) {
3169 // Make sure we are not doing both read_current (which implies we don't
3170 // want a snapshot) and create_snapshot which makes sure we create
3171 // a snapshot
3172 assert(column_family != nullptr);
3173 assert(!read_current || !create_snapshot);
3174
3175 if (create_snapshot) acquire_snapshot(true);
3176
3177 rocksdb::ReadOptions options = m_read_opts;
3178
3179 if (skip_bloom_filter) {
3180 const bool enable_iterate_bounds =
3181 THDVAR(get_thd(), enable_iterate_bounds);
3182 options.total_order_seek = true;
3183 options.iterate_lower_bound =
3184 enable_iterate_bounds ? &eq_cond_lower_bound : nullptr;
3185 options.iterate_upper_bound =
3186 enable_iterate_bounds ? &eq_cond_upper_bound : nullptr;
3187 } else {
3188 // With this option, Iterator::Valid() returns false if key
3189 // is outside of the prefix bloom filter range set at Seek().
3190 // Must not be set to true if not using bloom filter.
3191 options.prefix_same_as_start = true;
3192 }
3193 options.fill_cache = fill_cache;
3194 if (read_current) {
3195 options.snapshot = nullptr;
3196 }
3197 return get_iterator(options, column_family);
3198 }
3199
3200 virtual bool is_tx_started() const = 0;
3201 virtual void start_tx() = 0;
3202 virtual void start_stmt() = 0;
3203 virtual void set_name() = 0;
3204
3205 protected:
3206 // Non-virtual functions with actions to be done on transaction start and
3207 // commit.
on_commit()3208 void on_commit() {
3209 time_t tm;
3210 tm = time(nullptr);
3211 for (auto &it : modified_tables) {
3212 it->m_update_time = tm;
3213 }
3214 modified_tables.clear();
3215 }
on_rollback()3216 void on_rollback() {
3217 modified_tables.clear();
3218 }
3219 public:
log_table_write_op(Rdb_tbl_def * tbl)3220 void log_table_write_op(Rdb_tbl_def *tbl) {
3221 modified_tables.insert(tbl);
3222 }
3223
set_initial_savepoint()3224 void set_initial_savepoint() {
3225 /*
3226 Set the initial savepoint. If the first statement in the transaction
3227 fails, we need something to roll back to, without rolling back the
3228 entire transaction.
3229 */
3230 do_set_savepoint();
3231 m_writes_at_last_savepoint = m_write_count;
3232 }
3233
3234 /*
3235 Called when a "top-level" statement inside a transaction completes
3236 successfully and its changes become part of the transaction's changes.
3237 */
make_stmt_savepoint_permanent()3238 int make_stmt_savepoint_permanent() {
3239 // Take another RocksDB savepoint only if we had changes since the last
3240 // one. This is very important for long transactions doing lots of
3241 // SELECTs.
3242 if (m_writes_at_last_savepoint != m_write_count) {
3243 rocksdb::Status status = rocksdb::Status::NotFound();
3244 while ((status = do_pop_savepoint()) == rocksdb::Status::OK()) {
3245 }
3246
3247 if (status != rocksdb::Status::NotFound()) {
3248 return HA_EXIT_FAILURE;
3249 }
3250
3251 do_set_savepoint();
3252 m_writes_at_last_savepoint = m_write_count;
3253 }
3254
3255 return HA_EXIT_SUCCESS;
3256 }
3257
3258 /*
3259 Rollback to the savepoint we've set before the last statement
3260 */
rollback_to_stmt_savepoint()3261 void rollback_to_stmt_savepoint() {
3262 if (m_writes_at_last_savepoint != m_write_count) {
3263 do_rollback_to_savepoint();
3264 /*
3265 RollbackToSavePoint "removes the most recent SetSavePoint()", so
3266 we need to set it again so that next statement can roll back to this
3267 stage.
3268 It's ok to do it here at statement end (instead of doing it at next
3269 statement start) because setting a savepoint is cheap.
3270 */
3271 do_set_savepoint();
3272 m_write_count = m_writes_at_last_savepoint;
3273 }
3274 }
3275
3276 virtual void rollback_stmt() = 0;
3277
set_tx_failed(bool failed_arg)3278 void set_tx_failed(bool failed_arg) { m_is_tx_failed = failed_arg; }
3279
can_prepare() const3280 bool can_prepare() const {
3281 if (m_rollback_only) {
3282 my_error(ER_ROLLBACK_ONLY, MYF(0));
3283 return false;
3284 }
3285 return true;
3286 }
3287
rollback_to_savepoint(void * const savepoint)3288 int rollback_to_savepoint(void *const savepoint) {
3289 if (has_modifications()) {
3290 my_error(ER_ROLLBACK_TO_SAVEPOINT, MYF(0));
3291 m_rollback_only = true;
3292 return HA_EXIT_FAILURE;
3293 }
3294 return HA_EXIT_SUCCESS;
3295 }
3296
3297 /*
3298 This is used by transactions started with "START TRANSACTION WITH "
3299 "CONSISTENT [ROCKSDB] SNAPSHOT". When tx_read_only is turned on,
3300 snapshot has to be created via DB::GetSnapshot(), not via Transaction
3301 API.
3302 */
is_tx_read_only() const3303 bool is_tx_read_only() const { return m_tx_read_only; }
3304
set_tx_read_only(bool val)3305 void set_tx_read_only(bool val) { m_tx_read_only = val; }
3306
Rdb_transaction(THD * const thd)3307 explicit Rdb_transaction(THD *const thd)
3308 : m_thd(thd), m_tbl_io_perf(nullptr) {
3309 RDB_MUTEX_LOCK_CHECK(s_tx_list_mutex);
3310 s_tx_list.insert(this);
3311 RDB_MUTEX_UNLOCK_CHECK(s_tx_list_mutex);
3312 }
3313
~Rdb_transaction()3314 virtual ~Rdb_transaction() {
3315 RDB_MUTEX_LOCK_CHECK(s_tx_list_mutex);
3316 s_tx_list.erase(this);
3317 RDB_MUTEX_UNLOCK_CHECK(s_tx_list_mutex);
3318 }
3319 };
3320
3321 #ifndef NDEBUG
3322 // simulate that RocksDB has reported corrupted data
dbug_change_status_to_corrupted(rocksdb::Status * status)3323 static void dbug_change_status_to_corrupted(rocksdb::Status *status) {
3324 *status = rocksdb::Status::Corruption();
3325 }
dbug_change_status_to_io_error(rocksdb::Status * status)3326 static void dbug_change_status_to_io_error(rocksdb::Status *status) {
3327 *status = rocksdb::Status::IOError();
3328 }
dbug_change_status_to_incomplete(rocksdb::Status * status)3329 static void dbug_change_status_to_incomplete(rocksdb::Status *status) {
3330 *status = rocksdb::Status::Incomplete();
3331 }
3332 #endif
3333
3334 /*
3335 This is a rocksdb transaction. Its members represent the current transaction,
3336 which consists of:
3337 - the snapshot
3338 - the changes we've made but are not seeing yet.
3339
3340 The changes are made to individual tables, which store them here and then
3341 this object commits them on commit.
3342 */
3343 class Rdb_transaction_impl : public Rdb_transaction {
3344 rocksdb::Transaction *m_rocksdb_tx = nullptr;
3345 rocksdb::Transaction *m_rocksdb_reuse_tx = nullptr;
3346
3347 public:
set_lock_timeout(int timeout_sec_arg)3348 void set_lock_timeout(int timeout_sec_arg) override {
3349 if (m_rocksdb_tx) {
3350 m_rocksdb_tx->SetLockTimeout(rdb_convert_sec_to_ms(m_timeout_sec));
3351 }
3352 }
3353
set_sync(bool sync)3354 void set_sync(bool sync) override {
3355 m_rocksdb_tx->GetWriteOptions()->sync = sync;
3356 }
3357
release_lock(const Rdb_key_def & key_descr,const std::string & rowkey)3358 void release_lock(const Rdb_key_def &key_descr,
3359 const std::string &rowkey) override {
3360 if (!THDVAR(m_thd, lock_scanned_rows)) {
3361 m_rocksdb_tx->UndoGetForUpdate(key_descr.get_cf(),
3362 rocksdb::Slice(rowkey));
3363 // row_lock_count track row(pk)
3364 assert(!key_descr.is_primary_key() ||
3365 (key_descr.is_primary_key() && m_row_lock_count > 0));
3366 // m_row_lock_count tracks per row data instead of per key data
3367 if (key_descr.is_primary_key() && m_row_lock_count > 0) {
3368 m_row_lock_count--;
3369 }
3370 }
3371 }
3372
is_writebatch_trx() const3373 virtual bool is_writebatch_trx() const override { return false; }
3374
3375 private:
release_tx(void)3376 void release_tx(void) {
3377 // We are done with the current active transaction object. Preserve it
3378 // for later reuse.
3379 assert(m_rocksdb_reuse_tx == nullptr);
3380 m_rocksdb_reuse_tx = m_rocksdb_tx;
3381 m_rocksdb_tx = nullptr;
3382 }
3383
prepare()3384 bool prepare() override {
3385 rocksdb::Status s;
3386
3387 s = merge_auto_incr_map(m_rocksdb_tx->GetWriteBatch()->GetWriteBatch());
3388 #ifndef NDEBUG
3389 DBUG_EXECUTE_IF("myrocks_prepare_io_error",
3390 dbug_change_status_to_io_error(&s););
3391 DBUG_EXECUTE_IF("myrocks_prepare_incomplete",
3392 dbug_change_status_to_incomplete(&s););
3393 #endif
3394 if (!s.ok()) {
3395 std::string msg =
3396 "RocksDB error on COMMIT (Prepare/merge): " + s.ToString();
3397 my_error(ER_INTERNAL_ERROR, MYF(0), msg.c_str());
3398 return false;
3399 }
3400
3401 s = m_rocksdb_tx->Prepare();
3402 if (!s.ok()) {
3403 std::string msg = "RocksDB error on COMMIT (Prepare): " + s.ToString();
3404 my_error(ER_INTERNAL_ERROR, MYF(0), msg.c_str());
3405 return false;
3406 }
3407 return true;
3408 }
3409
commit_no_binlog()3410 bool commit_no_binlog() override {
3411 bool res = false;
3412 rocksdb::Status s;
3413
3414 s = merge_auto_incr_map(m_rocksdb_tx->GetWriteBatch()->GetWriteBatch());
3415 #ifndef NDEBUG
3416 DBUG_EXECUTE_IF("myrocks_commit_merge_io_error",
3417 dbug_change_status_to_io_error(&s););
3418 DBUG_EXECUTE_IF("myrocks_commit_merge_incomplete",
3419 dbug_change_status_to_incomplete(&s););
3420 #endif
3421 if (!s.ok()) {
3422 rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3423 res = true;
3424 goto error;
3425 }
3426
3427 release_snapshot();
3428 s = m_rocksdb_tx->Commit();
3429 #ifndef NDEBUG
3430 DBUG_EXECUTE_IF("myrocks_commit_io_error",
3431 dbug_change_status_to_io_error(&s););
3432 DBUG_EXECUTE_IF("myrocks_commit_incomplete",
3433 dbug_change_status_to_incomplete(&s););
3434 #endif
3435 if (!s.ok()) {
3436 rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3437 res = true;
3438 goto error;
3439 }
3440
3441 on_commit();
3442 error:
3443 on_rollback();
3444 /* Save the transaction object to be reused */
3445 release_tx();
3446
3447 m_write_count = 0;
3448 m_row_lock_count = 0;
3449 set_tx_read_only(false);
3450 m_rollback_only = false;
3451 return res;
3452 }
3453
3454 public:
rollback()3455 void rollback() override {
3456 on_rollback();
3457 m_write_count = 0;
3458 m_row_lock_count = 0;
3459 m_auto_incr_map.clear();
3460 m_ddl_transaction = false;
3461 if (m_rocksdb_tx) {
3462 release_snapshot();
3463 /* This will also release all of the locks: */
3464 m_rocksdb_tx->Rollback();
3465
3466 /* Save the transaction object to be reused */
3467 release_tx();
3468
3469 set_tx_read_only(false);
3470 m_rollback_only = false;
3471 }
3472 }
3473
acquire_snapshot(bool acquire_now)3474 void acquire_snapshot(bool acquire_now) override {
3475 if (m_read_opts.snapshot == nullptr) {
3476 if (is_tx_read_only()) {
3477 snapshot_created(rdb->GetSnapshot());
3478 } else if (acquire_now) {
3479 m_rocksdb_tx->SetSnapshot();
3480 snapshot_created(m_rocksdb_tx->GetSnapshot());
3481 } else if (!m_is_delayed_snapshot) {
3482 m_rocksdb_tx->SetSnapshotOnNextOperation(m_notifier);
3483 m_is_delayed_snapshot = true;
3484 }
3485 }
3486 }
3487
release_snapshot()3488 void release_snapshot() override {
3489 bool need_clear = m_is_delayed_snapshot;
3490
3491 if (m_read_opts.snapshot != nullptr) {
3492 m_snapshot_timestamp = 0;
3493 if (is_tx_read_only()) {
3494 rdb->ReleaseSnapshot(m_read_opts.snapshot);
3495 need_clear = false;
3496 } else {
3497 need_clear = true;
3498 }
3499 m_read_opts.snapshot = nullptr;
3500 }
3501
3502 if (need_clear && m_rocksdb_tx != nullptr) m_rocksdb_tx->ClearSnapshot();
3503 }
3504
has_snapshot()3505 bool has_snapshot() { return m_read_opts.snapshot != nullptr; }
3506
put(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const rocksdb::Slice & value,const bool assume_tracked)3507 rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family,
3508 const rocksdb::Slice &key, const rocksdb::Slice &value,
3509 const bool assume_tracked) override {
3510 ++m_write_count;
3511 return m_rocksdb_tx->Put(column_family, key, value, assume_tracked);
3512 }
3513
delete_key(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const bool assume_tracked)3514 rocksdb::Status delete_key(rocksdb::ColumnFamilyHandle *const column_family,
3515 const rocksdb::Slice &key,
3516 const bool assume_tracked) override {
3517 ++m_write_count;
3518 return m_rocksdb_tx->Delete(column_family, key, assume_tracked);
3519 }
3520
single_delete(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const bool assume_tracked)3521 rocksdb::Status single_delete(
3522 rocksdb::ColumnFamilyHandle *const column_family,
3523 const rocksdb::Slice &key, const bool assume_tracked) override {
3524 ++m_write_count;
3525 return m_rocksdb_tx->SingleDelete(column_family, key, assume_tracked);
3526 }
3527
has_modifications() const3528 bool has_modifications() const override {
3529 return m_rocksdb_tx->GetWriteBatch() &&
3530 m_rocksdb_tx->GetWriteBatch()->GetWriteBatch() &&
3531 m_rocksdb_tx->GetWriteBatch()->GetWriteBatch()->Count() > 0;
3532 }
3533
get_write_batch()3534 rocksdb::WriteBatchBase *get_write_batch() override {
3535 return m_rocksdb_tx->GetCommitTimeWriteBatch();
3536 }
3537
3538 /*
3539 Return a WriteBatch that one can write to. The writes will skip any
3540 transaction locking. The writes WILL be visible to the transaction.
3541 */
get_indexed_write_batch()3542 rocksdb::WriteBatchBase *get_indexed_write_batch() override {
3543 ++m_write_count;
3544 return m_rocksdb_tx->GetWriteBatch();
3545 }
3546
get(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,rocksdb::PinnableSlice * const value) const3547 rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family,
3548 const rocksdb::Slice &key,
3549 rocksdb::PinnableSlice *const value) const override {
3550 // clean PinnableSlice right before Get() for multiple gets per statement
3551 // the resources after the last Get in a statement are cleared in
3552 // handler::reset call
3553 value->Reset();
3554 global_stats.queries[QUERIES_POINT].inc();
3555 return m_rocksdb_tx->Get(m_read_opts, column_family, key, value);
3556 }
3557
get_for_update(const Rdb_key_def & key_descr,const rocksdb::Slice & key,rocksdb::PinnableSlice * const value,bool exclusive,const bool do_validate)3558 rocksdb::Status get_for_update(const Rdb_key_def &key_descr,
3559 const rocksdb::Slice &key,
3560 rocksdb::PinnableSlice *const value,
3561 bool exclusive,
3562 const bool do_validate) override {
3563 rocksdb::ColumnFamilyHandle *const column_family = key_descr.get_cf();
3564 /* check row lock limit in a trx */
3565 if (get_row_lock_count() >= get_max_row_lock_count()) {
3566 return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
3567 }
3568
3569 if (value != nullptr) {
3570 value->Reset();
3571 }
3572 rocksdb::Status s;
3573 // If snapshot is null, pass it to GetForUpdate and snapshot is
3574 // initialized there. Snapshot validation is skipped in that case.
3575 if (m_read_opts.snapshot == nullptr || do_validate) {
3576 s = m_rocksdb_tx->GetForUpdate(
3577 m_read_opts, column_family, key, value, exclusive,
3578 m_read_opts.snapshot ? do_validate : false);
3579 } else {
3580 // If snapshot is set, and if skipping validation,
3581 // call GetForUpdate without validation and set back old snapshot
3582 auto saved_snapshot = m_read_opts.snapshot;
3583 m_read_opts.snapshot = nullptr;
3584 s = m_rocksdb_tx->GetForUpdate(m_read_opts, column_family, key, value,
3585 exclusive, false);
3586 m_read_opts.snapshot = saved_snapshot;
3587 }
3588 // row_lock_count is to track per row instead of per key
3589 if (key_descr.is_primary_key()) incr_row_lock_count();
3590 return s;
3591 }
3592
get_iterator(const rocksdb::ReadOptions & options,rocksdb::ColumnFamilyHandle * const column_family)3593 rocksdb::Iterator *get_iterator(
3594 const rocksdb::ReadOptions &options,
3595 rocksdb::ColumnFamilyHandle *const column_family) override {
3596 global_stats.queries[QUERIES_RANGE].inc();
3597 return m_rocksdb_tx->GetIterator(options, column_family);
3598 }
3599
get_rdb_trx() const3600 const rocksdb::Transaction *get_rdb_trx() const { return m_rocksdb_tx; }
3601
is_tx_started() const3602 bool is_tx_started() const override { return (m_rocksdb_tx != nullptr); }
3603
start_tx()3604 void start_tx() override {
3605 rocksdb::TransactionOptions tx_opts;
3606 rocksdb::WriteOptions write_opts;
3607 tx_opts.set_snapshot = false;
3608 tx_opts.lock_timeout = rdb_convert_sec_to_ms(m_timeout_sec);
3609 tx_opts.deadlock_detect = THDVAR(m_thd, deadlock_detect);
3610 tx_opts.deadlock_detect_depth = THDVAR(m_thd, deadlock_detect_depth);
3611 // If this variable is set, this will write commit time write batch
3612 // information on recovery or memtable flush.
3613 tx_opts.use_only_the_last_commit_time_batch_for_recovery =
3614 THDVAR(m_thd, commit_time_batch_for_recovery);
3615 tx_opts.max_write_batch_size = THDVAR(m_thd, write_batch_max_bytes);
3616 tx_opts.write_batch_flush_threshold =
3617 THDVAR(m_thd, write_batch_flush_threshold);
3618
3619 write_opts.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
3620 write_opts.disableWAL = THDVAR(m_thd, write_disable_wal);
3621 write_opts.ignore_missing_column_families =
3622 THDVAR(m_thd, write_ignore_missing_column_families);
3623
3624 /*
3625 If m_rocksdb_reuse_tx is null this will create a new transaction object.
3626 Otherwise it will reuse the existing one.
3627 */
3628 m_rocksdb_tx =
3629 rdb->BeginTransaction(write_opts, tx_opts, m_rocksdb_reuse_tx);
3630 m_rocksdb_reuse_tx = nullptr;
3631
3632 m_read_opts = rocksdb::ReadOptions();
3633
3634 set_initial_savepoint();
3635
3636 m_ddl_transaction = false;
3637 }
3638
set_name()3639 void set_name() override {
3640 XID xid;
3641 thd_get_xid(m_thd, reinterpret_cast<MYSQL_XID *>(&xid));
3642 auto name = m_rocksdb_tx->GetName();
3643 if (!name.empty()) {
3644 assert(name == rdb_xid_to_string(xid));
3645 return;
3646 }
3647 rocksdb::Status s = m_rocksdb_tx->SetName(rdb_xid_to_string(xid));
3648 assert(s.ok());
3649 if (!s.ok()) {
3650 rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3651 }
3652 }
3653
3654 /* Implementations of do_*savepoint based on rocksdB::Transaction savepoints
3655 */
do_set_savepoint()3656 void do_set_savepoint() override { m_rocksdb_tx->SetSavePoint(); }
do_pop_savepoint()3657 rocksdb::Status do_pop_savepoint() override {
3658 return m_rocksdb_tx->PopSavePoint();
3659 }
3660
do_rollback_to_savepoint()3661 void do_rollback_to_savepoint() override {
3662 m_rocksdb_tx->RollbackToSavePoint();
3663 }
3664
3665 /*
3666 Start a statement inside a multi-statement transaction.
3667
3668 @todo: are we sure this is called once (and not several times) per
3669 statement start?
3670
3671 For hooking to start of statement that is its own transaction, see
3672 ha_rocksdb::external_lock().
3673 */
start_stmt()3674 void start_stmt() override {
3675 // Set the snapshot to delayed acquisition (SetSnapshotOnNextOperation)
3676 acquire_snapshot(false);
3677 }
3678
3679 /*
3680 This must be called when last statement is rolled back, but the transaction
3681 continues
3682 */
rollback_stmt()3683 void rollback_stmt() override {
3684 /* TODO: here we must release the locks taken since the start_stmt() call */
3685 if (m_rocksdb_tx) {
3686 const rocksdb::Snapshot *const org_snapshot = m_rocksdb_tx->GetSnapshot();
3687 rollback_to_stmt_savepoint();
3688
3689 const rocksdb::Snapshot *const cur_snapshot = m_rocksdb_tx->GetSnapshot();
3690 if (org_snapshot != cur_snapshot) {
3691 if (org_snapshot != nullptr) m_snapshot_timestamp = 0;
3692
3693 m_read_opts.snapshot = cur_snapshot;
3694 if (cur_snapshot != nullptr) {
3695 rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp);
3696 } else {
3697 m_is_delayed_snapshot = true;
3698 }
3699 }
3700 }
3701 }
3702
Rdb_transaction_impl(THD * const thd)3703 explicit Rdb_transaction_impl(THD *const thd)
3704 : Rdb_transaction(thd), m_rocksdb_tx(nullptr) {
3705 // Create a notifier that can be called when a snapshot gets generated.
3706 m_notifier = std::make_shared<Rdb_snapshot_notifier>(this);
3707 }
3708
~Rdb_transaction_impl()3709 virtual ~Rdb_transaction_impl() override {
3710 rollback();
3711
3712 // Theoretically the notifier could outlive the Rdb_transaction_impl
3713 // (because of the shared_ptr), so let it know it can't reference
3714 // the transaction anymore.
3715 m_notifier->detach();
3716
3717 // Free any transaction memory that is still hanging around.
3718 delete m_rocksdb_reuse_tx;
3719 assert(m_rocksdb_tx == nullptr);
3720 }
3721 };
3722
3723 /* This is a rocksdb write batch. This class doesn't hold or wait on any
3724 transaction locks (skips rocksdb transaction API) thus giving better
3725 performance.
3726
3727 Currently this is only used for replication threads which are guaranteed
3728 to be non-conflicting. Any further usage of this class should completely
3729 be thought thoroughly.
3730 */
3731 class Rdb_writebatch_impl : public Rdb_transaction {
3732 rocksdb::WriteBatchWithIndex *m_batch;
3733 rocksdb::WriteOptions write_opts;
3734 // Called after commit/rollback.
reset()3735 void reset() {
3736 m_batch->Clear();
3737 m_read_opts = rocksdb::ReadOptions();
3738 m_ddl_transaction = false;
3739 }
3740
3741 private:
prepare()3742 bool prepare() override { return true; }
3743
commit_no_binlog()3744 bool commit_no_binlog() override {
3745 bool res = false;
3746 rocksdb::Status s;
3747 rocksdb::TransactionDBWriteOptimizations optimize;
3748 optimize.skip_concurrency_control = true;
3749
3750 s = merge_auto_incr_map(m_batch->GetWriteBatch());
3751 if (!s.ok()) {
3752 rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3753 res = true;
3754 goto error;
3755 }
3756
3757 release_snapshot();
3758
3759 s = rdb->Write(write_opts, optimize, m_batch->GetWriteBatch());
3760 if (!s.ok()) {
3761 rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3762 res = true;
3763 goto error;
3764 }
3765 on_commit();
3766 error:
3767 on_rollback();
3768 reset();
3769
3770 m_write_count = 0;
3771 set_tx_read_only(false);
3772 m_rollback_only = false;
3773 return res;
3774 }
3775
3776 /* Implementations of do_*savepoint based on rocksdB::WriteBatch savepoints */
do_set_savepoint()3777 void do_set_savepoint() override { m_batch->SetSavePoint(); }
do_pop_savepoint()3778 rocksdb::Status do_pop_savepoint() override {
3779 return m_batch->PopSavePoint();
3780 }
3781
do_rollback_to_savepoint()3782 void do_rollback_to_savepoint() override { m_batch->RollbackToSavePoint(); }
3783
3784 public:
is_writebatch_trx() const3785 bool is_writebatch_trx() const override { return true; }
3786
set_lock_timeout(int timeout_sec_arg)3787 void set_lock_timeout(int timeout_sec_arg) override {
3788 // Nothing to do here.
3789 }
3790
set_sync(bool sync)3791 void set_sync(bool sync) override { write_opts.sync = sync; }
3792
release_lock(const Rdb_key_def & key_descr,const std::string & rowkey)3793 void release_lock(const Rdb_key_def &key_descr,
3794 const std::string &rowkey) override {
3795 // Nothing to do here since we don't hold any row locks.
3796 }
3797
rollback()3798 void rollback() override {
3799 on_rollback();
3800 m_write_count = 0;
3801 m_row_lock_count = 0;
3802 release_snapshot();
3803
3804 reset();
3805 set_tx_read_only(false);
3806 m_rollback_only = false;
3807 }
3808
acquire_snapshot(bool acquire_now)3809 void acquire_snapshot(bool acquire_now) override {
3810 if (m_read_opts.snapshot == nullptr) snapshot_created(rdb->GetSnapshot());
3811 }
3812
release_snapshot()3813 void release_snapshot() override {
3814 if (m_read_opts.snapshot != nullptr) {
3815 rdb->ReleaseSnapshot(m_read_opts.snapshot);
3816 m_read_opts.snapshot = nullptr;
3817 }
3818 }
3819
put(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const rocksdb::Slice & value,const bool assume_tracked)3820 rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family,
3821 const rocksdb::Slice &key, const rocksdb::Slice &value,
3822 const bool assume_tracked) override {
3823 ++m_write_count;
3824 m_batch->Put(column_family, key, value);
3825 // Note Put/Delete in write batch doesn't return any error code. We simply
3826 // return OK here.
3827 return rocksdb::Status::OK();
3828 }
3829
delete_key(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const bool assume_tracked)3830 rocksdb::Status delete_key(rocksdb::ColumnFamilyHandle *const column_family,
3831 const rocksdb::Slice &key,
3832 const bool assume_tracked) override {
3833 ++m_write_count;
3834 m_batch->Delete(column_family, key);
3835 return rocksdb::Status::OK();
3836 }
3837
single_delete(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const bool assume_tracked)3838 rocksdb::Status single_delete(
3839 rocksdb::ColumnFamilyHandle *const column_family,
3840 const rocksdb::Slice &key, const bool assume_tracked) override {
3841 ++m_write_count;
3842 m_batch->SingleDelete(column_family, key);
3843 return rocksdb::Status::OK();
3844 }
3845
has_modifications() const3846 bool has_modifications() const override {
3847 return m_batch->GetWriteBatch()->Count() > 0;
3848 }
3849
get_write_batch()3850 rocksdb::WriteBatchBase *get_write_batch() override { return m_batch; }
3851
get_indexed_write_batch()3852 rocksdb::WriteBatchBase *get_indexed_write_batch() override {
3853 ++m_write_count;
3854 return m_batch;
3855 }
3856
get(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,rocksdb::PinnableSlice * const value) const3857 rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family,
3858 const rocksdb::Slice &key,
3859 rocksdb::PinnableSlice *const value) const override {
3860 value->Reset();
3861 return m_batch->GetFromBatchAndDB(rdb, m_read_opts, column_family, key,
3862 value);
3863 }
3864
get_for_update(const Rdb_key_def & key_descr,const rocksdb::Slice & key,rocksdb::PinnableSlice * const value,bool,const bool)3865 rocksdb::Status get_for_update(const Rdb_key_def &key_descr,
3866 const rocksdb::Slice &key,
3867 rocksdb::PinnableSlice *const value,
3868 bool /* exclusive */,
3869 const bool /* do_validate */) override {
3870 rocksdb::ColumnFamilyHandle *const column_family = key_descr.get_cf();
3871 if (value == nullptr) {
3872 rocksdb::PinnableSlice pin_val;
3873 rocksdb::Status s = get(column_family, key, &pin_val);
3874 pin_val.Reset();
3875 return s;
3876 }
3877
3878 return get(column_family, key, value);
3879 }
3880
get_iterator(const rocksdb::ReadOptions & options,rocksdb::ColumnFamilyHandle * const column_family)3881 rocksdb::Iterator *get_iterator(
3882 const rocksdb::ReadOptions &options,
3883 rocksdb::ColumnFamilyHandle *const column_family) override {
3884 const auto it = rdb->NewIterator(options);
3885 return m_batch->NewIteratorWithBase(it);
3886 }
3887
is_tx_started() const3888 bool is_tx_started() const override { return (m_batch != nullptr); }
3889
start_tx()3890 void start_tx() override {
3891 reset();
3892 write_opts.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
3893 write_opts.disableWAL = THDVAR(m_thd, write_disable_wal);
3894 write_opts.ignore_missing_column_families =
3895 THDVAR(m_thd, write_ignore_missing_column_families);
3896
3897 set_initial_savepoint();
3898 }
3899
set_name()3900 void set_name() override {}
3901
start_stmt()3902 void start_stmt() override {}
3903
rollback_stmt()3904 void rollback_stmt() override {
3905 if (m_batch) rollback_to_stmt_savepoint();
3906 }
3907
Rdb_writebatch_impl(THD * const thd)3908 explicit Rdb_writebatch_impl(THD *const thd)
3909 : Rdb_transaction(thd), m_batch(nullptr) {
3910 m_batch = new rocksdb::WriteBatchWithIndex(rocksdb::BytewiseComparator(), 0,
3911 true);
3912 }
3913
~Rdb_writebatch_impl()3914 virtual ~Rdb_writebatch_impl() override {
3915 rollback();
3916 delete m_batch;
3917 }
3918 };
3919
SnapshotCreated(const rocksdb::Snapshot * const snapshot)3920 void Rdb_snapshot_notifier::SnapshotCreated(
3921 const rocksdb::Snapshot *const snapshot) {
3922 if (m_owning_tx != nullptr) {
3923 m_owning_tx->snapshot_created(snapshot);
3924 }
3925 }
3926
3927 std::multiset<Rdb_transaction *> Rdb_transaction::s_tx_list;
3928 mysql_mutex_t Rdb_transaction::s_tx_list_mutex;
3929
get_tx_from_thd(THD * const thd)3930 static Rdb_transaction *&get_tx_from_thd(THD *const thd) {
3931 return *reinterpret_cast<Rdb_transaction **>(
3932 my_core::thd_ha_data(thd, rocksdb_hton));
3933 }
3934
3935 class Rdb_perf_context_guard {
3936 Rdb_io_perf m_io_perf;
3937 Rdb_io_perf *m_io_perf_ptr;
3938 Rdb_transaction *m_tx;
3939 uint m_level;
3940
3941 public:
3942 Rdb_perf_context_guard(const Rdb_perf_context_guard &) = delete;
3943 Rdb_perf_context_guard &operator=(const Rdb_perf_context_guard &) = delete;
3944
Rdb_perf_context_guard(Rdb_io_perf * io_perf,uint level)3945 explicit Rdb_perf_context_guard(Rdb_io_perf *io_perf, uint level)
3946 : m_io_perf_ptr(io_perf), m_tx(nullptr), m_level(level) {
3947 m_io_perf_ptr->start(m_level);
3948 }
3949
Rdb_perf_context_guard(Rdb_transaction * tx,uint level)3950 explicit Rdb_perf_context_guard(Rdb_transaction *tx, uint level)
3951 : m_io_perf_ptr(nullptr), m_tx(tx), m_level(level) {
3952 /*
3953 if perf_context information is already being recorded, this becomes a
3954 no-op
3955 */
3956 if (tx != nullptr) {
3957 tx->io_perf_start(&m_io_perf);
3958 }
3959 }
3960
~Rdb_perf_context_guard()3961 ~Rdb_perf_context_guard() {
3962 if (m_tx != nullptr) {
3963 m_tx->io_perf_end_and_record();
3964 } else if (m_io_perf_ptr != nullptr) {
3965 m_io_perf_ptr->end_and_record(m_level);
3966 }
3967 }
3968 };
3969
3970 /*
3971 TODO: maybe, call this in external_lock() and store in ha_rocksdb..
3972 */
3973
get_or_create_tx(THD * const thd)3974 static Rdb_transaction *get_or_create_tx(THD *const thd) {
3975 Rdb_transaction *&tx = get_tx_from_thd(thd);
3976 // TODO: this is called too many times.. O(#rows)
3977 if (tx == nullptr) {
3978 if ((rpl_skip_tx_api_var && thd->rli_slave) ||
3979 (THDVAR(thd, master_skip_tx_api) && !thd->rli_slave)) {
3980 tx = new Rdb_writebatch_impl(thd);
3981 } else {
3982 tx = new Rdb_transaction_impl(thd);
3983 }
3984 tx->set_params(THDVAR(thd, lock_wait_timeout), rocksdb_max_row_locks);
3985 tx->start_tx();
3986 } else {
3987 tx->set_params(THDVAR(thd, lock_wait_timeout), rocksdb_max_row_locks);
3988 if (!tx->is_tx_started()) {
3989 tx->start_tx();
3990 }
3991 }
3992
3993 return tx;
3994 }
3995
rocksdb_close_connection(handlerton * const hton,THD * const thd)3996 static int rocksdb_close_connection(handlerton *const hton, THD *const thd) {
3997 Rdb_transaction *&tx = get_tx_from_thd(thd);
3998 if (tx != nullptr) {
3999 bool is_critical_error;
4000 int rc = tx->finish_bulk_load(&is_critical_error, false);
4001 if (rc != 0 && is_critical_error) {
4002 // NO_LINT_DEBUG
4003 sql_print_error(
4004 "RocksDB: Error %d finalizing last SST file while "
4005 "disconnecting",
4006 rc);
4007 }
4008
4009 delete tx;
4010 tx = nullptr;
4011 }
4012 return HA_EXIT_SUCCESS;
4013 }
4014
4015 /**
4016 Called by hton->flush_logs after MySQL group commit prepares a set of
4017 transactions.
4018 */
rocksdb_flush_wal(handlerton * const hton MY_ATTRIBUTE ((__unused__)),bool binlog_group_flush)4019 static bool rocksdb_flush_wal(handlerton *const hton MY_ATTRIBUTE((__unused__)),
4020 bool binlog_group_flush) {
4021 DBUG_ENTER("rocksdb_flush_wal");
4022 assert(rdb != nullptr);
4023
4024 /**
4025 If !binlog_group_flush, we got invoked by FLUSH LOGS or similar.
4026 Else, we got invoked by binlog group commit during flush stage.
4027 */
4028
4029 if (binlog_group_flush &&
4030 rocksdb_flush_log_at_trx_commit == FLUSH_LOG_NEVER) {
4031 /**
4032 rocksdb_flush_log_at_trx_commit=0
4033 (write and sync based on timer in Rdb_background_thread).
4034 Do not flush the redo log during binlog group commit.
4035 */
4036 DBUG_RETURN(false);
4037 }
4038
4039 if (!binlog_group_flush || !rocksdb_db_options->allow_mmap_writes ||
4040 rocksdb_flush_log_at_trx_commit != FLUSH_LOG_NEVER) {
4041 /**
4042 Sync the WAL if we are in FLUSH LOGS, or if
4043 rocksdb_flush_log_at_trx_commit=1
4044 (write and sync at each commit).
4045 */
4046 rocksdb_wal_group_syncs++;
4047 const rocksdb::Status s =
4048 rdb->FlushWAL(rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
4049
4050 if (!s.ok()) {
4051 rdb_log_status_error(s);
4052 DBUG_RETURN(true);
4053 }
4054 }
4055
4056 DBUG_RETURN(false);
4057 }
4058
4059 /**
4060 For a slave, prepare() updates the slave_gtid_info table which tracks the
4061 replication progress.
4062 */
rocksdb_prepare(handlerton * const hton,THD * const thd,bool prepare_tx)4063 static int rocksdb_prepare(handlerton *const hton, THD *const thd,
4064 bool prepare_tx) {
4065 Rdb_transaction *&tx = get_tx_from_thd(thd);
4066 if (!tx->is_tx_started()) {
4067 // nothing to prepare
4068 return HA_EXIT_SUCCESS;
4069 }
4070 if (!tx->can_prepare()) {
4071 return HA_EXIT_FAILURE;
4072 }
4073 if (prepare_tx ||
4074 (!my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
4075 if (thd->durability_property == HA_IGNORE_DURABILITY) {
4076 tx->set_sync(false);
4077 }
4078 if (rocksdb_write_policy != rocksdb::TxnDBWritePolicy::WRITE_UNPREPARED) {
4079 tx->set_name();
4080 }
4081 if (!tx->prepare()) {
4082 return HA_EXIT_FAILURE;
4083 }
4084
4085 DEBUG_SYNC(thd, "rocksdb.prepared");
4086 } else
4087 tx->make_stmt_savepoint_permanent();
4088
4089 return HA_EXIT_SUCCESS;
4090 }
4091
4092 /**
4093 do nothing for prepare/commit by xid
4094 this is needed to avoid crashes in XA scenarios
4095 */
rocksdb_commit_by_xid(handlerton * const hton,XID * const xid)4096 static int rocksdb_commit_by_xid(handlerton *const hton, XID *const xid) {
4097 DBUG_ENTER_FUNC();
4098
4099 assert(hton != nullptr);
4100 assert(xid != nullptr);
4101 assert(commit_latency_stats != nullptr);
4102
4103 auto clock = rocksdb::Env::Default()->GetSystemClock().get();
4104 rocksdb::StopWatchNano timer(clock, true);
4105
4106 const auto name = rdb_xid_to_string(*xid);
4107 assert(!name.empty());
4108
4109 rocksdb::Transaction *const trx = rdb->GetTransactionByName(name);
4110
4111 if (trx == nullptr) {
4112 DBUG_RETURN(HA_EXIT_FAILURE);
4113 }
4114
4115 const rocksdb::Status s = trx->Commit();
4116
4117 if (!s.ok()) {
4118 rdb_log_status_error(s);
4119 DBUG_RETURN(HA_EXIT_FAILURE);
4120 }
4121
4122 delete trx;
4123
4124 // `Add()` is implemented in a thread-safe manner.
4125 commit_latency_stats->Add(timer.ElapsedNanos() / 1000);
4126
4127 DBUG_RETURN(HA_EXIT_SUCCESS);
4128 }
4129
rocksdb_rollback_by_xid(handlerton * const hton MY_ATTRIBUTE ((__unused__)),XID * const xid)4130 static int rocksdb_rollback_by_xid(
4131 handlerton *const hton MY_ATTRIBUTE((__unused__)), XID *const xid) {
4132 DBUG_ENTER_FUNC();
4133
4134 assert(hton != nullptr);
4135 assert(xid != nullptr);
4136 assert(rdb != nullptr);
4137
4138 const auto name = rdb_xid_to_string(*xid);
4139
4140 rocksdb::Transaction *const trx = rdb->GetTransactionByName(name);
4141
4142 if (trx == nullptr) {
4143 DBUG_RETURN(HA_EXIT_FAILURE);
4144 }
4145
4146 const rocksdb::Status s = trx->Rollback();
4147
4148 if (!s.ok()) {
4149 rdb_log_status_error(s);
4150 DBUG_RETURN(HA_EXIT_FAILURE);
4151 }
4152
4153 delete trx;
4154
4155 DBUG_RETURN(HA_EXIT_SUCCESS);
4156 }
4157
4158 /**
4159 Rebuilds an XID from a serialized version stored in a string.
4160 */
rdb_xid_from_string(const std::string & src,XID * const dst)4161 static void rdb_xid_from_string(const std::string &src, XID *const dst) {
4162 assert(dst != nullptr);
4163 uint offset = 0;
4164 uint64 raw_fid8 =
4165 rdb_netbuf_to_uint64(reinterpret_cast<const uchar *>(src.data()));
4166 const int64 signed_fid8 = *reinterpret_cast<int64 *>(&raw_fid8);
4167 dst->set_format_id(signed_fid8);
4168 offset += RDB_FORMATID_SZ;
4169 dst->set_gtrid_length(src.at(offset));
4170 offset += RDB_GTRID_SZ;
4171 dst->set_bqual_length(src.at(offset));
4172 offset += RDB_BQUAL_SZ;
4173
4174 assert(dst->get_gtrid_length() >= 0);
4175 assert(dst->get_gtrid_length() <= MAXGTRIDSIZE);
4176 assert(dst->get_bqual_length() >= 0);
4177 assert(dst->get_bqual_length() <= MAXBQUALSIZE);
4178
4179 const std::string &tmp_data = src.substr(
4180 RDB_XIDHDR_LEN, (dst->get_gtrid_length()) + (dst->get_bqual_length()));
4181 dst->set_data(tmp_data.data(), tmp_data.length());
4182 }
4183
4184 /**
4185 Reading last committed binary log info from RocksDB system row.
4186 The info is needed for crash safe slave/master to work.
4187 */
rocksdb_recover(handlerton * const hton,XID * const xid_list,uint len)4188 static int rocksdb_recover(handlerton *const hton, XID *const xid_list,
4189 uint len) {
4190 if (len == 0 || xid_list == nullptr) {
4191 return HA_EXIT_SUCCESS;
4192 }
4193
4194 std::vector<rocksdb::Transaction *> trans_list;
4195 rdb->GetAllPreparedTransactions(&trans_list);
4196
4197 uint count = 0;
4198 for (auto &trans : trans_list) {
4199 if (count >= len) {
4200 break;
4201 }
4202 auto name = trans->GetName();
4203 rdb_xid_from_string(name, &xid_list[count]);
4204 count++;
4205 }
4206 return count;
4207 }
4208
rocksdb_commit(handlerton * const hton,THD * const thd,bool commit_tx)4209 static int rocksdb_commit(handlerton *const hton, THD *const thd,
4210 bool commit_tx) {
4211 DBUG_ENTER_FUNC();
4212
4213 assert(hton != nullptr);
4214 assert(thd != nullptr);
4215 assert(commit_latency_stats != nullptr);
4216
4217 auto clock = rocksdb::Env::Default()->GetSystemClock().get();
4218 rocksdb::StopWatchNano timer(clock, true);
4219
4220 /* note: h->external_lock(F_UNLCK) is called after this function is called) */
4221 Rdb_transaction *&tx = get_tx_from_thd(thd);
4222
4223 /* this will trigger saving of perf_context information */
4224 Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd));
4225
4226 if (tx != nullptr) {
4227 if (commit_tx || (!my_core::thd_test_options(
4228 thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
4229 /*
4230 We get here
4231 - For a COMMIT statement that finishes a multi-statement transaction
4232 - For a statement that has its own transaction
4233 */
4234 if (tx->commit()) {
4235 DBUG_RETURN(HA_ERR_ROCKSDB_COMMIT_FAILED);
4236 }
4237 } else {
4238 /*
4239 We get here when committing a statement within a transaction.
4240 */
4241 tx->set_tx_failed(false);
4242 tx->make_stmt_savepoint_permanent();
4243 }
4244
4245 if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED) {
4246 // For READ_COMMITTED, we release any existing snapshot so that we will
4247 // see any changes that occurred since the last statement.
4248 tx->release_snapshot();
4249 }
4250 }
4251
4252 // `Add()` is implemented in a thread-safe manner.
4253 commit_latency_stats->Add(timer.ElapsedNanos() / 1000);
4254
4255 DBUG_RETURN(HA_EXIT_SUCCESS);
4256 }
4257
rocksdb_rollback(handlerton * const hton,THD * const thd,bool rollback_tx)4258 static int rocksdb_rollback(handlerton *const hton, THD *const thd,
4259 bool rollback_tx) {
4260 Rdb_transaction *&tx = get_tx_from_thd(thd);
4261 Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd));
4262
4263 if (tx != nullptr) {
4264 if (rollback_tx) {
4265 /*
4266 We get here, when
4267 - ROLLBACK statement is issued.
4268
4269 Discard the changes made by the transaction
4270 */
4271 tx->rollback();
4272 } else {
4273 /*
4274 We get here when
4275 - a statement with AUTOCOMMIT=1 is being rolled back (because of some
4276 error)
4277 - a statement inside a transaction is rolled back
4278 */
4279
4280 tx->rollback_stmt();
4281 tx->set_tx_failed(true);
4282 }
4283
4284 if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED) {
4285 // For READ_COMMITTED, we release any existing snapshot so that we will
4286 // see any changes that occurred since the last statement.
4287 tx->release_snapshot();
4288 }
4289 }
4290 return HA_EXIT_SUCCESS;
4291 }
4292
print_stats(THD * const thd,std::string const & type,std::string const & name,std::string const & status,stat_print_fn * stat_print)4293 static bool print_stats(THD *const thd, std::string const &type,
4294 std::string const &name, std::string const &status,
4295 stat_print_fn *stat_print) {
4296 return stat_print(thd, type.c_str(), type.size(), name.c_str(), name.size(),
4297 status.c_str(), status.size());
4298 }
4299
format_string(const char * const format,...)4300 static std::string format_string(const char *const format, ...) {
4301 std::string res;
4302 va_list args;
4303 va_list args_copy;
4304 char static_buff[256];
4305
4306 assert(format != nullptr);
4307
4308 va_start(args, format);
4309 va_copy(args_copy, args);
4310
4311 // Calculate how much space we will need
4312 int len = vsnprintf(nullptr, 0, format, args);
4313 va_end(args);
4314
4315 if (len < 0) {
4316 res = std::string("<format error>");
4317 } else if (len == 0) {
4318 // Shortcut for an empty string
4319 res = std::string("");
4320 } else {
4321 // For short enough output use a static buffer
4322 char *buff = static_buff;
4323 std::unique_ptr<char[]> dynamic_buff = nullptr;
4324
4325 len++; // Add one for null terminator
4326
4327 // for longer output use an allocated buffer
4328 if (static_cast<uint>(len) > sizeof(static_buff)) {
4329 dynamic_buff.reset(new char[len]);
4330 buff = dynamic_buff.get();
4331 }
4332
4333 // Now re-do the vsnprintf with the buffer which is now large enough
4334 (void)vsnprintf(buff, len, format, args_copy);
4335
4336 // Convert to a std::string. Note we could have created a std::string
4337 // large enough and then converted the buffer to a 'char*' and created
4338 // the output in place. This would probably work but feels like a hack.
4339 // Since this isn't code that needs to be super-performant we are going
4340 // with this 'safer' method.
4341 res = std::string(buff);
4342 }
4343
4344 va_end(args_copy);
4345
4346 return res;
4347 }
4348
4349 class Rdb_snapshot_status : public Rdb_tx_list_walker {
4350 private:
4351 std::string m_data;
4352
current_timestamp(void)4353 static std::string current_timestamp(void) {
4354 static const char *const format = "%d-%02d-%02d %02d:%02d:%02d";
4355 time_t currtime;
4356 struct tm currtm;
4357
4358 time(&currtime);
4359
4360 localtime_r(&currtime, &currtm);
4361
4362 return format_string(format, currtm.tm_year + 1900, currtm.tm_mon + 1,
4363 currtm.tm_mday, currtm.tm_hour, currtm.tm_min,
4364 currtm.tm_sec);
4365 }
4366
get_header(void)4367 static std::string get_header(void) {
4368 return "\n============================================================\n" +
4369 current_timestamp() +
4370 " ROCKSDB TRANSACTION MONITOR OUTPUT\n"
4371 "============================================================\n"
4372 "---------\n"
4373 "SNAPSHOTS\n"
4374 "---------\n"
4375 "LIST OF SNAPSHOTS FOR EACH SESSION:\n";
4376 }
4377
get_footer(void)4378 static std::string get_footer(void) {
4379 return "-----------------------------------------\n"
4380 "END OF ROCKSDB TRANSACTION MONITOR OUTPUT\n"
4381 "=========================================\n";
4382 }
4383
get_dl_txn_info(const rocksdb::DeadlockInfo & txn,const GL_INDEX_ID & gl_index_id)4384 static Rdb_deadlock_info::Rdb_dl_trx_info get_dl_txn_info(
4385 const rocksdb::DeadlockInfo &txn, const GL_INDEX_ID &gl_index_id) {
4386 Rdb_deadlock_info::Rdb_dl_trx_info txn_data;
4387
4388 txn_data.trx_id = txn.m_txn_id;
4389
4390 txn_data.table_name = ddl_manager.safe_get_table_name(gl_index_id);
4391 if (txn_data.table_name.empty()) {
4392 txn_data.table_name =
4393 "NOT FOUND; INDEX_ID: " + std::to_string(gl_index_id.index_id);
4394 }
4395
4396 const auto &kd = ddl_manager.safe_find(gl_index_id);
4397 txn_data.index_name =
4398 (kd) ? kd->get_name()
4399 : "NOT FOUND; INDEX_ID: " + std::to_string(gl_index_id.index_id);
4400
4401 std::shared_ptr<rocksdb::ColumnFamilyHandle> cfh =
4402 cf_manager.get_cf(txn.m_cf_id);
4403
4404 // Retrieve CF name from CF handle object, and it is safe if the CF is
4405 // removed from cf_manager at this point.
4406 txn_data.cf_name = (cfh)
4407 ? cfh->GetName()
4408 : "NOT FOUND; CF_ID: " + std::to_string(txn.m_cf_id);
4409
4410 txn_data.waiting_key =
4411 rdb_hexdump(txn.m_waiting_key.c_str(), txn.m_waiting_key.length());
4412
4413 txn_data.exclusive_lock = txn.m_exclusive;
4414
4415 return txn_data;
4416 }
4417
get_dl_path_trx_info(const rocksdb::DeadlockPath & path_entry)4418 static Rdb_deadlock_info get_dl_path_trx_info(
4419 const rocksdb::DeadlockPath &path_entry) {
4420 Rdb_deadlock_info deadlock_info;
4421 deadlock_info.path.reserve(path_entry.path.size());
4422
4423 for (const auto &txn : path_entry.path) {
4424 const GL_INDEX_ID gl_index_id = {
4425 txn.m_cf_id, rdb_netbuf_to_uint32(reinterpret_cast<const uchar *>(
4426 txn.m_waiting_key.c_str()))};
4427 deadlock_info.path.push_back(get_dl_txn_info(txn, gl_index_id));
4428 }
4429 assert_IFF(path_entry.limit_exceeded, path_entry.path.empty());
4430 /* print the first txn in the path to display the full deadlock cycle */
4431 if (!path_entry.path.empty() && !path_entry.limit_exceeded) {
4432 const auto &deadlocking_txn = *(path_entry.path.end() - 1);
4433 deadlock_info.victim_trx_id = deadlocking_txn.m_txn_id;
4434 deadlock_info.deadlock_time = path_entry.deadlock_time;
4435 }
4436 return deadlock_info;
4437 }
4438
4439 public:
Rdb_snapshot_status()4440 Rdb_snapshot_status() : m_data(get_header()) {}
4441
getResult()4442 std::string getResult() { return m_data + get_footer(); }
4443
4444 /* Implement Rdb_transaction interface */
4445 /* Create one row in the snapshot status table */
process_tran(const Rdb_transaction * const tx)4446 void process_tran(const Rdb_transaction *const tx) override {
4447 assert(tx != nullptr);
4448
4449 /* Calculate the duration the snapshot has existed */
4450 int64_t snapshot_timestamp = tx->m_snapshot_timestamp;
4451 if (snapshot_timestamp != 0) {
4452 int64_t curr_time;
4453 rdb->GetEnv()->GetCurrentTime(&curr_time);
4454
4455 THD *thd = tx->get_thd();
4456 char buffer[1024];
4457 thd_security_context(thd, buffer, sizeof buffer, 0);
4458 m_data += format_string(
4459 "---SNAPSHOT, ACTIVE %lld sec\n"
4460 "%s\n"
4461 "lock count %llu, write count %llu\n",
4462 curr_time - snapshot_timestamp, buffer, tx->get_row_lock_count(),
4463 tx->get_write_count());
4464 }
4465 }
4466
get_deadlock_info()4467 std::vector<Rdb_deadlock_info> get_deadlock_info() {
4468 std::vector<Rdb_deadlock_info> deadlock_info;
4469 const auto &dlock_buffer = rdb->GetDeadlockInfoBuffer();
4470 for (const auto &path_entry : dlock_buffer) {
4471 if (!path_entry.limit_exceeded) {
4472 deadlock_info.push_back(get_dl_path_trx_info(path_entry));
4473 }
4474 }
4475 return deadlock_info;
4476 }
4477 };
4478
4479 /**
4480 * @brief
4481 * walks through all non-replication transactions and copies
4482 * out relevant information for information_schema.rocksdb_trx
4483 */
4484 class Rdb_trx_info_aggregator : public Rdb_tx_list_walker {
4485 private:
4486 std::vector<Rdb_trx_info> *m_trx_info;
4487
4488 public:
Rdb_trx_info_aggregator(std::vector<Rdb_trx_info> * const trx_info)4489 explicit Rdb_trx_info_aggregator(std::vector<Rdb_trx_info> *const trx_info)
4490 : m_trx_info(trx_info) {}
4491
process_tran(const Rdb_transaction * const tx)4492 void process_tran(const Rdb_transaction *const tx) override {
4493 static const std::map<int, std::string> state_map = {
4494 {rocksdb::Transaction::STARTED, "STARTED"},
4495 {rocksdb::Transaction::AWAITING_PREPARE, "AWAITING_PREPARE"},
4496 {rocksdb::Transaction::PREPARED, "PREPARED"},
4497 {rocksdb::Transaction::AWAITING_COMMIT, "AWAITING_COMMIT"},
4498 {rocksdb::Transaction::COMMITED, "COMMITED"},
4499 {rocksdb::Transaction::AWAITING_ROLLBACK, "AWAITING_ROLLBACK"},
4500 {rocksdb::Transaction::ROLLEDBACK, "ROLLEDBACK"},
4501 };
4502 static const size_t trx_query_max_len = 1024; // length stolen from InnoDB
4503
4504 assert(tx != nullptr);
4505
4506 THD *const thd = tx->get_thd();
4507 const my_thread_id thread_id = thd->thread_id();
4508
4509 if (tx->is_writebatch_trx()) {
4510 const auto wb_impl = static_cast<const Rdb_writebatch_impl *>(tx);
4511 assert(wb_impl);
4512 m_trx_info->push_back(
4513 {"", /* name */
4514 0, /* trx_id */
4515 wb_impl->get_write_count(), 0, /* lock_count */
4516 0, /* timeout_sec */
4517 "", /* state */
4518 "", /* waiting_key */
4519 0, /* waiting_cf_id */
4520 1, /*is_replication */
4521 1, /* skip_trx_api */
4522 wb_impl->is_tx_read_only(), 0, /* deadlock detection */
4523 wb_impl->num_ongoing_bulk_load(), thread_id, "" /* query string */});
4524 } else {
4525 const auto tx_impl = static_cast<const Rdb_transaction_impl *>(tx);
4526 assert(tx_impl);
4527 const rocksdb::Transaction *rdb_trx = tx_impl->get_rdb_trx();
4528
4529 if (rdb_trx == nullptr) {
4530 return;
4531 }
4532
4533 std::string query_str;
4534 query_str.reserve(trx_query_max_len + 1);
4535 size_t query_len = thd_query_safe(thd, &query_str[0], trx_query_max_len);
4536 query_str.resize(query_len);
4537
4538 const auto state_it = state_map.find(rdb_trx->GetState());
4539 assert(state_it != state_map.end());
4540 const int is_replication = (thd->rli_slave != nullptr);
4541 uint32_t waiting_cf_id;
4542 std::string waiting_key;
4543 rdb_trx->GetWaitingTxns(&waiting_cf_id, &waiting_key),
4544
4545 m_trx_info->push_back(
4546 {rdb_trx->GetName(), rdb_trx->GetID(), tx_impl->get_write_count(),
4547 tx_impl->get_row_lock_count(), tx_impl->get_timeout_sec(),
4548 state_it->second, waiting_key, waiting_cf_id, is_replication,
4549 0, /* skip_trx_api */
4550 tx_impl->is_tx_read_only(), rdb_trx->IsDeadlockDetect(),
4551 tx_impl->num_ongoing_bulk_load(), thread_id, query_str});
4552 }
4553 }
4554 };
4555
4556 /*
4557 returns a vector of info for all non-replication threads
4558 for use by information_schema.rocksdb_trx
4559 */
rdb_get_all_trx_info()4560 std::vector<Rdb_trx_info> rdb_get_all_trx_info() {
4561 std::vector<Rdb_trx_info> trx_info;
4562 Rdb_trx_info_aggregator trx_info_agg(&trx_info);
4563 Rdb_transaction::walk_tx_list(&trx_info_agg);
4564 return trx_info;
4565 }
4566
4567 /*
4568 returns a vector of info of recent deadlocks
4569 for use by information_schema.rocksdb_deadlock
4570 */
rdb_get_deadlock_info()4571 std::vector<Rdb_deadlock_info> rdb_get_deadlock_info() {
4572 Rdb_snapshot_status showStatus;
4573 Rdb_transaction::walk_tx_list(&showStatus);
4574 return showStatus.get_deadlock_info();
4575 }
4576
4577 /*
4578 This is called for SHOW ENGINE ROCKSDB STATUS | LOGS | etc.
4579
4580 For now, produce info about live files (which gives an imprecise idea about
4581 what column families are there).
4582 */
rocksdb_show_status(handlerton * const hton,THD * const thd,stat_print_fn * const stat_print,enum ha_stat_type stat_type)4583 static bool rocksdb_show_status(handlerton *const hton, THD *const thd,
4584 stat_print_fn *const stat_print,
4585 enum ha_stat_type stat_type) {
4586 assert(hton != nullptr);
4587 assert(thd != nullptr);
4588 assert(stat_print != nullptr);
4589
4590 bool res = false;
4591 char buf[100] = {'\0'};
4592
4593 if (stat_type == HA_ENGINE_STATUS) {
4594 assert(rdb != nullptr);
4595
4596 std::string str;
4597
4598 /* Global DB Statistics */
4599 if (rocksdb_stats) {
4600 str = rocksdb_stats->ToString();
4601
4602 // Use the same format as internal RocksDB statistics entries to make
4603 // sure that output will look unified.
4604 assert(commit_latency_stats != nullptr);
4605
4606 snprintf(buf, sizeof(buf),
4607 "rocksdb.commit_latency statistics "
4608 "Percentiles :=> 50 : %.2f 95 : %.2f "
4609 "99 : %.2f 100 : %.2f\n",
4610 commit_latency_stats->Percentile(50),
4611 commit_latency_stats->Percentile(95),
4612 commit_latency_stats->Percentile(99),
4613 commit_latency_stats->Percentile(100));
4614 str.append(buf);
4615
4616 uint64_t v = 0;
4617
4618 // Retrieve additional stalling related numbers from RocksDB and append
4619 // them to the buffer meant for displaying detailed statistics. The intent
4620 // here is to avoid adding another row to the query output because of
4621 // just two numbers.
4622 //
4623 // NB! We're replacing hyphens with underscores in output to better match
4624 // the existing naming convention.
4625 if (rdb->GetIntProperty("rocksdb.is-write-stopped", &v)) {
4626 snprintf(buf, sizeof(buf), "rocksdb.is_write_stopped COUNT : %lu\n", v);
4627 str.append(buf);
4628 }
4629
4630 if (rdb->GetIntProperty("rocksdb.actual-delayed-write-rate", &v)) {
4631 snprintf(buf, sizeof(buf),
4632 "rocksdb.actual_delayed_write_rate "
4633 "COUNT : %lu\n",
4634 v);
4635 str.append(buf);
4636 }
4637
4638 res |= print_stats(thd, "STATISTICS", "rocksdb", str, stat_print);
4639 }
4640
4641 /* Per DB stats */
4642 if (rdb->GetProperty("rocksdb.dbstats", &str)) {
4643 res |= print_stats(thd, "DBSTATS", "rocksdb", str, stat_print);
4644 }
4645
4646 /* Per column family stats */
4647 for (const auto &cf_name : cf_manager.get_cf_names()) {
4648 std::shared_ptr<rocksdb::ColumnFamilyHandle> cfh =
4649 cf_manager.get_cf(cf_name);
4650 if (!cfh) {
4651 continue;
4652 }
4653
4654 // Retrieve information from CF handle object.
4655 // Even if the CF is removed from CF_manager, the handle object
4656 // is valid.
4657 if (!rdb->GetProperty(cfh.get(), "rocksdb.cfstats", &str)) {
4658 continue;
4659 }
4660
4661 res |= print_stats(thd, "CF_COMPACTION", cf_name, str, stat_print);
4662 }
4663
4664 /* Memory Statistics */
4665 std::vector<rocksdb::DB *> dbs;
4666 std::unordered_set<const rocksdb::Cache *> cache_set;
4667 size_t internal_cache_count = 0;
4668 size_t kDefaultInternalCacheSize = 8 * 1024 * 1024;
4669
4670 dbs.push_back(rdb);
4671 cache_set.insert(rocksdb_tbl_options->block_cache.get());
4672
4673 for (const auto &cf_handle : cf_manager.get_all_cf()) {
4674 // It is safe if the CF handle is removed from cf_manager
4675 // at this point.
4676 rocksdb::ColumnFamilyDescriptor cf_desc;
4677 cf_handle->GetDescriptor(&cf_desc);
4678 auto *const table_factory = cf_desc.options.table_factory.get();
4679
4680 if (table_factory != nullptr) {
4681 std::string tf_name = table_factory->Name();
4682
4683 if (tf_name.find("BlockBasedTable") != std::string::npos) {
4684 const auto bbt_opt =
4685 table_factory->GetOptions<rocksdb::BlockBasedTableOptions>();
4686
4687 if (bbt_opt != nullptr) {
4688 if (bbt_opt->block_cache.get() != nullptr) {
4689 cache_set.insert(bbt_opt->block_cache.get());
4690 } else {
4691 internal_cache_count++;
4692 }
4693 cache_set.insert(bbt_opt->block_cache_compressed.get());
4694 }
4695 }
4696 }
4697 }
4698
4699 std::map<rocksdb::MemoryUtil::UsageType, uint64_t> temp_usage_by_type;
4700 str.clear();
4701 rocksdb::MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set,
4702 &temp_usage_by_type);
4703
4704 snprintf(buf, sizeof(buf), "\nMemTable Total: %lu",
4705 temp_usage_by_type[rocksdb::MemoryUtil::kMemTableTotal]);
4706 str.append(buf);
4707 snprintf(buf, sizeof(buf), "\nMemTable Unflushed: %lu",
4708 temp_usage_by_type[rocksdb::MemoryUtil::kMemTableUnFlushed]);
4709 str.append(buf);
4710 snprintf(buf, sizeof(buf), "\nTable Readers Total: %lu",
4711 temp_usage_by_type[rocksdb::MemoryUtil::kTableReadersTotal]);
4712 str.append(buf);
4713 snprintf(buf, sizeof(buf), "\nCache Total: %lu",
4714 temp_usage_by_type[rocksdb::MemoryUtil::kCacheTotal]);
4715 str.append(buf);
4716 snprintf(buf, sizeof(buf), "\nDefault Cache Capacity: %lu",
4717 internal_cache_count * kDefaultInternalCacheSize);
4718 str.append(buf);
4719 snprintf(buf, sizeof(buf), "\nCache Capacity: %lu",
4720 (uint64_t)rocksdb_block_cache_size);
4721 str.append(buf);
4722 res |= print_stats(thd, "MEMORY_STATS", "rocksdb", str, stat_print);
4723
4724 /* Show the background thread status */
4725 std::vector<rocksdb::ThreadStatus> thread_list;
4726 rocksdb::Status s = rdb->GetEnv()->GetThreadList(&thread_list);
4727
4728 if (!s.ok()) {
4729 // NO_LINT_DEBUG
4730 sql_print_error("RocksDB: Returned error (%s) from GetThreadList.\n",
4731 s.ToString().c_str());
4732 res |= true;
4733 } else {
4734 /* For each background thread retrieved, print out its information */
4735 for (auto &it : thread_list) {
4736 /* Only look at background threads. Ignore user threads, if any. */
4737 if (it.thread_type > rocksdb::ThreadStatus::LOW_PRIORITY) {
4738 continue;
4739 }
4740
4741 str = "\nthread_type: " + it.GetThreadTypeName(it.thread_type) +
4742 "\ncf_name: " + it.cf_name +
4743 "\noperation_type: " + it.GetOperationName(it.operation_type) +
4744 "\noperation_stage: " +
4745 it.GetOperationStageName(it.operation_stage) +
4746 "\nelapsed_time_ms: " + it.MicrosToString(it.op_elapsed_micros);
4747
4748 for (auto &it_props : it.InterpretOperationProperties(
4749 it.operation_type, it.op_properties)) {
4750 str += "\n" + it_props.first + ": " + std::to_string(it_props.second);
4751 }
4752
4753 str += "\nstate_type: " + it.GetStateName(it.state_type);
4754
4755 res |= print_stats(thd, "BG_THREADS", std::to_string(it.thread_id), str,
4756 stat_print);
4757 }
4758 }
4759 }
4760
4761 return res;
4762 }
4763
rocksdb_register_tx(handlerton * const hton,THD * const thd,Rdb_transaction * const tx)4764 static inline void rocksdb_register_tx(handlerton *const hton, THD *const thd,
4765 Rdb_transaction *const tx) {
4766 assert(tx != nullptr);
4767
4768 trans_register_ha(thd, false, rocksdb_hton, nullptr);
4769 if (rocksdb_write_policy == rocksdb::TxnDBWritePolicy::WRITE_UNPREPARED) {
4770 // Some internal operations will call trans_register_ha, but they do not
4771 // go through 2pc. In this case, the xid is set with query_id == 0, which
4772 // means that rocksdb will receive transactions with duplicate names.
4773 //
4774 // Skip setting name in these cases.
4775 if (thd->query_id != 0) {
4776 tx->set_name();
4777 }
4778 }
4779 if (my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
4780 tx->start_stmt();
4781 trans_register_ha(thd, true, rocksdb_hton, nullptr);
4782 }
4783 }
4784
4785 /*
4786 Supporting START TRANSACTION WITH CONSISTENT SNAPSHOT
4787
4788 - START TRANSACTION WITH CONSISTENT SNAPSHOT
4789 takes both InnoDB and RocksDB snapshots, and both InnoDB and RocksDB
4790 participate in transaction. When executing COMMIT, both InnoDB and
4791 RocksDB modifications are committed. Remember that XA is not supported yet,
4792 so mixing engines is not recommended anyway.
4793 */
rocksdb_start_tx_and_assign_read_view(handlerton * const hton,THD * const thd)4794 static int rocksdb_start_tx_and_assign_read_view(
4795 handlerton *const hton, /*!< in: RocksDB handlerton */
4796 THD *const thd) /*!< in: MySQL thread handle of the
4797 user for whom the transaction should
4798 be committed */
4799 {
4800 ulong const tx_isolation = my_core::thd_tx_isolation(thd);
4801
4802 Rdb_transaction *tx = get_or_create_tx(thd);
4803 Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd));
4804
4805 assert(!tx->has_snapshot());
4806 tx->set_tx_read_only(true);
4807 rocksdb_register_tx(hton, thd, tx);
4808
4809 if (tx_isolation == ISO_REPEATABLE_READ) {
4810 tx->acquire_snapshot(true);
4811 } else {
4812 push_warning_printf(thd, Sql_condition::SL_WARNING, HA_ERR_UNSUPPORTED,
4813 "RocksDB: Only REPEATABLE READ isolation level is "
4814 "supported for START TRANSACTION WITH CONSISTENT "
4815 "SNAPSHOT in RocksDB Storage Engine. Snapshot has not "
4816 "been taken.");
4817 }
4818 return HA_EXIT_SUCCESS;
4819 }
4820
4821 /* Dummy SAVEPOINT support. This is needed for long running transactions
4822 * like mysqldump (https://bugs.mysql.com/bug.php?id=71017).
4823 * Current SAVEPOINT does not correctly handle ROLLBACK and does not return
4824 * errors. This needs to be addressed in future versions (Issue#96).
4825 */
rocksdb_savepoint(handlerton * const hton,THD * const thd,void * const savepoint)4826 static int rocksdb_savepoint(handlerton *const hton, THD *const thd,
4827 void *const savepoint) {
4828 return HA_EXIT_SUCCESS;
4829 }
4830
rocksdb_rollback_to_savepoint(handlerton * const hton,THD * const thd,void * const savepoint)4831 static int rocksdb_rollback_to_savepoint(handlerton *const hton, THD *const thd,
4832 void *const savepoint) {
4833 Rdb_transaction *&tx = get_tx_from_thd(thd);
4834 return tx->rollback_to_savepoint(savepoint);
4835 }
4836
rocksdb_rollback_to_savepoint_can_release_mdl(handlerton * const hton,THD * const thd)4837 static bool rocksdb_rollback_to_savepoint_can_release_mdl(
4838 handlerton *const hton, THD *const thd) {
4839 return true;
4840 }
4841
check_rocksdb_options_compatibility(const char * const dbpath,const rocksdb::Options & main_opts,const std::vector<rocksdb::ColumnFamilyDescriptor> & cf_descr)4842 static rocksdb::Status check_rocksdb_options_compatibility(
4843 const char *const dbpath, const rocksdb::Options &main_opts,
4844 const std::vector<rocksdb::ColumnFamilyDescriptor> &cf_descr) {
4845 assert(rocksdb_datadir != nullptr);
4846
4847 rocksdb::DBOptions loaded_db_opt;
4848 std::vector<rocksdb::ColumnFamilyDescriptor> loaded_cf_descs;
4849 rocksdb::Status status =
4850 LoadLatestOptions(dbpath, rocksdb::Env::Default(), &loaded_db_opt,
4851 &loaded_cf_descs, rocksdb_ignore_unknown_options);
4852
4853 // If we're starting from scratch and there are no options saved yet then this
4854 // is a valid case. Therefore we can't compare the current set of options to
4855 // anything.
4856 if (status.IsNotFound()) {
4857 return rocksdb::Status::OK();
4858 }
4859
4860 if (!status.ok()) {
4861 return status;
4862 }
4863
4864 if (loaded_cf_descs.size() != cf_descr.size()) {
4865 return rocksdb::Status::NotSupported(
4866 "Mismatched size of column family "
4867 "descriptors.");
4868 }
4869
4870 // Please see RocksDB documentation for more context about why we need to set
4871 // user-defined functions and pointer-typed options manually.
4872 for (size_t i = 0; i < loaded_cf_descs.size(); i++) {
4873 loaded_cf_descs[i].options.compaction_filter =
4874 cf_descr[i].options.compaction_filter;
4875 loaded_cf_descs[i].options.compaction_filter_factory =
4876 cf_descr[i].options.compaction_filter_factory;
4877 loaded_cf_descs[i].options.comparator = cf_descr[i].options.comparator;
4878 loaded_cf_descs[i].options.memtable_factory =
4879 cf_descr[i].options.memtable_factory;
4880 loaded_cf_descs[i].options.merge_operator =
4881 cf_descr[i].options.merge_operator;
4882 loaded_cf_descs[i].options.prefix_extractor =
4883 cf_descr[i].options.prefix_extractor;
4884 loaded_cf_descs[i].options.table_factory =
4885 cf_descr[i].options.table_factory;
4886 }
4887
4888 // This is the essence of the function - determine if it's safe to open the
4889 // database or not.
4890 status = CheckOptionsCompatibility(dbpath, rocksdb::Env::Default(), main_opts,
4891 loaded_cf_descs,
4892 rocksdb_ignore_unknown_options);
4893
4894 return status;
4895 }
4896
rocksdb_partition_flags()4897 static uint rocksdb_partition_flags() { return (HA_CANNOT_PARTITION_FK); }
4898
4899 /* Clean up tables leftover from truncation */
rocksdb_truncation_table_cleanup(void)4900 void rocksdb_truncation_table_cleanup(void) {
4901 /* Scan for tables that have the truncation prefix */
4902 struct Rdb_truncate_tbls : public Rdb_tables_scanner {
4903 public:
4904 std::vector<Rdb_tbl_def *> m_tbl_list;
4905 int add_table(Rdb_tbl_def *tdef) override {
4906 assert(tdef != nullptr);
4907 if (tdef->base_tablename().find(TRUNCATE_TABLE_PREFIX) !=
4908 std::string::npos) {
4909 m_tbl_list.push_back(tdef);
4910 }
4911 return HA_EXIT_SUCCESS;
4912 }
4913 } collector;
4914 ddl_manager.scan_for_tables(&collector);
4915
4916 /*
4917 For now, delete any table found. It's possible to rename them back,
4918 but there's a risk the rename can potentially lead to other inconsistencies.
4919 Removing the old table (which is being truncated anyway) seems to be the
4920 safest solution.
4921 */
4922 ha_rocksdb table(rocksdb_hton, nullptr);
4923 for (Rdb_tbl_def *tbl_def : collector.m_tbl_list) {
4924 // NO_LINT_DEBUG
4925 sql_print_warning("MyRocks: Removing truncated leftover table %s",
4926 tbl_def->full_tablename().c_str());
4927 table.delete_table(tbl_def);
4928 }
4929 }
4930
4931 /*
4932 Storage Engine initialization function, invoked when plugin is loaded.
4933 */
4934
rocksdb_init_func(void * const p)4935 static int rocksdb_init_func(void *const p) {
4936 DBUG_ENTER_FUNC();
4937
4938 if (rdb_check_rocksdb_corruption()) {
4939 sql_print_error(
4940 "RocksDB: There was corruption detected in the RockDB data"
4941 "files. Check error log emitted earlier for more details.");
4942 if (rocksdb_allow_to_start_after_corruption) {
4943 sql_print_information(
4944 "RocksDB: Set rocksdb_allow_to_start_after_corruption=0 to prevent "
4945 "server from starting when RocksDB data corruption is detected.");
4946 } else {
4947 sql_print_error(
4948 "RocksDB: The server will exit normally and stop restart "
4949 "attempts. Remove %s file from data directory and "
4950 "start mysqld manually.",
4951 rdb_corruption_marker_file_name().c_str());
4952 exit(0);
4953 }
4954 }
4955
4956 // Validate the assumption about the size of ROCKSDB_SIZEOF_HIDDEN_PK_COLUMN.
4957 static_assert(sizeof(longlong) == 8, "Assuming that longlong is 8 bytes.");
4958
4959 // Lock the handlertons initialized status flag for writing
4960 Rdb_hton_init_state::Scoped_lock state_lock(*rdb_get_hton_init_state(), true);
4961 SHIP_ASSERT(!rdb_get_hton_init_state()->initialized());
4962
4963 init_rocksdb_psi_keys();
4964
4965 rocksdb_hton = (handlerton *)p;
4966
4967 rdb_open_tables.init();
4968 Ensure_cleanup rdb_open_tables_cleanup([]() { rdb_open_tables.free(); });
4969
4970 #ifdef HAVE_PSI_INTERFACE
4971 rdb_bg_thread.init(rdb_signal_bg_psi_mutex_key, rdb_signal_bg_psi_cond_key);
4972 rdb_drop_idx_thread.init(rdb_signal_drop_idx_psi_mutex_key,
4973 rdb_signal_drop_idx_psi_cond_key);
4974 rdb_is_thread.init(rdb_signal_is_psi_mutex_key, rdb_signal_is_psi_cond_key);
4975 rdb_mc_thread.init(rdb_signal_mc_psi_mutex_key, rdb_signal_mc_psi_cond_key);
4976 #else
4977 rdb_bg_thread.init();
4978 rdb_drop_idx_thread.init();
4979 rdb_is_thread.init();
4980 rdb_mc_thread.init();
4981 #endif
4982 mysql_mutex_init(rdb_collation_data_mutex_key, &rdb_collation_data_mutex,
4983 MY_MUTEX_INIT_FAST);
4984 mysql_mutex_init(rdb_mem_cmp_space_mutex_key, &rdb_mem_cmp_space_mutex,
4985 MY_MUTEX_INIT_FAST);
4986
4987 #if defined(HAVE_PSI_INTERFACE)
4988 rdb_collation_exceptions = new Regex(key_rwlock_collation_exception_list);
4989 #else
4990 rdb_collation_exceptions = new Regex();
4991 #endif
4992
4993 mysql_mutex_init(rdb_sysvars_psi_mutex_key, &rdb_sysvars_mutex,
4994 MY_MUTEX_INIT_FAST);
4995 mysql_mutex_init(rdb_block_cache_resize_mutex_key,
4996 &rdb_block_cache_resize_mutex, MY_MUTEX_INIT_FAST);
4997 mysql_mutex_init(rdb_bottom_pri_background_compactions_resize_mutex_key,
4998 &rdb_bottom_pri_background_compactions_resize_mutex,
4999 MY_MUTEX_INIT_FAST);
5000 Rdb_transaction::init_mutex();
5001
5002 rocksdb_hton->state = SHOW_OPTION_YES;
5003 rocksdb_hton->create = rocksdb_create_handler;
5004 rocksdb_hton->close_connection = rocksdb_close_connection;
5005 rocksdb_hton->prepare = rocksdb_prepare;
5006 rocksdb_hton->commit_by_xid = rocksdb_commit_by_xid;
5007 rocksdb_hton->rollback_by_xid = rocksdb_rollback_by_xid;
5008 rocksdb_hton->recover = rocksdb_recover;
5009 rocksdb_hton->commit = rocksdb_commit;
5010 rocksdb_hton->rollback = rocksdb_rollback;
5011 rocksdb_hton->db_type = DB_TYPE_ROCKSDB;
5012 rocksdb_hton->show_status = rocksdb_show_status;
5013 rocksdb_hton->start_consistent_snapshot =
5014 rocksdb_start_tx_and_assign_read_view;
5015 rocksdb_hton->savepoint_set = rocksdb_savepoint;
5016 rocksdb_hton->savepoint_rollback = rocksdb_rollback_to_savepoint;
5017 rocksdb_hton->savepoint_rollback_can_release_mdl =
5018 rocksdb_rollback_to_savepoint_can_release_mdl;
5019 rocksdb_hton->flush_logs = rocksdb_flush_wal;
5020
5021 rocksdb_hton->flags = HTON_TEMPORARY_NOT_SUPPORTED |
5022 HTON_SUPPORTS_EXTENDED_KEYS | HTON_CAN_RECREATE;
5023
5024 if (rocksdb_enable_native_partition)
5025 rocksdb_hton->partition_flags = rocksdb_partition_flags;
5026
5027 assert(!mysqld_embedded);
5028
5029 if (rocksdb_db_options->max_open_files > (long)open_files_limit) {
5030 sql_print_information(
5031 "RocksDB: rocksdb_max_open_files should not be "
5032 "greater than the open_files_limit, effective value "
5033 "of rocksdb_max_open_files is being set to "
5034 "open_files_limit / 2.");
5035 rocksdb_db_options->max_open_files = open_files_limit / 2;
5036 } else if (rocksdb_db_options->max_open_files == -2) {
5037 rocksdb_db_options->max_open_files = open_files_limit / 2;
5038 }
5039
5040 rdb_read_free_regex_handler.compile(DEFAULT_READ_FREE_RPL_TABLES,
5041 get_regex_flags(), table_alias_charset);
5042
5043 rocksdb_stats = rocksdb::CreateDBStatistics();
5044 rocksdb_stats->set_stats_level(
5045 static_cast<rocksdb::StatsLevel>(rocksdb_stats_level));
5046 rocksdb_stats_level = rocksdb_stats->get_stats_level();
5047 rocksdb_db_options->statistics = rocksdb_stats;
5048
5049 if (rocksdb_rate_limiter_bytes_per_sec != 0) {
5050 rocksdb_rate_limiter.reset(
5051 rocksdb::NewGenericRateLimiter(rocksdb_rate_limiter_bytes_per_sec));
5052 rocksdb_db_options->rate_limiter = rocksdb_rate_limiter;
5053 }
5054
5055 rocksdb_db_options->delayed_write_rate = rocksdb_delayed_write_rate;
5056
5057 std::shared_ptr<Rdb_logger> myrocks_logger = std::make_shared<Rdb_logger>();
5058 rocksdb::Status s = rocksdb::CreateLoggerFromOptions(
5059 rocksdb_datadir, *rocksdb_db_options, &rocksdb_db_options->info_log);
5060 if (s.ok()) {
5061 myrocks_logger->SetRocksDBLogger(rocksdb_db_options->info_log);
5062 }
5063
5064 rocksdb_db_options->info_log = myrocks_logger;
5065 myrocks_logger->SetInfoLogLevel(
5066 static_cast<rocksdb::InfoLogLevel>(rocksdb_info_log_level));
5067 rocksdb_db_options->wal_dir = rocksdb_wal_dir;
5068
5069 rocksdb_db_options->wal_recovery_mode =
5070 static_cast<rocksdb::WALRecoveryMode>(rocksdb_wal_recovery_mode);
5071
5072 rocksdb_db_options->track_and_verify_wals_in_manifest =
5073 rocksdb_track_and_verify_wals_in_manifest;
5074
5075 rocksdb_db_options->access_hint_on_compaction_start =
5076 static_cast<rocksdb::Options::AccessHint>(
5077 rocksdb_access_hint_on_compaction_start);
5078
5079 if (rocksdb_db_options->allow_mmap_reads &&
5080 rocksdb_db_options->use_direct_reads) {
5081 // allow_mmap_reads implies !use_direct_reads and RocksDB will not open if
5082 // mmap_reads and direct_reads are both on. (NO_LINT_DEBUG)
5083 sql_print_error(
5084 "RocksDB: Can't enable both use_direct_reads "
5085 "and allow_mmap_reads\n");
5086 DBUG_RETURN(HA_EXIT_FAILURE);
5087 }
5088
5089 // Check whether the filesystem backing rocksdb_datadir allows O_DIRECT
5090 if (rocksdb_db_options->use_direct_reads ||
5091 rocksdb_db_options->use_direct_io_for_flush_and_compaction) {
5092 rocksdb::EnvOptions soptions;
5093 rocksdb::Status check_status;
5094 rocksdb::Env *const env = rocksdb_db_options->env;
5095
5096 std::string fname = format_string("%s/DIRECT_CHECK", rocksdb_datadir);
5097 if (env->FileExists(fname).ok()) {
5098 std::unique_ptr<rocksdb::SequentialFile> file;
5099 soptions.use_direct_reads = true;
5100 check_status = env->NewSequentialFile(fname, &file, soptions);
5101 } else {
5102 std::unique_ptr<rocksdb::WritableFile> file;
5103 soptions.use_direct_writes = true;
5104 check_status = env->ReopenWritableFile(fname, &file, soptions);
5105 if (file != nullptr) {
5106 file->Close();
5107 }
5108 env->DeleteFile(fname);
5109 }
5110
5111 if (!check_status.ok()) {
5112 sql_print_error(
5113 "RocksDB: Unable to use direct io in rocksdb-datadir:"
5114 "(%s)",
5115 check_status.getState());
5116 DBUG_RETURN(HA_EXIT_FAILURE);
5117 }
5118 }
5119
5120 if (rocksdb_db_options->allow_mmap_writes &&
5121 rocksdb_db_options->use_direct_io_for_flush_and_compaction) {
5122 // See above comment for allow_mmap_reads. (NO_LINT_DEBUG)
5123 sql_print_error(
5124 "RocksDB: Can't enable both "
5125 "use_direct_io_for_flush_and_compaction and "
5126 "allow_mmap_writes\n");
5127 DBUG_RETURN(HA_EXIT_FAILURE);
5128 }
5129
5130 if (rocksdb_db_options->allow_mmap_writes &&
5131 rocksdb_flush_log_at_trx_commit != FLUSH_LOG_NEVER) {
5132 // NO_LINT_DEBUG
5133 sql_print_error(
5134 "RocksDB: rocksdb_flush_log_at_trx_commit needs to be 0 "
5135 "to use allow_mmap_writes");
5136 DBUG_RETURN(HA_EXIT_FAILURE);
5137 }
5138
5139 // sst_file_manager will move deleted rocksdb sst files to trash_dir
5140 // to be deleted in a background thread.
5141 std::string trash_dir = std::string(rocksdb_datadir) + "/trash";
5142 rocksdb_db_options->sst_file_manager.reset(NewSstFileManager(
5143 rocksdb_db_options->env, myrocks_logger, trash_dir,
5144 rocksdb_sst_mgr_rate_bytes_per_sec, true /* delete_existing_trash */));
5145
5146 std::vector<std::string> cf_names;
5147 rocksdb::Status status;
5148 status = rocksdb::DB::ListColumnFamilies(*rocksdb_db_options, rocksdb_datadir,
5149 &cf_names);
5150 if (!status.ok()) {
5151 /*
5152 When we start on an empty datadir, ListColumnFamilies returns IOError,
5153 and RocksDB doesn't provide any way to check what kind of error it was.
5154 Checking system errno happens to work right now.
5155 */
5156 if (status.IsIOError() && errno == ENOENT) {
5157 // NO_LINT_DEBUG
5158 sql_print_information("RocksDB: Got ENOENT when listing column families");
5159
5160 // NO_LINT_DEBUG
5161 sql_print_information(
5162 "RocksDB: assuming that we're creating a new database");
5163 } else {
5164 rdb_log_status_error(status, "Error listing column families");
5165 DBUG_RETURN(HA_EXIT_FAILURE);
5166 }
5167 } else {
5168 // NO_LINT_DEBUG
5169 sql_print_information("RocksDB: %ld column families found",
5170 cf_names.size());
5171 }
5172
5173 std::vector<rocksdb::ColumnFamilyDescriptor> cf_descr;
5174 std::vector<rocksdb::ColumnFamilyHandle *> cf_handles;
5175
5176 rocksdb_tbl_options->index_type =
5177 (rocksdb::BlockBasedTableOptions::IndexType)rocksdb_index_type;
5178
5179 if (!rocksdb_tbl_options->no_block_cache) {
5180 std::shared_ptr<rocksdb::MemoryAllocator> memory_allocator;
5181 if (!rocksdb_cache_dump) {
5182 #ifdef HAVE_JEMALLOC
5183 size_t block_size = rocksdb_tbl_options->block_size;
5184 rocksdb::JemallocAllocatorOptions alloc_opt;
5185 // Limit jemalloc tcache memory usage. The range
5186 // [block_size/4, block_size] should be enough to cover most of
5187 // block cache allocation sizes.
5188 alloc_opt.limit_tcache_size = true;
5189 alloc_opt.tcache_size_lower_bound = block_size / 4;
5190 alloc_opt.tcache_size_upper_bound = block_size;
5191 rocksdb::Status new_alloc_status =
5192 rocksdb::NewJemallocNodumpAllocator(alloc_opt, &memory_allocator);
5193 if (!new_alloc_status.ok()) {
5194 // Fallback to use default malloc/free.
5195 rdb_log_status_error(new_alloc_status,
5196 "Error excluding block cache from core dump");
5197 memory_allocator = nullptr;
5198 DBUG_RETURN(HA_EXIT_FAILURE);
5199 }
5200 #else
5201 // NO_LINT_DEBUG
5202 sql_print_warning(
5203 "Ignoring rocksdb_cache_dump because jemalloc is missing.");
5204 #endif // HAVE_JEMALLOC
5205 }
5206 std::shared_ptr<rocksdb::Cache> block_cache = rocksdb::NewLRUCache(
5207 rocksdb_block_cache_size, -1 /*num_shard_bits*/,
5208 false /*strict_capcity_limit*/, rocksdb_cache_high_pri_pool_ratio,
5209 memory_allocator);
5210 if (rocksdb_sim_cache_size > 0) {
5211 // Simulated cache enabled
5212 // Wrap block cache inside a simulated cache and pass it to RocksDB
5213 rocksdb_tbl_options->block_cache =
5214 rocksdb::NewSimCache(block_cache, rocksdb_sim_cache_size, 6);
5215 } else {
5216 // Pass block cache to RocksDB
5217 rocksdb_tbl_options->block_cache = block_cache;
5218 }
5219 }
5220
5221 if (rocksdb_collect_sst_properties) {
5222 properties_collector_factory =
5223 std::make_shared<Rdb_tbl_prop_coll_factory>(&ddl_manager);
5224
5225 rocksdb_set_compaction_options(nullptr, nullptr, nullptr, nullptr);
5226
5227 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
5228
5229 assert(rocksdb_table_stats_sampling_pct <=
5230 RDB_TBL_STATS_SAMPLE_PCT_MAX);
5231 properties_collector_factory->SetTableStatsSamplingPct(
5232 rocksdb_table_stats_sampling_pct);
5233
5234 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
5235 }
5236
5237 if (rocksdb_persistent_cache_size_mb > 0) {
5238 std::shared_ptr<rocksdb::PersistentCache> pcache;
5239 uint64_t cache_size_bytes = rocksdb_persistent_cache_size_mb * 1024 * 1024;
5240 status = rocksdb::NewPersistentCache(
5241 rocksdb::Env::Default(), std::string(rocksdb_persistent_cache_path),
5242 cache_size_bytes, myrocks_logger, true, &pcache);
5243 if (!status.ok()) {
5244 // NO_LINT_DEBUG
5245 sql_print_error("RocksDB: Persistent cache returned error: (%s)",
5246 status.getState());
5247 DBUG_RETURN(HA_EXIT_FAILURE);
5248 }
5249 rocksdb_tbl_options->persistent_cache = pcache;
5250 } else if (strlen(rocksdb_persistent_cache_path)) {
5251 // NO_LINT_DEBUG
5252 sql_print_error("RocksDB: Must specify rocksdb_persistent_cache_size_mb");
5253 DBUG_RETURN(HA_EXIT_FAILURE);
5254 }
5255
5256 std::unique_ptr<Rdb_cf_options> cf_options_map(new Rdb_cf_options());
5257 if (!cf_options_map->init(*rocksdb_tbl_options, properties_collector_factory,
5258 rocksdb_default_cf_options,
5259 rocksdb_override_cf_options)) {
5260 // NO_LINT_DEBUG
5261 sql_print_error("RocksDB: Failed to initialize CF options map.");
5262 DBUG_RETURN(HA_EXIT_FAILURE);
5263 }
5264
5265 /*
5266 If there are no column families, we're creating the new database.
5267 Create one column family named "default".
5268 */
5269 if (cf_names.size() == 0) cf_names.push_back(DEFAULT_CF_NAME);
5270
5271 std::vector<int> compaction_enabled_cf_indices;
5272
5273 // NO_LINT_DEBUG
5274 sql_print_information("RocksDB: Column Families at start:");
5275 for (size_t i = 0; i < cf_names.size(); ++i) {
5276 rocksdb::ColumnFamilyOptions opts;
5277 cf_options_map->get_cf_options(cf_names[i], &opts);
5278
5279 // NO_LINT_DEBUG
5280 sql_print_information(" cf=%s", cf_names[i].c_str());
5281
5282 // NO_LINT_DEBUG
5283 sql_print_information(" write_buffer_size=%ld", opts.write_buffer_size);
5284
5285 // NO_LINT_DEBUG
5286 sql_print_information(" target_file_size_base=%" PRIu64,
5287 opts.target_file_size_base);
5288
5289 /*
5290 Temporarily disable compactions to prevent a race condition where
5291 compaction starts before compaction filter is ready.
5292 */
5293 if (!opts.disable_auto_compactions) {
5294 compaction_enabled_cf_indices.push_back(i);
5295 opts.disable_auto_compactions = true;
5296 }
5297 cf_descr.push_back(rocksdb::ColumnFamilyDescriptor(cf_names[i], opts));
5298 }
5299
5300 rocksdb::Options main_opts(*rocksdb_db_options,
5301 cf_options_map->get_defaults());
5302
5303 rocksdb::TransactionDBOptions tx_db_options;
5304 tx_db_options.transaction_lock_timeout = 2000; // 2 seconds
5305 tx_db_options.custom_mutex_factory = std::make_shared<Rdb_mutex_factory>();
5306 tx_db_options.write_policy =
5307 static_cast<rocksdb::TxnDBWritePolicy>(rocksdb_write_policy);
5308
5309 status =
5310 check_rocksdb_options_compatibility(rocksdb_datadir, main_opts, cf_descr);
5311
5312 // We won't start if we'll determine that there's a chance of data corruption
5313 // because of incompatible options.
5314 if (!status.ok()) {
5315 rdb_log_status_error(
5316 status, "Compatibility check against existing database options failed");
5317 DBUG_RETURN(HA_EXIT_FAILURE);
5318 }
5319
5320 // NO_LINT_DEBUG
5321 sql_print_information("RocksDB: Opening TransactionDB...");
5322
5323 status = rocksdb::TransactionDB::Open(
5324 main_opts, tx_db_options, rocksdb_datadir, cf_descr, &cf_handles, &rdb);
5325
5326 if (!status.ok()) {
5327 rdb_log_status_error(status, "Error opening instance");
5328 DBUG_RETURN(HA_EXIT_FAILURE);
5329 }
5330 cf_manager.init(std::move(cf_options_map), &cf_handles);
5331
5332 // NO_LINT_DEBUG
5333 sql_print_information("RocksDB: Initializing data dictionary...");
5334
5335 if (st_rdb_exec_time.exec("Rdb_dict_manager::init", [&]() {
5336 return dict_manager.init(rdb, &cf_manager,
5337 rocksdb_enable_remove_orphaned_dropped_cfs);
5338 })) {
5339 // NO_LINT_DEBUG
5340 sql_print_error("RocksDB: Failed to initialize data dictionary.");
5341 DBUG_RETURN(HA_EXIT_FAILURE);
5342 }
5343
5344 sql_print_information("RocksDB: Initializing DDL Manager...");
5345
5346 if (st_rdb_exec_time.exec("Rdb_ddl_manager::init", [&]() {
5347 #if defined(ROCKSDB_INCLUDE_VALIDATE_TABLES) && ROCKSDB_INCLUDE_VALIDATE_TABLES
5348 return ddl_manager.init(&dict_manager, &cf_manager,
5349 rocksdb_validate_tables);
5350 #else
5351 return ddl_manager.init(&dict_manager, &cf_manager);
5352 #endif // defined(ROCKSDB_INCLUDE_VALIDATE_TABLES) && ROCKSDB_INCLUDE_VALIDATE_TABLES
5353 })) {
5354 // NO_LINT_DEBUG
5355 sql_print_error("RocksDB: Failed to initialize DDL manager.");
5356 DBUG_RETURN(HA_EXIT_FAILURE);
5357 }
5358
5359 for (const auto &cf_handle : cf_manager.get_all_cf()) {
5360 uint flags;
5361 if (!dict_manager.get_cf_flags(cf_handle->GetID(), &flags)) {
5362 const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
5363 rocksdb::WriteBatch *const batch = wb.get();
5364 dict_manager.add_cf_flags(batch, cf_handle->GetID(), 0);
5365 dict_manager.commit(batch);
5366 }
5367 }
5368
5369 Rdb_sst_info::init(rdb);
5370
5371 /*
5372 Enable auto compaction, things needed for compaction filter are finished
5373 initializing
5374 */
5375 std::vector<rocksdb::ColumnFamilyHandle *> compaction_enabled_cf_handles;
5376 compaction_enabled_cf_handles.reserve(compaction_enabled_cf_indices.size());
5377 for (const auto &index : compaction_enabled_cf_indices) {
5378 compaction_enabled_cf_handles.push_back(cf_handles[index]);
5379 }
5380
5381 status = rdb->EnableAutoCompaction(compaction_enabled_cf_handles);
5382
5383 if (!status.ok()) {
5384 rdb_log_status_error(status, "Error enabling compaction");
5385 DBUG_RETURN(HA_EXIT_FAILURE);
5386 }
5387
5388 auto err = rdb_bg_thread.create_thread(BG_THREAD_NAME
5389 #ifdef HAVE_PSI_INTERFACE
5390 ,
5391 rdb_background_psi_thread_key
5392 #endif
5393 );
5394 if (err != 0) {
5395 // NO_LINT_DEBUG
5396 sql_print_error("RocksDB: Couldn't start the background thread: (errno=%d)",
5397 err);
5398 DBUG_RETURN(HA_EXIT_FAILURE);
5399 }
5400
5401 err = rdb_drop_idx_thread.create_thread(INDEX_THREAD_NAME
5402 #ifdef HAVE_PSI_INTERFACE
5403 ,
5404 rdb_drop_idx_psi_thread_key
5405 #endif
5406 );
5407 if (err != 0) {
5408 sql_print_error("RocksDB: Couldn't start the drop index thread: (errno=%d)",
5409 err);
5410 DBUG_RETURN(HA_EXIT_FAILURE);
5411 }
5412
5413 #ifndef HAVE_PSI_INTERFACE
5414 err = rdb_is_thread.create_thread(INDEX_STATS_THREAD_NAME);
5415 #else
5416 err = rdb_is_thread.create_thread(INDEX_STATS_THREAD_NAME,
5417 rdb_is_psi_thread_key);
5418 #endif
5419 if (err != 0) {
5420 // NO_LINT_DEBUG
5421 sql_print_error(
5422 "RocksDB: Couldn't start the index stats calculation thread: "
5423 "(errno=%d)",
5424 err);
5425 DBUG_RETURN(HA_EXIT_FAILURE);
5426 }
5427
5428 err = rdb_mc_thread.create_thread(MANUAL_COMPACTION_THREAD_NAME
5429 #ifdef HAVE_PSI_INTERFACE
5430 ,
5431 rdb_mc_psi_thread_key
5432 #endif
5433 );
5434 if (err != 0) {
5435 // NO_LINT_DEBUG
5436 sql_print_error(
5437 "RocksDB: Couldn't start the manual compaction thread: (errno=%d)",
5438 err);
5439 DBUG_RETURN(HA_EXIT_FAILURE);
5440 }
5441
5442 rdb_set_collation_exception_list(rocksdb_strict_collation_exceptions);
5443
5444 if (rocksdb_pause_background_work) {
5445 rdb->PauseBackgroundWork();
5446 }
5447
5448 err = my_error_register(rdb_get_error_messages, HA_ERR_ROCKSDB_FIRST,
5449 HA_ERR_ROCKSDB_LAST);
5450 if (err != 0) {
5451 // NO_LINT_DEBUG
5452 sql_print_error("RocksDB: Couldn't initialize error messages");
5453 DBUG_RETURN(HA_EXIT_FAILURE);
5454 }
5455
5456 // Creating an instance of HistogramImpl should only happen after RocksDB
5457 // has been successfully initialized.
5458 commit_latency_stats = new rocksdb::HistogramImpl();
5459
5460 // succeeded, set the init status flag
5461 rdb_get_hton_init_state()->set_initialized(true);
5462
5463 // Remove tables that may have been leftover during truncation
5464 rocksdb_truncation_table_cleanup();
5465
5466 // NO_LINT_DEBUG
5467 sql_print_information(
5468 "MyRocks storage engine plugin has been successfully "
5469 "initialized.");
5470
5471 st_rdb_exec_time.report();
5472
5473 // Skip cleaning up rdb_open_tables as we've succeeded
5474 rdb_open_tables_cleanup.skip();
5475
5476 rocksdb_set_max_bottom_pri_background_compactions_internal(
5477 rocksdb_max_bottom_pri_background_compactions);
5478
5479 DBUG_RETURN(HA_EXIT_SUCCESS);
5480 }
5481
5482 /*
5483 Storage Engine deinitialization function, invoked when plugin is unloaded.
5484 */
5485
rocksdb_done_func(void * const p)5486 static int rocksdb_done_func(void *const p) {
5487 DBUG_ENTER_FUNC();
5488
5489 int error = 0;
5490
5491 // If we finalize the storage engine plugin, it is no longer initialized.
5492 // Grab a writer lock for the duration of the call, so we can clear the flag
5493 // and destroy the handlerton and global state in isolation.
5494 Rdb_hton_init_state::Scoped_lock state_lock(*rdb_get_hton_init_state(), true);
5495 SHIP_ASSERT(rdb_get_hton_init_state()->initialized());
5496
5497 // signal the drop index thread to stop
5498 rdb_drop_idx_thread.signal(true);
5499
5500 // Flush all memtables for not losing data, even if WAL is disabled.
5501 rocksdb_flush_all_memtables();
5502
5503 // Stop all rocksdb background work
5504 CancelAllBackgroundWork(rdb->GetBaseDB(), true);
5505
5506 // Signal the background thread to stop and to persist all stats collected
5507 // from background flushes and compactions. This will add more keys to a new
5508 // memtable, but since the memtables were just flushed, it should not trigger
5509 // a flush that can stall due to background threads being stopped. As long
5510 // as these keys are stored in a WAL file, they can be retrieved on restart.
5511 rdb_bg_thread.signal(true);
5512
5513 // signal the index stats calculation thread to stop
5514 rdb_is_thread.signal(true);
5515
5516 // signal the manual compaction thread to stop
5517 rdb_mc_thread.signal(true);
5518
5519 // Wait for the background thread to finish.
5520 auto err = rdb_bg_thread.join();
5521 if (err != 0) {
5522 // We'll log the message and continue because we're shutting down and
5523 // continuation is the optimal strategy.
5524 // NO_LINT_DEBUG
5525 sql_print_error("RocksDB: Couldn't stop the background thread: (errno=%d)",
5526 err);
5527 }
5528
5529 // Wait for the drop index thread to finish.
5530 err = rdb_drop_idx_thread.join();
5531 if (err != 0) {
5532 // NO_LINT_DEBUG
5533 sql_print_error("RocksDB: Couldn't stop the index thread: (errno=%d)", err);
5534 }
5535
5536 // Wait for the index stats calculation thread to finish.
5537 err = rdb_is_thread.join();
5538 if (err != 0) {
5539 // NO_LINT_DEBUG
5540 sql_print_error(
5541 "RocksDB: Couldn't stop the index stats calculation thread: (errno=%d)",
5542 err);
5543 }
5544
5545 // Wait for the manual compaction thread to finish.
5546 err = rdb_mc_thread.join();
5547 if (err != 0) {
5548 // NO_LINT_DEBUG
5549 sql_print_error(
5550 "RocksDB: Couldn't stop the manual compaction thread: (errno=%d)", err);
5551 }
5552
5553 if (rdb_open_tables.count()) {
5554 // Looks like we are getting unloaded and yet we have some open tables
5555 // left behind.
5556 error = 1;
5557 }
5558
5559 rdb_open_tables.free();
5560 mysql_mutex_destroy(&rdb_sysvars_mutex);
5561 mysql_mutex_destroy(&rdb_block_cache_resize_mutex);
5562 mysql_mutex_destroy(&rdb_bottom_pri_background_compactions_resize_mutex);
5563
5564 delete rdb_collation_exceptions;
5565 mysql_mutex_destroy(&rdb_collation_data_mutex);
5566 mysql_mutex_destroy(&rdb_mem_cmp_space_mutex);
5567
5568 Rdb_transaction::term_mutex();
5569
5570 for (auto &it : rdb_collation_data) {
5571 delete it;
5572 it = nullptr;
5573 }
5574
5575 ddl_manager.cleanup();
5576 dict_manager.cleanup();
5577 cf_manager.cleanup();
5578
5579 delete rdb;
5580 rdb = nullptr;
5581
5582 delete commit_latency_stats;
5583 commit_latency_stats = nullptr;
5584
5585 // Disown the cache data since we're shutting down.
5586 // This results in memory leaks but it improved the shutdown time.
5587 // Don't disown when running under valgrind or ASAN
5588 #if !defined(HAVE_VALGRIND) && !defined(HAVE_ASAN)
5589 if (rocksdb_tbl_options->block_cache) {
5590 rocksdb_tbl_options->block_cache->DisownData();
5591 }
5592 #endif // HAVE_VALGRIND
5593
5594 rocksdb_db_options = nullptr;
5595 rocksdb_tbl_options = nullptr;
5596 rocksdb_stats = nullptr;
5597
5598 my_error_unregister(HA_ERR_ROCKSDB_FIRST, HA_ERR_ROCKSDB_LAST);
5599
5600 // clear the initialized flag and unlock
5601 rdb_get_hton_init_state()->set_initialized(false);
5602
5603 DBUG_RETURN(error);
5604 }
5605
rocksdb_smart_seek(bool seek_backward,rocksdb::Iterator * const iter,const rocksdb::Slice & key_slice)5606 static inline void rocksdb_smart_seek(bool seek_backward,
5607 rocksdb::Iterator *const iter,
5608 const rocksdb::Slice &key_slice) {
5609 if (seek_backward) {
5610 iter->SeekForPrev(key_slice);
5611 } else {
5612 iter->Seek(key_slice);
5613 }
5614 }
5615
rocksdb_smart_next(bool seek_backward,rocksdb::Iterator * const iter)5616 static inline void rocksdb_smart_next(bool seek_backward,
5617 rocksdb::Iterator *const iter) {
5618 if (seek_backward) {
5619 iter->Prev();
5620 } else {
5621 iter->Next();
5622 }
5623 }
5624
5625 // If the iterator is not valid it might be because of EOF but might be due
5626 // to IOError or corruption. The good practice is always check it.
5627 // https://github.com/facebook/rocksdb/wiki/Iterator#error-handling
is_valid(rocksdb::Iterator * scan_it)5628 static inline bool is_valid(rocksdb::Iterator *scan_it) {
5629 if (scan_it->Valid()) {
5630 return true;
5631 } else {
5632 rocksdb::Status s = scan_it->status();
5633 DBUG_EXECUTE_IF("rocksdb_return_status_corrupted",
5634 dbug_change_status_to_corrupted(&s););
5635 if (s.IsIOError() || s.IsCorruption()) {
5636 if (s.IsCorruption()) {
5637 rdb_persist_corruption_marker();
5638 }
5639 rdb_handle_io_error(s, RDB_IO_ERROR_GENERAL);
5640 }
5641 return false;
5642 }
5643 }
5644
5645 /**
5646 @brief
5647 Example of simple lock controls. The "table_handler" it creates is a
5648 structure we will pass to each ha_rocksdb handler. Do you have to have
5649 one of these? Well, you have pieces that are used for locking, and
5650 they are needed to function.
5651 */
5652
get_table_handler(const char * const table_name)5653 Rdb_table_handler *Rdb_open_tables_map::get_table_handler(
5654 const char *const table_name) {
5655 assert(table_name != nullptr);
5656
5657 Rdb_table_handler *table_handler;
5658
5659 const std::string table_name_str(table_name);
5660
5661 // First, look up the table in the hash map.
5662 RDB_MUTEX_LOCK_CHECK(m_mutex);
5663 const auto &it = m_table_map.find(table_name_str);
5664 if (it != m_table_map.end()) {
5665 // Found it
5666 table_handler = it->second;
5667 } else {
5668 char *tmp_name;
5669
5670 // Since we did not find it in the hash map, attempt to create and add it
5671 // to the hash map.
5672 #ifdef HAVE_PSI_INTERFACE
5673 if (!(table_handler = reinterpret_cast<Rdb_table_handler *>(
5674 my_multi_malloc(rdb_handler_memory_key, MYF(MY_WME | MY_ZEROFILL),
5675 &table_handler, sizeof(*table_handler), &tmp_name,
5676 table_name_str.length() + 1, NullS)))) {
5677 #else
5678 if (!(table_handler = reinterpret_cast<Rdb_table_handler *>(
5679 my_multi_malloc(PSI_NOT_INSTRUMENTED, MYF(MY_WME | MY_ZEROFILL),
5680 &table_handler, sizeof(*table_handler), &tmp_name,
5681 table_name_str.length() + 1, NullS)))) {
5682 #endif
5683 // Allocating a new Rdb_table_handler and a new table name failed.
5684 RDB_MUTEX_UNLOCK_CHECK(m_mutex);
5685 return nullptr;
5686 }
5687
5688 table_handler->m_ref_count = 0;
5689 table_handler->m_table_name_length = table_name_str.length();
5690 table_handler->m_table_name = tmp_name;
5691 my_stpmov(table_handler->m_table_name, table_name_str.c_str());
5692
5693 m_table_map.emplace(table_name_str, table_handler);
5694
5695 thr_lock_init(&table_handler->m_thr_lock);
5696 table_handler->m_io_perf_read.init();
5697 }
5698 assert(table_handler->m_ref_count >= 0);
5699 table_handler->m_ref_count++;
5700
5701 RDB_MUTEX_UNLOCK_CHECK(m_mutex);
5702
5703 return table_handler;
5704 }
5705
5706 std::vector<std::string> rdb_get_open_table_names(void) {
5707 return rdb_open_tables.get_table_names();
5708 }
5709
5710 std::vector<std::string> Rdb_open_tables_map::get_table_names(void) const {
5711 const Rdb_table_handler *table_handler;
5712 std::vector<std::string> names;
5713
5714 RDB_MUTEX_LOCK_CHECK(m_mutex);
5715 for (const auto &kv : m_table_map) {
5716 table_handler = kv.second;
5717 assert(table_handler != nullptr);
5718 names.push_back(table_handler->m_table_name);
5719 }
5720 RDB_MUTEX_UNLOCK_CHECK(m_mutex);
5721
5722 return names;
5723 }
5724
5725 /*
5726 Inspired by innobase_get_int_col_max_value from InnoDB. This returns the
5727 maximum value a type can take on.
5728 */
5729 static ulonglong rdb_get_int_col_max_value(const Field *field) {
5730 ulonglong max_value = 0;
5731 switch (field->key_type()) {
5732 case HA_KEYTYPE_BINARY:
5733 max_value = 0xFFULL;
5734 break;
5735 case HA_KEYTYPE_INT8:
5736 max_value = 0x7FULL;
5737 break;
5738 case HA_KEYTYPE_USHORT_INT:
5739 max_value = 0xFFFFULL;
5740 break;
5741 case HA_KEYTYPE_SHORT_INT:
5742 max_value = 0x7FFFULL;
5743 break;
5744 case HA_KEYTYPE_UINT24:
5745 max_value = 0xFFFFFFULL;
5746 break;
5747 case HA_KEYTYPE_INT24:
5748 max_value = 0x7FFFFFULL;
5749 break;
5750 case HA_KEYTYPE_ULONG_INT:
5751 max_value = 0xFFFFFFFFULL;
5752 break;
5753 case HA_KEYTYPE_LONG_INT:
5754 max_value = 0x7FFFFFFFULL;
5755 break;
5756 case HA_KEYTYPE_ULONGLONG:
5757 max_value = 0xFFFFFFFFFFFFFFFFULL;
5758 break;
5759 case HA_KEYTYPE_LONGLONG:
5760 max_value = 0x7FFFFFFFFFFFFFFFULL;
5761 break;
5762 case HA_KEYTYPE_FLOAT:
5763 max_value = 0x1000000ULL;
5764 break;
5765 case HA_KEYTYPE_DOUBLE:
5766 max_value = 0x20000000000000ULL;
5767 break;
5768 default:
5769 abort();
5770 }
5771
5772 return max_value;
5773 }
5774
5775 void ha_rocksdb::load_auto_incr_value() {
5776 ulonglong auto_incr = 0;
5777 bool validate_last = false, use_datadic = true;
5778 #if !defined(NDEBUG)
5779 DBUG_EXECUTE_IF("myrocks_autoinc_upgrade", use_datadic = false;);
5780 validate_last = true;
5781 #endif // !defined(NDEBUG)
5782
5783 if (use_datadic && dict_manager.get_auto_incr_val(
5784 m_tbl_def->get_autoincr_gl_index_id(), &auto_incr)) {
5785 update_auto_incr_val(auto_incr);
5786 }
5787
5788 // If we find nothing in the data dictionary, or if we are in debug mode,
5789 // then call index_last to get the last value.
5790 //
5791 // This is needed when upgrading from a server that did not support
5792 // persistent auto_increment, of if the table is empty.
5793 //
5794 // For debug mode, we are just verifying that the data dictionary value is
5795 // greater than or equal to the maximum value in the table.
5796 if (auto_incr == 0 || validate_last) {
5797 auto_incr = load_auto_incr_value_from_index();
5798 update_auto_incr_val(auto_incr);
5799 }
5800
5801 // If we failed to find anything from the data dictionary and index, then
5802 // initialize auto_increment to 1.
5803 if (m_tbl_def->m_auto_incr_val == 0) {
5804 update_auto_incr_val(1);
5805 }
5806 }
5807
5808 ulonglong ha_rocksdb::load_auto_incr_value_from_index() {
5809 const int save_active_index = active_index;
5810 active_index = table->s->next_number_index;
5811 const uint8 save_table_status = table->status;
5812 ulonglong last_val = 0;
5813
5814 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
5815 const bool is_new_snapshot = !tx->has_snapshot();
5816 if (is_new_snapshot) {
5817 tx->acquire_snapshot(true);
5818 }
5819
5820 // Do a lookup. We only need index column, so it should be index-only.
5821 // (another reason to make it index-only is that table->read_set is not set
5822 // appropriately and non-index-only lookup will not read the value)
5823 const bool save_keyread_only = m_keyread_only;
5824 m_keyread_only = true;
5825 m_converter->set_is_key_requested(true);
5826
5827 if (!index_last(table->record[0])) {
5828 Field *field =
5829 table->key_info[table->s->next_number_index].key_part[0].field;
5830 ulonglong max_val = rdb_get_int_col_max_value(field);
5831 my_bitmap_map *const old_map =
5832 dbug_tmp_use_all_columns(table, table->read_set);
5833 last_val = field->val_int();
5834 if (last_val != max_val) {
5835 last_val++;
5836 }
5837 #ifndef NDEBUG
5838 ulonglong dd_val;
5839 if (last_val <= max_val) {
5840 const auto &gl_index_id = m_tbl_def->get_autoincr_gl_index_id();
5841 if (dict_manager.get_auto_incr_val(gl_index_id, &dd_val) &&
5842 tx->get_auto_incr(gl_index_id) == 0) {
5843 assert(dd_val >= last_val);
5844 }
5845 }
5846 #endif
5847 dbug_tmp_restore_column_map(table->read_set, old_map);
5848 }
5849
5850 m_keyread_only = save_keyread_only;
5851 if (is_new_snapshot) {
5852 tx->release_snapshot();
5853 }
5854
5855 table->status = save_table_status;
5856 active_index = save_active_index;
5857
5858 /*
5859 Do what ha_rocksdb::index_end() does.
5860 (Why don't we use index_init/index_end? class handler defines index_init
5861 as private, for some reason).
5862 */
5863 release_scan_iterator();
5864
5865 return last_val;
5866 }
5867
5868 void ha_rocksdb::update_auto_incr_val(ulonglong val) {
5869 ulonglong auto_incr_val = m_tbl_def->m_auto_incr_val;
5870 while (
5871 auto_incr_val < val &&
5872 !m_tbl_def->m_auto_incr_val.compare_exchange_weak(auto_incr_val, val)) {
5873 // Do nothing - just loop until auto_incr_val is >= val or we successfully
5874 // set it
5875 }
5876 }
5877
5878 void ha_rocksdb::update_auto_incr_val_from_field() {
5879 Field *field;
5880 ulonglong new_val, max_val;
5881 field = table->key_info[table->s->next_number_index].key_part[0].field;
5882 max_val = rdb_get_int_col_max_value(field);
5883
5884 my_bitmap_map *const old_map =
5885 dbug_tmp_use_all_columns(table, table->read_set);
5886 new_val = field->val_int();
5887 // don't increment if we would wrap around
5888 if (new_val != max_val) {
5889 new_val++;
5890 }
5891
5892 dbug_tmp_restore_column_map(table->read_set, old_map);
5893
5894 // Only update if positive value was set for auto_incr column.
5895 if (new_val <= max_val) {
5896 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
5897 tx->set_auto_incr(m_tbl_def->get_autoincr_gl_index_id(), new_val);
5898
5899 // Update the in memory auto_incr value in m_tbl_def.
5900 update_auto_incr_val(new_val);
5901 }
5902 }
5903
5904 int ha_rocksdb::load_hidden_pk_value() {
5905 const int save_active_index = active_index;
5906 active_index = m_tbl_def->m_key_count - 1;
5907 const uint8 save_table_status = table->status;
5908
5909 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
5910 const bool is_new_snapshot = !tx->has_snapshot();
5911
5912 longlong hidden_pk_id = 1;
5913 // Do a lookup.
5914 if (!index_last(table->record[0])) {
5915 /*
5916 Decode PK field from the key
5917 */
5918 auto err = read_hidden_pk_id_from_rowkey(&hidden_pk_id);
5919 if (err) {
5920 if (is_new_snapshot) {
5921 tx->release_snapshot();
5922 }
5923 return err;
5924 }
5925
5926 hidden_pk_id++;
5927 }
5928
5929 longlong old = m_tbl_def->m_hidden_pk_val;
5930 while (old < hidden_pk_id &&
5931 !m_tbl_def->m_hidden_pk_val.compare_exchange_weak(old, hidden_pk_id)) {
5932 }
5933
5934 if (is_new_snapshot) {
5935 tx->release_snapshot();
5936 }
5937
5938 table->status = save_table_status;
5939 active_index = save_active_index;
5940
5941 release_scan_iterator();
5942
5943 return HA_EXIT_SUCCESS;
5944 }
5945
5946 /* Get PK value from m_tbl_def->m_hidden_pk_info. */
5947 longlong ha_rocksdb::update_hidden_pk_val() {
5948 assert(has_hidden_pk(table));
5949 const longlong new_val = m_tbl_def->m_hidden_pk_val++;
5950 return new_val;
5951 }
5952
5953 /* Get the id of the hidden pk id from m_last_rowkey */
5954 int ha_rocksdb::read_hidden_pk_id_from_rowkey(longlong *const hidden_pk_id) {
5955 assert(hidden_pk_id != nullptr);
5956 assert(table != nullptr);
5957 assert(has_hidden_pk(table));
5958
5959 rocksdb::Slice rowkey_slice(m_last_rowkey.ptr(), m_last_rowkey.length());
5960
5961 // Get hidden primary key from old key slice
5962 Rdb_string_reader reader(&rowkey_slice);
5963 if ((!reader.read(Rdb_key_def::INDEX_NUMBER_SIZE))) {
5964 return HA_ERR_ROCKSDB_CORRUPT_DATA;
5965 }
5966
5967 const int length = Field_longlong::PACK_LENGTH;
5968 const uchar *from = reinterpret_cast<const uchar *>(reader.read(length));
5969 if (from == nullptr) {
5970 /* Mem-comparable image doesn't have enough bytes */
5971 return HA_ERR_ROCKSDB_CORRUPT_DATA;
5972 }
5973
5974 *hidden_pk_id = rdb_netbuf_read_uint64(&from);
5975 return HA_EXIT_SUCCESS;
5976 }
5977
5978 /**
5979 @brief
5980 Free lock controls. We call this whenever we close a table. If the table had
5981 the last reference to the table_handler, then we free the memory associated
5982 with it.
5983 */
5984
5985 void Rdb_open_tables_map::release_table_handler(
5986 Rdb_table_handler *const table_handler) {
5987 RDB_MUTEX_LOCK_CHECK(m_mutex);
5988
5989 assert(table_handler != nullptr);
5990 assert(table_handler->m_ref_count > 0);
5991 if (!--table_handler->m_ref_count) {
5992 const auto ret MY_ATTRIBUTE((__unused__)) =
5993 m_table_map.erase(std::string(table_handler->m_table_name));
5994 assert(ret == 1); // the hash entry must actually be found and deleted
5995 my_core::thr_lock_delete(&table_handler->m_thr_lock);
5996 my_free(table_handler);
5997 }
5998
5999 RDB_MUTEX_UNLOCK_CHECK(m_mutex);
6000 }
6001
6002 static handler *rocksdb_create_handler(my_core::handlerton *const hton,
6003 my_core::TABLE_SHARE *const table_arg,
6004 my_core::MEM_ROOT *const mem_root) {
6005 if (rocksdb_enable_native_partition && table_arg &&
6006 table_arg->db_type() == rocksdb_hton && table_arg->partition_info_str &&
6007 table_arg->partition_info_str_len) {
6008 ha_rockspart *file = new (mem_root) ha_rockspart(hton, table_arg);
6009 if (file && file->init_partitioning(mem_root)) {
6010 delete file;
6011 return nullptr;
6012 }
6013 return (file);
6014 }
6015
6016 return new (mem_root) ha_rocksdb(hton, table_arg);
6017 }
6018
6019 ha_rocksdb::ha_rocksdb(my_core::handlerton *const hton,
6020 my_core::TABLE_SHARE *const table_arg)
6021 : handler(hton, table_arg),
6022 m_table_handler(nullptr),
6023 m_scan_it(nullptr),
6024 m_scan_it_skips_bloom(false),
6025 m_scan_it_snapshot(nullptr),
6026 m_scan_it_lower_bound(nullptr),
6027 m_scan_it_upper_bound(nullptr),
6028 m_tbl_def(nullptr),
6029 m_pk_descr(nullptr),
6030 m_key_descr_arr(nullptr),
6031 m_pk_can_be_decoded(true),
6032 m_pk_tuple(nullptr),
6033 m_pk_packed_tuple(nullptr),
6034 m_sk_packed_tuple(nullptr),
6035 m_end_key_packed_tuple(nullptr),
6036 m_sk_match_prefix(nullptr),
6037 m_sk_match_prefix_buf(nullptr),
6038 m_sk_packed_tuple_old(nullptr),
6039 m_dup_sk_packed_tuple(nullptr),
6040 m_dup_sk_packed_tuple_old(nullptr),
6041 m_pack_buffer(nullptr),
6042 m_lock_rows(RDB_LOCK_NONE),
6043 m_keyread_only(false),
6044 m_insert_with_update(false),
6045 m_dup_key_found(false),
6046 #if defined(ROCKSDB_INCLUDE_RFR) && ROCKSDB_INCLUDE_RFR
6047 m_in_rpl_delete_rows(false),
6048 m_in_rpl_update_rows(false),
6049 #endif // defined(ROCKSDB_INCLUDE_RFR) && ROCKSDB_INCLUDE_RFR
6050 m_need_build_decoder(false) {
6051 }
6052
6053 ha_rocksdb::~ha_rocksdb() {
6054 int err MY_ATTRIBUTE((__unused__));
6055 err = finalize_bulk_load(false);
6056 if (err != 0) {
6057 // NO_LINT_DEBUG
6058 sql_print_error(
6059 "RocksDB: Error %d finalizing bulk load while closing "
6060 "handler.",
6061 err);
6062 }
6063 }
6064
6065 static const char *ha_rocksdb_exts[] = {NullS};
6066
6067 const char **ha_rocksdb::bas_ext() const {
6068 DBUG_ENTER_FUNC();
6069
6070 DBUG_RETURN(ha_rocksdb_exts);
6071 }
6072
6073 const std::string &ha_rocksdb::get_table_basename() const {
6074 return m_tbl_def->base_tablename();
6075 }
6076
6077 /**
6078 @return
6079 false OK
6080 other Error inpacking the data
6081 */
6082 bool ha_rocksdb::init_with_fields() {
6083 DBUG_ENTER_FUNC();
6084
6085 const uint pk = table_share->primary_key;
6086 if (pk != MAX_KEY) {
6087 const uint key_parts = table_share->key_info[pk].user_defined_key_parts;
6088 check_keyread_allowed(m_pk_can_be_decoded, table_share, pk /*PK*/,
6089 key_parts - 1, true);
6090 } else {
6091 m_pk_can_be_decoded = false;
6092 }
6093 cached_table_flags = table_flags();
6094
6095 DBUG_RETURN(false); /* Ok */
6096 }
6097
6098 bool ha_rocksdb::rpl_can_handle_stm_event() const {
6099 return !(rpl_skip_tx_api_var && !super_read_only);
6100 }
6101
6102 /*
6103 If the key is a TTL key, we may need to filter it out.
6104
6105 The purpose of read filtering for tables with TTL is to ensure that
6106 during a transaction a key which has expired already but not removed by
6107 compaction yet is not returned to the user.
6108
6109 Without this the user might be hit with problems such as disappearing
6110 rows within a transaction, etc, because the compaction filter ignores
6111 snapshots when filtering keys.
6112 */
6113 bool ha_rocksdb::should_hide_ttl_rec(const Rdb_key_def &kd,
6114 const rocksdb::Slice &ttl_rec_val,
6115 const int64_t curr_ts) {
6116 assert(kd.has_ttl());
6117 assert(kd.m_ttl_rec_offset != UINT_MAX);
6118
6119 /*
6120 Curr_ts can only be 0 if there are no snapshots open.
6121 should_hide_ttl_rec can only be called when there is >=1 snapshots, unless
6122 we are filtering on the write path (single INSERT/UPDATE) in which case
6123 we are passed in the current time as curr_ts.
6124
6125 In the event curr_ts is 0, we always decide not to filter the record. We
6126 also log a warning and increment a diagnostic counter.
6127 */
6128 if (curr_ts == 0) {
6129 update_row_stats(ROWS_HIDDEN_NO_SNAPSHOT);
6130 return false;
6131 }
6132
6133 if (!rdb_is_ttl_read_filtering_enabled() || !rdb_is_ttl_enabled()) {
6134 return false;
6135 }
6136
6137 Rdb_string_reader reader(&ttl_rec_val);
6138
6139 /*
6140 Find where the 8-byte ttl is for each record in this index.
6141 */
6142 uint64 ts;
6143 if (!reader.read(kd.m_ttl_rec_offset) || reader.read_uint64(&ts)) {
6144 /*
6145 This condition should never be reached since all TTL records have an
6146 8 byte ttl field in front. Don't filter the record out, and log an error.
6147 */
6148 std::string buf;
6149 buf = rdb_hexdump(ttl_rec_val.data(), ttl_rec_val.size(),
6150 RDB_MAX_HEXDUMP_LEN);
6151 const GL_INDEX_ID gl_index_id = kd.get_gl_index_id();
6152 // NO_LINT_DEBUG
6153 sql_print_error(
6154 "Decoding ttl from PK value failed, "
6155 "for index (%u,%u), val: %s",
6156 gl_index_id.cf_id, gl_index_id.index_id, buf.c_str());
6157 assert(0);
6158 return false;
6159 }
6160
6161 /* Hide record if it has expired before the current snapshot time. */
6162 uint64 read_filter_ts = 0;
6163 #if !defined(NDEBUG)
6164 read_filter_ts += rdb_dbug_set_ttl_read_filter_ts();
6165 #endif // !defined(NDEBUG)
6166 bool is_hide_ttl =
6167 ts + kd.m_ttl_duration + read_filter_ts <= static_cast<uint64>(curr_ts);
6168 if (is_hide_ttl) {
6169 update_row_stats(ROWS_FILTERED);
6170
6171 /* increment examined row count when rows are skipped */
6172 THD *thd = ha_thd();
6173 thd->inc_examined_row_count(1);
6174 DEBUG_SYNC(thd, "rocksdb.ttl_rows_examined");
6175 }
6176 return is_hide_ttl;
6177 }
6178
6179 int ha_rocksdb::rocksdb_skip_expired_records(const Rdb_key_def &kd,
6180 rocksdb::Iterator *const iter,
6181 bool seek_backward) {
6182 if (kd.has_ttl()) {
6183 THD *thd = ha_thd();
6184 while (iter->Valid() &&
6185 should_hide_ttl_rec(
6186 kd, iter->value(),
6187 get_or_create_tx(table->in_use)->m_snapshot_timestamp)) {
6188 DEBUG_SYNC(thd, "rocksdb.check_flags_ser");
6189 if (thd && thd->killed) {
6190 return HA_ERR_QUERY_INTERRUPTED;
6191 }
6192 rocksdb_smart_next(seek_backward, iter);
6193 }
6194 }
6195 return HA_EXIT_SUCCESS;
6196 }
6197
6198 #ifndef NDEBUG
6199 void dbug_append_garbage_at_end(rocksdb::PinnableSlice *on_disk_rec) {
6200 std::string str(on_disk_rec->data(), on_disk_rec->size());
6201 on_disk_rec->Reset();
6202 str.append("abc");
6203 on_disk_rec->PinSelf(rocksdb::Slice(str));
6204 }
6205
6206 void dbug_truncate_record(rocksdb::PinnableSlice *on_disk_rec) {
6207 on_disk_rec->remove_suffix(on_disk_rec->size());
6208 }
6209
6210 void dbug_modify_rec_varchar12(rocksdb::PinnableSlice *on_disk_rec) {
6211 std::string res;
6212 // The record is NULL-byte followed by VARCHAR(10).
6213 // Put the NULL-byte
6214 res.append("\0", 1);
6215 // Then, add a valid VARCHAR(12) value.
6216 res.append("\xC", 1);
6217 res.append("123456789ab", 12);
6218
6219 on_disk_rec->Reset();
6220 on_disk_rec->PinSelf(rocksdb::Slice(res));
6221 }
6222
6223 void dbug_create_err_inplace_alter() {
6224 my_printf_error(ER_UNKNOWN_ERROR,
6225 "Intentional failure in inplace alter occurred.", MYF(0));
6226 }
6227 #endif // !defined(NDEBUG)
6228
6229 int ha_rocksdb::convert_record_from_storage_format(
6230 const rocksdb::Slice *const key, uchar *const buf) {
6231 DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read1",
6232 dbug_append_garbage_at_end(&m_retrieved_record););
6233 DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read2",
6234 dbug_truncate_record(&m_retrieved_record););
6235 DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read3",
6236 dbug_modify_rec_varchar12(&m_retrieved_record););
6237
6238 return convert_record_from_storage_format(key, &m_retrieved_record, buf);
6239 }
6240
6241 /*
6242 @brief
6243 Unpack the record in this->m_retrieved_record and this->m_last_rowkey from
6244 storage format into buf (which can be table->record[0] or table->record[1]).
6245
6246 @param key Table record's key in mem-comparable form.
6247 @param buf Store record in table->record[0] format here
6248
6249 @detail
6250 If the table has blobs, the unpacked data in buf may keep pointers to the
6251 data in this->m_retrieved_record.
6252
6253 The key is only needed to check its checksum value (the checksum is in
6254 m_retrieved_record).
6255
6256 @seealso
6257 rdb_converter::setup_read_decoders() Sets up data structures which tell
6258 which columns to decode.
6259
6260 @return
6261 0 OK
6262 other Error inpacking the data
6263 */
6264
6265 int ha_rocksdb::convert_record_from_storage_format(
6266 const rocksdb::Slice *const key, const rocksdb::Slice *const value,
6267 uchar *const buf) {
6268 assert(key != nullptr);
6269 assert(buf != nullptr);
6270
6271 return m_converter->decode(m_pk_descr, buf, key, value);
6272 }
6273
6274 int ha_rocksdb::alloc_key_buffers(const TABLE *const table_arg,
6275 const Rdb_tbl_def *const tbl_def_arg,
6276 bool alloc_alter_buffers) {
6277 DBUG_ENTER_FUNC();
6278
6279 assert(m_pk_tuple == nullptr);
6280 assert(tbl_def_arg != nullptr);
6281
6282 std::shared_ptr<Rdb_key_def> *const kd_arr = tbl_def_arg->m_key_descr_arr;
6283
6284 uint key_len = 0;
6285 uint max_packed_sk_len = 0;
6286 uint pack_key_len = 0;
6287
6288 m_pk_descr = kd_arr[pk_index(table_arg, tbl_def_arg)];
6289 if (has_hidden_pk(table_arg)) {
6290 m_pk_key_parts = 1;
6291 } else {
6292 m_pk_key_parts =
6293 table->key_info[table->s->primary_key].user_defined_key_parts;
6294 key_len = table->key_info[table->s->primary_key].key_length;
6295 }
6296
6297 // move this into get_table_handler() ??
6298 m_pk_descr->setup(table_arg, tbl_def_arg);
6299
6300 #ifdef HAVE_PSI_INTERFACE
6301 m_pk_tuple =
6302 static_cast<uchar *>(my_malloc(rdb_handler_memory_key, key_len, MYF(0)));
6303 #else
6304 m_pk_tuple =
6305 static_cast<uchar *>(my_malloc(PSI_NOT_INSTRUMENTED, key_len, MYF(0)));
6306 #endif
6307 if (m_pk_tuple == nullptr) {
6308 goto error;
6309 }
6310
6311 pack_key_len = m_pk_descr->max_storage_fmt_length();
6312 #ifdef HAVE_PSI_INTERFACE
6313 m_pk_packed_tuple = static_cast<uchar *>(
6314 my_malloc(rdb_handler_memory_key, pack_key_len, MYF(0)));
6315 #else
6316 m_pk_packed_tuple = static_cast<uchar *>(
6317 my_malloc(PSI_NOT_INSTRUMENTED, pack_key_len, MYF(0)));
6318 #endif
6319 if (m_pk_packed_tuple == nullptr) {
6320 goto error;
6321 }
6322
6323 /* Sometimes, we may use m_sk_packed_tuple for storing packed PK */
6324 max_packed_sk_len = pack_key_len;
6325 for (uint i = 0; i < table_arg->s->keys; i++) {
6326 /* Primary key was processed above */
6327 if (i == table_arg->s->primary_key) continue;
6328
6329 // TODO: move this into get_table_handler() ??
6330 kd_arr[i]->setup(table_arg, tbl_def_arg);
6331
6332 const uint packed_len = kd_arr[i]->max_storage_fmt_length();
6333 if (packed_len > max_packed_sk_len) {
6334 max_packed_sk_len = packed_len;
6335 }
6336 }
6337
6338 #ifdef HAVE_PSI_INTERFACE
6339 if (!(m_sk_packed_tuple = static_cast<uchar *>(
6340 my_malloc(rdb_handler_memory_key, max_packed_sk_len, MYF(0)))) ||
6341 !(m_sk_match_prefix_buf = static_cast<uchar *>(
6342 my_malloc(rdb_handler_memory_key, max_packed_sk_len, MYF(0)))) ||
6343 !(m_sk_packed_tuple_old = static_cast<uchar *>(
6344 my_malloc(rdb_handler_memory_key, max_packed_sk_len, MYF(0)))) ||
6345 !(m_end_key_packed_tuple = static_cast<uchar *>(
6346 my_malloc(rdb_handler_memory_key, max_packed_sk_len, MYF(0)))) ||
6347 !(m_pack_buffer = static_cast<uchar *>(
6348 my_malloc(rdb_handler_memory_key, max_packed_sk_len, MYF(0)))) ||
6349 !(m_scan_it_lower_bound = static_cast<uchar *>(
6350 my_malloc(rdb_handler_memory_key, max_packed_sk_len, MYF(0)))) ||
6351 !(m_scan_it_upper_bound = static_cast<uchar *>(
6352 my_malloc(rdb_handler_memory_key, max_packed_sk_len, MYF(0))))) {
6353 #else
6354 if (!(m_sk_packed_tuple = static_cast<uchar *>(
6355 my_malloc(PSI_NOT_INSTRUMENTED, max_packed_sk_len, MYF(0)))) ||
6356 !(m_sk_match_prefix_buf = static_cast<uchar *>(
6357 my_malloc(PSI_NOT_INSTRUMENTED, max_packed_sk_len, MYF(0)))) ||
6358 !(m_sk_packed_tuple_old = static_cast<uchar *>(
6359 my_malloc(PSI_NOT_INSTRUMENTED, max_packed_sk_len, MYF(0)))) ||
6360 !(m_end_key_packed_tuple = static_cast<uchar *>(
6361 my_malloc(PSI_NOT_INSTRUMENTED, max_packed_sk_len, MYF(0)))) ||
6362 !(m_pack_buffer = static_cast<uchar *>(
6363 my_malloc(PSI_NOT_INSTRUMENTED, max_packed_sk_len, MYF(0)))) ||
6364 !(m_scan_it_lower_bound = static_cast<uchar *>(
6365 my_malloc(PSI_NOT_INSTRUMENTED, max_packed_sk_len, MYF(0)))) ||
6366 !(m_scan_it_upper_bound = static_cast<uchar *>(
6367 my_malloc(PSI_NOT_INSTRUMENTED, max_packed_sk_len, MYF(0))))) {
6368 #endif
6369 goto error;
6370 }
6371
6372 /*
6373 If inplace alter is happening, allocate special buffers for unique
6374 secondary index duplicate checking.
6375 */
6376 #ifdef HAVE_PSI_INTERFACE
6377 if (alloc_alter_buffers &&
6378 (!(m_dup_sk_packed_tuple = static_cast<uchar *>(
6379 my_malloc(rdb_handler_memory_key, max_packed_sk_len, MYF(0)))) ||
6380 !(m_dup_sk_packed_tuple_old = static_cast<uchar *>(
6381 my_malloc(rdb_handler_memory_key, max_packed_sk_len, MYF(0)))))) {
6382 #else
6383 if (alloc_alter_buffers &&
6384 (!(m_dup_sk_packed_tuple = static_cast<uchar *>(
6385 my_malloc(PSI_NOT_INSTRUMENTED, max_packed_sk_len, MYF(0)))) ||
6386 !(m_dup_sk_packed_tuple_old = static_cast<uchar *>(
6387 my_malloc(PSI_NOT_INSTRUMENTED, max_packed_sk_len, MYF(0)))))) {
6388 #endif
6389 goto error;
6390 }
6391
6392 DBUG_RETURN(HA_EXIT_SUCCESS);
6393
6394 error:
6395 // If we're here then this means that at some point above an allocation may
6396 // have failed. To avoid any resource leaks and maintain a clear contract
6397 // we'll clean up before returning the error code.
6398 free_key_buffers();
6399
6400 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
6401 }
6402
6403 void ha_rocksdb::free_key_buffers() {
6404 my_free(m_pk_tuple);
6405 m_pk_tuple = nullptr;
6406
6407 my_free(m_pk_packed_tuple);
6408 m_pk_packed_tuple = nullptr;
6409
6410 my_free(m_sk_packed_tuple);
6411 m_sk_packed_tuple = nullptr;
6412
6413 my_free(m_sk_match_prefix_buf);
6414 m_sk_match_prefix_buf = nullptr;
6415
6416 my_free(m_sk_packed_tuple_old);
6417 m_sk_packed_tuple_old = nullptr;
6418
6419 my_free(m_end_key_packed_tuple);
6420 m_end_key_packed_tuple = nullptr;
6421
6422 my_free(m_pack_buffer);
6423 m_pack_buffer = nullptr;
6424
6425 my_free(m_dup_sk_packed_tuple);
6426 m_dup_sk_packed_tuple = nullptr;
6427
6428 my_free(m_dup_sk_packed_tuple_old);
6429 m_dup_sk_packed_tuple_old = nullptr;
6430
6431 my_free(m_scan_it_lower_bound);
6432 m_scan_it_lower_bound = nullptr;
6433
6434 my_free(m_scan_it_upper_bound);
6435 m_scan_it_upper_bound = nullptr;
6436 }
6437
6438 /**
6439 @return
6440 HA_EXIT_SUCCESS OK
6441 other HA_ERR error code (can be SE-specific)
6442 */
6443 int ha_rocksdb::open(const char *const name, int mode, uint test_if_locked) {
6444 DBUG_ENTER_FUNC();
6445
6446 int err = close();
6447 if (err) {
6448 DBUG_RETURN(err);
6449 }
6450
6451 m_table_handler = rdb_open_tables.get_table_handler(name);
6452
6453 if (m_table_handler == nullptr) {
6454 DBUG_RETURN(HA_ERR_OUT_OF_MEM);
6455 }
6456
6457 my_core::thr_lock_data_init(&m_table_handler->m_thr_lock, &m_db_lock,
6458 nullptr);
6459 m_io_perf.init(&m_table_handler->m_table_perf_context,
6460 &m_table_handler->m_io_perf_read, &stats);
6461
6462 Rdb_perf_context_guard guard(&m_io_perf,
6463 rocksdb_perf_context_level(ha_thd()));
6464
6465 std::string fullname;
6466 err = rdb_normalize_tablename(name, &fullname);
6467 if (err != HA_EXIT_SUCCESS) {
6468 DBUG_RETURN(err);
6469 }
6470
6471 m_tbl_def = ddl_manager.find(fullname);
6472 if (m_tbl_def == nullptr) {
6473 my_error(ER_INTERNAL_ERROR, MYF(0),
6474 "Attempt to open a table that is not present in RocksDB-SE data "
6475 "dictionary");
6476 DBUG_RETURN(HA_ERR_ROCKSDB_INVALID_TABLE);
6477 }
6478
6479 m_lock_rows = RDB_LOCK_NONE;
6480 m_key_descr_arr = m_tbl_def->m_key_descr_arr;
6481
6482 /*
6483 Full table scan actually uses primary key
6484 (UPDATE needs to know this, otherwise it will go into infinite loop on
6485 queries like "UPDATE tbl SET pk=pk+100")
6486 */
6487 key_used_on_scan = table->s->primary_key;
6488
6489 // close() above has already called free_key_buffers(). No need to do it here.
6490 err = alloc_key_buffers(table, m_tbl_def);
6491
6492 if (err) {
6493 DBUG_RETURN(err);
6494 }
6495
6496 /*
6497 init_with_fields() is used to initialize table flags based on the field
6498 definitions in table->field[].
6499 It is called by open_binary_frm(), but that function calls the method for
6500 a temporary ha_rocksdb object which is later destroyed.
6501
6502 If we are here in ::open(), then init_with_fields() has not been called
6503 for this object. Call it ourselves, we want all member variables to be
6504 properly initialized.
6505 */
6506 init_with_fields();
6507
6508 /* Initialize decoder */
6509 m_converter.reset(new Rdb_converter(ha_thd(), m_tbl_def, table));
6510
6511 /*
6512 Update m_ttl_bytes address to same as Rdb_converter's m_ttl_bytes.
6513 Remove this code after moving convert_record_to_storage_format() into
6514 Rdb_converter class.
6515 */
6516 m_ttl_bytes = m_converter->get_ttl_bytes_buffer();
6517
6518 info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
6519
6520 /*
6521 The following load_XXX code calls row decode functions, and they do
6522 that without having done ::external_lock() or index_init()/rnd_init().
6523 (Note: this also means we're doing a read when there was no
6524 rdb_converter::setup_field_encoders() call)
6525
6526 Initialize the necessary variables for them:
6527 */
6528
6529 /* Load auto_increment value only once on first use. */
6530 if (table->found_next_number_field && m_tbl_def->m_auto_incr_val == 0) {
6531 load_auto_incr_value();
6532 }
6533
6534 /* Load hidden pk only once on first use. */
6535 if (has_hidden_pk(table) && m_tbl_def->m_hidden_pk_val == 0 &&
6536 (err = load_hidden_pk_value()) != HA_EXIT_SUCCESS) {
6537 free_key_buffers();
6538 DBUG_RETURN(err);
6539 }
6540
6541 /* Index block size in MyRocks: used by MySQL in query optimization */
6542 stats.block_size = rocksdb_tbl_options->block_size;
6543
6544 DBUG_RETURN(HA_EXIT_SUCCESS);
6545 }
6546
6547 int ha_rocksdb::close(void) {
6548 DBUG_ENTER_FUNC();
6549
6550 m_pk_descr = nullptr;
6551 m_key_descr_arr = nullptr;
6552 m_converter = nullptr;
6553 free_key_buffers();
6554
6555 if (m_table_handler != nullptr) {
6556 rdb_open_tables.release_table_handler(m_table_handler);
6557 m_table_handler = nullptr;
6558 }
6559
6560 // These are needed to suppress valgrind errors in rocksdb.partition
6561 m_last_rowkey.mem_free();
6562 m_sk_tails.free();
6563 m_sk_tails_old.free();
6564 m_pk_unpack_info.free();
6565
6566 DBUG_RETURN(HA_EXIT_SUCCESS);
6567 }
6568
6569 static const char *rdb_error_messages[] = {
6570 "Table must have a PRIMARY KEY.",
6571 "Specifying DATA DIRECTORY for an individual table is not supported.",
6572 "Specifying INDEX DIRECTORY for an individual table is not supported.",
6573 "RocksDB commit failed.",
6574 "Failure during bulk load operation.",
6575 "Found data corruption.",
6576 "CRC checksum mismatch.",
6577 "Invalid table.",
6578 "Could not access RocksDB properties.",
6579 "File I/O error during merge/sort operation.",
6580 "RocksDB status: not found.",
6581 "RocksDB status: corruption.",
6582 "RocksDB status: not supported.",
6583 "RocksDB status: invalid argument.",
6584 "RocksDB status: io error.",
6585 "RocksDB status: no space.",
6586 "RocksDB status: merge in progress.",
6587 "RocksDB status: incomplete.",
6588 "RocksDB status: shutdown in progress.",
6589 "RocksDB status: timed out.",
6590 "RocksDB status: aborted.",
6591 "RocksDB status: lock limit reached.",
6592 "RocksDB status: busy.",
6593 "RocksDB status: deadlock.",
6594 "RocksDB status: expired.",
6595 "RocksDB status: try again.",
6596 };
6597
6598 static_assert((sizeof(rdb_error_messages) / sizeof(rdb_error_messages[0])) ==
6599 ((HA_ERR_ROCKSDB_LAST - HA_ERR_ROCKSDB_FIRST) + 1),
6600 "Number of error messages doesn't match number of error codes");
6601
6602 static const char *rdb_get_error_messages(int error) {
6603 if (error >= HA_ERR_ROCKSDB_FIRST && error <= HA_ERR_ROCKSDB_LAST) {
6604 return rdb_error_messages[error - HA_ERR_ROCKSDB_FIRST];
6605 }
6606 return "";
6607 }
6608
6609 bool ha_rocksdb::get_error_message(const int error, String *const buf) {
6610 DBUG_ENTER_FUNC();
6611
6612 static_assert(HA_ERR_ROCKSDB_LAST > HA_ERR_FIRST,
6613 "HA_ERR_ROCKSDB_LAST > HA_ERR_FIRST");
6614 static_assert(HA_ERR_ROCKSDB_LAST > HA_ERR_LAST,
6615 "HA_ERR_ROCKSDB_LAST > HA_ERR_LAST");
6616
6617 assert(buf != nullptr);
6618
6619 buf->append(rdb_get_error_messages(error));
6620
6621 // We can be called with the values which are < HA_ERR_FIRST because most
6622 // MySQL internal functions will just return HA_EXIT_FAILURE in case of
6623 // an error.
6624
6625 DBUG_RETURN(false);
6626 }
6627
6628 /*
6629 Generalized way to convert RocksDB status errors into MySQL error code, and
6630 print error message.
6631
6632 Each error code below maps to a RocksDB status code found in:
6633 rocksdb/include/rocksdb/status.h
6634 */
6635 int ha_rocksdb::rdb_error_to_mysql(const rocksdb::Status &s,
6636 const char *opt_msg) {
6637 assert(!s.ok());
6638
6639 int err;
6640 switch (s.code()) {
6641 case rocksdb::Status::Code::kOk:
6642 err = HA_EXIT_SUCCESS;
6643 break;
6644 case rocksdb::Status::Code::kNotFound:
6645 err = HA_ERR_ROCKSDB_STATUS_NOT_FOUND;
6646 break;
6647 case rocksdb::Status::Code::kCorruption:
6648 err = HA_ERR_ROCKSDB_STATUS_CORRUPTION;
6649 break;
6650 case rocksdb::Status::Code::kNotSupported:
6651 err = HA_ERR_ROCKSDB_STATUS_NOT_SUPPORTED;
6652 break;
6653 case rocksdb::Status::Code::kInvalidArgument:
6654 err = HA_ERR_ROCKSDB_STATUS_INVALID_ARGUMENT;
6655 break;
6656 case rocksdb::Status::Code::kIOError:
6657 err = (s.IsNoSpace()) ? HA_ERR_ROCKSDB_STATUS_NO_SPACE
6658 : HA_ERR_ROCKSDB_STATUS_IO_ERROR;
6659 break;
6660 case rocksdb::Status::Code::kMergeInProgress:
6661 err = HA_ERR_ROCKSDB_STATUS_MERGE_IN_PROGRESS;
6662 break;
6663 case rocksdb::Status::Code::kIncomplete:
6664 err = HA_ERR_ROCKSDB_STATUS_INCOMPLETE;
6665 break;
6666 case rocksdb::Status::Code::kShutdownInProgress:
6667 err = HA_ERR_ROCKSDB_STATUS_SHUTDOWN_IN_PROGRESS;
6668 break;
6669 case rocksdb::Status::Code::kTimedOut:
6670 err = HA_ERR_ROCKSDB_STATUS_TIMED_OUT;
6671 break;
6672 case rocksdb::Status::Code::kAborted:
6673 err = (s.IsLockLimit()) ? HA_ERR_ROCKSDB_STATUS_LOCK_LIMIT
6674 : HA_ERR_ROCKSDB_STATUS_ABORTED;
6675 break;
6676 case rocksdb::Status::Code::kBusy:
6677 err = (s.IsDeadlock()) ? HA_ERR_ROCKSDB_STATUS_DEADLOCK
6678 : HA_ERR_ROCKSDB_STATUS_BUSY;
6679 break;
6680 case rocksdb::Status::Code::kExpired:
6681 err = HA_ERR_ROCKSDB_STATUS_EXPIRED;
6682 break;
6683 case rocksdb::Status::Code::kTryAgain:
6684 err = HA_ERR_ROCKSDB_STATUS_TRY_AGAIN;
6685 break;
6686 default:
6687 assert(0);
6688 return -1;
6689 }
6690
6691 std::string errMsg;
6692 if (s.IsLockLimit()) {
6693 errMsg =
6694 "Operation aborted: Failed to acquire lock due to "
6695 "rocksdb_max_row_locks limit";
6696 } else {
6697 errMsg = s.ToString();
6698 }
6699
6700 if (opt_msg) {
6701 std::string concatenated_error = errMsg + " (" + std::string(opt_msg) + ")";
6702 my_error(ER_GET_ERRMSG, MYF(0), s.code(), concatenated_error.c_str(),
6703 rocksdb_hton_name);
6704 } else {
6705 my_error(ER_GET_ERRMSG, MYF(0), s.code(), errMsg.c_str(),
6706 rocksdb_hton_name);
6707 }
6708
6709 return err;
6710 }
6711
6712 /* MyRocks supports only the following collations for indexed columns */
6713 static const std::set<const my_core::CHARSET_INFO *> RDB_INDEX_COLLATIONS = {
6714 &my_charset_bin, &my_charset_utf8_bin, &my_charset_latin1_bin};
6715
6716 static bool rdb_is_index_collation_supported(
6717 const my_core::Field *const field) {
6718 const my_core::enum_field_types type = field->real_type();
6719 /* Handle [VAR](CHAR|BINARY) or TEXT|BLOB */
6720 if (type == MYSQL_TYPE_VARCHAR || type == MYSQL_TYPE_STRING ||
6721 type == MYSQL_TYPE_BLOB || type == MYSQL_TYPE_JSON) {
6722 return (RDB_INDEX_COLLATIONS.find(field->charset()) !=
6723 RDB_INDEX_COLLATIONS.end()) ||
6724 rdb_is_collation_supported(field->charset());
6725 }
6726 return true;
6727 }
6728
6729 /*
6730 Create structures needed for storing data in rocksdb. This is called when the
6731 table is created. The structures will be shared by all TABLE* objects.
6732
6733 @param
6734 table_arg Table with definition
6735 db_table "dbname.tablename"
6736 len strlen of the above
6737 tbl_def_arg tbl_def whose key_descr is being created/populated
6738 old_tbl_def_arg tbl_def from which keys are being copied over from
6739 (for use during inplace alter)
6740
6741 @return
6742 0 - Ok
6743 other - error, either given table ddl is not supported by rocksdb or OOM.
6744 */
6745 int ha_rocksdb::create_key_defs(
6746 const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
6747 const TABLE *const old_table_arg /* = nullptr */,
6748 const Rdb_tbl_def *const old_tbl_def_arg
6749 /* = nullptr */) const {
6750 DBUG_ENTER_FUNC();
6751
6752 assert(table_arg != nullptr);
6753 assert(table_arg->s != nullptr);
6754
6755 DBUG_EXECUTE_IF("rocksdb_truncate_failure", {
6756 my_error(ER_INTERNAL_ERROR, MYF(0), "Simulated truncation failure.");
6757 DBUG_RETURN(HA_EXIT_FAILURE);
6758 });
6759
6760 DBUG_EXECUTE_IF("rocksdb_truncate_failure_crash", DBUG_SUICIDE(););
6761
6762 /*
6763 These need to be one greater than MAX_INDEXES since the user can create
6764 MAX_INDEXES secondary keys and no primary key which would cause us
6765 to generate a hidden one.
6766 */
6767 std::array<key_def_cf_info, MAX_INDEXES + 1> cfs;
6768
6769 /*
6770 NOTE: All new column families must be created before new index numbers are
6771 allocated to each key definition. See below for more details.
6772 http://github.com/MySQLOnRocksDB/mysql-5.6/issues/86#issuecomment-138515501
6773 */
6774 if (create_cfs(table_arg, tbl_def_arg, &cfs)) {
6775 DBUG_RETURN(HA_EXIT_FAILURE);
6776 }
6777
6778 uint64 ttl_duration = 0;
6779 std::string ttl_column;
6780 uint ttl_field_offset;
6781
6782 uint err;
6783 if ((err = Rdb_key_def::extract_ttl_duration(table_arg, tbl_def_arg,
6784 &ttl_duration))) {
6785 DBUG_RETURN(err);
6786 }
6787
6788 if ((err = Rdb_key_def::extract_ttl_col(table_arg, tbl_def_arg, &ttl_column,
6789 &ttl_field_offset))) {
6790 DBUG_RETURN(err);
6791 }
6792
6793 /* We don't currently support TTL on tables with hidden primary keys. */
6794 if (ttl_duration > 0 && has_hidden_pk(table_arg)) {
6795 my_error(ER_RDB_TTL_UNSUPPORTED, MYF(0));
6796 DBUG_RETURN(HA_EXIT_FAILURE);
6797 }
6798
6799 /*
6800 If TTL duration is not specified but TTL column was specified, throw an
6801 error because TTL column requires duration.
6802 */
6803 if (ttl_duration == 0 && !ttl_column.empty()) {
6804 my_error(ER_RDB_TTL_COL_FORMAT, MYF(0), ttl_column.c_str());
6805 DBUG_RETURN(HA_EXIT_FAILURE);
6806 }
6807
6808 if (!old_tbl_def_arg) {
6809 /*
6810 old_tbl_def doesn't exist. this means we are in the process of creating
6811 a new table.
6812
6813 Get the index numbers (this will update the next_index_number)
6814 and create Rdb_key_def structures.
6815 */
6816 for (uint i = 0; i < tbl_def_arg->m_key_count; i++) {
6817 if (create_key_def(table_arg, i, tbl_def_arg, &m_key_descr_arr[i], cfs[i],
6818 ttl_duration, ttl_column)) {
6819 DBUG_RETURN(HA_EXIT_FAILURE);
6820 }
6821 }
6822 } else {
6823 /*
6824 old_tbl_def exists. This means we are creating a new tbl_def as part of
6825 in-place alter table. Copy over existing keys from the old_tbl_def and
6826 generate the necessary new key definitions if any.
6827 */
6828 if (create_inplace_key_defs(table_arg, tbl_def_arg, old_table_arg,
6829 old_tbl_def_arg, cfs, ttl_duration,
6830 ttl_column)) {
6831 DBUG_RETURN(HA_EXIT_FAILURE);
6832 }
6833 }
6834
6835 DBUG_RETURN(HA_EXIT_SUCCESS);
6836 }
6837
6838 /*
6839 Checks index parameters and creates column families needed for storing data
6840 in rocksdb if necessary.
6841
6842 @param in
6843 table_arg Table with definition
6844 db_table Table name
6845 tbl_def_arg Table def structure being populated
6846
6847 @param out
6848 cfs CF info for each key definition in 'key_info' order
6849
6850 @return
6851 0 - Ok
6852 other - error
6853 */
6854 int ha_rocksdb::create_cfs(
6855 const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
6856 std::array<struct key_def_cf_info, MAX_INDEXES + 1> *const cfs) const {
6857 DBUG_ENTER_FUNC();
6858
6859 assert(table_arg != nullptr);
6860 assert(table_arg->s != nullptr);
6861 assert(tbl_def_arg != nullptr);
6862
6863 char tablename_sys[NAME_LEN + 1];
6864
6865 my_core::filename_to_tablename(tbl_def_arg->base_tablename().c_str(),
6866 tablename_sys, sizeof(tablename_sys));
6867
6868 uint primary_key_index = pk_index(table_arg, tbl_def_arg);
6869 /*
6870 The first loop checks the index parameters and creates
6871 column families if necessary.
6872 */
6873 THD *const thd = my_core::thd_get_current_thd();
6874 for (uint i = 0; i < tbl_def_arg->m_key_count; i++) {
6875 std::shared_ptr<rocksdb::ColumnFamilyHandle> cf_handle;
6876
6877 /*
6878 Skip collation checks on truncation since we might be recreating the
6879 table that had unsupported collations and we don't want to fail the
6880 truncation.
6881 */
6882 if (rocksdb_strict_collation_check &&
6883 thd->lex->sql_command != SQLCOM_TRUNCATE &&
6884 !is_hidden_pk(i, table_arg, tbl_def_arg) &&
6885 tbl_def_arg->base_tablename().find(tmp_file_prefix) != 0) {
6886 for (uint part = 0; part < table_arg->key_info[i].actual_key_parts;
6887 part++) {
6888 if (!rdb_is_index_collation_supported(
6889 table_arg->key_info[i].key_part[part].field) &&
6890 !rdb_collation_exceptions->match(tablename_sys)) {
6891 std::string collation_err;
6892 for (const auto &coll : RDB_INDEX_COLLATIONS) {
6893 if (collation_err != "") {
6894 collation_err += ", ";
6895 }
6896 collation_err += coll->name;
6897 }
6898
6899 if (rocksdb_error_on_suboptimal_collation) {
6900 my_error(ER_UNSUPPORTED_COLLATION, MYF(0),
6901 tbl_def_arg->full_tablename().c_str(),
6902 table_arg->key_info[i].key_part[part].field->field_name,
6903 collation_err.c_str());
6904 DBUG_RETURN(HA_EXIT_FAILURE);
6905 } else {
6906 push_warning_printf(
6907 ha_thd(), Sql_condition::SL_WARNING, HA_ERR_INTERNAL_ERROR,
6908 "Indexed column %s.%s uses a collation that does not allow "
6909 "index-only access in secondary key and has reduced disk space "
6910 "efficiency in primary key.",
6911 tbl_def_arg->full_tablename().c_str(),
6912 table_arg->key_info[i].key_part[part].field->field_name);
6913 }
6914 }
6915 }
6916 }
6917
6918 // Internal consistency check to make sure that data in TABLE and
6919 // Rdb_tbl_def structures matches. Either both are missing or both are
6920 // specified. Yes, this is critical enough to make it into SHIP_ASSERT.
6921 SHIP_ASSERT(!table_arg->part_info == tbl_def_arg->base_partition().empty());
6922
6923 // Generate the name for the column family to use.
6924 bool per_part_match_found = false;
6925 std::string cf_name =
6926 generate_cf_name(i, table_arg, tbl_def_arg, &per_part_match_found);
6927
6928 // Prevent create from using the system column family.
6929 if (cf_name == DEFAULT_SYSTEM_CF_NAME) {
6930 my_error(ER_WRONG_ARGUMENTS, MYF(0),
6931 "column family not valid for storing index data.");
6932 DBUG_RETURN(HA_EXIT_FAILURE);
6933 }
6934
6935 DBUG_EXECUTE_IF("rocksdb_create_primary_cf", {
6936 if (cf_name == "cf_primary_key") {
6937 THD *const thd = my_core::thd_get_current_thd();
6938 static constexpr char act[] =
6939 "now signal ready_to_mark_cf_dropped_in_create_cfs "
6940 "wait_for mark_cf_dropped_done_in_create_cfs";
6941 assert(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
6942 }
6943 });
6944
6945 DBUG_EXECUTE_IF("rocksdb_create_secondary_cf", {
6946 if (cf_name == "cf_secondary_key") {
6947 THD *const thd = my_core::thd_get_current_thd();
6948 static constexpr char act[] =
6949 "now signal ready_to_mark_cf_dropped_in_create_cfs "
6950 "wait_for mark_cf_dropped_done_in_create_cfs";
6951 assert(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
6952 }
6953 });
6954
6955 // if not specified, use default CF name
6956 if (cf_name.empty()) {
6957 if (i != primary_key_index && rocksdb_use_default_sk_cf)
6958 cf_name = DEFAULT_SK_CF_NAME;
6959 else
6960 cf_name = DEFAULT_CF_NAME;
6961 }
6962
6963 // Here's how `get_or_create_cf` will use the input parameters:
6964 //
6965 // `cf_name` - will be used as a CF name.
6966 {
6967 std::lock_guard<Rdb_dict_manager> dm_lock(dict_manager);
6968 cf_handle = cf_manager.get_or_create_cf(rdb, cf_name, !rocksdb_no_create_column_family);
6969 if (!cf_handle) {
6970 DBUG_RETURN(HA_EXIT_FAILURE);
6971 }
6972
6973 uint32 cf_id = cf_handle->GetID();
6974
6975 // If the cf is marked as dropped, we fail it here.
6976 // The cf can be dropped after this point, we will
6977 // check again when committing metadata changes.
6978 if (dict_manager.get_dropped_cf(cf_id)) {
6979 my_error(ER_CF_DROPPED, MYF(0), cf_name.c_str());
6980 DBUG_RETURN(HA_EXIT_FAILURE);
6981 }
6982
6983 if (cf_manager.create_cf_flags_if_needed(&dict_manager,
6984 cf_handle->GetID(), cf_name,
6985 per_part_match_found)) {
6986 DBUG_RETURN(HA_EXIT_FAILURE);
6987 }
6988 }
6989
6990 // The CF can be dropped from cf_manager at this point. This is part of
6991 // create table or alter table. If the drop happens before metadata are
6992 // written, create table or alter table will fail.
6993 auto &cf = (*cfs)[i];
6994
6995 cf.cf_handle = cf_handle;
6996 cf.is_reverse_cf = Rdb_cf_manager::is_cf_name_reverse(cf_name.c_str());
6997 cf.is_per_partition_cf = per_part_match_found;
6998 }
6999
7000 DBUG_RETURN(HA_EXIT_SUCCESS);
7001 }
7002
7003 /*
7004 Create key definition needed for storing data in rocksdb during ADD index
7005 inplace operations.
7006
7007 @param in
7008 table_arg Table with definition
7009 tbl_def_arg New table def structure being populated
7010 old_tbl_def_arg Old(current) table def structure
7011 cfs Struct array which contains column family information
7012
7013 @return
7014 0 - Ok
7015 other - error, either given table ddl is not supported by rocksdb or OOM.
7016 */
7017 int ha_rocksdb::create_inplace_key_defs(
7018 const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
7019 const TABLE *const old_table_arg, const Rdb_tbl_def *const old_tbl_def_arg,
7020 const std::array<key_def_cf_info, MAX_INDEXES + 1> &cfs,
7021 uint64 ttl_duration, const std::string &ttl_column) const {
7022 DBUG_ENTER_FUNC();
7023
7024 assert(table_arg != nullptr);
7025 assert(tbl_def_arg != nullptr);
7026 assert(old_tbl_def_arg != nullptr);
7027
7028 std::shared_ptr<Rdb_key_def> *const old_key_descr =
7029 old_tbl_def_arg->m_key_descr_arr;
7030 std::shared_ptr<Rdb_key_def> *const new_key_descr =
7031 tbl_def_arg->m_key_descr_arr;
7032 const std::unordered_map<std::string, uint> old_key_pos =
7033 get_old_key_positions(table_arg, tbl_def_arg, old_table_arg,
7034 old_tbl_def_arg);
7035
7036 uint i;
7037 for (i = 0; i < tbl_def_arg->m_key_count; i++) {
7038 const auto &it = old_key_pos.find(get_key_name(i, table_arg, tbl_def_arg));
7039
7040 if (it != old_key_pos.end()) {
7041 /*
7042 Found matching index in old table definition, so copy it over to the
7043 new one created.
7044 */
7045 const Rdb_key_def &okd = *old_key_descr[it->second];
7046
7047 const GL_INDEX_ID gl_index_id = okd.get_gl_index_id();
7048 struct Rdb_index_info index_info;
7049 if (!dict_manager.get_index_info(gl_index_id, &index_info)) {
7050 // NO_LINT_DEBUG
7051 sql_print_error(
7052 "RocksDB: Could not get index information "
7053 "for Index Number (%u,%u), table %s",
7054 gl_index_id.cf_id, gl_index_id.index_id,
7055 old_tbl_def_arg->full_tablename().c_str());
7056 DBUG_RETURN(HA_EXIT_FAILURE);
7057 }
7058
7059 uint32 ttl_rec_offset =
7060 Rdb_key_def::has_index_flag(index_info.m_index_flags,
7061 Rdb_key_def::TTL_FLAG)
7062 ? Rdb_key_def::calculate_index_flag_offset(
7063 index_info.m_index_flags, Rdb_key_def::TTL_FLAG)
7064 : UINT_MAX;
7065
7066 /*
7067 We can't use the copy constructor because we need to update the
7068 keynr within the pack_info for each field and the keyno of the keydef
7069 itself.
7070 */
7071 new_key_descr[i] = std::make_shared<Rdb_key_def>(
7072 okd.get_index_number(), i, okd.get_shared_cf(),
7073 index_info.m_index_dict_version, index_info.m_index_type,
7074 index_info.m_kv_version, okd.m_is_reverse_cf,
7075 okd.m_is_per_partition_cf, okd.m_name.c_str(),
7076 dict_manager.get_stats(gl_index_id), index_info.m_index_flags,
7077 ttl_rec_offset, index_info.m_ttl_duration);
7078 } else if (create_key_def(table_arg, i, tbl_def_arg, &new_key_descr[i],
7079 cfs[i], ttl_duration, ttl_column)) {
7080 DBUG_RETURN(HA_EXIT_FAILURE);
7081 }
7082
7083 assert(new_key_descr[i] != nullptr);
7084 new_key_descr[i]->setup(table_arg, tbl_def_arg);
7085 }
7086
7087 tbl_def_arg->m_tbl_stats.set(new_key_descr[0]->m_stats.m_rows, 0, 0);
7088
7089 DBUG_RETURN(HA_EXIT_SUCCESS);
7090 }
7091
7092 std::unordered_map<std::string, uint> ha_rocksdb::get_old_key_positions(
7093 const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg,
7094 const TABLE *const old_table_arg,
7095 const Rdb_tbl_def *const old_tbl_def_arg) const {
7096 DBUG_ENTER_FUNC();
7097
7098 assert(table_arg != nullptr);
7099 assert(old_table_arg != nullptr);
7100 assert(tbl_def_arg != nullptr);
7101 assert(old_tbl_def_arg != nullptr);
7102
7103 std::shared_ptr<Rdb_key_def> *const old_key_descr =
7104 old_tbl_def_arg->m_key_descr_arr;
7105 std::unordered_map<std::string, uint> old_key_pos;
7106 std::unordered_map<std::string, uint> new_key_pos;
7107 uint i;
7108
7109 for (i = 0; i < tbl_def_arg->m_key_count; i++) {
7110 new_key_pos[get_key_name(i, table_arg, tbl_def_arg)] = i;
7111 }
7112
7113 for (i = 0; i < old_tbl_def_arg->m_key_count; i++) {
7114 if (is_hidden_pk(i, old_table_arg, old_tbl_def_arg)) {
7115 old_key_pos[old_key_descr[i]->m_name] = i;
7116 continue;
7117 }
7118
7119 /*
7120 In case of matching key name, need to check key parts of keys as well,
7121 in case a simultaneous drop + add is performed, where the key name is the
7122 same but the key parts are different.
7123
7124 Example:
7125 CREATE TABLE t1 (a INT, b INT, KEY ka(a)) ENGINE=RocksDB;
7126 ALTER TABLE t1 DROP INDEX ka, ADD INDEX ka(b), ALGORITHM=INPLACE;
7127 */
7128 const KEY *const old_key = &old_table_arg->key_info[i];
7129 const auto &it = new_key_pos.find(old_key->name);
7130 if (it == new_key_pos.end()) {
7131 continue;
7132 }
7133
7134 KEY *const new_key = &table_arg->key_info[it->second];
7135
7136 /*
7137 Check that the key is identical between old and new tables.
7138 If not, we still need to create a new index.
7139
7140 The exception is if there is an index changed from unique to non-unique,
7141 in these cases we don't need to rebuild as they are stored the same way in
7142 RocksDB.
7143 */
7144 bool unique_to_non_unique =
7145 ((old_key->flags ^ new_key->flags) == HA_NOSAME) &&
7146 (old_key->flags & HA_NOSAME);
7147
7148 if (compare_keys(old_key, new_key) && !unique_to_non_unique) {
7149 continue;
7150 }
7151
7152 /* Check to make sure key parts match. */
7153 if (compare_key_parts(old_key, new_key)) {
7154 continue;
7155 }
7156
7157 old_key_pos[old_key->name] = i;
7158 }
7159
7160 DBUG_RETURN(old_key_pos);
7161 }
7162
7163 /* Check to see if two keys are identical. */
7164 int ha_rocksdb::compare_keys(const KEY *const old_key,
7165 const KEY *const new_key) const {
7166 DBUG_ENTER_FUNC();
7167
7168 assert(old_key != nullptr);
7169 assert(new_key != nullptr);
7170
7171 /* Check index name. */
7172 if (strcmp(old_key->name, new_key->name) != 0) {
7173 DBUG_RETURN(HA_EXIT_FAILURE);
7174 }
7175
7176 /* If index algorithms are different then keys are different. */
7177 if (old_key->algorithm != new_key->algorithm) {
7178 DBUG_RETURN(HA_EXIT_FAILURE);
7179 }
7180
7181 /* Check that the key is identical between old and new tables. */
7182 if ((old_key->flags ^ new_key->flags) & HA_KEYFLAG_MASK) {
7183 DBUG_RETURN(HA_EXIT_FAILURE);
7184 }
7185
7186 /* Check index comment. (for column family changes) */
7187 std::string old_comment(old_key->comment.str, old_key->comment.length);
7188 std::string new_comment(new_key->comment.str, new_key->comment.length);
7189 if (old_comment.compare(new_comment) != 0) {
7190 DBUG_RETURN(HA_EXIT_FAILURE);
7191 }
7192
7193 DBUG_RETURN(HA_EXIT_SUCCESS);
7194 }
7195
7196 /* Check two keys to ensure that key parts within keys match */
7197 int ha_rocksdb::compare_key_parts(const KEY *const old_key,
7198 const KEY *const new_key) const {
7199 DBUG_ENTER_FUNC();
7200
7201 assert(old_key != nullptr);
7202 assert(new_key != nullptr);
7203
7204 /* Skip if key parts do not match, as it is a different key */
7205 if (new_key->user_defined_key_parts != old_key->user_defined_key_parts) {
7206 DBUG_RETURN(HA_EXIT_FAILURE);
7207 }
7208
7209 /* Check to see that key parts themselves match */
7210 for (uint i = 0; i < old_key->user_defined_key_parts; i++) {
7211 if (strcmp(old_key->key_part[i].field->field_name,
7212 new_key->key_part[i].field->field_name) != 0) {
7213 DBUG_RETURN(HA_EXIT_FAILURE);
7214 }
7215
7216 /* Check if prefix index key part length has changed */
7217 if (old_key->key_part[i].length != new_key->key_part[i].length) {
7218 DBUG_RETURN(HA_EXIT_FAILURE);
7219 }
7220 }
7221
7222 DBUG_RETURN(HA_EXIT_SUCCESS);
7223 }
7224
7225 /*
7226 Create key definition needed for storing data in rocksdb.
7227 This can be called either during CREATE table or doing ADD index operations.
7228
7229 @param in
7230 table_arg Table with definition
7231 i Position of index being created inside table_arg->key_info
7232 tbl_def_arg Table def structure being populated
7233 cf_info Struct which contains column family information
7234
7235 @param out
7236 new_key_def Newly created index definition.
7237
7238 @return
7239 0 - Ok
7240 other - error, either given table ddl is not supported by rocksdb or OOM.
7241 */
7242 int ha_rocksdb::create_key_def(const TABLE *const table_arg, const uint i,
7243 const Rdb_tbl_def *const tbl_def_arg,
7244 std::shared_ptr<Rdb_key_def> *const new_key_def,
7245 const struct key_def_cf_info &cf_info,
7246 uint64 ttl_duration,
7247 const std::string &ttl_column) const {
7248 DBUG_ENTER_FUNC();
7249
7250 assert(new_key_def != nullptr);
7251 assert(*new_key_def == nullptr);
7252
7253 const uint index_id = ddl_manager.get_and_update_next_number(&dict_manager);
7254 const uint16_t index_dict_version = Rdb_key_def::INDEX_INFO_VERSION_LATEST;
7255 uchar index_type;
7256 uint16_t kv_version;
7257
7258 if (is_hidden_pk(i, table_arg, tbl_def_arg)) {
7259 index_type = Rdb_key_def::INDEX_TYPE_HIDDEN_PRIMARY;
7260 kv_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_LATEST;
7261 } else if (i == table_arg->s->primary_key) {
7262 index_type = Rdb_key_def::INDEX_TYPE_PRIMARY;
7263 uint16 pk_latest_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_LATEST;
7264 kv_version = pk_latest_version;
7265 } else {
7266 index_type = Rdb_key_def::INDEX_TYPE_SECONDARY;
7267 uint16 sk_latest_version = Rdb_key_def::SECONDARY_FORMAT_VERSION_LATEST;
7268 kv_version = sk_latest_version;
7269 }
7270
7271 // Use PRIMARY_FORMAT_VERSION_UPDATE1 here since it is the same value as
7272 // SECONDARY_FORMAT_VERSION_UPDATE1 so it doesn't matter if this is a
7273 // primary key or secondary key.
7274 DBUG_EXECUTE_IF("MYROCKS_LEGACY_VARBINARY_FORMAT", {
7275 kv_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_UPDATE1;
7276 });
7277
7278 DBUG_EXECUTE_IF("MYROCKS_NO_COVERED_BITMAP_FORMAT", {
7279 if (index_type == Rdb_key_def::INDEX_TYPE_SECONDARY) {
7280 kv_version = Rdb_key_def::SECONDARY_FORMAT_VERSION_UPDATE2;
7281 }
7282 });
7283
7284 uint32 index_flags = (ttl_duration > 0 ? Rdb_key_def::TTL_FLAG : 0);
7285
7286 uint32 ttl_rec_offset =
7287 Rdb_key_def::has_index_flag(index_flags, Rdb_key_def::TTL_FLAG)
7288 ? Rdb_key_def::calculate_index_flag_offset(index_flags,
7289 Rdb_key_def::TTL_FLAG)
7290 : UINT_MAX;
7291
7292 const char *const key_name = get_key_name(i, table_arg, m_tbl_def);
7293 *new_key_def = std::make_shared<Rdb_key_def>(
7294 index_id, i, cf_info.cf_handle, index_dict_version, index_type,
7295 kv_version, cf_info.is_reverse_cf, cf_info.is_per_partition_cf, key_name,
7296 Rdb_index_stats(), index_flags, ttl_rec_offset, ttl_duration);
7297
7298 if (!ttl_column.empty()) {
7299 (*new_key_def)->m_ttl_column = ttl_column;
7300 }
7301 // initialize key_def
7302 (*new_key_def)->setup(table_arg, tbl_def_arg);
7303 DBUG_RETURN(HA_EXIT_SUCCESS);
7304 }
7305
7306 bool rdb_is_tablename_normalized(const std::string &tablename) {
7307 return tablename.size() < 2 || (tablename[0] != '.' && tablename[1] != '/');
7308 }
7309
7310 int rdb_normalize_tablename(const std::string &tablename,
7311 std::string *const strbuf) {
7312 assert(strbuf != nullptr);
7313
7314 if (tablename.size() < 2 || tablename[0] != '.' || tablename[1] != '/') {
7315 assert(0); // We were not passed table name?
7316 return HA_ERR_ROCKSDB_INVALID_TABLE;
7317 }
7318
7319 size_t pos = tablename.find_first_of('/', 2);
7320 if (pos == std::string::npos) {
7321 assert(0); // We were not passed table name?
7322 return HA_ERR_ROCKSDB_INVALID_TABLE;
7323 }
7324
7325 *strbuf = tablename.substr(2, pos - 2) + "." + tablename.substr(pos + 1);
7326
7327 return HA_EXIT_SUCCESS;
7328 }
7329
7330 int rdb_make_canonical_tablename(const std::string &tablename,
7331 std::string *const strbuf) {
7332 assert(strbuf != nullptr);
7333
7334 size_t pos = tablename.find_first_of('.');
7335 if (pos == std::string::npos) {
7336 assert(0);
7337 return HA_ERR_ROCKSDB_INVALID_TABLE;
7338 }
7339
7340 *strbuf = "./" + tablename.substr(0, pos) + "/" + tablename.substr(pos + 1);
7341
7342 return HA_EXIT_SUCCESS;
7343 }
7344
7345 /*
7346 Check to see if the user's original statement includes foreign key
7347 references
7348 */
7349 bool ha_rocksdb::contains_foreign_key(THD *const thd) {
7350 bool success;
7351 const char *str = thd->query().str;
7352
7353 assert(str != nullptr);
7354
7355 while (*str != '\0') {
7356 // Scan from our current pos looking for 'FOREIGN'
7357 str = rdb_find_in_string(str, "FOREIGN", &success);
7358 if (!success) {
7359 return false;
7360 }
7361
7362 // Skip past the found "FOREIGN'
7363 str = rdb_check_next_token(&my_charset_bin, str, "FOREIGN", &success);
7364 assert(success);
7365
7366 if (!my_isspace(&my_charset_bin, *str)) {
7367 return false;
7368 }
7369
7370 // See if the next token is 'KEY'
7371 str = rdb_check_next_token(&my_charset_bin, str, "KEY", &success);
7372 if (!success) {
7373 continue;
7374 }
7375
7376 // See if the next token is '('
7377 str = rdb_check_next_token(&my_charset_bin, str, "(", &success);
7378 if (!success) {
7379 // There is an optional index id after 'FOREIGN KEY', skip it
7380 str = rdb_skip_id(&my_charset_bin, str);
7381
7382 // Now check for '(' again
7383 str = rdb_check_next_token(&my_charset_bin, str, "(", &success);
7384 }
7385
7386 // If we have found 'FOREIGN KEY [<word>] (' we can be confident we have
7387 // a foreign key clause.
7388 return success;
7389 }
7390
7391 // We never found a valid foreign key clause
7392 return false;
7393 }
7394
7395 /**
7396 @brief
7397 splits the normalized table name of <dbname>.<tablename>#P#<part_no> into
7398 the <dbname>, <tablename> and <part_no> components.
7399
7400 @param dbbuf returns database name/table_schema
7401 @param tablebuf returns tablename
7402 @param partitionbuf returns partition suffix if there is one
7403 @return HA_EXIT_SUCCESS on success, non-zero on failure to split
7404 */
7405 int rdb_split_normalized_tablename(const std::string &fullname,
7406 std::string *const db,
7407 std::string *const table,
7408 std::string *const partition) {
7409 assert(!fullname.empty());
7410
7411 #define RDB_PARTITION_STR "#P#"
7412
7413 /* Normalize returns dbname.tablename. */
7414 size_t dotpos = fullname.find('.');
7415
7416 /* Invalid table name? */
7417 if (dotpos == std::string::npos) {
7418 return HA_ERR_ROCKSDB_INVALID_TABLE;
7419 }
7420
7421 // Table must have a database name associated with it.
7422 assert(dotpos > 0);
7423
7424 if (db != nullptr) {
7425 *db = fullname.substr(0, dotpos);
7426 }
7427
7428 dotpos++;
7429
7430 const size_t partpos =
7431 fullname.find(RDB_PARTITION_STR, dotpos, strlen(RDB_PARTITION_STR));
7432
7433 if (partpos != std::string::npos) {
7434 assert(partpos >= dotpos);
7435
7436 if (table != nullptr) {
7437 *table = fullname.substr(dotpos, partpos - dotpos);
7438 }
7439
7440 if (partition != nullptr) {
7441 *partition = fullname.substr(partpos + strlen(RDB_PARTITION_STR));
7442 }
7443 } else if (table != nullptr) {
7444 *table = fullname.substr(dotpos);
7445 }
7446
7447 return HA_EXIT_SUCCESS;
7448 }
7449
7450 /*
7451 Generates the normalized tablename using as many of the given arguments as
7452 possible. Any of the three arguments to <db>.<table>#P#<partition> can be
7453 null/empty, but return string will only ever be of the form
7454 <db>
7455 <db>.<table>
7456 <db>.<table>#P#<partition>
7457 <table>
7458 <table>#P#<partition>
7459 */
7460 void rdb_gen_normalized_tablename(const std::string *db,
7461 const std::string *table,
7462 const std::string *partition,
7463 std::string *fullname) {
7464 if (!fullname) return;
7465 fullname->clear();
7466 if (db && db->length() > 0) *fullname += *db;
7467
7468 /* If table was not passed in, the partition will be ignored too */
7469 if (!table || table->length() == 0) return;
7470
7471 if (fullname->length() > 0) *fullname += ".";
7472 *fullname += *table;
7473
7474 if (partition && partition->length() > 0) {
7475 *fullname += std::string(RDB_PARTITION_STR) + *partition;
7476 }
7477 }
7478
7479 /*
7480 Create a table's Rdb_tbl_def and its Rdb_key_defs and store table information
7481 into MyRocks Data Dictionary
7482 The method is called during create table/partition, truncate table/partition
7483
7484 @param table_name IN table's name formated as
7485 'dbname.tablename'
7486 @param table_arg IN sql table
7487 @param auto_increment_value IN specified table's auto increment value
7488
7489 @return
7490 HA_EXIT_SUCCESS OK
7491 other HA_ERR error code (can be SE-specific)
7492 */
7493 int ha_rocksdb::create_table(const std::string &table_name,
7494 const TABLE *table_arg,
7495 ulonglong auto_increment_value) {
7496 DBUG_ENTER_FUNC();
7497
7498 int err;
7499
7500 const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
7501 rocksdb::WriteBatch *const batch = wb.get();
7502
7503 /* Create table/key descriptions and put them into the data dictionary */
7504 m_tbl_def = new Rdb_tbl_def(table_name);
7505
7506 uint n_keys = table_arg->s->keys;
7507
7508 /*
7509 If no primary key found, create a hidden PK and place it inside table
7510 definition
7511 */
7512 if (has_hidden_pk(table_arg)) {
7513 n_keys += 1;
7514 // reset hidden pk id
7515 // the starting valid value for hidden pk is 1
7516 m_tbl_def->m_hidden_pk_val = 1;
7517 }
7518
7519 m_key_descr_arr = new std::shared_ptr<Rdb_key_def>[n_keys];
7520 m_tbl_def->m_key_count = n_keys;
7521 m_tbl_def->m_key_descr_arr = m_key_descr_arr;
7522
7523 err = create_key_defs(table_arg, m_tbl_def);
7524 if (err != HA_EXIT_SUCCESS) {
7525 goto error;
7526 }
7527
7528 m_pk_descr = m_key_descr_arr[pk_index(table_arg, m_tbl_def)];
7529
7530 if (auto_increment_value) {
7531 bool autoinc_upgrade_test = false;
7532 m_tbl_def->m_auto_incr_val = auto_increment_value;
7533 DBUG_EXECUTE_IF("myrocks_autoinc_upgrade", autoinc_upgrade_test = true;);
7534 if (!autoinc_upgrade_test) {
7535 auto s = dict_manager.put_auto_incr_val(
7536 batch, m_tbl_def->get_autoincr_gl_index_id(),
7537 m_tbl_def->m_auto_incr_val);
7538 if (!s.ok()) {
7539 goto error;
7540 }
7541 }
7542 }
7543
7544 DBUG_EXECUTE_IF("rocksdb_create_table", {
7545 THD *const thd = my_core::thd_get_current_thd();
7546 static constexpr char act[] =
7547 "now signal ready_to_mark_cf_dropped_in_create_table "
7548 "wait_for mark_cf_dropped_done_in_create_table";
7549 assert(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
7550 });
7551
7552 {
7553 std::lock_guard<Rdb_dict_manager> dm_lock(dict_manager);
7554 err = ddl_manager.put_and_write(m_tbl_def, batch);
7555 if (err != HA_EXIT_SUCCESS) {
7556 goto error;
7557 }
7558
7559 err = dict_manager.commit(batch);
7560 if (err != HA_EXIT_SUCCESS) {
7561 goto error;
7562 }
7563 }
7564
7565 DBUG_RETURN(HA_EXIT_SUCCESS);
7566
7567 error:
7568 /* Delete what we have allocated so far */
7569 delete m_tbl_def;
7570 m_tbl_def = nullptr;
7571 m_key_descr_arr = nullptr;
7572
7573 DBUG_RETURN(err);
7574 }
7575
7576 /**
7577 @brief
7578 create() is called to create a table. The variable name will have the name
7579 of the table.
7580
7581 @details
7582 When create() is called you do not need to worry about
7583 opening the table. Also, the .frm file will have already been
7584 created so adjusting create_info is not necessary. You can overwrite
7585 the .frm file at this point if you wish to change the table
7586 definition, but there are no methods currently provided for doing
7587 so.
7588
7589 Called from handle.cc by ha_create_table().
7590
7591 @return
7592 HA_EXIT_SUCCESS OK
7593 other HA_ERR error code (can be SE-specific)
7594
7595 @see
7596 ha_create_table() in handle.cc
7597 */
7598
7599 int ha_rocksdb::create(const char *const name, TABLE *const table_arg,
7600 HA_CREATE_INFO *const create_info) {
7601 DBUG_ENTER_FUNC();
7602
7603 assert(table_arg != nullptr);
7604 assert(create_info != nullptr);
7605
7606 if (unlikely(create_info->data_file_name)) {
7607 // DATA DIRECTORY is used to create tables under a specific location
7608 // outside the MySQL data directory. We don't support this for MyRocks.
7609 // The `rocksdb_datadir` setting should be used to configure RocksDB data
7610 // directory.
7611 DBUG_RETURN(HA_ERR_ROCKSDB_TABLE_DATA_DIRECTORY_NOT_SUPPORTED);
7612 }
7613
7614 if (unlikely(create_info->index_file_name)) {
7615 // Similar check for INDEX DIRECTORY as well.
7616 DBUG_RETURN(HA_ERR_ROCKSDB_TABLE_INDEX_DIRECTORY_NOT_SUPPORTED);
7617 }
7618
7619 if (unlikely(create_info->encrypt_type.length)) {
7620 my_error(ER_NOT_SUPPORTED_YET, MYF(0),
7621 "ENCRYPTION for the RocksDB storage engine");
7622 DBUG_RETURN(HA_WRONG_CREATE_OPTION);
7623 }
7624
7625 if (unlikely(create_info->tablespace)) {
7626 my_error(ER_NOT_SUPPORTED_YET, MYF(0),
7627 "TABLESPACEs for the RocksDB storage engine");
7628 DBUG_RETURN(HA_WRONG_CREATE_OPTION);
7629 }
7630
7631 if (unlikely(create_info->compress.length)) {
7632 my_error(ER_NOT_SUPPORTED_YET, MYF(0),
7633 "InnoDB page COMPRESSION for the RocksDB storage engine");
7634 DBUG_RETURN(HA_WRONG_CREATE_OPTION);
7635 }
7636
7637 int err;
7638 /*
7639 Construct dbname.tablename ourselves, because parititioning
7640 passes strings like "./test/t14#P#p0" for individual partitions,
7641 while table_arg->s->table_name has none of that.
7642 */
7643 std::string str;
7644 err = rdb_normalize_tablename(name, &str);
7645 if (err != HA_EXIT_SUCCESS) {
7646 DBUG_RETURN(err);
7647 }
7648
7649 // FOREIGN KEY isn't supported yet
7650 THD *const thd = my_core::thd_get_current_thd();
7651 if (contains_foreign_key(thd)) {
7652 my_error(ER_NOT_SUPPORTED_YET, MYF(0),
7653 "FOREIGN KEY for the RocksDB storage engine");
7654 DBUG_RETURN(HA_ERR_UNSUPPORTED);
7655 }
7656
7657 // Check whether Data Dictionary contain information
7658 Rdb_tbl_def *old_tbl = ddl_manager.find(str);
7659 if (old_tbl != nullptr) {
7660 if (thd->lex->sql_command == SQLCOM_TRUNCATE) {
7661 DBUG_RETURN(truncate_table(old_tbl, table_arg,
7662 create_info->auto_increment_value));
7663 } else {
7664 my_error(ER_METADATA_INCONSISTENCY, MYF(0), str.c_str(), name);
7665 DBUG_RETURN(HA_ERR_ROCKSDB_CORRUPT_DATA);
7666 }
7667 }
7668
7669 DBUG_RETURN(create_table(str, table_arg, create_info->auto_increment_value));
7670 }
7671
7672 /*
7673 Fast truncates a table by renaming the old table, creating a new one and
7674 restoring or deleting the old table based on the results from creation.
7675
7676 @param tbl_def IN MyRocks's table structure
7677 @param table_arg IN sql table
7678 @param auto_increment_value IN specified table's auto increment value
7679
7680 @return
7681 HA_EXIT_SUCCESS OK
7682 other HA_ERR error code (can be SE-specific)
7683 */
7684 int ha_rocksdb::truncate_table(Rdb_tbl_def *tbl_def_arg, TABLE *table_arg,
7685 ulonglong auto_increment_value) {
7686 DBUG_ENTER_FUNC();
7687
7688 /*
7689 Fast table truncation involves deleting the table and then recreating
7690 it. However, it is possible recreating the table fails. In this case, a
7691 table inconsistency might result between SQL and MyRocks where MyRocks is
7692 missing a table. Since table creation involves modifying keys with the
7693 original table name, renaming the original table first, and then renaming
7694 it back in case of creation failure can help restore the pre-truncation
7695 state.
7696
7697 If the server were to crash during truncation, the system will end up with
7698 an inconsistency. Future changes for atomic ddl will resolve this. For now,
7699 if there are any truncation renamed tables found during startup, MyRocks
7700 will automatically remove them.
7701 */
7702 std::string orig_tablename = tbl_def_arg->full_tablename();
7703 std::string dbname, tblname, partition;
7704
7705 /*
7706 Rename the table in the data dictionary. Since this thread should be
7707 holding the MDL for this tablename, it is safe to perform these renames
7708 should be locked via MDL, no other process thread be able to access this
7709 table.
7710 */
7711 int err = rdb_split_normalized_tablename(orig_tablename, &dbname, &tblname,
7712 &partition);
7713 assert(err == 0);
7714 if (err != HA_EXIT_SUCCESS) DBUG_RETURN(err);
7715 tblname = std::string(TRUNCATE_TABLE_PREFIX) + tblname;
7716
7717 std::string tmp_tablename;
7718 rdb_gen_normalized_tablename(&dbname, &tblname, &partition, &tmp_tablename);
7719
7720 err = rename_table(orig_tablename.c_str(), tmp_tablename.c_str());
7721 if (err != HA_EXIT_SUCCESS) DBUG_RETURN(err);
7722
7723 /*
7724 Attempt to create the table. If this succeeds, then drop the old table.
7725 Otherwise, try to restore it.
7726 */
7727 err = create_table(orig_tablename, table_arg, auto_increment_value);
7728 bool should_remove_old_table = true;
7729
7730 /* Restore the old table being truncated if creating the new table failed */
7731 if (err != HA_EXIT_SUCCESS) {
7732 int rename_err =
7733 rename_table(tmp_tablename.c_str(), orig_tablename.c_str());
7734
7735 /*
7736 If the rename also fails, we are out of options, but at least try to drop
7737 the old table contents.
7738 */
7739 if (rename_err == HA_EXIT_SUCCESS) {
7740 should_remove_old_table = false;
7741 } else {
7742 // NO_LINT_DEBUG
7743 sql_print_error(
7744 "MyRocks: Failure during truncation of table %s "
7745 "being renamed from %s",
7746 orig_tablename.c_str(), tmp_tablename.c_str());
7747 err = rename_err;
7748 }
7749 }
7750
7751 /*
7752 Since the table was successfully truncated or the name restore failed, no
7753 error should be returned at this point from trying to delete the old
7754 table. If the delete_table fails, log it instead.
7755 */
7756 Rdb_tbl_def *old_tbl_def = ddl_manager.find(tmp_tablename);
7757 if (should_remove_old_table && old_tbl_def) {
7758 m_tbl_def = old_tbl_def;
7759 if (delete_table(old_tbl_def) != HA_EXIT_SUCCESS) {
7760 // NO_LINT_DEBUG
7761 sql_print_error(
7762 "Failure when trying to drop table %s during "
7763 "truncation of table %s",
7764 tmp_tablename.c_str(), orig_tablename.c_str());
7765 }
7766 }
7767
7768 /* Update the local m_tbl_def reference */
7769 m_tbl_def = ddl_manager.find(orig_tablename);
7770 m_converter.reset(new Rdb_converter(ha_thd(), m_tbl_def, table_arg));
7771 DBUG_RETURN(err);
7772 }
7773
7774 /**
7775 @note
7776 This function is used only when the table has not yet been opened, and
7777 keyread_allowed bitmap doesn't have the correct values yet.
7778
7779 See comment in ha_rocksdb::index_flags() for details.
7780 */
7781
7782 bool ha_rocksdb::check_keyread_allowed(bool &pk_can_be_decoded,
7783 const TABLE_SHARE *table_share, uint inx,
7784 uint part, bool all_parts) {
7785 bool res = true;
7786 KEY *const key_info = &table_share->key_info[inx];
7787
7788 Rdb_field_packing dummy1;
7789 res = dummy1.setup(nullptr, key_info->key_part[part].field, inx, part,
7790 key_info->key_part[part].length);
7791
7792 if (res && all_parts) {
7793 for (uint i = 0; i < part; i++) {
7794 Field *field;
7795 if ((field = key_info->key_part[i].field)) {
7796 Rdb_field_packing dummy;
7797 if (!dummy.setup(nullptr, field, inx, i,
7798 key_info->key_part[i].length)) {
7799 /* Cannot do index-only reads for this column */
7800 res = false;
7801 break;
7802 }
7803 }
7804 }
7805 }
7806
7807 const uint pk = table_share->primary_key;
7808 if (inx == pk && all_parts &&
7809 part + 1 == table_share->key_info[pk].user_defined_key_parts) {
7810 pk_can_be_decoded = res;
7811 }
7812
7813 return res;
7814 }
7815
7816 int ha_rocksdb::read_key_exact(const Rdb_key_def &kd,
7817 rocksdb::Iterator *const iter,
7818 const bool /* unused */,
7819 const rocksdb::Slice &key_slice,
7820 const int64_t ttl_filter_ts) {
7821 assert(iter != nullptr);
7822
7823 THD *thd = ha_thd();
7824 /*
7825 We are looking for the first record such that
7826 index_tuple= lookup_tuple.
7827 lookup_tuple may be a prefix of the index.
7828 */
7829 rocksdb_smart_seek(kd.m_is_reverse_cf, iter, key_slice);
7830
7831 while (iter->Valid() && kd.value_matches_prefix(iter->key(), key_slice)) {
7832 if (thd && thd->killed) {
7833 return HA_ERR_QUERY_INTERRUPTED;
7834 }
7835 /*
7836 If TTL is enabled we need to check if the given key has already expired
7837 from the POV of the current transaction. If it has, try going to the next
7838 key.
7839 */
7840 if (kd.has_ttl() && should_hide_ttl_rec(kd, iter->value(), ttl_filter_ts)) {
7841 rocksdb_smart_next(kd.m_is_reverse_cf, iter);
7842 continue;
7843 }
7844
7845 return HA_EXIT_SUCCESS;
7846 }
7847
7848 /*
7849 Got a record that is not equal to the lookup value, or even a record
7850 from another table.index.
7851 */
7852 return HA_ERR_KEY_NOT_FOUND;
7853 }
7854
7855 int ha_rocksdb::read_before_key(const Rdb_key_def &kd,
7856 const bool full_key_match,
7857 const rocksdb::Slice &key_slice,
7858 const int64_t ttl_filter_ts) {
7859 THD *thd = ha_thd();
7860 /*
7861 We are looking for record with the biggest t.key such that
7862 t.key < lookup_tuple.
7863 */
7864 rocksdb_smart_seek(!kd.m_is_reverse_cf, m_scan_it, key_slice);
7865
7866 while (is_valid(m_scan_it)) {
7867 if (thd && thd->killed) {
7868 return HA_ERR_QUERY_INTERRUPTED;
7869 }
7870 /*
7871 We are using full key and we've hit an exact match, or...
7872
7873 If TTL is enabled we need to check if the given key has already expired
7874 from the POV of the current transaction. If it has, try going to the next
7875 key.
7876 */
7877 if ((full_key_match &&
7878 kd.value_matches_prefix(m_scan_it->key(), key_slice)) ||
7879 (kd.has_ttl() &&
7880 should_hide_ttl_rec(kd, m_scan_it->value(), ttl_filter_ts))) {
7881 rocksdb_smart_next(!kd.m_is_reverse_cf, m_scan_it);
7882 continue;
7883 }
7884
7885 return HA_EXIT_SUCCESS;
7886 }
7887
7888 return HA_ERR_KEY_NOT_FOUND;
7889 }
7890
7891 int ha_rocksdb::read_after_key(const Rdb_key_def &kd,
7892 const rocksdb::Slice &key_slice,
7893 const int64_t ttl_filter_ts) {
7894 THD *thd = ha_thd();
7895 /*
7896 We are looking for the first record such that
7897
7898 index_tuple $GT lookup_tuple
7899
7900 with HA_READ_AFTER_KEY, $GT = '>',
7901 with HA_READ_KEY_OR_NEXT, $GT = '>='
7902 */
7903 rocksdb_smart_seek(kd.m_is_reverse_cf, m_scan_it, key_slice);
7904
7905 /*
7906 If TTL is enabled we need to check if the given key has already expired
7907 from the POV of the current transaction. If it has, try going to the next
7908 key.
7909 */
7910 while (is_valid(m_scan_it) && kd.has_ttl() &&
7911 should_hide_ttl_rec(kd, m_scan_it->value(), ttl_filter_ts)) {
7912 if (thd && thd->killed) {
7913 return HA_ERR_QUERY_INTERRUPTED;
7914 }
7915 rocksdb_smart_next(kd.m_is_reverse_cf, m_scan_it);
7916 }
7917
7918 return is_valid(m_scan_it) ? HA_EXIT_SUCCESS : HA_ERR_KEY_NOT_FOUND;
7919 }
7920
7921 int ha_rocksdb::position_to_correct_key(
7922 const Rdb_key_def &kd, const enum ha_rkey_function &find_flag,
7923 const bool full_key_match, const uchar *const key,
7924 const key_part_map &keypart_map, const rocksdb::Slice &key_slice,
7925 bool *const move_forward, const int64_t ttl_filter_ts) {
7926 int rc = 0;
7927
7928 *move_forward = true;
7929
7930 switch (find_flag) {
7931 case HA_READ_KEY_EXACT:
7932 rc = read_key_exact(kd, m_scan_it, full_key_match, key_slice,
7933 ttl_filter_ts);
7934 break;
7935 case HA_READ_BEFORE_KEY:
7936 *move_forward = false;
7937 rc = read_before_key(kd, full_key_match, key_slice, ttl_filter_ts);
7938 if (rc == 0 && !kd.covers_key(m_scan_it->key())) {
7939 /* The record we've got is not from this index */
7940 rc = HA_ERR_KEY_NOT_FOUND;
7941 }
7942 break;
7943 case HA_READ_AFTER_KEY:
7944 case HA_READ_KEY_OR_NEXT:
7945 rc = read_after_key(kd, key_slice, ttl_filter_ts);
7946 if (rc == 0 && !kd.covers_key(m_scan_it->key())) {
7947 /* The record we've got is not from this index */
7948 rc = HA_ERR_KEY_NOT_FOUND;
7949 }
7950 break;
7951 case HA_READ_KEY_OR_PREV:
7952 case HA_READ_PREFIX:
7953 /* This flag is not used by the SQL layer, so we don't support it yet. */
7954 rc = HA_ERR_UNSUPPORTED;
7955 break;
7956 case HA_READ_PREFIX_LAST:
7957 case HA_READ_PREFIX_LAST_OR_PREV:
7958 *move_forward = false;
7959 /*
7960 Find the last record with the specified index prefix lookup.
7961 - HA_READ_PREFIX_LAST requires that the record has the
7962 prefix=lookup (if there are no such records,
7963 HA_ERR_KEY_NOT_FOUND should be returned).
7964 - HA_READ_PREFIX_LAST_OR_PREV has no such requirement. If there are no
7965 records with prefix=lookup, we should return the last record
7966 before that.
7967 */
7968 rc = read_before_key(kd, full_key_match, key_slice, ttl_filter_ts);
7969 if (rc == 0) {
7970 const rocksdb::Slice &rkey = m_scan_it->key();
7971 if (!kd.covers_key(rkey)) {
7972 /* The record we've got is not from this index */
7973 rc = HA_ERR_KEY_NOT_FOUND;
7974 } else if (find_flag == HA_READ_PREFIX_LAST) {
7975 uint size = kd.pack_index_tuple(table, m_pack_buffer,
7976 m_sk_packed_tuple, key, keypart_map);
7977 rocksdb::Slice lookup_tuple(
7978 reinterpret_cast<char *>(m_sk_packed_tuple), size);
7979
7980 // We need to compare the key we've got with the original search
7981 // prefix.
7982 if (!kd.value_matches_prefix(rkey, lookup_tuple)) {
7983 rc = HA_ERR_KEY_NOT_FOUND;
7984 }
7985 }
7986 }
7987 break;
7988 default:
7989 assert(0);
7990 break;
7991 }
7992
7993 return rc;
7994 }
7995
7996 int ha_rocksdb::calc_eq_cond_len(const Rdb_key_def &kd,
7997 const enum ha_rkey_function &find_flag,
7998 const rocksdb::Slice &slice,
7999 const int bytes_changed_by_succ,
8000 const key_range *const end_key,
8001 uint *const end_key_packed_size) {
8002 if (find_flag == HA_READ_KEY_EXACT) return slice.size();
8003
8004 if (find_flag == HA_READ_PREFIX_LAST) {
8005 /*
8006 We have made the kd.successor(m_sk_packed_tuple) call above.
8007
8008 The slice is at least Rdb_key_def::INDEX_NUMBER_SIZE bytes long.
8009 */
8010 return slice.size() - bytes_changed_by_succ;
8011 }
8012
8013 if (end_key) {
8014 *end_key_packed_size =
8015 kd.pack_index_tuple(table, m_pack_buffer, m_end_key_packed_tuple,
8016 end_key->key, end_key->keypart_map);
8017
8018 /*
8019 Calculating length of the equal conditions here. 4 byte index id is
8020 included.
8021 Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3)
8022 WHERE id1=1 AND id2=1 AND id3>=2 => eq_cond_len= 4+8+4= 16
8023 WHERE id1=1 AND id2>=1 AND id3>=2 => eq_cond_len= 4+8= 12
8024 Example2: id1 VARCHAR(30), id2 INT, PRIMARY KEY (id1, id2)
8025 WHERE id1 = 'AAA' and id2 < 3; => eq_cond_len=13 (varchar used 9 bytes)
8026 */
8027 rocksdb::Slice end_slice(reinterpret_cast<char *>(m_end_key_packed_tuple),
8028 *end_key_packed_size);
8029 return slice.difference_offset(end_slice);
8030 }
8031
8032 /*
8033 On range scan without any end key condition, there is no
8034 eq cond, and eq cond length is the same as index_id size (4 bytes).
8035 Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3)
8036 WHERE id1>=1 AND id2 >= 2 and id2 <= 5 => eq_cond_len= 4
8037 */
8038 return Rdb_key_def::INDEX_NUMBER_SIZE;
8039 }
8040
8041 int ha_rocksdb::read_row_from_primary_key(uchar *const buf) {
8042 assert(buf != nullptr);
8043
8044 int rc;
8045 const rocksdb::Slice &rkey = m_scan_it->key();
8046 const uint pk_size = rkey.size();
8047 const char *pk_data = rkey.data();
8048
8049 memcpy(m_pk_packed_tuple, pk_data, pk_size);
8050 m_last_rowkey.copy(pk_data, pk_size, &my_charset_bin);
8051
8052 if (m_lock_rows != RDB_LOCK_NONE) {
8053 /* We need to put a lock and re-read */
8054 rc = get_row_by_rowid(buf, m_pk_packed_tuple, pk_size);
8055 } else {
8056 /* Unpack from the row we've read */
8057 const rocksdb::Slice &value = m_scan_it->value();
8058 rc = convert_record_from_storage_format(&rkey, &value, buf);
8059 }
8060
8061 return rc;
8062 }
8063
8064 int ha_rocksdb::read_row_from_secondary_key(uchar *const buf,
8065 const Rdb_key_def &kd,
8066 bool move_forward) {
8067 assert(buf != nullptr);
8068
8069 int rc = 0;
8070 uint pk_size;
8071
8072 /* Get the key columns and primary key value */
8073 const rocksdb::Slice &rkey = m_scan_it->key();
8074 const rocksdb::Slice &value = m_scan_it->value();
8075
8076 #if !defined(NDEBUG)
8077 bool save_keyread_only = m_keyread_only;
8078 #endif // !defined(NDEBUG)
8079 DBUG_EXECUTE_IF("dbug.rocksdb.HA_EXTRA_KEYREAD", { m_keyread_only = true; });
8080
8081 bool covered_lookup =
8082 (m_keyread_only && kd.can_cover_lookup()) ||
8083 kd.covers_lookup(&value, m_converter->get_lookup_bitmap());
8084
8085 #if !defined(NDEBUG)
8086 m_keyread_only = save_keyread_only;
8087 #endif // !defined(NDEBUG)
8088
8089 if (covered_lookup && m_lock_rows == RDB_LOCK_NONE) {
8090 pk_size =
8091 kd.get_primary_key_tuple(table, *m_pk_descr, &rkey, m_pk_packed_tuple);
8092 if (pk_size == RDB_INVALID_KEY_LEN) {
8093 rc = HA_ERR_ROCKSDB_CORRUPT_DATA;
8094 } else {
8095 rc = kd.unpack_record(table, buf, &rkey, &value,
8096 m_converter->get_verify_row_debug_checksums());
8097 }
8098 } else {
8099 if (kd.m_is_reverse_cf) move_forward = !move_forward;
8100
8101 rc = find_icp_matching_index_rec(move_forward, buf);
8102 if (!rc) {
8103 const rocksdb::Slice &rkey = m_scan_it->key();
8104 pk_size = kd.get_primary_key_tuple(table, *m_pk_descr, &rkey,
8105 m_pk_packed_tuple);
8106 if (pk_size == RDB_INVALID_KEY_LEN) {
8107 rc = HA_ERR_ROCKSDB_CORRUPT_DATA;
8108 } else {
8109 rc = get_row_by_rowid(buf, m_pk_packed_tuple, pk_size);
8110 }
8111 }
8112 }
8113
8114 if (!rc) {
8115 m_last_rowkey.copy((const char *)m_pk_packed_tuple, pk_size,
8116 &my_charset_bin);
8117 }
8118
8119 return rc;
8120 }
8121
8122 /**
8123 @note
8124 The problem with this function is that SQL layer calls it, when
8125 - the table has not been yet opened (no ::open() call done)
8126 - this->table_share already exists, but it is in the process of being
8127 filled, so some of fields are still NULL.
8128 - In particular, table_share->key_info[inx].key_part[] is filled only up
8129 to part #part. Subsequent key parts are not yet filled.
8130
8131 To complicate things further, SQL layer will call index_flags() with
8132 all_parts=true. Essentially, we're asked to provide flags for reading
8133 keyparts whose datatype is not yet known.
8134
8135 We walk around this problem by using check_keyread_allowed(), which uses
8136 table_share object and is careful not to step on unitialized data.
8137
8138 When we get a call with all_parts=true, we try to analyze all parts but
8139 ignore those that have key_part->field==nullptr (these are not initialized
8140 yet).
8141 */
8142
8143 ulong ha_rocksdb::index_flags(bool &pk_can_be_decoded,
8144 const TABLE_SHARE *table_share, uint inx,
8145 uint part, bool all_parts) {
8146 DBUG_ENTER_FUNC();
8147
8148 ulong base_flags = HA_READ_NEXT | // doesn't seem to be used
8149 HA_READ_ORDER | HA_READ_RANGE | HA_READ_PREV;
8150
8151 bool res = check_keyread_allowed(pk_can_be_decoded, table_share, inx, part,
8152 all_parts);
8153 if (res)
8154 base_flags |= HA_KEYREAD_ONLY;
8155
8156 if (inx == table_share->primary_key) {
8157 /*
8158 Index-only reads on primary key are the same as table scan for us. Still,
8159 we need to explicitly "allow" them, otherwise SQL layer will miss some
8160 plans.
8161 */
8162 base_flags |= HA_KEYREAD_ONLY;
8163 } else if (res) {
8164 /* We can do ICP only if we are able to decode the key (res == true) */
8165 /*
8166 We can Index Condition Pushdown any key except the primary. With primary
8167 key, we get (pk, record) pair immediately, there is no place to put the
8168 ICP check.
8169 */
8170 base_flags |= HA_DO_INDEX_COND_PUSHDOWN;
8171 }
8172
8173 DBUG_RETURN(base_flags);
8174 }
8175
8176 ulong ha_rocksdb::index_flags(uint inx, uint part, bool all_parts) const {
8177 return index_flags(m_pk_can_be_decoded, table_share, inx, part, all_parts);
8178 }
8179
8180 ha_rocksdb::Table_flags ha_rocksdb::table_flags() const {
8181 return table_flags(m_pk_can_be_decoded);
8182 }
8183
8184 /**
8185 @brief
8186 Read next index tuple through the secondary index.
8187
8188 @details
8189 m_scan_it points at the index key-value pair that we should read the (pk,row)
8190 pair for.
8191 */
8192 int ha_rocksdb::secondary_index_read(const int keyno, uchar *const buf) {
8193 assert(buf != nullptr);
8194 assert(table != nullptr);
8195
8196 /* Use STATUS_NOT_FOUND when record not found or some error occurred */
8197 table->status = STATUS_NOT_FOUND;
8198
8199 if (is_valid(m_scan_it)) {
8200 rocksdb::Slice key = m_scan_it->key();
8201
8202 /* Check if we've ran out of records of this index */
8203 if (m_key_descr_arr[keyno]->covers_key(key)) {
8204 int rc = 0;
8205
8206 // TODO: We could here check if we have ran out of range we're scanning
8207 const uint size = m_key_descr_arr[keyno]->get_primary_key_tuple(
8208 table, *m_pk_descr, &key, m_pk_packed_tuple);
8209 if (size == RDB_INVALID_KEY_LEN) {
8210 return HA_ERR_ROCKSDB_CORRUPT_DATA;
8211 }
8212
8213 m_last_rowkey.copy((const char *)m_pk_packed_tuple, size,
8214 &my_charset_bin);
8215
8216 rocksdb::Slice value = m_scan_it->value();
8217 bool covered_lookup =
8218 (m_keyread_only && m_key_descr_arr[keyno]->can_cover_lookup()) ||
8219 m_key_descr_arr[keyno]->covers_lookup(
8220 &value, m_converter->get_lookup_bitmap());
8221 if (covered_lookup && m_lock_rows == RDB_LOCK_NONE) {
8222 rc = m_key_descr_arr[keyno]->unpack_record(
8223 table, buf, &key, &value,
8224 m_converter->get_verify_row_debug_checksums());
8225 inc_covered_sk_lookup();
8226 } else {
8227 DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete_sk");
8228 rc = get_row_by_rowid(buf, m_pk_packed_tuple, size);
8229 }
8230
8231 if (!rc) {
8232 table->status = 0;
8233 update_row_stats(ROWS_READ);
8234 }
8235 return rc;
8236 }
8237 }
8238 return HA_ERR_END_OF_FILE;
8239 }
8240
8241 /*
8242 ha_rocksdb::read_range_first overrides handler::read_range_first.
8243 The only difference from handler::read_range_first is that
8244 ha_rocksdb::read_range_first passes end_key to
8245 ha_rocksdb::index_read_map_impl function.
8246
8247 @return
8248 HA_EXIT_SUCCESS OK
8249 other HA_ERR error code (can be SE-specific)
8250 */
8251 int ha_rocksdb::read_range_first(const key_range *const start_key,
8252 const key_range *const end_key,
8253 bool eq_range_arg, bool sorted) {
8254 DBUG_ENTER_FUNC();
8255
8256 check_build_decoder();
8257
8258 int result;
8259
8260 eq_range = eq_range_arg;
8261 set_end_range(end_key, RANGE_SCAN_ASC);
8262
8263 range_key_part = table->key_info[active_index].key_part;
8264
8265 if (!start_key) // Read first record
8266 result = ha_index_first(table->record[0]);
8267 else {
8268 if (is_using_prohibited_gap_locks(
8269 table,
8270 is_using_full_unique_key(active_index, start_key->keypart_map,
8271 start_key->flag))) {
8272 DBUG_RETURN(HA_ERR_LOCK_DEADLOCK);
8273 }
8274
8275 result =
8276 index_read_map_impl(table->record[0], start_key->key,
8277 start_key->keypart_map, start_key->flag, end_key);
8278 }
8279 if (result) {
8280 DBUG_RETURN((result == HA_ERR_KEY_NOT_FOUND) ? HA_ERR_END_OF_FILE : result);
8281 }
8282
8283 if (compare_key(end_range) <= 0) {
8284 DBUG_RETURN(HA_EXIT_SUCCESS);
8285 } else {
8286 /*
8287 The last read row does not fall in the range. So request
8288 storage engine to release row lock if possible.
8289 */
8290 unlock_row();
8291 DBUG_RETURN(HA_ERR_END_OF_FILE);
8292 }
8293 }
8294
8295 /**
8296 @return
8297 HA_EXIT_SUCCESS OK
8298 other HA_ERR error code (can be SE-specific)
8299 */
8300 int ha_rocksdb::index_read_map(uchar *const buf, const uchar *const key,
8301 key_part_map keypart_map,
8302 enum ha_rkey_function find_flag) {
8303 DBUG_ENTER_FUNC();
8304
8305 check_build_decoder();
8306
8307 DBUG_RETURN(index_read_map_impl(buf, key, keypart_map, find_flag, nullptr));
8308 }
8309
8310 /*
8311 See storage/rocksdb/rocksdb-range-access.txt for description of how MySQL
8312 index navigation commands are converted into RocksDB lookup commands.
8313
8314 This function takes end_key as an argument, and it is set on range scan.
8315 MyRocks needs to decide whether prefix bloom filter can be used or not.
8316 To decide to use prefix bloom filter or not, calculating equal condition
8317 length
8318 is needed. On equal lookups (find_flag == HA_READ_KEY_EXACT), equal
8319 condition length is the same as rocksdb::Slice.size() of the start key.
8320 On range scan, equal condition length is MIN(start_key, end_key) of the
8321 rocksdb::Slice expression.
8322
8323 @return
8324 HA_EXIT_SUCCESS OK
8325 other HA_ERR error code (can be SE-specific)
8326 */
8327 int ha_rocksdb::index_read_map_impl(uchar *const buf, const uchar *const key,
8328 key_part_map keypart_map,
8329 enum ha_rkey_function find_flag,
8330 const key_range *end_key) {
8331 DBUG_ENTER_FUNC();
8332
8333 int rc = 0;
8334
8335 THD *thd = ha_thd();
8336 DEBUG_SYNC(thd, "rocksdb.check_flags_rmi");
8337 if (thd && thd->killed) {
8338 rc = HA_ERR_QUERY_INTERRUPTED;
8339 DBUG_RETURN(rc);
8340 }
8341
8342 ha_statistic_increment(&SSV::ha_read_key_count);
8343 const Rdb_key_def &kd = *m_key_descr_arr[active_index];
8344 const uint actual_key_parts = kd.get_key_parts();
8345 bool using_full_key = is_using_full_key(keypart_map, actual_key_parts);
8346
8347 if (!end_key) end_key = end_range;
8348
8349 /* By default, we don't need the retrieved records to match the prefix */
8350 m_sk_match_prefix = nullptr;
8351
8352 if (active_index == table->s->primary_key && find_flag == HA_READ_KEY_EXACT &&
8353 using_full_key) {
8354 /*
8355 Equality lookup over primary key, using full tuple.
8356 This is a special case, use DB::Get.
8357 */
8358 const uint size = kd.pack_index_tuple(table, m_pack_buffer,
8359 m_pk_packed_tuple, key, keypart_map);
8360 bool skip_lookup = is_blind_delete_enabled();
8361 rc = get_row_by_rowid(buf, m_pk_packed_tuple, size, false, skip_lookup);
8362 if (!rc && !skip_lookup) {
8363 update_row_stats(ROWS_READ);
8364 }
8365 DBUG_RETURN(rc);
8366 }
8367
8368 /*
8369 Unique secondary index performs lookups without the extended key fields
8370 */
8371 uint packed_size;
8372 if (active_index != table->s->primary_key &&
8373 table->key_info[active_index].flags & HA_NOSAME &&
8374 find_flag == HA_READ_KEY_EXACT && using_full_key) {
8375 key_part_map tmp_map = (key_part_map(1) << table->key_info[active_index]
8376 .user_defined_key_parts) -
8377 1;
8378 packed_size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
8379 key, tmp_map);
8380 if (table->key_info[active_index].user_defined_key_parts !=
8381 kd.get_key_parts()) {
8382 using_full_key = false;
8383 }
8384
8385 if (m_insert_with_update && m_dup_key_found &&
8386 active_index == m_dupp_errkey) {
8387 /*
8388 We are in INSERT ... ON DUPLICATE KEY UPDATE, and this is a read
8389 that SQL layer does to read the duplicate key.
8390 Its rowid is saved in m_last_rowkey. Get the full record and return it.
8391 */
8392
8393 assert(m_dup_key_retrieved_record.length() >= packed_size);
8394 assert(memcmp(m_dup_key_retrieved_record.ptr(), m_sk_packed_tuple,
8395 packed_size) == 0);
8396
8397 rc = get_row_by_rowid(buf, m_last_rowkey.ptr(), m_last_rowkey.length());
8398 DBUG_RETURN(rc);
8399 }
8400
8401 } else {
8402 packed_size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
8403 key, keypart_map);
8404 }
8405
8406 if ((pushed_idx_cond && pushed_idx_cond_keyno == active_index) &&
8407 (find_flag == HA_READ_KEY_EXACT || find_flag == HA_READ_PREFIX_LAST)) {
8408 /*
8409 We are doing a point index lookup, and ICP is enabled. It is possible
8410 that this call will be followed by ha_rocksdb->index_next_same() call.
8411
8412 Do what InnoDB does: save the lookup tuple now. We will need it in
8413 index_next_same/find_icp_matching_index_rec in order to stop scanning
8414 as soon as index record doesn't match the lookup tuple.
8415
8416 When not using ICP, handler::index_next_same() will make sure that rows
8417 that don't match the lookup prefix are not returned.
8418 row matches the lookup prefix.
8419 */
8420 m_sk_match_prefix = m_sk_match_prefix_buf;
8421 m_sk_match_length = packed_size;
8422 memcpy(m_sk_match_prefix, m_sk_packed_tuple, packed_size);
8423 }
8424
8425 int bytes_changed_by_succ = 0;
8426 if (find_flag == HA_READ_PREFIX_LAST_OR_PREV ||
8427 find_flag == HA_READ_PREFIX_LAST || find_flag == HA_READ_AFTER_KEY) {
8428 /* See below */
8429 bytes_changed_by_succ = kd.successor(m_sk_packed_tuple, packed_size);
8430 }
8431
8432 rocksdb::Slice slice(reinterpret_cast<const char *>(m_sk_packed_tuple),
8433 packed_size);
8434
8435 uint end_key_packed_size = 0;
8436 const uint eq_cond_len =
8437 calc_eq_cond_len(kd, find_flag, slice, bytes_changed_by_succ, end_key,
8438 &end_key_packed_size);
8439
8440 bool use_all_keys = false;
8441 if (find_flag == HA_READ_KEY_EXACT &&
8442 my_count_bits(keypart_map) == kd.get_key_parts()) {
8443 use_all_keys = true;
8444 }
8445
8446 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
8447 const bool is_new_snapshot = !tx->has_snapshot();
8448 // Loop as long as we get a deadlock error AND we end up creating the
8449 // snapshot here (i.e. it did not exist prior to this)
8450 for (;;) {
8451 DEBUG_SYNC(thd, "rocksdb.check_flags_rmi_scan");
8452 if (thd && thd->killed) {
8453 rc = HA_ERR_QUERY_INTERRUPTED;
8454 break;
8455 }
8456 /*
8457 This will open the iterator and position it at a record that's equal or
8458 greater than the lookup tuple.
8459 */
8460 setup_scan_iterator(kd, &slice, use_all_keys, eq_cond_len);
8461
8462 /*
8463 Once we are positioned on from above, move to the position we really
8464 want: See storage/rocksdb/rocksdb-range-access.txt
8465 */
8466 bool move_forward;
8467 rc =
8468 position_to_correct_key(kd, find_flag, using_full_key, key, keypart_map,
8469 slice, &move_forward, tx->m_snapshot_timestamp);
8470
8471 if (rc) {
8472 break;
8473 }
8474
8475 m_skip_scan_it_next_call = false;
8476
8477 /*
8478 Now get the data for the row into 'buf'. If we were using a primary key
8479 then we have all the rows we need. For a secondary key we now need to
8480 lookup the primary key.
8481 */
8482 if (active_index == table->s->primary_key) {
8483 rc = read_row_from_primary_key(buf);
8484 } else {
8485 rc = read_row_from_secondary_key(buf, kd, move_forward);
8486 }
8487
8488 if (!should_recreate_snapshot(rc, is_new_snapshot)) {
8489 break; /* Exit the loop */
8490 }
8491
8492 // release the snapshot and iterator so they will be regenerated
8493 tx->release_snapshot();
8494 release_scan_iterator();
8495 }
8496
8497 if (rc) {
8498 /*
8499 This status is returned on any error
8500 the only possible error condition is record-not-found
8501 */
8502 table->status = STATUS_NOT_FOUND;
8503 } else {
8504 table->status = 0;
8505 update_row_stats(ROWS_READ);
8506 }
8507
8508 DBUG_RETURN(rc);
8509 }
8510
8511 /*
8512 @brief
8513 Scan the secondary index until we find an index record that satisfies ICP
8514
8515 @param move_forward true <=> move m_scan_it forward
8516 false <=> move m_scan_it backward
8517 @param buf Record buffer (must be the same buffer that
8518 pushed index condition points to, in practice
8519 it is table->record[0])
8520
8521 @detail
8522 Move the current iterator m_scan_it until we get an index tuple that
8523 satisfies the pushed Index Condition.
8524 (if there is no pushed index condition, return right away)
8525
8526 @return
8527 0 - Index tuple satisfies ICP, can do index read.
8528 other - error code
8529 */
8530
8531 int ha_rocksdb::find_icp_matching_index_rec(const bool move_forward,
8532 uchar *const buf) {
8533 assert(buf != nullptr);
8534
8535 if (pushed_idx_cond && pushed_idx_cond_keyno == active_index) {
8536 const Rdb_key_def &kd = *m_key_descr_arr[active_index];
8537 THD *thd = ha_thd();
8538
8539 while (1) {
8540 int rc = rocksdb_skip_expired_records(kd, m_scan_it, !move_forward);
8541 if (rc != HA_EXIT_SUCCESS) {
8542 return rc;
8543 }
8544
8545 if (thd && thd->killed) {
8546 return HA_ERR_QUERY_INTERRUPTED;
8547 }
8548
8549 if (!is_valid(m_scan_it)) {
8550 table->status = STATUS_NOT_FOUND;
8551 return HA_ERR_END_OF_FILE;
8552 }
8553 const rocksdb::Slice rkey = m_scan_it->key();
8554
8555 if (!kd.covers_key(rkey)) {
8556 table->status = STATUS_NOT_FOUND;
8557 return HA_ERR_END_OF_FILE;
8558 }
8559
8560 if (m_sk_match_prefix) {
8561 const rocksdb::Slice prefix((const char *)m_sk_match_prefix,
8562 m_sk_match_length);
8563 if (!kd.value_matches_prefix(rkey, prefix)) {
8564 table->status = STATUS_NOT_FOUND;
8565 return HA_ERR_END_OF_FILE;
8566 }
8567 }
8568
8569 const rocksdb::Slice value = m_scan_it->value();
8570 int err = kd.unpack_record(table, buf, &rkey, &value,
8571 m_converter->get_verify_row_debug_checksums());
8572 if (err != HA_EXIT_SUCCESS) {
8573 return err;
8574 }
8575
8576 const enum icp_result icp_status = check_index_cond();
8577 if (icp_status == ICP_NO_MATCH) {
8578 rocksdb_smart_next(!move_forward, m_scan_it);
8579 continue; /* Get the next (or prev) index tuple */
8580 } else if (icp_status == ICP_OUT_OF_RANGE) {
8581 /* We have walked out of range we are scanning */
8582 table->status = STATUS_NOT_FOUND;
8583 return HA_ERR_END_OF_FILE;
8584 } else /* icp_status == ICP_MATCH */
8585 {
8586 /* Index Condition is satisfied. We have rc==0, proceed to fetch the
8587 * row. */
8588 break;
8589 }
8590 }
8591 }
8592 return HA_EXIT_SUCCESS;
8593 }
8594
8595 /**
8596 @return
8597 HA_EXIT_SUCCESS OK
8598 other HA_ERR error code (can be SE-specific)
8599 */
8600 int ha_rocksdb::index_read_last_map(uchar *const buf, const uchar *const key,
8601 key_part_map keypart_map) {
8602 DBUG_ENTER_FUNC();
8603
8604 check_build_decoder();
8605
8606 DBUG_RETURN(index_read_map(buf, key, keypart_map, HA_READ_PREFIX_LAST));
8607 }
8608
8609 /**
8610 @return
8611 HA_ADMIN_OK OK
8612 other HA_ADMIN error code
8613 */
8614 int ha_rocksdb::check(THD *const thd, HA_CHECK_OPT *const check_opt) {
8615 DBUG_ENTER_FUNC();
8616
8617 assert(thd != nullptr);
8618 assert(check_opt != nullptr);
8619
8620 const uint pk = pk_index(table, m_tbl_def);
8621 String rowkey_copy;
8622 String sec_key_copy;
8623 const char *const table_name = table->s->table_name.str;
8624
8625 // Only when debugging: don't use snapshot when reading
8626 // Rdb_transaction *tx= get_or_create_tx(table->in_use);
8627 // tx->snapshot= nullptr;
8628
8629 bool save_verify_row_debug_checksums =
8630 m_converter->get_verify_row_debug_checksums();
8631 m_converter->set_verify_row_debug_checksums(true);
8632 /* For each secondary index, check that we can get a PK value from it */
8633 // NO_LINT_DEBUG
8634 sql_print_information("CHECKTABLE %s: Checking table %s", table_name,
8635 table_name);
8636 ha_rows row_checksums_at_start = 0; // set/used iff first_index==true
8637 ha_rows row_checksums = ha_rows(-1);
8638 bool first_index = true;
8639
8640 for (uint keyno = 0; keyno < table->s->keys; keyno++) {
8641 if (keyno != pk) {
8642 extra(HA_EXTRA_KEYREAD);
8643 ha_index_init(keyno, true);
8644 ha_rows rows = 0;
8645 ha_rows checksums = 0;
8646 if (first_index) {
8647 row_checksums_at_start = m_converter->get_row_checksums_checked();
8648 }
8649 int res;
8650 // NO_LINT_DEBUG
8651 sql_print_information("CHECKTABLE %s: Checking index %s", table_name,
8652 table->key_info[keyno].name);
8653 while (1) {
8654 if (!rows) {
8655 res = index_first(table->record[0]);
8656 } else {
8657 res = index_next(table->record[0]);
8658 }
8659
8660 if (res == HA_ERR_END_OF_FILE) break;
8661 if (res) {
8662 // error
8663 // NO_LINT_DEBUG
8664 sql_print_error("CHECKTABLE %s: .. row %lld: index scan error %d",
8665 table_name, rows, res);
8666 goto error;
8667 }
8668 rocksdb::Slice key = m_scan_it->key();
8669 sec_key_copy.copy(key.data(), key.size(), &my_charset_bin);
8670 rowkey_copy.copy(m_last_rowkey.ptr(), m_last_rowkey.length(),
8671 &my_charset_bin);
8672
8673 if (m_key_descr_arr[keyno]->unpack_info_has_checksum(
8674 m_scan_it->value())) {
8675 checksums++;
8676 }
8677
8678 if ((res = get_row_by_rowid(table->record[0], rowkey_copy.ptr(),
8679 rowkey_copy.length()))) {
8680 // NO_LINT_DEBUG
8681 sql_print_error(
8682 "CHECKTABLE %s: .. row %lld: "
8683 "failed to fetch row by rowid",
8684 table_name, rows);
8685 goto error;
8686 }
8687
8688 longlong hidden_pk_id = 0;
8689 if (has_hidden_pk(table) &&
8690 read_hidden_pk_id_from_rowkey(&hidden_pk_id)) {
8691 goto error;
8692 }
8693
8694 /* Check if we get the same PK value */
8695 uint packed_size = m_pk_descr->pack_record(
8696 table, m_pack_buffer, table->record[0], m_pk_packed_tuple, nullptr,
8697 false, hidden_pk_id);
8698 if (packed_size != rowkey_copy.length() ||
8699 memcmp(m_pk_packed_tuple, rowkey_copy.ptr(), packed_size)) {
8700 // NO_LINT_DEBUG
8701 sql_print_error("CHECKTABLE %s: .. row %lld: PK value mismatch",
8702 table_name, rows);
8703 goto print_and_error;
8704 }
8705
8706 /* Check if we get the same secondary key value */
8707 packed_size = m_key_descr_arr[keyno]->pack_record(
8708 table, m_pack_buffer, table->record[0], m_sk_packed_tuple,
8709 &m_sk_tails, false, hidden_pk_id);
8710 if (packed_size != sec_key_copy.length() ||
8711 memcmp(m_sk_packed_tuple, sec_key_copy.ptr(), packed_size)) {
8712 // NO_LINT_DEBUG
8713 sql_print_error(
8714 "CHECKTABLE %s: .. row %lld: "
8715 "secondary index value mismatch",
8716 table_name, rows);
8717 goto print_and_error;
8718 }
8719 rows++;
8720 continue;
8721
8722 print_and_error : {
8723 std::string buf;
8724 buf = rdb_hexdump(rowkey_copy.ptr(), rowkey_copy.length(),
8725 RDB_MAX_HEXDUMP_LEN);
8726 // NO_LINT_DEBUG
8727 sql_print_error("CHECKTABLE %s: rowkey: %s", table_name, buf.c_str());
8728
8729 buf = rdb_hexdump(m_retrieved_record.data(), m_retrieved_record.size(),
8730 RDB_MAX_HEXDUMP_LEN);
8731 // NO_LINT_DEBUG
8732 sql_print_error("CHECKTABLE %s: record: %s", table_name, buf.c_str());
8733
8734 buf = rdb_hexdump(sec_key_copy.ptr(), sec_key_copy.length(),
8735 RDB_MAX_HEXDUMP_LEN);
8736 // NO_LINT_DEBUG
8737 sql_print_error("CHECKTABLE %s: index: %s", table_name, buf.c_str());
8738
8739 goto error;
8740 }
8741 }
8742 // NO_LINT_DEBUG
8743 sql_print_information(
8744 "CHECKTABLE %s: ... %lld index entries checked "
8745 "(%lld had checksums)",
8746 table_name, rows, checksums);
8747
8748 if (first_index) {
8749 row_checksums =
8750 m_converter->get_row_checksums_checked() - row_checksums_at_start;
8751 first_index = false;
8752 }
8753 ha_index_end();
8754 }
8755 }
8756 if (row_checksums != ha_rows(-1)) {
8757 // NO_LINT_DEBUG
8758 sql_print_information("CHECKTABLE %s: %lld table records had checksums",
8759 table_name, row_checksums);
8760 }
8761 extra(HA_EXTRA_NO_KEYREAD);
8762
8763 m_converter->set_verify_row_debug_checksums(save_verify_row_debug_checksums);
8764 /*
8765 TODO: we should check also for PK records that are missing in the secondary
8766 indexes.
8767 For that, need to walk through the PK and check that every PK record has a
8768 proper counterpart in each secondary index.
8769 */
8770 DBUG_RETURN(HA_ADMIN_OK);
8771 error:
8772 m_converter->set_verify_row_debug_checksums(save_verify_row_debug_checksums);
8773 ha_index_or_rnd_end();
8774 extra(HA_EXTRA_NO_KEYREAD);
8775
8776 DBUG_RETURN(HA_ADMIN_CORRUPT);
8777 }
8778
8779 static void dbug_dump_str(FILE *const out, const char *const str, int len) {
8780 fprintf(out, "\"");
8781 for (int i = 0; i < len; i++) {
8782 if (str[i] > 32) {
8783 fprintf(out, "%c", str[i]);
8784 } else {
8785 fprintf(out, "\\%d", str[i]);
8786 }
8787 }
8788 fprintf(out, "\"");
8789 }
8790
8791 /*
8792 Debugging help: dump the whole database into a human-readable file.
8793 Usage:
8794 dbug_dump_database(rdb);
8795 */
8796
8797 void dbug_dump_database(rocksdb::DB *const db) {
8798 FILE *const out = fopen("/tmp/rocksdb.dump", "wt");
8799 if (!out) return;
8800
8801 rocksdb::Iterator *it = db->NewIterator(rocksdb::ReadOptions());
8802 for (it->SeekToFirst(); it->Valid(); it->Next()) {
8803 rocksdb::Slice key = it->key();
8804 rocksdb::Slice val = it->value();
8805 dbug_dump_str(out, key.data(), key.size());
8806 fprintf(out, " -> ");
8807 dbug_dump_str(out, val.data(), val.size());
8808 fprintf(out, "\n");
8809 }
8810
8811 delete it;
8812 fclose(out);
8813 }
8814
8815 rocksdb::Status ha_rocksdb::get_for_update(
8816 Rdb_transaction *const tx, const Rdb_key_def &key_descr,
8817 const rocksdb::Slice &key, rocksdb::PinnableSlice *const value) const {
8818 assert(m_lock_rows != RDB_LOCK_NONE);
8819
8820 bool exclusive = m_lock_rows != RDB_LOCK_READ;
8821 bool do_validate = my_core::thd_tx_isolation(ha_thd()) > ISO_READ_COMMITTED;
8822 rocksdb::Status s =
8823 tx->get_for_update(key_descr, key, value, exclusive, do_validate);
8824
8825 #ifndef NDEBUG
8826 ++rocksdb_num_get_for_update_calls;
8827 #endif
8828 return s;
8829 }
8830
8831 bool ha_rocksdb::is_blind_delete_enabled() {
8832 THD *thd = ha_thd();
8833 return (THDVAR(thd, blind_delete_primary_key) &&
8834 thd->lex->sql_command == SQLCOM_DELETE &&
8835 thd->lex->table_count == 1 && table->s->keys == 1 &&
8836 !has_hidden_pk(table) && !thd->rli_slave);
8837 }
8838
8839 /*
8840 Given a rowid (i.e. packed PK) as a parameter, get the record.
8841
8842 @return
8843 HA_EXIT_SUCCESS OK
8844 other HA_ERR error code (can be SE-specific)
8845 */
8846
8847 int ha_rocksdb::get_row_by_rowid(uchar *const buf, const char *const rowid,
8848 const uint rowid_size,
8849 const bool skip_ttl_check,
8850 const bool skip_lookup) {
8851 DBUG_ENTER_FUNC();
8852
8853 assert(buf != nullptr);
8854 assert(rowid != nullptr);
8855 assert(table != nullptr);
8856
8857 int rc;
8858
8859 rocksdb::Slice key_slice(rowid, rowid_size);
8860
8861 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
8862 assert(tx != nullptr);
8863
8864 DEBUG_SYNC(ha_thd(), "rocksdb.get_row_by_rowid");
8865 DBUG_EXECUTE_IF("dbug.rocksdb.get_row_by_rowid", {
8866 THD *thd = ha_thd();
8867 static constexpr char act[] =
8868 "now signal Reached "
8869 "wait_for signal.rocksdb.get_row_by_rowid_let_running";
8870 assert(opt_debug_sync_timeout > 0);
8871 assert(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
8872 };);
8873
8874 bool found;
8875 rocksdb::Status s;
8876
8877 /* Pretend row found without looking up */
8878 if (skip_lookup) {
8879 update_row_stats(ROWS_DELETED_BLIND);
8880 m_last_rowkey.copy((const char *)rowid, rowid_size, &my_charset_bin);
8881 table->status = 0;
8882 DBUG_RETURN(0);
8883 }
8884
8885 if (m_lock_rows == RDB_LOCK_NONE) {
8886 tx->acquire_snapshot(true);
8887 s = tx->get(m_pk_descr->get_cf(), key_slice, &m_retrieved_record);
8888 } else if (m_insert_with_update && m_dup_key_found &&
8889 m_pk_descr->get_keyno() == m_dupp_errkey) {
8890 assert(m_dup_key_retrieved_record.length() ==
8891 m_retrieved_record.size());
8892 assert(memcmp(m_dup_key_retrieved_record.ptr(),
8893 m_retrieved_record.data(),
8894 m_retrieved_record.size()) == 0);
8895
8896 // do nothing - we already have the result in m_retrieved_record and
8897 // already taken the lock
8898 s = rocksdb::Status::OK();
8899 } else {
8900 s = get_for_update(tx, *m_pk_descr, key_slice, &m_retrieved_record);
8901 }
8902
8903 DBUG_EXECUTE_IF("rocksdb_return_status_corrupted",
8904 dbug_change_status_to_corrupted(&s););
8905
8906 if (!s.IsNotFound() && !s.ok()) {
8907 DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def));
8908 }
8909 found = !s.IsNotFound();
8910
8911 table->status = STATUS_NOT_FOUND;
8912 if (found) {
8913 /* If we found the record, but it's expired, pretend we didn't find it. */
8914 if (!skip_ttl_check && m_pk_descr->has_ttl() &&
8915 should_hide_ttl_rec(*m_pk_descr, m_retrieved_record,
8916 tx->m_snapshot_timestamp)) {
8917 DBUG_RETURN(HA_ERR_KEY_NOT_FOUND);
8918 }
8919
8920 m_last_rowkey.copy((const char *)rowid, rowid_size, &my_charset_bin);
8921 rc = convert_record_from_storage_format(&key_slice, buf);
8922
8923 if (!rc) {
8924 table->status = 0;
8925 }
8926 } else {
8927 /*
8928 Note: we don't need to unlock the row. It is intentional that we keep
8929 locks on rows that don't exist.
8930 */
8931 rc = HA_ERR_KEY_NOT_FOUND;
8932 }
8933
8934 DBUG_RETURN(rc);
8935 }
8936
8937 /**
8938 @return
8939 HA_EXIT_SUCCESS OK
8940 other HA_ERR error code (can be SE-specific)
8941 */
8942 int ha_rocksdb::index_next(uchar *const buf) {
8943 DBUG_ENTER_FUNC();
8944
8945 check_build_decoder();
8946
8947 bool moves_forward = true;
8948 ha_statistic_increment(&SSV::ha_read_next_count);
8949 if (m_key_descr_arr[active_index]->m_is_reverse_cf) {
8950 moves_forward = false;
8951 }
8952
8953 int rc = index_next_with_direction(buf, moves_forward);
8954 if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
8955
8956 DBUG_RETURN(rc);
8957 }
8958
8959 /**
8960 @return
8961 HA_EXIT_SUCCESS OK
8962 other HA_ERR error code (can be SE-specific)
8963 */
8964 int ha_rocksdb::index_prev(uchar *const buf) {
8965 DBUG_ENTER_FUNC();
8966
8967 check_build_decoder();
8968
8969 bool moves_forward = false;
8970 ha_statistic_increment(&SSV::ha_read_prev_count);
8971 if (m_key_descr_arr[active_index]->m_is_reverse_cf) {
8972 moves_forward = true;
8973 }
8974
8975 int rc = index_next_with_direction(buf, moves_forward);
8976 if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
8977
8978 DBUG_RETURN(rc);
8979 }
8980
8981 int ha_rocksdb::index_next_with_direction(uchar *const buf, bool move_forward) {
8982 DBUG_ENTER_FUNC();
8983
8984 int rc;
8985
8986 if (active_index == pk_index(table, m_tbl_def)) {
8987 rc = rnd_next_with_direction(buf, move_forward);
8988 } else {
8989 THD *thd = ha_thd();
8990 for (;;) {
8991 DEBUG_SYNC(thd, "rocksdb.check_flags_inwd");
8992 if (thd && thd->killed) {
8993 rc = HA_ERR_QUERY_INTERRUPTED;
8994 break;
8995 }
8996 if (m_skip_scan_it_next_call) {
8997 m_skip_scan_it_next_call = false;
8998 } else if (!m_scan_it->Valid()) {
8999 DBUG_RETURN(HA_ERR_KEY_NOT_FOUND);
9000 } else {
9001 if (move_forward) {
9002 m_scan_it->Next(); /* this call cannot fail */
9003 } else {
9004 m_scan_it->Prev();
9005 }
9006 }
9007 rc = rocksdb_skip_expired_records(*m_key_descr_arr[active_index],
9008 m_scan_it, !move_forward);
9009 if (rc != HA_EXIT_SUCCESS) {
9010 break;
9011 }
9012 rc = find_icp_matching_index_rec(move_forward, buf);
9013 if (!rc) rc = secondary_index_read(active_index, buf);
9014 if (!should_skip_invalidated_record(rc)) {
9015 break;
9016 }
9017 }
9018 }
9019
9020 DBUG_RETURN(rc);
9021 }
9022
9023 /**
9024 @return
9025 HA_EXIT_SUCCESS OK
9026 other HA_ERR error code (can be SE-specific)
9027 */
9028 int ha_rocksdb::index_first(uchar *const buf) {
9029 DBUG_ENTER_FUNC();
9030
9031 check_build_decoder();
9032
9033 m_sk_match_prefix = nullptr;
9034 ha_statistic_increment(&SSV::ha_read_first_count);
9035 int rc = m_key_descr_arr[active_index]->m_is_reverse_cf
9036 ? index_last_intern(buf)
9037 : index_first_intern(buf);
9038 if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
9039
9040 DBUG_RETURN(rc);
9041 }
9042
9043 /**
9044 @return
9045 HA_EXIT_SUCCESS OK
9046 other HA_ERR error code (can be SE-specific)
9047 */
9048 int ha_rocksdb::index_last(uchar *const buf) {
9049 DBUG_ENTER_FUNC();
9050
9051 check_build_decoder();
9052
9053 m_sk_match_prefix = nullptr;
9054 ha_statistic_increment(&SSV::ha_read_last_count);
9055 int rc = m_key_descr_arr[active_index]->m_is_reverse_cf
9056 ? index_first_intern(buf)
9057 : index_last_intern(buf);
9058 if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
9059
9060 DBUG_RETURN(rc);
9061 }
9062
9063 /*
9064 Start scanning from the "first" value.
9065
9066 The 'first' here means "the first from start of the key space".
9067 For reverse-ordered key spaces, the first value will be the biggest, 'zzzz'.
9068
9069 An picture of a forward-ordered keyspace (remember, the keys have form
9070 'indexnr-keyval'. Suppose the index we are at has number n)
9071
9072 (n-1) - ...
9073 ( n ) <--- 1. (n) doesn't exist in the db but it would be here.
9074 ( n ) - aaa <--- 2. Seek("n") will put us here on the first index
9075 ( n ) - bbb record.
9076 ( n ) - cc
9077
9078 So, need to do: Seek(n);
9079
9080 A backward-ordered keyspace:
9081
9082 (n+1) - bbb
9083 (n+1) - aaa
9084 (n+1) <--- (n+1) doesn't exist in the db but would be here.
9085 ( n ) - ccc <--- 1. We need to be here.
9086 ( n ) - bbb
9087 ( n ) - aaa
9088 ( n )
9089
9090 So, need to: Seek(n+1);
9091
9092 */
9093
9094 int ha_rocksdb::index_first_intern(uchar *const buf) {
9095 DBUG_ENTER_FUNC();
9096
9097 assert(buf != nullptr);
9098
9099 uchar *key;
9100 uint key_size;
9101 int rc;
9102
9103 if (is_pk(active_index, table, m_tbl_def)) {
9104 key = m_pk_packed_tuple;
9105 } else {
9106 key = m_sk_packed_tuple;
9107 }
9108
9109 assert(key != nullptr);
9110
9111 const Rdb_key_def &kd = *m_key_descr_arr[active_index];
9112 int key_start_matching_bytes = kd.get_first_key(key, &key_size);
9113
9114 rocksdb::Slice index_key((const char *)key, key_size);
9115
9116 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
9117 assert(tx != nullptr);
9118
9119 const bool is_new_snapshot = !tx->has_snapshot();
9120 // Loop as long as we get a deadlock error AND we end up creating the
9121 // snapshot here (i.e. it did not exist prior to this)
9122 for (;;) {
9123 setup_scan_iterator(kd, &index_key, false, key_start_matching_bytes);
9124 m_scan_it->Seek(index_key);
9125 m_skip_scan_it_next_call = true;
9126
9127 rc = index_next_with_direction(buf, true);
9128 if (!should_recreate_snapshot(rc, is_new_snapshot)) {
9129 break; /* exit the loop */
9130 }
9131
9132 // release the snapshot and iterator so they will be regenerated
9133 tx->release_snapshot();
9134 release_scan_iterator();
9135 }
9136
9137 DBUG_RETURN(rc);
9138 }
9139
9140 /**
9141 @details
9142 Start scanning from the "last" value
9143
9144 The 'last' here means "the last from start of the key space".
9145 For reverse-ordered key spaces, we will actually read the smallest value.
9146
9147 An picture of a forward-ordered keyspace (remember, the keys have form
9148 'indexnr-keyval'. Suppose the we are at a key that has number n)
9149
9150 (n-1)-something
9151 ( n )-aaa
9152 ( n )-bbb
9153 ( n )-ccc <----------- Need to seek to here.
9154 (n+1) <---- Doesn't exist, but would be here.
9155 (n+1)-smth, or no value at all
9156
9157 RocksDB's Iterator::SeekForPrev($val) seeks to "at $val or last value that's
9158 smaller". We can't seek to "(n)-ccc" directly, because we don't know what
9159 is the value of 'ccc' (the biggest record with prefix (n)). Instead, we seek
9160 to "(n+1)", which is the least possible value that's greater than any value
9161 in index #n.
9162
9163 So, need to: it->SeekForPrev(n+1)
9164
9165 A backward-ordered keyspace:
9166
9167 (n+1)-something
9168 ( n ) - ccc
9169 ( n ) - bbb
9170 ( n ) - aaa <---------------- (*) Need to seek here.
9171 ( n ) <--- Doesn't exist, but would be here.
9172 (n-1)-smth, or no value at all
9173
9174 So, need to: it->SeekForPrev(n)
9175 */
9176
9177 int ha_rocksdb::index_last_intern(uchar *const buf) {
9178 DBUG_ENTER_FUNC();
9179
9180 assert(buf != nullptr);
9181
9182 uchar *key;
9183 uint key_size;
9184 int rc;
9185
9186 if (is_pk(active_index, table, m_tbl_def)) {
9187 key = m_pk_packed_tuple;
9188 } else {
9189 key = m_sk_packed_tuple;
9190 }
9191
9192 assert(key != nullptr);
9193
9194 const Rdb_key_def &kd = *m_key_descr_arr[active_index];
9195 int key_end_matching_bytes = kd.get_last_key(key, &key_size);
9196
9197 rocksdb::Slice index_key((const char *)key, key_size);
9198
9199 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
9200 assert(tx != nullptr);
9201
9202 const bool is_new_snapshot = !tx->has_snapshot();
9203 // Loop as long as we get a deadlock error AND we end up creating the
9204 // snapshot here (i.e. it did not exist prior to this)
9205 for (;;) {
9206 setup_scan_iterator(kd, &index_key, false, key_end_matching_bytes);
9207 m_scan_it->SeekForPrev(index_key);
9208 m_skip_scan_it_next_call = false;
9209
9210 if (is_pk(active_index, table, m_tbl_def)) {
9211 m_skip_scan_it_next_call = true;
9212 rc = rnd_next_with_direction(buf, false);
9213 } else {
9214 rc = find_icp_matching_index_rec(false /*move_forward*/, buf);
9215 if (!rc) rc = secondary_index_read(active_index, buf);
9216 }
9217
9218 if (!should_recreate_snapshot(rc, is_new_snapshot)) {
9219 break; /* exit the loop */
9220 }
9221
9222 // release the snapshot and iterator so they will be regenerated
9223 tx->release_snapshot();
9224 release_scan_iterator();
9225 }
9226
9227 DBUG_RETURN(rc);
9228 }
9229
9230 void ha_rocksdb::unlock_row() {
9231 DBUG_ENTER_FUNC();
9232
9233 if (m_lock_rows != RDB_LOCK_NONE) {
9234 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
9235 tx->release_lock(*m_pk_descr,
9236 std::string(m_last_rowkey.ptr(), m_last_rowkey.length()));
9237 }
9238
9239 DBUG_VOID_RETURN;
9240 }
9241
9242 /*
9243 Returning true if SingleDelete can be used.
9244 - Secondary Indexes can always use SingleDelete.
9245 - If the index is PRIMARY KEY, and if all of the columns of the table
9246 are covered by the PRIMARY KEY, SingleDelete can be used.
9247 */
9248 bool ha_rocksdb::can_use_single_delete(const uint index) const {
9249 return (index != pk_index(table, m_tbl_def) ||
9250 (!has_hidden_pk(table) &&
9251 table->key_info[index].actual_key_parts == table->s->fields));
9252 }
9253
9254 bool ha_rocksdb::skip_unique_check() const {
9255 /*
9256 We want to skip unique checks if:
9257 1) bulk_load is on
9258 2) the user set unique_checks option to 0, and the table does not have
9259 any indexes. If the table has secondary keys, then those might becomes
9260 inconsisted/corrupted
9261 3) We're using read-free replication
9262 */
9263 return THDVAR(table->in_use, bulk_load) ||
9264 (my_core::thd_test_options(table->in_use,
9265 OPTION_RELAXED_UNIQUE_CHECKS) &&
9266 m_tbl_def->m_key_count == 1) ||
9267 use_read_free_rpl();
9268 }
9269
9270 bool ha_rocksdb::commit_in_the_middle() {
9271 return THDVAR(table->in_use, bulk_load) ||
9272 THDVAR(table->in_use, commit_in_the_middle);
9273 }
9274
9275 /*
9276 Executing bulk commit if it should.
9277 @retval true if bulk commit failed
9278 @retval false if bulk commit was skipped or succeeded
9279 */
9280 bool ha_rocksdb::do_bulk_commit(Rdb_transaction *const tx) {
9281 assert(tx != nullptr);
9282 return commit_in_the_middle() &&
9283 tx->get_write_count() >= THDVAR(table->in_use, bulk_load_size) &&
9284 tx->flush_batch();
9285 }
9286
9287 /*
9288 If table was created without primary key, SQL layer represents the primary
9289 key number as MAX_INDEXES. Hence, this function returns true if the table
9290 does not contain a primary key. (In which case we generate a hidden
9291 'auto-incremented' pk.)
9292 */
9293 bool ha_rocksdb::has_hidden_pk(const TABLE *const table) const {
9294 assert(table != nullptr);
9295 return Rdb_key_def::table_has_hidden_pk(table);
9296 }
9297
9298 /*
9299 Returns true if given index number is a hidden_pk.
9300 - This is used when a table is created with no primary key.
9301 */
9302 bool ha_rocksdb::is_hidden_pk(const uint index, const TABLE *const table_arg,
9303 const Rdb_tbl_def *const tbl_def_arg) {
9304 assert(table_arg != nullptr);
9305 assert(table_arg->s != nullptr);
9306 assert(tbl_def_arg != nullptr);
9307
9308 return (table_arg->s->primary_key == MAX_INDEXES &&
9309 index == tbl_def_arg->m_key_count - 1);
9310 }
9311
9312 /* Returns index of primary key */
9313 uint ha_rocksdb::pk_index(const TABLE *const table_arg,
9314 const Rdb_tbl_def *const tbl_def_arg) {
9315 assert(table_arg != nullptr);
9316 assert(table_arg->s != nullptr);
9317 assert(tbl_def_arg != nullptr);
9318
9319 return table_arg->s->primary_key == MAX_INDEXES ? tbl_def_arg->m_key_count - 1
9320 : table_arg->s->primary_key;
9321 }
9322
9323 /* Returns true if given index number is a primary key */
9324 bool ha_rocksdb::is_pk(const uint index, const TABLE *const table_arg,
9325 const Rdb_tbl_def *const tbl_def_arg) {
9326 assert(table_arg != nullptr);
9327 assert(table_arg->s != nullptr);
9328 assert(tbl_def_arg != nullptr);
9329
9330 return index == table_arg->s->primary_key ||
9331 is_hidden_pk(index, table_arg, tbl_def_arg);
9332 }
9333
9334 uint ha_rocksdb::max_supported_key_part_length(HA_CREATE_INFO *) const {
9335 DBUG_ENTER_FUNC();
9336 DBUG_RETURN(rocksdb_large_prefix ? MAX_INDEX_COL_LEN_LARGE
9337 : MAX_INDEX_COL_LEN_SMALL);
9338 }
9339
9340 const char *ha_rocksdb::get_key_name(const uint index,
9341 const TABLE *const table_arg,
9342 const Rdb_tbl_def *const tbl_def_arg) {
9343 assert(table_arg != nullptr);
9344 assert(tbl_def_arg != nullptr);
9345
9346 if (is_hidden_pk(index, table_arg, tbl_def_arg)) {
9347 return HIDDEN_PK_NAME;
9348 }
9349
9350 assert(table_arg->key_info != nullptr);
9351 assert(table_arg->key_info[index].name != nullptr);
9352
9353 return table_arg->key_info[index].name;
9354 }
9355
9356 const char *ha_rocksdb::get_key_comment(const uint index,
9357 const TABLE *const table_arg,
9358 const Rdb_tbl_def *const tbl_def_arg) {
9359 assert(table_arg != nullptr);
9360 assert(tbl_def_arg != nullptr);
9361
9362 if (is_hidden_pk(index, table_arg, tbl_def_arg)) {
9363 return nullptr;
9364 }
9365
9366 assert(table_arg->key_info != nullptr);
9367
9368 return table_arg->key_info[index].comment.str;
9369 }
9370
9371 const std::string ha_rocksdb::generate_cf_name(
9372 const uint index, const TABLE *const table_arg,
9373 const Rdb_tbl_def *const tbl_def_arg, bool *per_part_match_found) {
9374 assert(table_arg != nullptr);
9375 assert(tbl_def_arg != nullptr);
9376 assert(per_part_match_found != nullptr);
9377
9378 // When creating CF-s the caller needs to know if there was a custom CF name
9379 // specified for a given paritition.
9380 *per_part_match_found = false;
9381
9382 // Index comment is used to define the column family name specification(s).
9383 // If there was no comment, we get an emptry string, and it means "use the
9384 // default column family".
9385 const char *const comment = get_key_comment(index, table_arg, tbl_def_arg);
9386
9387 // `get_key_comment` can return `nullptr`, that's why this.
9388 std::string key_comment = comment ? comment : "";
9389
9390 std::string cf_name = Rdb_key_def::parse_comment_for_qualifier(
9391 key_comment, table_arg, tbl_def_arg, per_part_match_found,
9392 RDB_CF_NAME_QUALIFIER);
9393
9394 if (table_arg->part_info != nullptr && !*per_part_match_found) {
9395 // At this point we tried to search for a custom CF name for a partition,
9396 // but none was specified. Therefore default one will be used.
9397 return "";
9398 }
9399
9400 // If we didn't find any partitioned/non-partitioned qualifiers, return the
9401 // comment itself. NOTE: this currently handles returning the cf name
9402 // specified in the index comment in the case of no partitions, which doesn't
9403 // use any qualifiers at the moment. (aka its a special case)
9404 if (cf_name.empty() && !key_comment.empty()) {
9405 return key_comment;
9406 }
9407
9408 return cf_name;
9409 }
9410
9411 const std::string ha_rocksdb::get_table_comment(const TABLE *const table_arg) {
9412 assert(table_arg != nullptr);
9413 assert(table_arg->s != nullptr);
9414
9415 return table_arg->s->comment.str;
9416 }
9417
9418 /**
9419 Write a new row
9420
9421 @param[in] buf new row data to write
9422 @return
9423 HA_EXIT_SUCCESS OK
9424 other HA_ERR error code (can be SE-specific)
9425 */
9426 int ha_rocksdb::write_row(uchar *const buf) {
9427 DBUG_ENTER_FUNC();
9428
9429 assert(buf != nullptr);
9430 assert(buf == table->record[0]);
9431 assert(m_lock_rows == RDB_LOCK_WRITE);
9432
9433 ha_statistic_increment(&SSV::ha_write_count);
9434 /*
9435 Note: "buf == table->record[0]" is copied from innodb. I am not aware of
9436 any use cases where this condition is not true.
9437 */
9438 if (table->next_number_field && buf == table->record[0]) {
9439 int err;
9440 if ((err = update_auto_increment())) {
9441 DBUG_RETURN(err);
9442 }
9443 }
9444
9445 // clear cache at beginning of write for INSERT ON DUPLICATE
9446 // we may get multiple write->fail->read->update if there are multiple
9447 // values from INSERT
9448 m_dup_key_found = false;
9449
9450 const int rv = update_write_row(nullptr, buf, skip_unique_check());
9451
9452 if (rv == 0) {
9453
9454 // Not protected by ddl_manger lock for performance
9455 // reasons. This is an estimate value anyway.
9456 inc_table_n_rows();
9457 update_table_stats_if_needed();
9458
9459 update_row_stats(ROWS_INSERTED);
9460 }
9461
9462 DBUG_RETURN(rv);
9463 }
9464
9465 // Increment the number of rows in the table by one.
9466 // This operation is not protected by ddl manager lock.
9467 // The number is estimated.
9468 void ha_rocksdb::inc_table_n_rows() {
9469 if (!rocksdb_table_stats_use_table_scan) {
9470 return;
9471 }
9472
9473 uint64 n_rows = m_tbl_def->m_tbl_stats.m_stat_n_rows;
9474 if (n_rows < std::numeric_limits<ulonglong>::max()) {
9475 m_tbl_def->m_tbl_stats.m_stat_n_rows = n_rows + 1;
9476 }
9477 }
9478
9479 // Decrement the number of rows in the table by one.
9480 // This operation is not protected by ddl manager lock.
9481 // The number is estimated.
9482 void ha_rocksdb::dec_table_n_rows() {
9483 if (!rocksdb_table_stats_use_table_scan) {
9484 return;
9485 }
9486
9487 uint64 n_rows = m_tbl_def->m_tbl_stats.m_stat_n_rows;
9488 if (n_rows > 0) {
9489 m_tbl_def->m_tbl_stats.m_stat_n_rows = n_rows - 1;
9490 }
9491 }
9492
9493 /**
9494 Constructing m_last_rowkey (MyRocks key expression) from
9495 before_update|delete image (MySQL row expression).
9496 m_last_rowkey is normally set during lookup phase, such as
9497 rnd_next_with_direction() and rnd_pos(). With Read Free Replication,
9498 these read functions are skipped and update_rows(), delete_rows() are
9499 called without setting m_last_rowkey. This function sets m_last_rowkey
9500 for Read Free Replication.
9501 */
9502 void ha_rocksdb::set_last_rowkey(const uchar *const old_data) {
9503 #if defined(ROCKSDB_INCLUDE_RFR) && ROCKSDB_INCLUDE_RFR
9504 if (old_data && use_read_free_rpl()) {
9505 const int old_pk_size = m_pk_descr->pack_record(
9506 table, m_pack_buffer, old_data, m_pk_packed_tuple, nullptr, false);
9507 m_last_rowkey.copy((const char *)m_pk_packed_tuple, old_pk_size,
9508 &my_charset_bin);
9509 }
9510 #endif // defined(ROCKSDB_INCLUDE_RFR) && ROCKSDB_INCLUDE_RFR
9511 }
9512
9513 /**
9514 Collect update data for primary key
9515
9516 @param[in, out] row_info hold all data for update row, such as
9517 new row data/old row data
9518 @return
9519 HA_EXIT_SUCCESS OK
9520 other HA_ERR error code (can be SE-specific)
9521 */
9522 int ha_rocksdb::get_pk_for_update(struct update_row_info *const row_info) {
9523 int size;
9524
9525 /*
9526 Get new row key for any insert, and any update where the pk is not hidden.
9527 Row key for updates with hidden pk is handled below.
9528 */
9529 if (!has_hidden_pk(table)) {
9530 row_info->hidden_pk_id = 0;
9531
9532 row_info->new_pk_unpack_info = &m_pk_unpack_info;
9533
9534 size = m_pk_descr->pack_record(
9535 table, m_pack_buffer, row_info->new_data, m_pk_packed_tuple,
9536 row_info->new_pk_unpack_info, false, 0, 0, nullptr);
9537 } else if (row_info->old_data == nullptr) {
9538 row_info->hidden_pk_id = update_hidden_pk_val();
9539 size =
9540 m_pk_descr->pack_hidden_pk(row_info->hidden_pk_id, m_pk_packed_tuple);
9541 } else {
9542 /*
9543 If hidden primary key, rowkey for new record will always be the same as
9544 before
9545 */
9546 size = row_info->old_pk_slice.size();
9547 memcpy(m_pk_packed_tuple, row_info->old_pk_slice.data(), size);
9548 int err = read_hidden_pk_id_from_rowkey(&row_info->hidden_pk_id);
9549 if (err) {
9550 return err;
9551 }
9552 }
9553
9554 row_info->new_pk_slice =
9555 rocksdb::Slice((const char *)m_pk_packed_tuple, size);
9556
9557 return HA_EXIT_SUCCESS;
9558 }
9559
9560 /**
9561 Check the specified primary key value is unique and also lock the row
9562
9563 @param[in] key_id key index
9564 @param[in] row_info hold all data for update row, such as old row
9565 data and new row data
9566 @param[out] found whether the primary key exists before.
9567 @param[out] pk_changed whether primary key is changed
9568 @return
9569 HA_EXIT_SUCCESS OK
9570 other HA_ERR error code (can be SE-specific)
9571 */
9572 int ha_rocksdb::check_and_lock_unique_pk(const uint key_id,
9573 const struct update_row_info &row_info,
9574 bool *const found,
9575 const bool skip_unique_check) {
9576 assert(found != nullptr);
9577
9578 assert(row_info.old_pk_slice.size() == 0 ||
9579 row_info.new_pk_slice.compare(row_info.old_pk_slice) != 0);
9580
9581 const bool ignore_pk_unique_check = skip_unique_check;
9582
9583 /*
9584 Perform a read to determine if a duplicate entry exists. For primary
9585 keys, a point lookup will be sufficient.
9586
9587 note: we intentionally don't set options.snapshot here. We want to read
9588 the latest committed data.
9589 */
9590
9591 /*
9592 To prevent race conditions like below, it is necessary to
9593 take a lock for a target row. get_for_update() holds a gap lock if
9594 target key does not exist, so below conditions should never
9595 happen.
9596
9597 1) T1 Get(empty) -> T2 Get(empty) -> T1 Put(insert) -> T1 commit
9598 -> T2 Put(overwrite) -> T2 commit
9599 2) T1 Get(empty) -> T1 Put(insert, not committed yet) -> T2 Get(empty)
9600 -> T2 Put(insert, blocked) -> T1 commit -> T2 commit(overwrite)
9601 */
9602 const rocksdb::Status s =
9603 get_for_update(row_info.tx, *m_pk_descr, row_info.new_pk_slice,
9604 ignore_pk_unique_check ? nullptr : &m_retrieved_record);
9605 if (!s.ok() && !s.IsNotFound()) {
9606 return row_info.tx->set_status_error(table->in_use, s,
9607 *m_key_descr_arr[key_id], m_tbl_def);
9608 }
9609
9610 bool key_found = ignore_pk_unique_check ? false : !s.IsNotFound();
9611
9612 /*
9613 If the pk key has ttl, we may need to pretend the row wasn't
9614 found if it is already expired.
9615 */
9616 if (key_found && m_pk_descr->has_ttl() &&
9617 should_hide_ttl_rec(*m_pk_descr, m_retrieved_record,
9618 (row_info.tx->m_snapshot_timestamp
9619 ? row_info.tx->m_snapshot_timestamp
9620 : static_cast<int64_t>(std::time(nullptr))))) {
9621 key_found = false;
9622 }
9623
9624 if (key_found && row_info.old_data == nullptr && m_insert_with_update) {
9625 // In INSERT ON DUPLICATE KEY UPDATE ... case, if the insert failed
9626 // due to a duplicate key, remember the last key and skip the check
9627 // next time
9628 m_dup_key_found = true;
9629
9630 #ifndef NDEBUG
9631 // save it for sanity checking later
9632 m_dup_key_retrieved_record.copy(m_retrieved_record.data(),
9633 m_retrieved_record.size(), &my_charset_bin);
9634 #endif
9635 }
9636
9637 *found = key_found;
9638
9639 return HA_EXIT_SUCCESS;
9640 }
9641
9642 /**
9643 Check the specified secondary key value is unique and also lock the row
9644
9645 @param[in] key_id key index
9646 @param[in] row_info hold all data for update row, such as old row
9647 data and new row data
9648 @param[out] found whether specified key value exists before.
9649 @return
9650 HA_EXIT_SUCCESS OK
9651 other HA_ERR error code (can be SE-specific)
9652 */
9653 int ha_rocksdb::check_and_lock_sk(const uint key_id,
9654 const struct update_row_info &row_info,
9655 bool *const found,
9656 const bool skip_unique_check) {
9657 assert(found != nullptr);
9658 *found = false;
9659
9660 /*
9661 Can skip checking this key if none of the key fields have changed.
9662 */
9663 if (row_info.old_data != nullptr && !m_update_scope.is_set(key_id)) {
9664 return HA_EXIT_SUCCESS;
9665 }
9666
9667 KEY *key_info = nullptr;
9668 uint n_null_fields = 0;
9669 uint user_defined_key_parts = 1;
9670
9671 key_info = &table->key_info[key_id];
9672 user_defined_key_parts = key_info->user_defined_key_parts;
9673 /*
9674 If there are no uniqueness requirements, there's no need to obtain a
9675 lock for this key.
9676 */
9677 if (!(key_info->flags & HA_NOSAME)) {
9678 return HA_EXIT_SUCCESS;
9679 }
9680
9681 const Rdb_key_def &kd = *m_key_descr_arr[key_id];
9682
9683 /*
9684 Calculate the new key for obtaining the lock
9685
9686 For unique secondary indexes, the key used for locking does not
9687 include the extended fields.
9688 */
9689 int size =
9690 kd.pack_record(table, m_pack_buffer, row_info.new_data, m_sk_packed_tuple,
9691 nullptr, false, 0, user_defined_key_parts, &n_null_fields);
9692 if (n_null_fields > 0) {
9693 /*
9694 If any fields are marked as NULL this will never match another row as
9695 to NULL never matches anything else including another NULL.
9696 */
9697 return HA_EXIT_SUCCESS;
9698 }
9699
9700 const rocksdb::Slice new_slice =
9701 rocksdb::Slice((const char *)m_sk_packed_tuple, size);
9702
9703 /*
9704 Acquire lock on the old key in case of UPDATE
9705 */
9706 if (row_info.old_data != nullptr) {
9707 size = kd.pack_record(table, m_pack_buffer, row_info.old_data,
9708 m_sk_packed_tuple_old, nullptr, false, 0,
9709 user_defined_key_parts);
9710 const rocksdb::Slice old_slice =
9711 rocksdb::Slice((const char *)m_sk_packed_tuple_old, size);
9712
9713 const rocksdb::Status s =
9714 get_for_update(row_info.tx, kd, old_slice, nullptr);
9715 if (!s.ok()) {
9716 return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def);
9717 }
9718
9719 /*
9720 If the old and new keys are the same we're done since we've already taken
9721 the lock on the old key
9722 */
9723 if (!new_slice.compare(old_slice)) {
9724 return HA_EXIT_SUCCESS;
9725 }
9726 }
9727
9728 /*
9729 Perform a read to determine if a duplicate entry exists - since this is
9730 a secondary indexes a range scan is needed.
9731
9732 note: we intentionally don't set options.snapshot here. We want to read
9733 the latest committed data.
9734 */
9735
9736 const bool all_parts_used = (user_defined_key_parts == kd.get_key_parts());
9737
9738 /*
9739 This iterator seems expensive since we need to allocate and free
9740 memory for each unique index.
9741
9742 If this needs to be optimized, for keys without NULL fields, the
9743 extended primary key fields can be migrated to the value portion of the
9744 key. This enables using Get() instead of Seek() as in the primary key
9745 case.
9746
9747 The bloom filter may need to be disabled for this lookup.
9748 */
9749 uchar lower_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE];
9750 uchar upper_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE];
9751 rocksdb::Slice lower_bound_slice;
9752 rocksdb::Slice upper_bound_slice;
9753
9754 const bool total_order_seek = !check_bloom_and_set_bounds(
9755 ha_thd(), kd, new_slice, all_parts_used, Rdb_key_def::INDEX_NUMBER_SIZE,
9756 lower_bound_buf, upper_bound_buf, &lower_bound_slice, &upper_bound_slice);
9757 const bool fill_cache = !THDVAR(ha_thd(), skip_fill_cache);
9758
9759 const rocksdb::Status s = get_for_update(row_info.tx, kd, new_slice, nullptr);
9760 if (!s.ok() && !s.IsNotFound()) {
9761 return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def);
9762 }
9763
9764 rocksdb::Iterator *const iter = row_info.tx->get_iterator(
9765 kd.get_cf(), total_order_seek, fill_cache, lower_bound_slice,
9766 upper_bound_slice, true /* read current data */,
9767 false /* acquire snapshot */);
9768 /*
9769 Need to scan the transaction to see if there is a duplicate key.
9770 Also need to scan RocksDB and verify the key has not been deleted
9771 in the transaction.
9772 */
9773 *found = !read_key_exact(kd, iter, all_parts_used, new_slice,
9774 row_info.tx->m_snapshot_timestamp);
9775
9776 int rc = HA_EXIT_SUCCESS;
9777
9778 if (*found && m_insert_with_update) {
9779 const rocksdb::Slice &rkey = iter->key();
9780 uint pk_size =
9781 kd.get_primary_key_tuple(table, *m_pk_descr, &rkey, m_pk_packed_tuple);
9782 if (pk_size == RDB_INVALID_KEY_LEN) {
9783 rc = HA_ERR_ROCKSDB_CORRUPT_DATA;
9784 } else {
9785 m_dup_key_found = true;
9786 m_last_rowkey.copy((const char *)m_pk_packed_tuple, pk_size,
9787 &my_charset_bin);
9788 #ifndef NDEBUG
9789 // save it for sanity checking later
9790 m_dup_key_retrieved_record.copy(rkey.data(), rkey.size(),
9791 &my_charset_bin);
9792 #endif
9793 }
9794 }
9795
9796 delete iter;
9797 return rc;
9798 }
9799
9800 /**
9801 Enumerate all keys to check their uniquess and also lock it
9802
9803 @param[in] row_info hold all data for update row, such as old row
9804 data and new row data
9805 @param[out] pk_changed whether primary key is changed
9806 @return
9807 HA_EXIT_SUCCESS OK
9808 other HA_ERR error code (can be SE-specific)
9809 */
9810 int ha_rocksdb::check_uniqueness_and_lock(
9811 const struct update_row_info &row_info, bool pk_changed,
9812 bool skip_unique_check) {
9813 /*
9814 Go through each index and determine if the index has uniqueness
9815 requirements. If it does, then try to obtain a row lock on the new values.
9816 Once all locks have been obtained, then perform the changes needed to
9817 update/insert the row.
9818 */
9819 for (uint key_id = 0; key_id < m_tbl_def->m_key_count; key_id++) {
9820 bool found;
9821 int rc;
9822
9823 if (is_pk(key_id, table, m_tbl_def)) {
9824 if (row_info.old_pk_slice.size() > 0 && !pk_changed) {
9825 found = false;
9826 rc = HA_EXIT_SUCCESS;
9827 } else {
9828 rc = check_and_lock_unique_pk(key_id, row_info, &found,
9829 skip_unique_check);
9830 DEBUG_SYNC(ha_thd(), "rocksdb.after_unique_pk_check");
9831 }
9832 } else {
9833 rc = check_and_lock_sk(key_id, row_info, &found, skip_unique_check);
9834 DEBUG_SYNC(ha_thd(), "rocksdb.after_unique_sk_check");
9835 }
9836
9837 if (rc != HA_EXIT_SUCCESS) {
9838 return rc;
9839 }
9840
9841 if (found) {
9842 /* There is a row with this key already, so error out. */
9843 errkey = key_id;
9844 m_dupp_errkey = errkey;
9845
9846 return HA_ERR_FOUND_DUPP_KEY;
9847 }
9848 }
9849
9850 return HA_EXIT_SUCCESS;
9851 }
9852
9853 /**
9854 Check whether secondary key value is duplicate or not
9855
9856 @param[in] table_arg the table currently working on
9857 @param[in key_def the key_def is being checked
9858 @param[in] key secondary key storage data
9859 @param[out] sk_info hold secondary key memcmp datas(new/old)
9860 @return
9861 HA_EXIT_SUCCESS OK
9862 other HA_ERR error code (can be SE-specific)
9863 */
9864
9865 int ha_rocksdb::check_duplicate_sk(const TABLE *table_arg,
9866 const Rdb_key_def &key_def,
9867 const rocksdb::Slice *key,
9868 struct unique_sk_buf_info *sk_info) {
9869 uint n_null_fields = 0;
9870 const rocksdb::Comparator *index_comp = key_def.get_cf()->GetComparator();
9871
9872 /* Get proper SK buffer. */
9873 uchar *sk_buf = sk_info->swap_and_get_sk_buf();
9874
9875 /* Get memcmp form of sk without extended pk tail */
9876 uint sk_memcmp_size =
9877 key_def.get_memcmp_sk_parts(table_arg, *key, sk_buf, &n_null_fields);
9878
9879 sk_info->sk_memcmp_key =
9880 rocksdb::Slice(reinterpret_cast<char *>(sk_buf), sk_memcmp_size);
9881
9882 if (sk_info->sk_memcmp_key_old.size() > 0 && n_null_fields == 0 &&
9883 index_comp->Compare(sk_info->sk_memcmp_key, sk_info->sk_memcmp_key_old) ==
9884 0) {
9885 return 1;
9886 }
9887
9888 sk_info->sk_memcmp_key_old = sk_info->sk_memcmp_key;
9889 return 0;
9890 }
9891
9892 int ha_rocksdb::bulk_load_key(Rdb_transaction *const tx, const Rdb_key_def &kd,
9893 const rocksdb::Slice &key,
9894 const rocksdb::Slice &value, bool sort) {
9895 DBUG_ENTER_FUNC();
9896 int res;
9897 THD *thd = ha_thd();
9898 if (thd && thd->killed) {
9899 DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED);
9900 }
9901
9902 rocksdb::ColumnFamilyHandle *cf = kd.get_cf();
9903
9904 // In the case of unsorted inserts, m_sst_info allocated here is not
9905 // used to store the keys. It is still used to indicate when tables
9906 // are switched.
9907 if (m_sst_info == nullptr || m_sst_info->is_done()) {
9908 m_sst_info.reset(new Rdb_sst_info(rdb, m_table_handler->m_table_name,
9909 kd.get_name(), cf, *rocksdb_db_options,
9910 THDVAR(ha_thd(), trace_sst_api)));
9911 res = tx->start_bulk_load(this, m_sst_info);
9912 if (res != HA_EXIT_SUCCESS) {
9913 DBUG_RETURN(res);
9914 }
9915 }
9916 assert(m_sst_info);
9917
9918 if (sort) {
9919 Rdb_index_merge *key_merge;
9920 assert(cf != nullptr);
9921
9922 res = tx->get_key_merge(kd.get_gl_index_id(), cf, &key_merge);
9923 if (res == HA_EXIT_SUCCESS) {
9924 res = key_merge->add(key, value);
9925 }
9926 } else {
9927 res = m_sst_info->put(key, value);
9928 }
9929
9930 DBUG_RETURN(res);
9931 }
9932
9933 int ha_rocksdb::finalize_bulk_load(bool print_client_error) {
9934 DBUG_ENTER_FUNC();
9935
9936 int res = HA_EXIT_SUCCESS;
9937
9938 /* Skip if there are no possible ongoing bulk loads */
9939 if (m_sst_info) {
9940 if (m_sst_info->is_done()) {
9941 m_sst_info.reset();
9942 DBUG_RETURN(res);
9943 }
9944
9945 Rdb_sst_info::Rdb_sst_commit_info commit_info;
9946
9947 // Wrap up the current work in m_sst_info and get ready to commit
9948 // This transfer the responsibility of commit over to commit_info
9949 res = m_sst_info->finish(&commit_info, print_client_error);
9950 if (res == 0) {
9951 // Make sure we have work to do - under race condition we could lose
9952 // to another thread and end up with no work
9953 if (commit_info.has_work()) {
9954 rocksdb::IngestExternalFileOptions opts;
9955 opts.move_files = true;
9956 opts.snapshot_consistency = false;
9957 opts.allow_global_seqno = false;
9958 opts.allow_blocking_flush = false;
9959
9960 const rocksdb::Status s = rdb->IngestExternalFile(
9961 commit_info.get_cf(), commit_info.get_committed_files(), opts);
9962 if (!s.ok()) {
9963 if (print_client_error) {
9964 Rdb_sst_info::report_error_msg(s, nullptr);
9965 }
9966 res = HA_ERR_ROCKSDB_BULK_LOAD;
9967 } else {
9968 // Mark the list of SST files as committed, otherwise they'll get
9969 // cleaned up when commit_info destructs
9970 commit_info.commit();
9971 }
9972 }
9973 }
9974 m_sst_info.reset();
9975 }
9976 DBUG_RETURN(res);
9977 }
9978
9979 /**
9980 Update an existing primary key record or write a new primary key record
9981
9982 @param[in] kd the primary key is being update/write
9983 @param[in] update_row_info hold all row data, such as old row data and
9984 new row data
9985 @param[in] pk_changed whether primary key is changed
9986 @return
9987 HA_EXIT_SUCCESS OK
9988 Other HA_ERR error code (can be SE-specific)
9989 */
9990 int ha_rocksdb::update_write_pk(const Rdb_key_def &kd,
9991 const struct update_row_info &row_info,
9992 const bool pk_changed) {
9993 const uint key_id = kd.get_keyno();
9994 const bool hidden_pk = is_hidden_pk(key_id, table, m_tbl_def);
9995
9996 /*
9997 If the PK has changed, or if this PK uses single deletes and this is an
9998 update, the old key needs to be deleted. In the single delete case, it
9999 might be possible to have this sequence of keys: PUT(X), PUT(X), SD(X),
10000 resulting in the first PUT(X) showing up.
10001 */
10002 if (!hidden_pk && (pk_changed || ((row_info.old_pk_slice.size() > 0) &&
10003 can_use_single_delete(key_id)))) {
10004 const rocksdb::Status s = delete_or_singledelete(
10005 key_id, row_info.tx, kd.get_cf(), row_info.old_pk_slice);
10006 if (!s.ok()) {
10007 return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def);
10008 }
10009 }
10010
10011 if (table->found_next_number_field) {
10012 update_auto_incr_val_from_field();
10013 }
10014
10015 int rc = HA_EXIT_SUCCESS;
10016 rocksdb::Slice value_slice;
10017 /* Prepare the new record to be written into RocksDB */
10018 if ((rc = m_converter->encode_value_slice(
10019 m_pk_descr, row_info.new_pk_slice, row_info.new_pk_unpack_info,
10020 !row_info.old_pk_slice.empty(), should_store_row_debug_checksums(),
10021 m_ttl_bytes, &m_ttl_bytes_updated, &value_slice))) {
10022 return rc;
10023 }
10024
10025 const auto cf = m_pk_descr->get_cf();
10026 if (rocksdb_enable_bulk_load_api && THDVAR(table->in_use, bulk_load) &&
10027 !hidden_pk) {
10028 /*
10029 Write the primary key directly to an SST file using an SstFileWriter
10030 */
10031 rc = bulk_load_key(row_info.tx, kd, row_info.new_pk_slice, value_slice,
10032 THDVAR(table->in_use, bulk_load_allow_unsorted));
10033 } else if (row_info.skip_unique_check || row_info.tx->m_ddl_transaction) {
10034 /*
10035 It is responsibility of the user to make sure that the data being
10036 inserted doesn't violate any unique keys.
10037 */
10038 row_info.tx->get_indexed_write_batch()->Put(cf, row_info.new_pk_slice,
10039 value_slice);
10040 } else {
10041 const bool assume_tracked = can_assume_tracked(ha_thd());
10042 const auto s = row_info.tx->put(cf, row_info.new_pk_slice, value_slice,
10043 assume_tracked);
10044 if (!s.ok()) {
10045 if (s.IsBusy()) {
10046 errkey = table->s->primary_key;
10047 m_dupp_errkey = errkey;
10048 rc = HA_ERR_FOUND_DUPP_KEY;
10049 } else {
10050 rc = row_info.tx->set_status_error(table->in_use, s, *m_pk_descr,
10051 m_tbl_def);
10052 }
10053 }
10054 }
10055
10056 return rc;
10057 }
10058
10059 /**
10060 update an existing secondary key record or write a new secondary key record
10061
10062 @param[in] table_arg Table we're working on
10063 @param[in] kd The secondary key being update/write
10064 @param[in] row_info data structure contains old row data and new row data
10065 @param[in] bulk_load_sk whether support bulk load. Currently it is only
10066 support for write
10067 @return
10068 HA_EXIT_SUCCESS OK
10069 Other HA_ERR error code (can be SE-specific)
10070 */
10071 int ha_rocksdb::update_write_sk(const TABLE *const table_arg,
10072 const Rdb_key_def &kd,
10073 const struct update_row_info &row_info,
10074 const bool bulk_load_sk) {
10075 int new_packed_size;
10076 int old_packed_size;
10077 int rc = HA_EXIT_SUCCESS;
10078
10079 rocksdb::Slice new_key_slice;
10080 rocksdb::Slice new_value_slice;
10081 rocksdb::Slice old_key_slice;
10082
10083 const uint key_id = kd.get_keyno();
10084 /*
10085 Can skip updating this key if none of the key fields have changed and, if
10086 this table has TTL, the TTL timestamp has not changed.
10087 */
10088 if (row_info.old_data != nullptr && !m_update_scope.is_set(key_id) &&
10089 (!kd.has_ttl() || !m_ttl_bytes_updated)) {
10090 return HA_EXIT_SUCCESS;
10091 }
10092
10093 bool store_row_debug_checksums = should_store_row_debug_checksums();
10094 new_packed_size =
10095 kd.pack_record(table_arg, m_pack_buffer, row_info.new_data,
10096 m_sk_packed_tuple, &m_sk_tails, store_row_debug_checksums,
10097 row_info.hidden_pk_id, 0, nullptr, m_ttl_bytes);
10098
10099 if (row_info.old_data != nullptr) {
10100 // The old value
10101 old_packed_size = kd.pack_record(
10102 table_arg, m_pack_buffer, row_info.old_data, m_sk_packed_tuple_old,
10103 &m_sk_tails_old, store_row_debug_checksums, row_info.hidden_pk_id, 0,
10104 nullptr, m_ttl_bytes);
10105
10106 /*
10107 Check if we are going to write the same value. This can happen when
10108 one does
10109 UPDATE tbl SET col='foo'
10110 and we are looking at the row that already has col='foo'.
10111
10112 We also need to compare the unpack info. Suppose, the collation is
10113 case-insensitive, and unpack info contains information about whether
10114 the letters were uppercase and lowercase. Then, both 'foo' and 'FOO'
10115 will have the same key value, but different data in unpack_info.
10116
10117 (note: anyone changing bytewise_compare should take this code into
10118 account)
10119 */
10120 if (old_packed_size == new_packed_size &&
10121 m_sk_tails_old.get_current_pos() == m_sk_tails.get_current_pos() &&
10122 !(kd.has_ttl() && m_ttl_bytes_updated) &&
10123 memcmp(m_sk_packed_tuple_old, m_sk_packed_tuple, old_packed_size) ==
10124 0 &&
10125 memcmp(m_sk_tails_old.ptr(), m_sk_tails.ptr(),
10126 m_sk_tails.get_current_pos()) == 0) {
10127 return HA_EXIT_SUCCESS;
10128 }
10129
10130 /*
10131 Deleting entries from secondary index should skip locking, but
10132 be visible to the transaction.
10133 (also note that DDL statements do not delete rows, so this is not a DDL
10134 statement)
10135 */
10136 old_key_slice = rocksdb::Slice(
10137 reinterpret_cast<const char *>(m_sk_packed_tuple_old), old_packed_size);
10138
10139 row_info.tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(),
10140 old_key_slice);
10141 }
10142
10143 new_key_slice = rocksdb::Slice(
10144 reinterpret_cast<const char *>(m_sk_packed_tuple), new_packed_size);
10145 new_value_slice =
10146 rocksdb::Slice(reinterpret_cast<const char *>(m_sk_tails.ptr()),
10147 m_sk_tails.get_current_pos());
10148
10149 if (bulk_load_sk && row_info.old_data == nullptr) {
10150 rc = bulk_load_key(row_info.tx, kd, new_key_slice, new_value_slice, true);
10151 } else {
10152 row_info.tx->get_indexed_write_batch()->Put(kd.get_cf(), new_key_slice,
10153 new_value_slice);
10154 }
10155
10156 return rc;
10157 }
10158
10159 /**
10160 Update existing indexes(PK/SKs) or write new indexes(PK/SKs)
10161
10162 @param[in] row_info hold all row data, such as old key/new key
10163 @param[in] pk_changed whether primary key is changed
10164 @return
10165 HA_EXIT_SUCCESS OK
10166 Other HA_ERR error code (can be SE-specific)
10167 */
10168 int ha_rocksdb::update_write_indexes(const struct update_row_info &row_info,
10169 const bool pk_changed) {
10170 int rc;
10171 bool bulk_load_sk;
10172
10173 // The PK must be updated first to pull out the TTL value.
10174 rc = update_write_pk(*m_pk_descr, row_info, pk_changed);
10175 if (rc != HA_EXIT_SUCCESS) {
10176 return rc;
10177 }
10178
10179 // Update the remaining indexes. Allow bulk loading only if
10180 // allow_sk is enabled
10181 bulk_load_sk = rocksdb_enable_bulk_load_api &&
10182 THDVAR(table->in_use, bulk_load) &&
10183 THDVAR(table->in_use, bulk_load_allow_sk);
10184 for (uint key_id = 0; key_id < m_tbl_def->m_key_count; key_id++) {
10185 if (is_pk(key_id, table, m_tbl_def)) {
10186 continue;
10187 }
10188
10189 rc = update_write_sk(table, *m_key_descr_arr[key_id], row_info,
10190 bulk_load_sk);
10191 if (rc != HA_EXIT_SUCCESS) {
10192 return rc;
10193 }
10194 }
10195
10196 return HA_EXIT_SUCCESS;
10197 }
10198
10199 /**
10200 Update an existing row or write a new row
10201
10202 @param[in] old_data nullptr for write, non-null for update
10203 @param[in] new_data non-null for write/update
10204 @param[in] skip_unique_check whether to check uniqueness
10205 @return
10206 HA_EXIT_SUCCESS OK
10207 Other HA_ERR error code (can be SE-specific)
10208 */
10209 int ha_rocksdb::update_write_row(const uchar *const old_data,
10210 const uchar *const new_data,
10211 const bool skip_unique_check) {
10212 DBUG_ENTER_FUNC();
10213
10214 THD *thd = ha_thd();
10215 if (thd && thd->killed) {
10216 DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED);
10217 }
10218
10219 bool pk_changed = false;
10220 struct update_row_info row_info;
10221
10222 row_info.old_data = old_data;
10223 row_info.new_data = new_data;
10224 row_info.skip_unique_check = skip_unique_check;
10225 row_info.new_pk_unpack_info = nullptr;
10226 set_last_rowkey(old_data);
10227
10228 row_info.tx = get_or_create_tx(table->in_use);
10229
10230 if (old_data != nullptr) {
10231 row_info.old_pk_slice =
10232 rocksdb::Slice(m_last_rowkey.ptr(), m_last_rowkey.length());
10233
10234 /* Determine which indexes need updating. */
10235 calc_updated_indexes();
10236 }
10237
10238 /*
10239 Get the new row key into row_info.new_pk_slice
10240 */
10241 int rc = get_pk_for_update(&row_info);
10242 if (rc != HA_EXIT_SUCCESS) {
10243 DBUG_RETURN(rc);
10244 }
10245
10246 /*
10247 For UPDATEs, if the key has changed, we need to obtain a lock. INSERTs
10248 always require locking.
10249 */
10250 if (row_info.old_pk_slice.size() > 0) {
10251 pk_changed = row_info.new_pk_slice.compare(row_info.old_pk_slice) != 0;
10252 }
10253
10254 // Case: We skip both unique checks and rows locks only when bulk load is
10255 // enabled or if rocksdb_skip_locks_if_skip_unique_check is ON
10256 if (!THDVAR(table->in_use, bulk_load) &&
10257 (!rocksdb_skip_locks_if_skip_unique_check || !skip_unique_check)) {
10258 /*
10259 Check to see if we are going to have failures because of unique
10260 keys. Also lock the appropriate key values.
10261 */
10262 rc = check_uniqueness_and_lock(row_info, pk_changed, skip_unique_check);
10263 if (rc != HA_EXIT_SUCCESS) {
10264 DBUG_RETURN(rc);
10265 }
10266 }
10267
10268 DEBUG_SYNC(ha_thd(), "rocksdb.update_write_row_after_unique_check");
10269
10270 /*
10271 At this point, all locks have been obtained, and all checks for duplicate
10272 keys have been performed. No further errors can be allowed to occur from
10273 here because updates to the transaction will be made and those updates
10274 cannot be easily removed without rolling back the entire transaction.
10275 */
10276 rc = update_write_indexes(row_info, pk_changed);
10277 if (rc != HA_EXIT_SUCCESS) {
10278 DBUG_RETURN(rc);
10279 }
10280
10281 row_info.tx->log_table_write_op(m_tbl_def);
10282
10283 if (do_bulk_commit(row_info.tx)) {
10284 DBUG_RETURN(HA_ERR_ROCKSDB_BULK_LOAD);
10285 }
10286
10287 DBUG_RETURN(HA_EXIT_SUCCESS);
10288 }
10289
10290 /*
10291 Setting iterator upper/lower bounds for Seek/SeekForPrev.
10292 This makes RocksDB to avoid scanning tombstones outside of
10293 the given key ranges, when prefix_same_as_start=true was not passed
10294 (when prefix bloom filter can not be used).
10295 Inversing upper/lower bound is necessary on reverse order CF.
10296 This covers HA_READ_PREFIX_LAST* case as well. For example,
10297 if given query eq condition was 12 bytes and condition was
10298 0x0000b3eb003f65c5e78858b8, and if doing HA_READ_PREFIX_LAST,
10299 eq_cond_len was 11 (see calc_eq_cond_len() for details).
10300 If the index was reverse order, upper bound would be
10301 0x0000b3eb003f65c5e78857, and lower bound would be
10302 0x0000b3eb003f65c5e78859. These cover given eq condition range.
10303 */
10304 void ha_rocksdb::setup_iterator_bounds(
10305 const Rdb_key_def &kd, const rocksdb::Slice &eq_cond, size_t bound_len,
10306 uchar *const lower_bound, uchar *const upper_bound,
10307 rocksdb::Slice *lower_bound_slice, rocksdb::Slice *upper_bound_slice) {
10308 // If eq_cond is shorter than Rdb_key_def::INDEX_NUMBER_SIZE, we should be
10309 // able to get better bounds just by using index id directly.
10310 if (eq_cond.size() <= Rdb_key_def::INDEX_NUMBER_SIZE) {
10311 assert(bound_len == Rdb_key_def::INDEX_NUMBER_SIZE);
10312 uint size;
10313 kd.get_infimum_key(lower_bound, &size);
10314 assert(size == Rdb_key_def::INDEX_NUMBER_SIZE);
10315 kd.get_supremum_key(upper_bound, &size);
10316 assert(size == Rdb_key_def::INDEX_NUMBER_SIZE);
10317 } else {
10318 assert(bound_len <= eq_cond.size());
10319 memcpy(upper_bound, eq_cond.data(), bound_len);
10320 kd.successor(upper_bound, bound_len);
10321 memcpy(lower_bound, eq_cond.data(), bound_len);
10322 kd.predecessor(lower_bound, bound_len);
10323 }
10324
10325 if (kd.m_is_reverse_cf) {
10326 *upper_bound_slice = rocksdb::Slice((const char *)lower_bound, bound_len);
10327 *lower_bound_slice = rocksdb::Slice((const char *)upper_bound, bound_len);
10328 } else {
10329 *upper_bound_slice = rocksdb::Slice((const char *)upper_bound, bound_len);
10330 *lower_bound_slice = rocksdb::Slice((const char *)lower_bound, bound_len);
10331 }
10332 }
10333
10334 /*
10335 Open a cursor
10336 */
10337
10338 void ha_rocksdb::setup_scan_iterator(const Rdb_key_def &kd,
10339 rocksdb::Slice *const slice,
10340 const bool use_all_keys,
10341 const uint eq_cond_len) {
10342 assert(slice != nullptr);
10343 assert(slice->size() >= eq_cond_len);
10344
10345 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10346
10347 bool skip_bloom = true;
10348
10349 const rocksdb::Slice eq_cond(slice->data(), eq_cond_len);
10350 // The size of m_scan_it_lower_bound (and upper) is technically
10351 // max_packed_sk_len as calculated in ha_rocksdb::alloc_key_buffers. Rather
10352 // than recalculating that number, we pass in the max of eq_cond_len and
10353 // Rdb_key_def::INDEX_NUMBER_SIZE which is guaranteed to be smaller than
10354 // max_packed_sk_len, hence ensuring no buffer overrun.
10355 //
10356 // See ha_rocksdb::setup_iterator_bounds on how the bound_len parameter is
10357 // used.
10358 if (check_bloom_and_set_bounds(
10359 ha_thd(), kd, eq_cond, use_all_keys,
10360 std::max(eq_cond_len, (uint)Rdb_key_def::INDEX_NUMBER_SIZE),
10361 m_scan_it_lower_bound, m_scan_it_upper_bound,
10362 &m_scan_it_lower_bound_slice, &m_scan_it_upper_bound_slice)) {
10363 skip_bloom = false;
10364 }
10365
10366 /*
10367 In some cases, setup_scan_iterator() is called multiple times from
10368 the same query but bloom filter can not always be used.
10369 Suppose the following query example. id2 is VARCHAR(30) and PRIMARY KEY
10370 (id1, id2).
10371 select count(*) from t2 WHERE id1=100 and id2 IN ('00000000000000000000',
10372 '100');
10373 In this case, setup_scan_iterator() is called twice, the first time is for
10374 (id1, id2)=(100, '00000000000000000000') and the second time is for (100,
10375 '100').
10376 If prefix bloom filter length is 24 bytes, prefix bloom filter can be used
10377 for the
10378 first condition but not for the second condition.
10379 If bloom filter condition is changed, currently it is necessary to destroy
10380 and
10381 re-create Iterator.
10382 */
10383 if (m_scan_it_skips_bloom != skip_bloom) {
10384 release_scan_iterator();
10385 }
10386
10387 /*
10388 SQL layer can call rnd_init() multiple times in a row.
10389 In that case, re-use the iterator, but re-position it at the table start.
10390 */
10391 if (!m_scan_it) {
10392 const bool fill_cache = !THDVAR(ha_thd(), skip_fill_cache);
10393 if (commit_in_the_middle()) {
10394 assert(m_scan_it_snapshot == nullptr);
10395 m_scan_it_snapshot = rdb->GetSnapshot();
10396
10397 auto read_opts = rocksdb::ReadOptions();
10398 read_opts.total_order_seek = true; // TODO: set based on WHERE conditions
10399 read_opts.snapshot = m_scan_it_snapshot;
10400 m_scan_it = rdb->NewIterator(read_opts, kd.get_cf());
10401 } else {
10402 m_scan_it = tx->get_iterator(kd.get_cf(), skip_bloom, fill_cache,
10403 m_scan_it_lower_bound_slice,
10404 m_scan_it_upper_bound_slice);
10405 }
10406 m_scan_it_skips_bloom = skip_bloom;
10407 }
10408 }
10409
10410 void ha_rocksdb::release_scan_iterator() {
10411 delete m_scan_it;
10412 m_scan_it = nullptr;
10413
10414 if (m_scan_it_snapshot) {
10415 rdb->ReleaseSnapshot(m_scan_it_snapshot);
10416 m_scan_it_snapshot = nullptr;
10417 }
10418 }
10419
10420 void ha_rocksdb::setup_iterator_for_rnd_scan() {
10421 uint key_size;
10422
10423 int key_start_matching_bytes =
10424 m_pk_descr->get_first_key(m_pk_packed_tuple, &key_size);
10425
10426 rocksdb::Slice table_key((const char *)m_pk_packed_tuple, key_size);
10427
10428 setup_scan_iterator(*m_pk_descr, &table_key, false, key_start_matching_bytes);
10429 m_scan_it->Seek(table_key);
10430 m_skip_scan_it_next_call = true;
10431 }
10432
10433 /**
10434 @return
10435 HA_EXIT_SUCCESS OK
10436 other HA_ERR error code (can be SE-specific)
10437 */
10438 int ha_rocksdb::rnd_init(bool scan) {
10439 DBUG_ENTER_FUNC();
10440
10441 m_need_build_decoder = true;
10442 active_index = table->s->primary_key;
10443
10444 THD *thd = ha_thd();
10445 if (thd && thd->killed) {
10446 DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED);
10447 }
10448
10449 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10450
10451 if (scan) {
10452 m_rnd_scan_is_new_snapshot = !tx->has_snapshot();
10453 setup_iterator_for_rnd_scan();
10454 } else {
10455 /* We don't need any preparations for rnd_pos() calls. */
10456 }
10457
10458 // If m_lock_rows is on then we will be doing a get_for_update when accessing
10459 // the index, so don't acquire the snapshot right away. Otherwise acquire
10460 // the snapshot immediately.
10461 tx->acquire_snapshot(m_lock_rows == RDB_LOCK_NONE);
10462
10463 DBUG_RETURN(HA_EXIT_SUCCESS);
10464 }
10465
10466 /**
10467 @return
10468 HA_EXIT_SUCCESS OK
10469 other HA_ERR error code (can be SE-specific)
10470 */
10471 int ha_rocksdb::rnd_next(uchar *const buf) {
10472 DBUG_ENTER_FUNC();
10473
10474 check_build_decoder();
10475
10476 int rc;
10477 ha_statistic_increment(&SSV::ha_read_rnd_next_count);
10478 for (;;) {
10479 rc = rnd_next_with_direction(buf, true);
10480 if (!should_recreate_snapshot(rc, m_rnd_scan_is_new_snapshot)) {
10481 break; /* exit the loop */
10482 }
10483 // release the snapshot and iterator and then regenerate them
10484 Rdb_transaction *tx = get_or_create_tx(table->in_use);
10485 tx->release_snapshot();
10486 release_scan_iterator();
10487 setup_iterator_for_rnd_scan();
10488 }
10489
10490 m_rnd_scan_is_new_snapshot = false;
10491
10492 if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
10493
10494 DBUG_RETURN(rc);
10495 }
10496
10497 /*
10498 See also secondary_index_read().
10499 */
10500 int ha_rocksdb::rnd_next_with_direction(uchar *const buf, bool move_forward) {
10501 DBUG_ENTER_FUNC();
10502
10503 int rc;
10504 THD *thd = ha_thd();
10505
10506 table->status = STATUS_NOT_FOUND;
10507
10508 if (!m_scan_it || !is_valid(m_scan_it)) {
10509 /*
10510 We can get here when SQL layer has called
10511
10512 h->index_init(PRIMARY);
10513 h->index_read_map(full index tuple, HA_READ_KEY_EXACT);
10514
10515 In this case, we should return EOF.
10516 */
10517 DBUG_RETURN(HA_ERR_END_OF_FILE);
10518 }
10519
10520 for (;;) {
10521 DEBUG_SYNC(thd, "rocksdb.check_flags_rnwd");
10522 if (thd && thd->killed) {
10523 rc = HA_ERR_QUERY_INTERRUPTED;
10524 break;
10525 }
10526
10527 if (m_skip_scan_it_next_call) {
10528 m_skip_scan_it_next_call = false;
10529 } else {
10530 if (move_forward) {
10531 m_scan_it->Next(); /* this call cannot fail */
10532 } else {
10533 m_scan_it->Prev(); /* this call cannot fail */
10534 }
10535 }
10536
10537 if (!is_valid(m_scan_it)) {
10538 rc = HA_ERR_END_OF_FILE;
10539 break;
10540 }
10541
10542 /* check if we're out of this table */
10543 const rocksdb::Slice key = m_scan_it->key();
10544 if (!m_pk_descr->covers_key(key)) {
10545 rc = HA_ERR_END_OF_FILE;
10546 break;
10547 }
10548
10549 if (m_lock_rows != RDB_LOCK_NONE) {
10550 /*
10551 Lock the row we've just read.
10552
10553 Now we call get_for_update which will 1) Take a lock and 2) Will fail
10554 if the row was deleted since the snapshot was taken.
10555 */
10556 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10557 DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete");
10558
10559 if (m_pk_descr->has_ttl() &&
10560 should_hide_ttl_rec(*m_pk_descr, m_scan_it->value(),
10561 tx->m_snapshot_timestamp)) {
10562 continue;
10563 }
10564
10565 const rocksdb::Status s =
10566 get_for_update(tx, *m_pk_descr, key, &m_retrieved_record);
10567 if (s.IsNotFound() &&
10568 should_skip_invalidated_record(HA_ERR_KEY_NOT_FOUND)) {
10569 continue;
10570 }
10571
10572 if (!s.ok()) {
10573 DBUG_RETURN(
10574 tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def));
10575 }
10576
10577 // If we called get_for_update() use the value from that call not from
10578 // the iterator as it may be stale since we don't have a snapshot
10579 // when m_lock_rows is not RDB_LOCK_NONE.
10580 m_last_rowkey.copy(key.data(), key.size(), &my_charset_bin);
10581 rc = convert_record_from_storage_format(&key, buf);
10582 } else {
10583 // Use the value from the iterator
10584 rocksdb::Slice value = m_scan_it->value();
10585
10586 if (m_pk_descr->has_ttl() &&
10587 should_hide_ttl_rec(
10588 *m_pk_descr, value,
10589 get_or_create_tx(table->in_use)->m_snapshot_timestamp)) {
10590 continue;
10591 }
10592
10593 m_last_rowkey.copy(key.data(), key.size(), &my_charset_bin);
10594 rc = convert_record_from_storage_format(&key, &value, buf);
10595 }
10596
10597 table->status = 0;
10598 break;
10599 }
10600
10601 if (!rc) {
10602 update_row_stats(ROWS_READ);
10603 }
10604
10605 DBUG_RETURN(rc);
10606 }
10607
10608 int ha_rocksdb::rnd_end() {
10609 DBUG_ENTER_FUNC();
10610
10611 m_need_build_decoder = false;
10612
10613 release_scan_iterator();
10614
10615 DBUG_RETURN(HA_EXIT_SUCCESS);
10616 }
10617
10618 void ha_rocksdb::build_decoder() {
10619 m_converter->setup_field_decoders(table->read_set, active_index,
10620 m_keyread_only,
10621 m_lock_rows == RDB_LOCK_WRITE);
10622 }
10623
10624 void ha_rocksdb::check_build_decoder() {
10625 if (m_need_build_decoder) {
10626 build_decoder();
10627 m_need_build_decoder = false;
10628 }
10629 }
10630
10631 /**
10632 @return
10633 HA_EXIT_SUCCESS OK
10634 other HA_ERR error code (can be SE-specific)
10635 */
10636 int ha_rocksdb::index_init(uint idx, bool sorted) {
10637 DBUG_ENTER_FUNC();
10638
10639 m_need_build_decoder = true;
10640 active_index = idx;
10641
10642 THD *thd = ha_thd();
10643 if (thd && thd->killed) {
10644 DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED);
10645 }
10646
10647 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10648 assert(tx != nullptr);
10649
10650 // If m_lock_rows is not RDB_LOCK_NONE then we will be doing a get_for_update
10651 // when accessing the index, so don't acquire the snapshot right away.
10652 // Otherwise acquire the snapshot immediately.
10653 tx->acquire_snapshot(m_lock_rows == RDB_LOCK_NONE);
10654
10655 DBUG_RETURN(HA_EXIT_SUCCESS);
10656 }
10657
10658 /**
10659 @return
10660 HA_EXIT_SUCCESS OK
10661 */
10662 int ha_rocksdb::index_end() {
10663 DBUG_ENTER_FUNC();
10664
10665 m_need_build_decoder = false;
10666
10667 release_scan_iterator();
10668
10669 active_index = MAX_KEY;
10670 in_range_check_pushed_down = false;
10671
10672 DBUG_RETURN(HA_EXIT_SUCCESS);
10673 }
10674
10675 /**
10676 Called by the partition manager for truncating tables.
10677
10678 @return
10679 HA_EXIT_SUCCESS OK
10680 other HA_ERR error code (can be SE-specific)
10681 */
10682 int ha_rocksdb::truncate() {
10683 DBUG_ENTER_FUNC();
10684
10685 assert(m_tbl_def != nullptr);
10686
10687 // Reset auto_increment_value to 1 if auto-increment feature is enabled
10688 // By default, the starting valid value for auto_increment_value is 1
10689 DBUG_RETURN(truncate_table(
10690 m_tbl_def, table,
10691 table->found_next_number_field ? 1 : 0 /* auto_increment_value */));
10692 }
10693
10694 /*
10695 Delete the row we've last read. The row is also passed as parameter.
10696
10697 @detail
10698 The caller guarantees table buf points to the row that was just read.
10699 The row is either table->record[0] or table->record[1].
10700 (Check out InnoDB: row_update_for_mysql() has "UT_NOT_USED(mysql_rec)"
10701
10702 @return
10703 HA_EXIT_SUCCESS OK
10704 other HA_ERR error code (can be SE-specific)
10705 */
10706 int ha_rocksdb::delete_row(const uchar *const buf) {
10707 DBUG_ENTER_FUNC();
10708
10709 assert(buf != nullptr);
10710
10711 ha_statistic_increment(&SSV::ha_delete_count);
10712 set_last_rowkey(buf);
10713
10714 rocksdb::Slice key_slice(m_last_rowkey.ptr(), m_last_rowkey.length());
10715 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10716
10717 const uint index = pk_index(table, m_tbl_def);
10718 rocksdb::Status s =
10719 delete_or_singledelete(index, tx, m_pk_descr->get_cf(), key_slice);
10720 if (!s.ok()) {
10721 DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def));
10722 }
10723
10724 longlong hidden_pk_id = 0;
10725 if (m_tbl_def->m_key_count > 1 && has_hidden_pk(table)) {
10726 int err = read_hidden_pk_id_from_rowkey(&hidden_pk_id);
10727 if (err) {
10728 DBUG_RETURN(err);
10729 }
10730 }
10731
10732 // Delete the record for every secondary index
10733 for (uint i = 0; i < m_tbl_def->m_key_count; i++) {
10734 if (!is_pk(i, table, m_tbl_def)) {
10735 int packed_size;
10736 const Rdb_key_def &kd = *m_key_descr_arr[i];
10737
10738 // The unique key should be locked so that behavior is
10739 // similar to InnoDB and reduce conflicts. The key
10740 // used for locking does not include the extended fields.
10741 const KEY *key_info = &table->key_info[i];
10742 if (key_info->flags & HA_NOSAME) {
10743 uint user_defined_key_parts = key_info->user_defined_key_parts;
10744 uint n_null_fields = 0;
10745
10746 packed_size = kd.pack_record(table, m_pack_buffer, buf,
10747 m_sk_packed_tuple, nullptr, false, 0,
10748 user_defined_key_parts, &n_null_fields);
10749
10750 // NULL fields are considered unique, so no lock is needed
10751 if (n_null_fields == 0) {
10752 rocksdb::Slice sk_slice(
10753 reinterpret_cast<const char *>(m_sk_packed_tuple), packed_size);
10754 const rocksdb::Status s = get_for_update(tx, kd, sk_slice, nullptr);
10755 if (!s.ok()) {
10756 DBUG_RETURN(tx->set_status_error(table->in_use, s, kd, m_tbl_def));
10757 }
10758 }
10759 }
10760
10761 packed_size = kd.pack_record(table, m_pack_buffer, buf, m_sk_packed_tuple,
10762 nullptr, false, hidden_pk_id);
10763 rocksdb::Slice secondary_key_slice(
10764 reinterpret_cast<const char *>(m_sk_packed_tuple), packed_size);
10765 tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(),
10766 secondary_key_slice);
10767 }
10768 }
10769
10770 tx->log_table_write_op(m_tbl_def);
10771
10772 if (do_bulk_commit(tx)) {
10773 DBUG_RETURN(HA_ERR_ROCKSDB_BULK_LOAD);
10774 }
10775
10776 // Not protected by ddl_manger lock for performance
10777 // reasons. This is an estimate value anyway.
10778 dec_table_n_rows();
10779 update_table_stats_if_needed();
10780 update_row_stats(ROWS_DELETED);
10781
10782 DBUG_RETURN(HA_EXIT_SUCCESS);
10783 }
10784
10785 rocksdb::Status ha_rocksdb::delete_or_singledelete(
10786 uint index, Rdb_transaction *const tx,
10787 rocksdb::ColumnFamilyHandle *const column_family,
10788 const rocksdb::Slice &key) {
10789 const bool assume_tracked = can_assume_tracked(ha_thd());
10790 if (can_use_single_delete(index)) {
10791 return tx->single_delete(column_family, key, assume_tracked);
10792 }
10793 return tx->delete_key(column_family, key, assume_tracked);
10794 }
10795
10796 void ha_rocksdb::update_stats(void) {
10797 DBUG_ENTER_FUNC();
10798
10799 stats.records = 0;
10800 stats.index_file_length = 0ul;
10801 stats.data_file_length = 0ul;
10802 stats.mean_rec_length = 0;
10803
10804 for (uint i = 0; i < m_tbl_def->m_key_count; i++) {
10805 if (is_pk(i, table, m_tbl_def)) {
10806 stats.data_file_length = m_pk_descr->m_stats.m_actual_disk_size;
10807 stats.records = m_pk_descr->m_stats.m_rows;
10808 } else {
10809 stats.index_file_length += m_key_descr_arr[i]->m_stats.m_actual_disk_size;
10810 }
10811 }
10812
10813 DBUG_VOID_RETURN;
10814 }
10815
10816 int ha_rocksdb::adjust_handler_stats_table_scan() {
10817 DBUG_ENTER_FUNC();
10818
10819 bool should_recalc_stats = false;
10820 if (static_cast<longlong>(stats.data_file_length) < 0) {
10821 stats.data_file_length = 0;
10822 should_recalc_stats = true;
10823 }
10824
10825 if (static_cast<longlong>(stats.index_file_length) < 0) {
10826 stats.index_file_length = 0;
10827 should_recalc_stats = true;
10828 }
10829
10830 if (static_cast<longlong>(stats.records) < 0) {
10831 stats.records = 1;
10832 should_recalc_stats = true;
10833 }
10834
10835 if (should_recalc_stats) {
10836 // If any of the stats is corrupt, add the table to the index stats
10837 // recalc queue.
10838 rdb_is_thread.add_index_stats_request(m_tbl_def->full_tablename());
10839 }
10840 DBUG_RETURN(HA_EXIT_SUCCESS);
10841 }
10842
10843 /**
10844 @return
10845 HA_EXIT_SUCCESS OK
10846 HA_EXIT_FAILURE Error
10847 */
10848 int ha_rocksdb::info(uint flag) {
10849 DBUG_ENTER_FUNC();
10850
10851 if (!table) DBUG_RETURN(HA_EXIT_FAILURE);
10852
10853 if (flag & HA_STATUS_VARIABLE) {
10854 /*
10855 Test only to simulate corrupted stats
10856 */
10857 DBUG_EXECUTE_IF("myrocks_simulate_negative_stats",
10858 m_pk_descr->m_stats.m_actual_disk_size =
10859 -m_pk_descr->m_stats.m_actual_disk_size;);
10860
10861 update_stats();
10862 if (rocksdb_table_stats_use_table_scan) {
10863 int ret = adjust_handler_stats_table_scan();
10864 if (ret != HA_EXIT_SUCCESS) {
10865 return ret;
10866 }
10867 } else {
10868 int ret = adjust_handler_stats_sst_and_memtable();
10869 if (ret != HA_EXIT_SUCCESS) {
10870 return ret;
10871 }
10872 }
10873
10874 if (rocksdb_debug_optimizer_n_rows > 0) {
10875 stats.records = rocksdb_debug_optimizer_n_rows;
10876 }
10877
10878 if (stats.records != 0) {
10879 stats.mean_rec_length = stats.data_file_length / stats.records;
10880 }
10881 }
10882
10883 if (flag & (HA_STATUS_VARIABLE | HA_STATUS_CONST)) {
10884 ref_length = m_pk_descr->max_storage_fmt_length();
10885
10886 for (uint i = 0; i < m_tbl_def->m_key_count; i++) {
10887 if (is_hidden_pk(i, table, m_tbl_def)) {
10888 continue;
10889 }
10890 KEY *const k = &table->key_info[i];
10891 for (uint j = 0; j < k->actual_key_parts; j++) {
10892 const Rdb_index_stats &k_stats = m_key_descr_arr[i]->m_stats;
10893 uint x;
10894
10895 if (k_stats.m_distinct_keys_per_prefix.size() > j &&
10896 k_stats.m_distinct_keys_per_prefix[j] > 0) {
10897 x = k_stats.m_rows / k_stats.m_distinct_keys_per_prefix[j];
10898 /*
10899 If the number of rows is less than the number of prefixes (due to
10900 sampling), the average number of rows with the same prefix is 1.
10901 */
10902 if (x == 0) {
10903 x = 1;
10904 }
10905 } else {
10906 x = 0;
10907 }
10908 if (x > stats.records) x = stats.records;
10909 if ((x == 0 && rocksdb_debug_optimizer_no_zero_cardinality) ||
10910 rocksdb_debug_optimizer_n_rows > 0) {
10911 // Fake cardinality implementation. For example, (idx1, idx2, idx3)
10912 // index
10913 // will have rec_per_key for (idx1)=4, (idx1,2)=2, and (idx1,2,3)=1.
10914 // rec_per_key for the whole index is 1, and multiplied by 2^n if
10915 // n suffix columns of the index are not used.
10916 x = 1 << (k->actual_key_parts - j - 1);
10917 }
10918 k->rec_per_key[j] = x;
10919 }
10920 }
10921
10922 stats.create_time = m_tbl_def->get_create_time();
10923 }
10924
10925 if (flag & HA_STATUS_TIME) {
10926 stats.update_time = m_tbl_def->m_update_time;
10927 }
10928
10929 if (flag & HA_STATUS_ERRKEY) {
10930 /*
10931 Currently we support only primary keys so we know which key had a
10932 uniqueness violation.
10933 */
10934 errkey = m_dupp_errkey;
10935 dup_ref = m_pk_tuple; // TODO(?): this should store packed PK.
10936 }
10937
10938 if (flag & HA_STATUS_AUTO) {
10939 stats.auto_increment_value = m_tbl_def->m_auto_incr_val;
10940 }
10941
10942 DBUG_RETURN(HA_EXIT_SUCCESS);
10943 }
10944
10945 void ha_rocksdb::position(const uchar *const record) {
10946 DBUG_ENTER_FUNC();
10947
10948 longlong hidden_pk_id = 0;
10949 if (has_hidden_pk(table) && read_hidden_pk_id_from_rowkey(&hidden_pk_id)) {
10950 assert(false); // should never reach here
10951 }
10952
10953 /*
10954 Get packed primary key value from the record.
10955
10956 (Note: m_last_rowkey has the packed PK of last-read-row, which allows to
10957 handle most cases, but there is an exception to this: when slave applies
10958 RBR events, it fills the record and then calls position(); rnd_pos())
10959
10960 Also note that we *can* rely on PK column values being available. This is
10961 because table_flags() includes HA_PRIMARY_KEY_REQUIRED_FOR_POSITION bit.
10962 When that is true, table->prepare_for_position() adds PK columns into the
10963 read set (this may potentially disable index-only access when PK column
10964 cannot be restored from its mem-comparable form in the secondary indexes).
10965 */
10966 const uint packed_size = m_pk_descr->pack_record(
10967 table, m_pack_buffer, record, ref, nullptr, false, hidden_pk_id);
10968
10969 /*
10970 It could be that mem-comparable form of PK occupies less than ref_length
10971 bytes. Fill the remainder with zeros.
10972 */
10973 if (ref_length > packed_size) {
10974 memset(ref + packed_size, 0, ref_length - packed_size);
10975 }
10976
10977 DBUG_VOID_RETURN;
10978 }
10979
10980 /**
10981 @return
10982 HA_EXIT_SUCCESS OK
10983 other HA_ERR error code (can be SE-specific)
10984 */
10985 int ha_rocksdb::rnd_pos(uchar *const buf, uchar *const pos) {
10986 DBUG_ENTER_FUNC();
10987
10988 check_build_decoder();
10989
10990 int rc;
10991 size_t len;
10992
10993 ha_statistic_increment(&SSV::ha_read_rnd_count);
10994 len = m_pk_descr->key_length(table,
10995 rocksdb::Slice((const char *)pos, ref_length));
10996 if (len == size_t(-1)) {
10997 DBUG_RETURN(HA_ERR_ROCKSDB_CORRUPT_DATA); /* Data corruption? */
10998 }
10999
11000 rc = get_row_by_rowid(buf, pos, len);
11001
11002 if (!rc) {
11003 update_row_stats(ROWS_READ);
11004 }
11005
11006 DBUG_RETURN(rc);
11007 }
11008
11009 /*
11010 @brief
11011 Calculate (if needed) the bitmap of indexes that are modified by the
11012 current query
11013
11014 @detail
11015 The calculation done by checking index definitions against the
11016 table->write_set
11017 column bitmap.
11018 */
11019
11020 void ha_rocksdb::calc_updated_indexes() {
11021 m_update_scope.clear_all();
11022
11023 for (uint keynr = 0; keynr < table->s->keys; keynr++) {
11024 const Rdb_key_def &kd = *m_key_descr_arr[keynr];
11025 /* Walk over all key parts, including the "extended key" suffix */
11026 const uint key_parts = kd.get_key_parts();
11027 for (uint kp = 0; kp < key_parts; kp++) {
11028 if (has_hidden_pk(table) && kp + 1 == key_parts) break;
11029
11030 Field *const field = kd.get_table_field_for_part_no(table, kp);
11031 if (bitmap_is_set(table->write_set, field->field_index)) {
11032 m_update_scope.set_bit(keynr);
11033 break;
11034 }
11035 }
11036 }
11037 }
11038
11039 /**
11040 Update an existing row
11041 @param[in] old_data nullptr for write, non-null for update
11042 @param[in] new_data non-null for write/update
11043 @return
11044 HA_EXIT_SUCCESS OK
11045 other HA_ERR error code (can be SE-specific)
11046 */
11047 int ha_rocksdb::update_row(const uchar *const old_data, uchar *const new_data) {
11048 DBUG_ENTER_FUNC();
11049
11050 assert(old_data != nullptr);
11051 assert(new_data != nullptr);
11052 assert(m_lock_rows == RDB_LOCK_WRITE);
11053 /*
11054 old_data points to record we're updating. It is the same as the record
11055 we've just read (for multi-table UPDATE, too, because SQL layer will make
11056 an rnd_pos() call to re-read the record before calling update_row())
11057 */
11058 assert(new_data == table->record[0]);
11059
11060 ha_statistic_increment(&SSV::ha_update_count);
11061 const int rv = update_write_row(old_data, new_data, skip_unique_check());
11062
11063 if (rv == 0) {
11064 update_table_stats_if_needed();
11065 update_row_stats(ROWS_UPDATED);
11066 }
11067
11068 DBUG_RETURN(rv);
11069 }
11070
11071 void ha_rocksdb::update_table_stats_if_needed() {
11072 DBUG_ENTER_FUNC();
11073
11074 if (!rocksdb_table_stats_use_table_scan) {
11075 DBUG_VOID_RETURN;
11076 }
11077
11078 /*
11079 InnoDB performs a similar operation to update counters during query
11080 processing. Because the changes in MyRocks are made to a write batch,
11081 it is possible for the table scan cardinality calculation to trigger
11082 before the transaction performing the update commits. Hence the
11083 cardinality scan might miss the keys for these pending transactions.
11084 */
11085 uint64 counter = m_tbl_def->m_tbl_stats.m_stat_modified_counter++;
11086 uint64 n_rows = m_tbl_def->m_tbl_stats.m_stat_n_rows;
11087
11088 if (counter > std::max(rocksdb_table_stats_recalc_threshold_count,
11089 static_cast<uint64>(
11090 n_rows * rocksdb_table_stats_recalc_threshold_pct /
11091 100.0))) {
11092 // Add the table to the recalc queue
11093 rdb_is_thread.add_index_stats_request(m_tbl_def->full_tablename());
11094 m_tbl_def->m_tbl_stats.m_stat_modified_counter = 0;
11095 }
11096
11097 DBUG_VOID_RETURN;
11098 }
11099
11100 /* The following function was copied from ha_blackhole::store_lock: */
11101 THR_LOCK_DATA **ha_rocksdb::store_lock(THD *const thd, THR_LOCK_DATA **to,
11102 enum thr_lock_type lock_type) {
11103 DBUG_ENTER_FUNC();
11104
11105 assert(thd != nullptr);
11106 assert(to != nullptr);
11107
11108 bool in_lock_tables = my_core::thd_in_lock_tables(thd);
11109
11110 /* First, make a decision about MyRocks's internal locking */
11111 if (lock_type >= TL_WRITE_ALLOW_WRITE) {
11112 m_lock_rows = RDB_LOCK_WRITE;
11113 } else if (lock_type == TL_READ_WITH_SHARED_LOCKS) {
11114 m_lock_rows = RDB_LOCK_READ;
11115 } else if (lock_type != TL_IGNORE) {
11116 m_lock_rows = RDB_LOCK_NONE;
11117 if (THDVAR(thd, lock_scanned_rows)) {
11118 /*
11119 The following logic was copied directly from
11120 ha_innobase::store_lock_with_x_type() in
11121 storage/innobase/handler/ha_innodb.cc and causes MyRocks to leave
11122 locks in place on rows that are in a table that is not being updated.
11123 */
11124 const uint sql_command = my_core::thd_sql_command(thd);
11125 if ((lock_type == TL_READ && in_lock_tables) ||
11126 (lock_type == TL_READ_HIGH_PRIORITY && in_lock_tables) ||
11127 lock_type == TL_READ_WITH_SHARED_LOCKS ||
11128 lock_type == TL_READ_NO_INSERT ||
11129 (lock_type != TL_IGNORE && sql_command != SQLCOM_SELECT)) {
11130 ulong tx_isolation = my_core::thd_tx_isolation(thd);
11131 if (sql_command != SQLCOM_CHECKSUM &&
11132 ((my_core::thd_test_options(thd, OPTION_BIN_LOG) &&
11133 tx_isolation > ISO_READ_COMMITTED) ||
11134 tx_isolation == ISO_SERIALIZABLE ||
11135 (lock_type != TL_READ && lock_type != TL_READ_NO_INSERT) ||
11136 (sql_command != SQLCOM_INSERT_SELECT &&
11137 sql_command != SQLCOM_REPLACE_SELECT &&
11138 sql_command != SQLCOM_UPDATE && sql_command != SQLCOM_DELETE &&
11139 sql_command != SQLCOM_CREATE_TABLE))) {
11140 m_lock_rows = RDB_LOCK_READ;
11141 }
11142 }
11143 }
11144 }
11145
11146 /* Then, tell the SQL layer what kind of locking it should use: */
11147 if (lock_type != TL_IGNORE && m_db_lock.type == TL_UNLOCK) {
11148 /*
11149 Here is where we get into the guts of a row level lock.
11150 If TL_UNLOCK is set
11151 If we are not doing a LOCK TABLE or DISCARD/IMPORT
11152 TABLESPACE, then allow multiple writers
11153 */
11154
11155 if ((lock_type >= TL_WRITE_CONCURRENT_INSERT && lock_type <= TL_WRITE) &&
11156 !in_lock_tables && !my_core::thd_tablespace_op(thd)) {
11157 lock_type = TL_WRITE_ALLOW_WRITE;
11158 }
11159
11160 /*
11161 In queries of type INSERT INTO t1 SELECT ... FROM t2 ...
11162 MySQL would use the lock TL_READ_NO_INSERT on t2, and that
11163 would conflict with TL_WRITE_ALLOW_WRITE, blocking all inserts
11164 to t2. Convert the lock to a normal read lock to allow
11165 concurrent inserts to t2.
11166 */
11167
11168 if (lock_type == TL_READ_NO_INSERT && !in_lock_tables) {
11169 lock_type = TL_READ;
11170 }
11171
11172 m_db_lock.type = lock_type;
11173 }
11174
11175 *to++ = &m_db_lock;
11176
11177 DBUG_RETURN(to);
11178 }
11179
11180 void ha_rocksdb::read_thd_vars(THD *const thd) {
11181 m_store_row_debug_checksums = THDVAR(thd, store_row_debug_checksums);
11182 m_converter->set_verify_row_debug_checksums(
11183 THDVAR(thd, verify_row_debug_checksums));
11184 m_checksums_pct = THDVAR(thd, checksums_pct);
11185 }
11186
11187 /**
11188 @return
11189 HA_EXIT_SUCCESS OK
11190 other HA_ERR error code (cannot be SE-specific)
11191 */
11192 int ha_rocksdb::external_lock(THD *const thd, int lock_type) {
11193 DBUG_ENTER_FUNC();
11194
11195 assert(thd != nullptr);
11196
11197 int res = HA_EXIT_SUCCESS;
11198 int binlog_format = my_core::thd_binlog_format(thd);
11199 bool unsafe_for_binlog = THDVAR(ha_thd(), unsafe_for_binlog);
11200 if (lock_type == F_WRLCK && !thd->rli_slave && !unsafe_for_binlog &&
11201 binlog_format != BINLOG_FORMAT_ROW &&
11202 binlog_format != BINLOG_FORMAT_UNSPEC &&
11203 my_core::thd_binlog_filter_ok(thd)) {
11204 my_error(ER_REQUIRE_ROW_BINLOG_FORMAT, MYF(0));
11205 DBUG_RETURN(HA_ERR_UNSUPPORTED);
11206 }
11207
11208 if (lock_type == F_UNLCK) {
11209 Rdb_transaction *const tx = get_tx_from_thd(thd);
11210
11211 if (tx) {
11212 tx->io_perf_end_and_record(&m_io_perf);
11213 tx->m_n_mysql_tables_in_use--;
11214 if (tx->m_n_mysql_tables_in_use == 0 &&
11215 !my_core::thd_test_options(thd,
11216 OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
11217 /*
11218 Do like InnoDB: when we get here, it's time to commit a
11219 single-statement transaction.
11220
11221 If the statement involved multiple tables, this code will be executed
11222 for each of them, but that's ok because non-first tx->commit() calls
11223 will be no-ops.
11224 */
11225 if (tx->commit_or_rollback()) {
11226 res = HA_ERR_INTERNAL_ERROR;
11227 }
11228 }
11229 }
11230 } else {
11231 if (my_core::thd_tx_isolation(thd) < ISO_READ_COMMITTED ||
11232 my_core::thd_tx_isolation(thd) > ISO_REPEATABLE_READ) {
11233 my_error(ER_ISOLATION_MODE_NOT_SUPPORTED, MYF(0),
11234 tx_isolation_names[my_core::thd_tx_isolation(thd)]);
11235 DBUG_RETURN(HA_ERR_UNSUPPORTED);
11236 }
11237 /*
11238 It's nice to do the following on start of every statement. The problem
11239 is, handler->start_stmt() is not called for INSERTs.
11240 So, we put this code here.
11241 */
11242 Rdb_transaction *const tx = get_or_create_tx(thd);
11243 read_thd_vars(thd);
11244
11245 if (skip_unique_check()) {
11246 if ((thd->lex->sql_command == SQLCOM_INSERT ||
11247 thd->lex->sql_command == SQLCOM_LOAD ||
11248 thd->lex->sql_command == SQLCOM_REPLACE) &&
11249 (thd->lex->duplicates == DUP_REPLACE ||
11250 thd->lex->duplicates == DUP_UPDATE)) {
11251 my_error(ER_ON_DUPLICATE_DISABLED, MYF(0), thd->query().str);
11252 DBUG_RETURN(HA_ERR_UNSUPPORTED);
11253 }
11254 }
11255
11256 if (lock_type == F_WRLCK) {
11257 if (tx->is_tx_read_only()) {
11258 my_error(ER_UPDATES_WITH_CONSISTENT_SNAPSHOT, MYF(0));
11259 DBUG_RETURN(HA_ERR_UNSUPPORTED);
11260 }
11261
11262 /*
11263 SQL layer signals us to take a write lock. It does so when starting DML
11264 statement. We should put locks on the rows we're reading.
11265
11266 Note: sometimes, external_lock() can be called without a prior
11267 ::store_lock call. That's why we need to set lock_* members here, too.
11268 */
11269 m_lock_rows = RDB_LOCK_WRITE;
11270
11271 if (thd->lex->sql_command == SQLCOM_CREATE_INDEX ||
11272 thd->lex->sql_command == SQLCOM_DROP_INDEX ||
11273 thd->lex->sql_command == SQLCOM_ALTER_TABLE) {
11274 tx->m_ddl_transaction = true;
11275 }
11276 }
11277 tx->m_n_mysql_tables_in_use++;
11278 rocksdb_register_tx(rocksdb_hton, thd, tx);
11279 tx->io_perf_start(&m_io_perf);
11280 }
11281
11282 DBUG_RETURN(res);
11283 }
11284
11285 /**
11286 @note
11287 A quote from ha_innobase::start_stmt():
11288 <quote>
11289 MySQL calls this function at the start of each SQL statement inside LOCK
11290 TABLES. Inside LOCK TABLES the ::external_lock method does not work to
11291 mark SQL statement borders.
11292 </quote>
11293
11294 @return
11295 HA_EXIT_SUCCESS OK
11296 */
11297
11298 int ha_rocksdb::start_stmt(THD *const thd, thr_lock_type lock_type) {
11299 DBUG_ENTER_FUNC();
11300
11301 assert(thd != nullptr);
11302
11303 Rdb_transaction *const tx = get_or_create_tx(thd);
11304 read_thd_vars(thd);
11305 rocksdb_register_tx(ht, thd, tx);
11306 tx->io_perf_start(&m_io_perf);
11307
11308 DBUG_RETURN(HA_EXIT_SUCCESS);
11309 }
11310
11311 rocksdb::Range get_range(uint32_t i,
11312 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2],
11313 int offset1, int offset2) {
11314 uchar *buf_begin = buf;
11315 uchar *buf_end = buf + Rdb_key_def::INDEX_NUMBER_SIZE;
11316 rdb_netbuf_store_index(buf_begin, i + offset1);
11317 rdb_netbuf_store_index(buf_end, i + offset2);
11318
11319 return rocksdb::Range(
11320 rocksdb::Slice((const char *)buf_begin, Rdb_key_def::INDEX_NUMBER_SIZE),
11321 rocksdb::Slice((const char *)buf_end, Rdb_key_def::INDEX_NUMBER_SIZE));
11322 }
11323
11324 static rocksdb::Range get_range(const Rdb_key_def &kd,
11325 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2],
11326 int offset1, int offset2) {
11327 return get_range(kd.get_index_number(), buf, offset1, offset2);
11328 }
11329
11330 rocksdb::Range get_range(const Rdb_key_def &kd,
11331 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]) {
11332 if (kd.m_is_reverse_cf) {
11333 return myrocks::get_range(kd, buf, 1, 0);
11334 } else {
11335 return myrocks::get_range(kd, buf, 0, 1);
11336 }
11337 }
11338
11339 rocksdb::Range ha_rocksdb::get_range(
11340 const int i, uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]) const {
11341 return myrocks::get_range(*m_key_descr_arr[i], buf);
11342 }
11343
11344 /*
11345 This function is called with total_order_seek=true, but
11346 upper/lower bound setting is not necessary.
11347 Boundary set is useful when there is no matching key,
11348 but in drop_index_thread's case, it means index is marked as removed,
11349 so no further seek will happen for the index id.
11350 */
11351 static bool is_myrocks_index_empty(rocksdb::ColumnFamilyHandle *cfh,
11352 const bool is_reverse_cf,
11353 const rocksdb::ReadOptions &read_opts,
11354 const uint index_id) {
11355 bool index_removed = false;
11356 uchar key_buf[Rdb_key_def::INDEX_NUMBER_SIZE] = {0};
11357 rdb_netbuf_store_uint32(key_buf, index_id);
11358 const rocksdb::Slice key =
11359 rocksdb::Slice(reinterpret_cast<char *>(key_buf), sizeof(key_buf));
11360 std::unique_ptr<rocksdb::Iterator> it(rdb->NewIterator(read_opts, cfh));
11361 rocksdb_smart_seek(is_reverse_cf, it.get(), key);
11362 if (!it->Valid()) {
11363 index_removed = true;
11364 } else {
11365 if (memcmp(it->key().data(), key_buf, Rdb_key_def::INDEX_NUMBER_SIZE)) {
11366 // Key does not have same prefix
11367 index_removed = true;
11368 }
11369 }
11370 return index_removed;
11371 }
11372
11373 /*
11374 Drop index thread's main logic
11375 */
11376
11377 void Rdb_drop_index_thread::run() {
11378 RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
11379
11380 for (;;) {
11381 // The stop flag might be set by shutdown command
11382 // after drop_index_thread releases signal_mutex
11383 // (i.e. while executing expensive Seek()). To prevent drop_index_thread
11384 // from entering long cond_timedwait, checking if stop flag
11385 // is true or not is needed, with drop_index_interrupt_mutex held.
11386 if (m_killed) {
11387 break;
11388 }
11389
11390 timespec ts;
11391 clock_gettime(CLOCK_REALTIME, &ts);
11392 ts.tv_sec += dict_manager.is_drop_index_empty()
11393 ? 24 * 60 * 60 // no filtering
11394 : 60; // filtering
11395
11396 const auto ret MY_ATTRIBUTE((__unused__)) =
11397 mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts);
11398 if (m_killed) {
11399 break;
11400 }
11401 // make sure, no program error is returned
11402 assert(ret == 0 || ret == ETIMEDOUT);
11403 RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
11404
11405 std::unordered_set<GL_INDEX_ID> indices;
11406 dict_manager.get_ongoing_drop_indexes(&indices);
11407 if (!indices.empty()) {
11408 std::unordered_set<GL_INDEX_ID> finished;
11409 rocksdb::ReadOptions read_opts;
11410 read_opts.total_order_seek = true; // disable bloom filter
11411
11412 for (const auto d : indices) {
11413 uint32 cf_flags = 0;
11414 if (!dict_manager.get_cf_flags(d.cf_id, &cf_flags)) {
11415 sql_print_error(
11416 "RocksDB: Failed to get column family flags "
11417 "from cf id %u. MyRocks data dictionary may "
11418 "get corrupted.",
11419 d.cf_id);
11420 abort();
11421 }
11422
11423 std::shared_ptr<rocksdb::ColumnFamilyHandle> cfh =
11424 cf_manager.get_cf(d.cf_id);
11425 assert(cfh);
11426
11427 if (dict_manager.get_dropped_cf(d.cf_id)) {
11428 finished.insert(d);
11429 continue;
11430 }
11431
11432 const bool is_reverse_cf = cf_flags & Rdb_key_def::REVERSE_CF_FLAG;
11433
11434 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
11435 rocksdb::Range range = get_range(d.index_id, buf, is_reverse_cf ? 1 : 0,
11436 is_reverse_cf ? 0 : 1);
11437
11438 rocksdb::Status status = DeleteFilesInRange(rdb->GetBaseDB(), cfh.get(),
11439 &range.start, &range.limit);
11440 if (!status.ok()) {
11441 if (status.IsShutdownInProgress()) {
11442 break;
11443 }
11444 rdb_handle_io_error(status, RDB_IO_ERROR_BG_THREAD);
11445 }
11446
11447 status = rdb->CompactRange(getCompactRangeOptions(), cfh.get(),
11448 &range.start, &range.limit);
11449 if (!status.ok()) {
11450 if (status.IsShutdownInProgress()) {
11451 break;
11452 }
11453 rdb_handle_io_error(status, RDB_IO_ERROR_BG_THREAD);
11454 }
11455 if (is_myrocks_index_empty(cfh.get(), is_reverse_cf, read_opts,
11456 d.index_id)) {
11457 finished.insert(d);
11458 }
11459 }
11460
11461 if (!finished.empty()) {
11462 dict_manager.finish_drop_indexes(finished);
11463 }
11464 }
11465
11466 DBUG_EXECUTE_IF("rocksdb_drop_cf", {
11467 THD *thd = new THD();
11468 thd->thread_stack = reinterpret_cast<char *>(&(thd));
11469 thd->store_globals();
11470
11471 static constexpr char act[] = "now wait_for ready_to_drop_cf";
11472 assert(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
11473
11474 thd->restore_globals();
11475 delete thd;
11476 });
11477
11478 // Remove dropped column family
11479 // 1. Get all cf ids from ongoing_index_drop.
11480 // 2. Get all cf ids for cfs marked as dropped.
11481 // 3. If a cf id is in the list of ongoing_index_drop
11482 // , skip removing this cf. It will be removed later.
11483 // 4. If it is not, proceed to remove the cf.
11484 //
11485 // This should be under dict_manager lock
11486
11487 {
11488 std::lock_guard<Rdb_dict_manager> dm_lock(dict_manager);
11489 std::unordered_set<uint32> dropped_cf_ids;
11490 dict_manager.get_all_dropped_cfs(&dropped_cf_ids);
11491
11492 if (!dropped_cf_ids.empty()) {
11493 std::unordered_set<GL_INDEX_ID> ongoing_drop_indices;
11494 dict_manager.get_ongoing_drop_indexes(&ongoing_drop_indices);
11495
11496 std::unordered_set<uint32> ongoing_drop_cf_ids;
11497 for (const auto index : ongoing_drop_indices) {
11498 ongoing_drop_cf_ids.insert(index.cf_id);
11499 }
11500
11501 for (const auto cf_id : dropped_cf_ids) {
11502 if (ongoing_drop_cf_ids.find(cf_id) == ongoing_drop_cf_ids.end()) {
11503 cf_manager.remove_dropped_cf(&dict_manager, rdb, cf_id);
11504 }
11505 }
11506 }
11507 }
11508
11509 DBUG_EXECUTE_IF("rocksdb_drop_cf", {
11510 THD *thd = new THD();
11511 thd->thread_stack = reinterpret_cast<char *>(&(thd));
11512 thd->store_globals();
11513
11514 static constexpr char act[] = "now signal drop_cf_done";
11515 assert(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
11516
11517 thd->restore_globals();
11518 delete thd;
11519 });
11520 RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
11521 }
11522
11523 RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
11524 }
11525
11526 Rdb_tbl_def *ha_rocksdb::get_table_if_exists(const char *const tablename) {
11527 assert(tablename != nullptr);
11528
11529 std::string str;
11530 if (rdb_normalize_tablename(tablename, &str) != HA_EXIT_SUCCESS) {
11531 // We were not passed table name?
11532 assert(0);
11533 return nullptr;
11534 }
11535
11536 return ddl_manager.find(str);
11537 }
11538
11539 /*
11540 Overload func for delete table ---it deletes table meta data in data
11541 dictionary immediately and delete real data in background thread(async)
11542
11543 @param tbl IN MyRocks table definition
11544
11545 @return
11546 HA_EXIT_SUCCESS OK
11547 other HA_ERR error code (can be SE-specific)
11548 */
11549 int ha_rocksdb::delete_table(Rdb_tbl_def *const tbl) {
11550 DBUG_ENTER_FUNC();
11551
11552 assert(tbl != nullptr);
11553 assert(m_tbl_def == nullptr || m_tbl_def == tbl);
11554
11555 const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
11556 rocksdb::WriteBatch *const batch = wb.get();
11557
11558 DBUG_EXECUTE_IF("rocksdb_before_delete_table", {
11559 static constexpr char act[] =
11560 "now signal ready_to_mark_cf_dropped_before_delete_table wait_for "
11561 "mark_cf_dropped_done_before_delete_table";
11562 assert(!debug_sync_set_action(ha_thd(), STRING_WITH_LEN(act)));
11563 });
11564
11565 {
11566 std::lock_guard<Rdb_dict_manager> dm_lock(dict_manager);
11567 dict_manager.add_drop_table(tbl->m_key_descr_arr, tbl->m_key_count, batch);
11568
11569 /*
11570 Remove the table entry in data dictionary (this will also remove it from
11571 the persistent data dictionary).
11572 */
11573 ddl_manager.remove(tbl, batch, true);
11574
11575 int err = dict_manager.commit(batch);
11576 if (err) {
11577 DBUG_RETURN(err);
11578 }
11579 }
11580
11581 DBUG_EXECUTE_IF("rocksdb_after_delete_table", {
11582 static constexpr char act[] =
11583 "now signal ready_to_mark_cf_dropped_after_delete_table "
11584 "wait_for mark_cf_dropped_done_after_delete_table";
11585 assert(!debug_sync_set_action(ha_thd(), STRING_WITH_LEN(act)));
11586 });
11587
11588 rdb_drop_idx_thread.signal();
11589 // avoid dangling pointer
11590 m_tbl_def = nullptr;
11591 DBUG_RETURN(HA_EXIT_SUCCESS);
11592 }
11593
11594 /*
11595 Note: the following function is called when the table is not open. That is,
11596 this->table==nullptr, pk_key_descr==nullptr, etc.
11597
11598 tablename points to line in form "./dbname/tablename".
11599
11600 @return
11601 HA_EXIT_SUCCESS OK
11602 other HA_ERR error code (can be SE-specific)
11603 */
11604
11605 int ha_rocksdb::delete_non_partitioned_table(const char *const tablename) {
11606 DBUG_ENTER_FUNC();
11607
11608 assert(tablename != nullptr);
11609
11610 /* Find the table in the hash */
11611 Rdb_tbl_def *const tbl = get_table_if_exists(tablename);
11612 if (!tbl) {
11613 DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
11614 }
11615
11616 DBUG_RETURN(delete_table(tbl));
11617 }
11618
11619 static int init_partition_handler(THD *thd, const std::string &partition_string,
11620 ha_rockspart &file) {
11621 assert(thd);
11622 MEM_ROOT *mem_root = thd->mem_root;
11623
11624 partition_info *part_info =
11625 native_part::parse_partition_info(thd, partition_string);
11626
11627 if (file.init_partitioning(mem_root))
11628 return HA_ERR_CANNOT_INITIALIZE_PARTITIONING;
11629
11630 file.set_part_info(part_info, false);
11631
11632 if (file.initialize_partition(mem_root))
11633 return HA_ERR_CANNOT_INITIALIZE_PARTITIONING;
11634
11635 return 0;
11636 }
11637
11638 int ha_rocksdb::delete_partitioned_table(
11639 const char *const tablename, const std::string &partition_info_str) {
11640 ha_rockspart file(rocksdb_hton, nullptr);
11641 int result = init_partition_handler(ha_thd(), partition_info_str, file);
11642 if (result) return result;
11643 return file.delete_table(tablename);
11644 }
11645
11646 int ha_rocksdb::delete_table(const char *const tablename) {
11647 assert(tablename);
11648 std::string partition_info_str;
11649 if (!native_part::get_part_str_for_table(tablename, partition_info_str))
11650 return HA_ERR_TABLE_CORRUPT;
11651 if (partition_info_str.empty())
11652 return delete_non_partitioned_table(tablename);
11653 return delete_partitioned_table(tablename, partition_info_str);
11654 }
11655
11656 int ha_rocksdb::remove_rows(Rdb_tbl_def *const tbl) {
11657 const rocksdb::WriteOptions wo =
11658 rdb_get_rocksdb_write_options(handler::ha_thd());
11659
11660 rocksdb::ReadOptions opts;
11661 opts.total_order_seek = true;
11662 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
11663
11664 char key_buf[MAX_KEY_LENGTH];
11665 uint key_len;
11666
11667 uchar lower_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE];
11668 uchar upper_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE];
11669 rocksdb::Slice lower_bound_slice;
11670 rocksdb::Slice upper_bound_slice;
11671
11672 /*
11673 Remove all records in each index.
11674 (This is is not crash-safe, but it doesn't matter, because bulk row
11675 deletion will be handled on rocksdb side)
11676 */
11677 for (uint i = 0; i < tbl->m_key_count; i++) {
11678 const Rdb_key_def &kd = *tbl->m_key_descr_arr[i];
11679 kd.get_infimum_key(reinterpret_cast<uchar *>(key_buf), &key_len);
11680 rocksdb::ColumnFamilyHandle *cf = kd.get_cf();
11681 const rocksdb::Slice table_key(key_buf, key_len);
11682 assert(key_len == Rdb_key_def::INDEX_NUMBER_SIZE);
11683 if (THDVAR(ha_thd(), enable_iterate_bounds)) {
11684 setup_iterator_bounds(kd, table_key, Rdb_key_def::INDEX_NUMBER_SIZE,
11685 lower_bound_buf, upper_bound_buf,
11686 &lower_bound_slice, &upper_bound_slice);
11687 opts.iterate_lower_bound = &lower_bound_slice;
11688 opts.iterate_upper_bound = &upper_bound_slice;
11689 } else {
11690 opts.iterate_lower_bound = nullptr;
11691 opts.iterate_upper_bound = nullptr;
11692 }
11693 std::unique_ptr<rocksdb::Iterator> it(rdb->NewIterator(opts, cf));
11694
11695 it->Seek(table_key);
11696 while (it->Valid()) {
11697 const rocksdb::Slice key = it->key();
11698 if (!kd.covers_key(key)) {
11699 break;
11700 }
11701
11702 rocksdb::Status s;
11703 if (can_use_single_delete(i)) {
11704 s = rdb->SingleDelete(wo, cf, key);
11705 } else {
11706 s = rdb->Delete(wo, cf, key);
11707 }
11708
11709 if (!s.ok()) {
11710 return tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def);
11711 }
11712
11713 it->Next();
11714 }
11715 }
11716
11717 return HA_EXIT_SUCCESS;
11718 }
11719
11720 int ha_rocksdb::rename_partitioned_table(const char *const from,
11721 const char *const to,
11722 const std::string &partition_string) {
11723 ha_rockspart file(rocksdb_hton, nullptr);
11724 int result = init_partition_handler(ha_thd(), partition_string, file);
11725 if (result) return result;
11726 return file.rename_table(from, to);
11727 }
11728
11729 /**
11730 @return
11731 HA_EXIT_SUCCESS OK
11732 other HA_ERR error code (cannot be SE-specific)
11733 */
11734 int ha_rocksdb::rename_non_partitioned_table(const char *const from,
11735 const char *const to) {
11736 DBUG_ENTER_FUNC();
11737
11738 assert(from != nullptr);
11739 assert(to != nullptr);
11740
11741 std::string from_str;
11742 std::string to_str;
11743 std::string from_db;
11744 std::string to_db;
11745 int rc;
11746
11747 if (rdb_is_tablename_normalized(from)) {
11748 from_str = from;
11749 } else {
11750 rc = rdb_normalize_tablename(from, &from_str);
11751 if (rc != HA_EXIT_SUCCESS) {
11752 DBUG_RETURN(rc);
11753 }
11754 }
11755
11756 rc = rdb_split_normalized_tablename(from_str, &from_db);
11757 if (rc != HA_EXIT_SUCCESS) {
11758 DBUG_RETURN(rc);
11759 }
11760
11761 if (rdb_is_tablename_normalized(to)) {
11762 to_str = to;
11763 } else {
11764 rc = rdb_normalize_tablename(to, &to_str);
11765 if (rc != HA_EXIT_SUCCESS) {
11766 DBUG_RETURN(rc);
11767 }
11768 }
11769
11770 rc = rdb_split_normalized_tablename(to_str, &to_db);
11771 if (rc != HA_EXIT_SUCCESS) {
11772 DBUG_RETURN(rc);
11773 }
11774
11775 // If the user changed the database part of the name then validate that the
11776 // 'to' database exists.
11777 if (from_db != to_db && !rdb_database_exists(to_db)) {
11778 // If we return a RocksDB specific error code here we get
11779 // "error: 206 - Unknown error 206". InnoDB gets
11780 // "error -1 - Unknown error -1" so let's match them.
11781 DBUG_RETURN(-1);
11782 }
11783
11784 DBUG_EXECUTE_IF("gen_sql_table_name", to_str = to_str + "#sql-test";);
11785
11786 const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
11787 rocksdb::WriteBatch *const batch = wb.get();
11788
11789 // rename table is under dict_manager lock, and the cfs used
11790 // by indices of this table cannot be dropped during the process.
11791 dict_manager.lock();
11792
11793 if (ddl_manager.rename(from_str, to_str, batch)) {
11794 rc = HA_ERR_ROCKSDB_INVALID_TABLE;
11795 } else {
11796 rc = dict_manager.commit(batch);
11797 }
11798 dict_manager.unlock();
11799
11800 DBUG_RETURN(rc);
11801 }
11802
11803 int ha_rocksdb::rename_table(const char *const from, const char *const to) {
11804 assert(from);
11805 assert(to);
11806
11807 const char* from2 = from;
11808 const char* to2 = to;
11809 std::string from_canon, to_canon;
11810 int rc;
11811
11812 if (rdb_is_tablename_normalized(from)) {
11813 rc = rdb_make_canonical_tablename(from, &from_canon);
11814 if (rc != HA_EXIT_SUCCESS) {
11815 return rc;
11816 }
11817
11818 from2 = from_canon.c_str();
11819 }
11820
11821 if (rdb_is_tablename_normalized(to)) {
11822 rc = rdb_make_canonical_tablename(to, &to_canon);
11823 if (rc != HA_EXIT_SUCCESS) {
11824 return rc;
11825 }
11826 to2 = to_canon.c_str();
11827 }
11828
11829 std::string partition_info_str;
11830 if (!native_part::get_part_str_for_table(from2, partition_info_str))
11831 return HA_ERR_TABLE_CORRUPT;
11832
11833 if (partition_info_str.empty()) return rename_non_partitioned_table(from2, to2);
11834 return rename_partitioned_table(from2, to2, partition_info_str);
11835 }
11836
11837 /**
11838 check_if_incompatible_data() called if ALTER TABLE can't detect otherwise
11839 if new and old definition are compatible
11840
11841 @details If there are no other explicit signs like changed number of
11842 fields this function will be called by compare_tables()
11843 (sql/sql_tables.cc) to decide should we rewrite whole table or only .frm
11844 file.
11845
11846 */
11847
11848 bool ha_rocksdb::check_if_incompatible_data(HA_CREATE_INFO *const info,
11849 uint table_changes) {
11850 DBUG_ENTER_FUNC();
11851
11852 assert(info != nullptr);
11853
11854 // this function is needed only for online alter-table
11855 DBUG_RETURN(COMPATIBLE_DATA_NO);
11856 }
11857
11858 /**
11859 @return
11860 HA_EXIT_SUCCESS OK
11861 */
11862 int ha_rocksdb::extra(enum ha_extra_function operation) {
11863 DBUG_ENTER_FUNC();
11864
11865 switch (operation) {
11866 case HA_EXTRA_KEYREAD:
11867 m_keyread_only = true;
11868 break;
11869 case HA_EXTRA_NO_KEYREAD:
11870 m_keyread_only = false;
11871 break;
11872 case HA_EXTRA_FLUSH:
11873 /*
11874 If the table has blobs, then they are part of m_retrieved_record.
11875 This call invalidates them.
11876 */
11877 m_retrieved_record.Reset();
11878 break;
11879 case HA_EXTRA_INSERT_WITH_UPDATE:
11880 // INSERT ON DUPLICATE KEY UPDATE
11881 if (rocksdb_enable_insert_with_update_caching) {
11882 m_insert_with_update = true;
11883 }
11884 break;
11885 case HA_EXTRA_NO_IGNORE_DUP_KEY:
11886 // PAIRED with HA_EXTRA_INSERT_WITH_UPDATE or HA_EXTRA_WRITE_CAN_REPLACE
11887 // that indicates the end of REPLACE / INSERT ON DUPLICATE KEY
11888 m_insert_with_update = false;
11889 break;
11890
11891 default:
11892 break;
11893 }
11894
11895 DBUG_RETURN(HA_EXIT_SUCCESS);
11896 }
11897
11898 /*
11899 Given a starting key and an ending key, estimate the number of rows that
11900 will exist between the two keys.
11901 */
11902 ha_rows ha_rocksdb::records_in_range(uint inx, key_range *const min_key,
11903 key_range *const max_key) {
11904 DBUG_ENTER_FUNC();
11905
11906 ha_rows ret = THDVAR(ha_thd(), records_in_range);
11907 if (ret) {
11908 DBUG_RETURN(ret);
11909 }
11910 if (table->force_index) {
11911 const ha_rows force_rows = THDVAR(ha_thd(), force_index_records_in_range);
11912 if (force_rows) {
11913 DBUG_RETURN(force_rows);
11914 }
11915 }
11916
11917 const Rdb_key_def &kd = *m_key_descr_arr[inx];
11918
11919 auto disk_size = kd.m_stats.m_actual_disk_size;
11920 if (disk_size == 0) disk_size = kd.m_stats.m_data_size;
11921 auto rows = kd.m_stats.m_rows;
11922 if (rows == 0 || disk_size == 0) {
11923 rows = 1;
11924 disk_size = ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE;
11925 }
11926 ulonglong total_size = 0;
11927 ulonglong total_row = 0;
11928 records_in_range_internal(inx, min_key, max_key, disk_size, rows, &total_size,
11929 &total_row);
11930 ret = total_row;
11931 /*
11932 GetApproximateSizes() gives estimates so ret might exceed stats.records.
11933 MySQL then decides to use full index scan rather than range scan, which
11934 is not efficient for most cases.
11935 To prevent this, changing estimated records slightly smaller than
11936 stats.records.
11937 */
11938 if (ret >= stats.records) {
11939 ret = stats.records * 0.99;
11940 }
11941
11942 if (rocksdb_debug_optimizer_n_rows > 0) {
11943 ret = rocksdb_debug_optimizer_n_rows;
11944 } else if (ret == 0) {
11945 ret = 1;
11946 }
11947
11948 DBUG_RETURN(ret);
11949 }
11950
11951 void ha_rocksdb::records_in_range_internal(uint inx, key_range *const min_key,
11952 key_range *const max_key,
11953 int64 disk_size, int64 rows,
11954 ulonglong *total_size,
11955 ulonglong *row_count) {
11956 DBUG_ENTER_FUNC();
11957
11958 const Rdb_key_def &kd = *m_key_descr_arr[inx];
11959
11960 uint size1 = 0;
11961 if (min_key) {
11962 size1 = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
11963 min_key->key, min_key->keypart_map);
11964 if (min_key->flag == HA_READ_PREFIX_LAST_OR_PREV ||
11965 min_key->flag == HA_READ_PREFIX_LAST ||
11966 min_key->flag == HA_READ_AFTER_KEY) {
11967 kd.successor(m_sk_packed_tuple, size1);
11968 }
11969 } else {
11970 kd.get_infimum_key(m_sk_packed_tuple, &size1);
11971 }
11972
11973 uint size2 = 0;
11974 if (max_key) {
11975 size2 = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple_old,
11976 max_key->key, max_key->keypart_map);
11977 if (max_key->flag == HA_READ_PREFIX_LAST_OR_PREV ||
11978 max_key->flag == HA_READ_PREFIX_LAST ||
11979 max_key->flag == HA_READ_AFTER_KEY) {
11980 kd.successor(m_sk_packed_tuple_old, size2);
11981 }
11982 } else {
11983 kd.get_supremum_key(m_sk_packed_tuple_old, &size2);
11984 }
11985
11986 const rocksdb::Slice slice1((const char *)m_sk_packed_tuple, size1);
11987 const rocksdb::Slice slice2((const char *)m_sk_packed_tuple_old, size2);
11988
11989 // It's possible to get slice1 == slice2 for a non-inclusive range with the
11990 // right bound being successor() of the left one, e.g. "t.key>10 AND t.key<11"
11991 if (slice1.compare(slice2) >= 0) {
11992 // It's not possible to get slice2 > slice1
11993 assert(slice1.compare(slice2) == 0);
11994 DBUG_VOID_RETURN;
11995 }
11996
11997 rocksdb::Range r(kd.m_is_reverse_cf ? slice2 : slice1,
11998 kd.m_is_reverse_cf ? slice1 : slice2);
11999
12000 uint64_t sz = 0;
12001
12002 #pragma GCC diagnostic push
12003 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
12004 // Getting statistics, including from Memtables
12005 uint8_t include_flags = rocksdb::DB::INCLUDE_FILES;
12006 rdb->GetApproximateSizes(kd.get_cf(), &r, 1, &sz, include_flags);
12007 *row_count = rows * ((double)sz / (double)disk_size);
12008 *total_size = sz;
12009 uint64_t memTableCount;
12010 rdb->GetApproximateMemTableStats(kd.get_cf(), r, &memTableCount, &sz);
12011 *row_count += memTableCount;
12012 *total_size += sz;
12013 DBUG_VOID_RETURN;
12014 }
12015
12016 void ha_rocksdb::update_create_info(HA_CREATE_INFO *const create_info) {
12017 DBUG_ENTER_FUNC();
12018
12019 assert(create_info != nullptr);
12020
12021 if (!(create_info->used_fields & HA_CREATE_USED_AUTO)) {
12022 create_info->auto_increment_value = m_tbl_def->m_auto_incr_val;
12023 }
12024
12025 DBUG_VOID_RETURN;
12026 }
12027
12028 /**
12029 @brief
12030 Doing manual compaction on OPTIMIZE TABLE in RocksDB.
12031 Compaction itself is executed by background thread in RocksDB, but
12032 CompactRange() waits until compaction completes so this function
12033 may take a long time.
12034 Since RocksDB dataset is allocated per index id, OPTIMIZE TABLE
12035 triggers manual compaction for all indexes of the table.
12036 @details
12037 Compaction range is from the beginning of the index id to
12038 the first row of the next index id. When using reverse order
12039 column family, the first row of the next index id should be
12040 the last row of the previous index id.
12041
12042 @return
12043 HA_ADMIN_OK OK
12044 other HA_ADMIN error code
12045 */
12046 int ha_rocksdb::optimize(THD *const thd, HA_CHECK_OPT *const check_opt) {
12047 DBUG_ENTER_FUNC();
12048
12049 assert(thd != nullptr);
12050 assert(check_opt != nullptr);
12051
12052 for (uint i = 0; i < table->s->keys; i++) {
12053 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
12054 auto range = get_range(i, buf);
12055 const rocksdb::Status s = rdb->CompactRange(getCompactRangeOptions(),
12056 m_key_descr_arr[i]->get_cf(),
12057 &range.start, &range.limit);
12058 if (!s.ok()) {
12059 DBUG_RETURN(rdb_error_to_mysql(s));
12060 }
12061 }
12062
12063 DBUG_RETURN(HA_EXIT_SUCCESS);
12064 }
12065
12066 static void init_stats(
12067 const std::unordered_map<GL_INDEX_ID, std::shared_ptr<const Rdb_key_def>>
12068 &to_recalc,
12069 std::unordered_map<GL_INDEX_ID, Rdb_index_stats> *stats) {
12070 for (const auto &it : to_recalc) {
12071 const GL_INDEX_ID index_id = it.first;
12072 auto &kd = it.second;
12073
12074 (*stats).emplace(index_id, Rdb_index_stats(index_id));
12075 assert(kd->get_key_parts() > 0);
12076 (*stats)[index_id].m_distinct_keys_per_prefix.resize(kd->get_key_parts());
12077 }
12078 }
12079
12080 /**
12081 Calculate the following index stats for all indexes of a table:
12082 number of rows, file size, and cardinality. It adopts an index
12083 scan approach using rocksdb::Iterator. Sampling is used to
12084 accelerate the scan.
12085 **/
12086 static int calculate_cardinality_table_scan(
12087 const std::unordered_map<GL_INDEX_ID, std::shared_ptr<const Rdb_key_def>>
12088 &to_recalc,
12089 std::unordered_map<GL_INDEX_ID, Rdb_index_stats> *stats,
12090 table_cardinality_scan_type scan_type, uint64_t max_num_rows_scanned,
12091 THD::killed_state volatile *killed) {
12092 DBUG_ENTER_FUNC();
12093
12094 assert(scan_type != SCAN_TYPE_NONE);
12095 init_stats(to_recalc, stats);
12096
12097 auto read_opts = rocksdb::ReadOptions();
12098 read_opts.fill_cache = false;
12099 if (scan_type == SCAN_TYPE_MEMTABLE_ONLY) {
12100 read_opts.read_tier = rocksdb::ReadTier::kMemtableTier;
12101 } else {
12102 read_opts.total_order_seek = true;
12103 }
12104
12105 Rdb_tbl_card_coll cardinality_collector(rocksdb_table_stats_sampling_pct);
12106
12107 for (const auto &it_kd : to_recalc) {
12108 const GL_INDEX_ID index_id = it_kd.first;
12109
12110 if (!ddl_manager.safe_find(index_id)) {
12111 // If index id is not in ddl manager, then it has been dropped.
12112 // Skip scanning index
12113 continue;
12114 }
12115
12116 const std::shared_ptr<const Rdb_key_def> &kd = it_kd.second;
12117 assert(index_id == kd->get_gl_index_id());
12118 Rdb_index_stats &stat = (*stats)[kd->get_gl_index_id()];
12119
12120 uchar r_buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
12121 auto r = myrocks::get_range(*kd, r_buf);
12122 uint64_t memtableCount;
12123 uint64_t memtableSize;
12124 rdb->GetApproximateMemTableStats(kd->get_cf(), r, &memtableCount,
12125 &memtableSize);
12126
12127 if (scan_type == SCAN_TYPE_MEMTABLE_ONLY &&
12128 memtableCount < (uint64_t)stat.m_rows / 10) {
12129 // skip tables that already have enough stats from SST files to reduce
12130 // overhead and avoid degradation of big tables stats by sampling from
12131 // relatively tiny (less than 10% of full data set) memtable dataset
12132 continue;
12133 }
12134
12135 // Set memtable count to row count
12136 stat.m_rows = memtableCount;
12137
12138 if (scan_type == SCAN_TYPE_FULL_TABLE) {
12139 // Set memtable size to file size
12140 stat.m_actual_disk_size = memtableSize;
12141 }
12142
12143 std::unique_ptr<rocksdb::Iterator> it = std::unique_ptr<rocksdb::Iterator>(
12144 rdb->NewIterator(read_opts, kd->get_cf()));
12145 rocksdb::Slice first_index_key((const char *)r_buf,
12146 Rdb_key_def::INDEX_NUMBER_SIZE);
12147
12148 // Reset m_last_key for new index
12149 cardinality_collector.Reset();
12150 uint64_t rows_scanned = 0ul;
12151 for (it->Seek(first_index_key); is_valid(it.get()); it->Next()) {
12152 if (killed && *killed) {
12153 // NO_LINT_DEBUG
12154 sql_print_information(
12155 "Index stats calculation for index %s with id (%u,%u) is "
12156 "terminated",
12157 kd->get_name().c_str(), stat.m_gl_index_id.cf_id,
12158 stat.m_gl_index_id.index_id);
12159 DBUG_RETURN(HA_EXIT_FAILURE);
12160 }
12161
12162 const rocksdb::Slice key = it->key();
12163
12164 if ((scan_type == SCAN_TYPE_FULL_TABLE && max_num_rows_scanned > 0 &&
12165 rows_scanned >= max_num_rows_scanned) ||
12166 !kd->covers_key(key)) {
12167 break; // end of this index
12168 }
12169
12170 cardinality_collector.ProcessKey(key, kd.get(), &stat);
12171 rows_scanned++;
12172 }
12173
12174 cardinality_collector.Reset(); /* reset m_last_key for each key definition */
12175 cardinality_collector.SetCardinality(&stat);
12176 cardinality_collector.AdjustStats(&stat);
12177
12178 DBUG_EXECUTE_IF("rocksdb_calculate_stats", {
12179 if (kd->get_name() == "secondary_key") {
12180 THD *thd = new THD();
12181 thd->thread_stack = reinterpret_cast<char *>(&thd);
12182 thd->store_globals();
12183
12184 static constexpr char act[] =
12185 "now signal ready_to_drop_index wait_for ready_to_save_index_stats";
12186 assert(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
12187
12188 thd->restore_globals();
12189 delete thd;
12190 }
12191 });
12192 }
12193
12194 DBUG_RETURN(HA_EXIT_SUCCESS);
12195 }
12196
12197 static void reset_cardinality(
12198 std::unordered_map<GL_INDEX_ID, Rdb_index_stats> *stats) {
12199 for (auto &src : *stats) {
12200 Rdb_index_stats &stat = src.second;
12201 stat.reset_cardinality();
12202 }
12203 }
12204
12205 static void merge_stats(
12206 const std::unordered_map<GL_INDEX_ID, std::shared_ptr<const Rdb_key_def>>
12207 &to_recalc,
12208 std::unordered_map<GL_INDEX_ID, Rdb_index_stats> *stats,
12209 const std::unordered_map<GL_INDEX_ID, Rdb_index_stats> &card_stats) {
12210 assert(stats->size() == card_stats.size());
12211
12212 for (auto &src : *stats) {
12213 auto index_id = src.first;
12214 Rdb_index_stats &stat = src.second;
12215 auto it = card_stats.find(index_id);
12216 assert(it != card_stats.end());
12217
12218 auto it_index = to_recalc.find(index_id);
12219 assert(it_index != to_recalc.end());
12220 stat.merge(it->second, true, it_index->second->max_storage_fmt_length());
12221 }
12222 }
12223
12224 static void adjust_cardinality(
12225 std::unordered_map<GL_INDEX_ID, Rdb_index_stats> *stats,
12226 table_cardinality_scan_type scan_type, uint64_t max_num_rows_scanned) {
12227 assert(scan_type == SCAN_TYPE_FULL_TABLE);
12228 assert(max_num_rows_scanned > 0);
12229
12230 for (auto &src : *stats) {
12231 Rdb_index_stats &stat = src.second;
12232 if ((uint64_t)stat.m_rows > max_num_rows_scanned) {
12233 stat.adjust_cardinality(stat.m_rows / max_num_rows_scanned);
12234 }
12235 #ifndef NDEBUG
12236 for (size_t i = 0; i < stat.m_distinct_keys_per_prefix.size(); i++) {
12237 assert(stat.m_distinct_keys_per_prefix[i] <= stat.m_rows);
12238 }
12239 #endif
12240 }
12241 }
12242
12243 static int read_stats_from_ssts(
12244 const std::unordered_map<GL_INDEX_ID, std::shared_ptr<const Rdb_key_def>>
12245 &to_recalc,
12246 std::unordered_map<GL_INDEX_ID, Rdb_index_stats> *stats) {
12247 DBUG_ENTER_FUNC();
12248
12249 init_stats(to_recalc, stats);
12250
12251 // find per column family key ranges which need to be queried
12252 std::unordered_map<rocksdb::ColumnFamilyHandle *, std::vector<rocksdb::Range>>
12253 ranges;
12254 std::vector<uchar> buf(to_recalc.size() * 2 * Rdb_key_def::INDEX_NUMBER_SIZE);
12255
12256 uchar *bufp = buf.data();
12257 for (const auto &it : to_recalc) {
12258 auto &kd = it.second;
12259 ranges[kd->get_cf()].push_back(myrocks::get_range(*kd, bufp));
12260 bufp += 2 * Rdb_key_def::INDEX_NUMBER_SIZE;
12261 }
12262
12263 // get RocksDB table properties for these ranges
12264 rocksdb::TablePropertiesCollection props;
12265 for (const auto &it : ranges) {
12266 const auto old_size MY_ATTRIBUTE((__unused__)) = props.size();
12267 const auto status = rdb->GetPropertiesOfTablesInRange(
12268 it.first, &it.second[0], it.second.size(), &props);
12269 assert(props.size() >= old_size);
12270 if (!status.ok()) {
12271 DBUG_RETURN(ha_rocksdb::rdb_error_to_mysql(
12272 status, "Could not access RocksDB properties"));
12273 }
12274 }
12275
12276 int num_sst = 0;
12277 for (const auto &it : props) {
12278 std::vector<Rdb_index_stats> sst_stats;
12279 Rdb_tbl_prop_coll::read_stats_from_tbl_props(it.second, &sst_stats);
12280 /*
12281 sst_stats is a list of index statistics for indexes that have entries
12282 in the current SST file.
12283 */
12284 for (const auto &it1 : sst_stats) {
12285 /*
12286 Only update statistics for indexes that belong to this SQL table.
12287
12288 The reason is: We are walking through all SST files that have
12289 entries from this table (and so can compute good statistics). For
12290 other SQL tables, it can be that we're only seeing a small fraction
12291 of table's entries (and so we can't update statistics based on that).
12292 */
12293 if (stats->find(it1.m_gl_index_id) == stats->end()) {
12294 continue;
12295 }
12296
12297 auto it_index = to_recalc.find(it1.m_gl_index_id);
12298 assert(it_index != to_recalc.end());
12299 if (it_index == to_recalc.end()) {
12300 continue;
12301 }
12302
12303 (*stats)[it1.m_gl_index_id].merge(
12304 it1, true, it_index->second->max_storage_fmt_length());
12305 }
12306 num_sst++;
12307 }
12308
12309 DBUG_RETURN(HA_EXIT_SUCCESS);
12310 }
12311
12312 static int calculate_stats(
12313 const std::unordered_map<GL_INDEX_ID, std::shared_ptr<const Rdb_key_def>>
12314 &to_recalc,
12315 table_cardinality_scan_type scan_type, THD::killed_state volatile *killed) {
12316 DBUG_ENTER_FUNC();
12317
12318 std::unordered_map<GL_INDEX_ID, Rdb_index_stats> stats;
12319 int ret = read_stats_from_ssts(to_recalc, &stats);
12320 if (ret != HA_EXIT_SUCCESS) {
12321 DBUG_RETURN(ret);
12322 }
12323
12324 if (scan_type != SCAN_TYPE_NONE) {
12325 std::unordered_map<GL_INDEX_ID, Rdb_index_stats> card_stats;
12326 uint64_t max_num_rows_scanned = rocksdb_table_stats_max_num_rows_scanned;
12327 ret = calculate_cardinality_table_scan(to_recalc, &card_stats, scan_type,
12328 max_num_rows_scanned, killed);
12329 if (ret != HA_EXIT_SUCCESS) {
12330 DBUG_RETURN(ret);
12331 }
12332
12333 if (scan_type == SCAN_TYPE_FULL_TABLE) {
12334 reset_cardinality(&stats);
12335 }
12336
12337 merge_stats(to_recalc, &stats, card_stats);
12338 if (scan_type == SCAN_TYPE_FULL_TABLE && max_num_rows_scanned > 0) {
12339 adjust_cardinality(&stats, scan_type, max_num_rows_scanned);
12340 }
12341 }
12342
12343 // set and persist new stats
12344 ddl_manager.set_stats(stats);
12345 ddl_manager.persist_stats(true);
12346
12347 DBUG_RETURN(HA_EXIT_SUCCESS);
12348 }
12349
12350 static int calculate_stats_for_table(
12351 const std::string &tbl_name, table_cardinality_scan_type scan_type,
12352 THD::killed_state volatile *killed = nullptr) {
12353 DBUG_ENTER_FUNC();
12354 std::unordered_map<GL_INDEX_ID, std::shared_ptr<const Rdb_key_def>> to_recalc;
12355 std::vector<GL_INDEX_ID> indexes;
12356 ddl_manager.find_indexes(tbl_name, &indexes);
12357
12358 for (const auto &index : indexes) {
12359 std::shared_ptr<const Rdb_key_def> keydef = ddl_manager.safe_find(index);
12360
12361 if (keydef) {
12362 to_recalc.insert(std::make_pair(keydef->get_gl_index_id(), keydef));
12363 }
12364 }
12365
12366 if (to_recalc.empty()) {
12367 DBUG_RETURN(HA_EXIT_FAILURE);
12368 }
12369
12370 DBUG_EXECUTE_IF("rocksdb_is_bg_thread_drop_table", {
12371 if (tbl_name == "test.t") {
12372 THD *thd = new THD();
12373 thd->thread_stack = reinterpret_cast<char *>(&thd);
12374 thd->store_globals();
12375
12376 static constexpr char act[] = "now signal ready_to_drop_table";
12377 assert(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
12378
12379 thd->restore_globals();
12380 delete thd;
12381 }
12382 });
12383
12384 int err = calculate_stats(to_recalc, scan_type, killed);
12385 if (err != HA_EXIT_SUCCESS) {
12386 DBUG_RETURN(err);
12387 }
12388
12389 DBUG_EXECUTE_IF("rocksdb_is_bg_thread_drop_table", {
12390 if (tbl_name == "test.t") {
12391 THD *thd = new THD();
12392 thd->thread_stack = reinterpret_cast<char *>(&thd);
12393 thd->store_globals();
12394
12395 static constexpr char act[] = "now wait_for ready_to_save_table_stats";
12396 assert(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
12397
12398 thd->restore_globals();
12399 delete thd;
12400 }
12401 });
12402
12403 if (scan_type == SCAN_TYPE_FULL_TABLE) {
12404 // Save table stats including number of rows
12405 // and modified counter
12406 ddl_manager.set_table_stats(tbl_name);
12407 }
12408
12409 DBUG_RETURN(HA_EXIT_SUCCESS);
12410 }
12411
12412 /*
12413 @return
12414 HA_ADMIN_OK OK
12415 other HA_ADMIN error code
12416 */
12417 int ha_rocksdb::analyze(THD *const thd, HA_CHECK_OPT *const check_opt) {
12418 DBUG_ENTER_FUNC();
12419
12420 if (table) {
12421 table_cardinality_scan_type scan_type = rocksdb_table_stats_use_table_scan
12422 ? SCAN_TYPE_FULL_TABLE
12423 : SCAN_TYPE_MEMTABLE_ONLY;
12424
12425 if (calculate_stats_for_table(m_tbl_def->full_tablename(), scan_type,
12426 &(thd->killed)) != HA_EXIT_SUCCESS) {
12427 DBUG_RETURN(HA_ADMIN_FAILED);
12428 }
12429 }
12430
12431 DBUG_RETURN(HA_ADMIN_OK);
12432 }
12433
12434 int ha_rocksdb::adjust_handler_stats_sst_and_memtable() {
12435 DBUG_ENTER_FUNC();
12436
12437 /*
12438 If any stats are negative due to bad cached stats, re-run analyze table
12439 and re-retrieve the stats.
12440 */
12441 if (static_cast<longlong>(stats.data_file_length) < 0 ||
12442 static_cast<longlong>(stats.index_file_length) < 0 ||
12443 static_cast<longlong>(stats.records) < 0) {
12444 if (calculate_stats_for_table(m_tbl_def->full_tablename(),
12445 SCAN_TYPE_NONE)) {
12446 DBUG_RETURN(HA_EXIT_FAILURE);
12447 }
12448
12449 update_stats();
12450 }
12451
12452 // if number of records is hardcoded, we do not want to force computation
12453 // of memtable cardinalities
12454 if (stats.records == 0 || (rocksdb_force_compute_memtable_stats &&
12455 rocksdb_debug_optimizer_n_rows == 0)) {
12456 // First, compute SST files stats
12457 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
12458 auto r = get_range(pk_index(table, m_tbl_def), buf);
12459 uint64_t sz = 0;
12460
12461 uint8_t include_flags = rocksdb::DB::INCLUDE_FILES;
12462
12463 // recompute SST files stats only if records count is 0
12464 if (stats.records == 0) {
12465 rdb->GetApproximateSizes(m_pk_descr->get_cf(), &r, 1, &sz, include_flags);
12466 stats.records += sz / ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE;
12467 stats.data_file_length += sz;
12468 }
12469
12470 // Second, compute memtable stats. This call is expensive, so cache
12471 // values computed for some time.
12472 uint64_t cachetime = rocksdb_force_compute_memtable_stats_cachetime;
12473 uint64_t time = (cachetime == 0) ? 0 : my_micro_time();
12474 if (cachetime == 0 ||
12475 time > m_table_handler->m_mtcache_last_update + cachetime) {
12476 uint64_t memtableCount;
12477 uint64_t memtableSize;
12478
12479 // the stats below are calculated from skiplist wich is a probablistic
12480 // data structure, so the results vary between test runs
12481 // it also can return 0 for quite a large tables which means that
12482 // cardinality for memtable only indxes will be reported as 0
12483
12484 rdb->GetApproximateMemTableStats(m_pk_descr->get_cf(), r, &memtableCount,
12485 &memtableSize);
12486
12487 // Atomically update all of these fields at the same time
12488 if (cachetime > 0) {
12489 if (m_table_handler->m_mtcache_lock.fetch_add(
12490 1, std::memory_order_acquire) == 0) {
12491 m_table_handler->m_mtcache_count = memtableCount;
12492 m_table_handler->m_mtcache_size = memtableSize;
12493 m_table_handler->m_mtcache_last_update = time;
12494 }
12495 m_table_handler->m_mtcache_lock.fetch_sub(1, std::memory_order_release);
12496 }
12497
12498 stats.records += memtableCount;
12499 stats.data_file_length += memtableSize;
12500 } else {
12501 // Cached data is still valid, so use it instead
12502 stats.records += m_table_handler->m_mtcache_count;
12503 stats.data_file_length += m_table_handler->m_mtcache_size;
12504 }
12505 }
12506
12507 DBUG_RETURN(HA_EXIT_SUCCESS);
12508 }
12509
12510 void ha_rocksdb::get_auto_increment(ulonglong off, ulonglong inc,
12511 ulonglong nb_desired_values,
12512 ulonglong *const first_value,
12513 ulonglong *const nb_reserved_values) {
12514 /*
12515 MySQL has a somewhat complicated way of handling the auto-increment value.
12516 The first time get_auto_increment is called for a statement,
12517 nb_desired_values is the estimate for how many values will be needed. The
12518 engine can then reserve some values, and those will be automatically used
12519 by MySQL, until a hard-coded value shows up in the insert statement, after
12520 which MySQL again calls this function to reset its starting value.
12521 *
12522 For simplicity we will just ignore nb_desired_values - we aren't going to
12523 reserve any extra values for a multi-insert statement. Each row will
12524 simply acquire the next value as needed and we will always tell MySQL that
12525 we only reserved 1 value. Since we are using an atomic value for
12526 m_auto_incr_val this should be safe - if we had to grab a mutex, doing
12527 an actual reserve of some values might be a better solution.
12528 */
12529 DEBUG_SYNC(ha_thd(), "rocksdb.autoinc_vars");
12530
12531 if (off > inc) {
12532 off = 1;
12533 }
12534
12535 Field *field;
12536 ulonglong new_val, max_val;
12537 field = table->key_info[table->s->next_number_index].key_part[0].field;
12538 max_val = rdb_get_int_col_max_value(field);
12539
12540 // Local variable reference to simplify code below
12541 auto &auto_incr = m_tbl_def->m_auto_incr_val;
12542
12543 if (inc == 1) {
12544 assert(off == 1);
12545 // Optimization for the standard case where we are always simply
12546 // incrementing from the last position
12547
12548 // Use CAS operation in a loop to make sure automically get the next auto
12549 // increment value while ensuring that we don't wrap around to a negative
12550 // number.
12551 //
12552 // We set auto_incr to the min of max_val and new_val + 1. This means that
12553 // if we're at the maximum, we should be returning the same value for
12554 // multiple rows, resulting in duplicate key errors (as expected).
12555 //
12556 // If we return values greater than the max, the SQL layer will "truncate"
12557 // the value anyway, but it means that we store invalid values into
12558 // auto_incr that will be visible in SHOW CREATE TABLE.
12559 new_val = auto_incr;
12560 while (new_val != std::numeric_limits<ulonglong>::max()) {
12561 if (auto_incr.compare_exchange_weak(new_val,
12562 std::min(new_val + 1, max_val))) {
12563 break;
12564 }
12565 }
12566 } else {
12567 // The next value can be more complicated if either 'inc' or 'off' is not 1
12568 ulonglong last_val = auto_incr;
12569
12570 if (last_val > max_val) {
12571 new_val = std::numeric_limits<ulonglong>::max();
12572 } else {
12573 // Loop until we can correctly update the atomic value
12574 do {
12575 assert(last_val > 0);
12576 // Calculate the next value in the auto increment series: offset
12577 // + N * increment where N is 0, 1, 2, ...
12578 //
12579 // For further information please visit:
12580 // http://dev.mysql.com/doc/refman/5.7/en/replication-options-master.html
12581 //
12582 // The following is confusing so here is an explanation:
12583 // To get the next number in the sequence above you subtract out the
12584 // offset, calculate the next sequence (N * increment) and then add the
12585 // offset back in.
12586 //
12587 // The additions are rearranged to avoid overflow. The following is
12588 // equivalent to (last_val - 1 + inc - off) / inc. This uses the fact
12589 // that (a+b)/c = a/c + b/c + (a%c + b%c)/c. To show why:
12590 //
12591 // (a+b)/c
12592 // = (a - a%c + a%c + b - b%c + b%c) / c
12593 // = (a - a%c) / c + (b - b%c) / c + (a%c + b%c) / c
12594 // = a/c + b/c + (a%c + b%c) / c
12595 //
12596 // Now, substitute a = last_val - 1, b = inc - off, c = inc to get the
12597 // following statement.
12598 ulonglong n =
12599 (last_val - 1) / inc + ((last_val - 1) % inc + inc - off) / inc;
12600
12601 // Check if n * inc + off will overflow. This can only happen if we have
12602 // an UNSIGNED BIGINT field.
12603 if (n > (std::numeric_limits<ulonglong>::max() - off) / inc) {
12604 assert(max_val == std::numeric_limits<ulonglong>::max());
12605 // The 'last_val' value is already equal to or larger than the largest
12606 // value in the sequence. Continuing would wrap around (technically
12607 // the behavior would be undefined). What should we do?
12608 // We could:
12609 // 1) set the new value to the last possible number in our sequence
12610 // as described above. The problem with this is that this
12611 // number could be smaller than a value in an existing row.
12612 // 2) set the new value to the largest possible number. This number
12613 // may not be in our sequence, but it is guaranteed to be equal
12614 // to or larger than any other value already inserted.
12615 //
12616 // For now I'm going to take option 2.
12617 //
12618 // Returning ULLONG_MAX from get_auto_increment will cause the SQL
12619 // layer to fail with ER_AUTOINC_READ_FAILED. This means that due to
12620 // the SE API for get_auto_increment, inserts will fail with
12621 // ER_AUTOINC_READ_FAILED if the column is UNSIGNED BIGINT, but
12622 // inserts will fail with ER_DUP_ENTRY for other types (or no failure
12623 // if the column is in a non-unique SK).
12624 new_val = std::numeric_limits<ulonglong>::max();
12625 auto_incr = new_val; // Store the largest value into auto_incr
12626 break;
12627 }
12628
12629 new_val = n * inc + off;
12630
12631 // Attempt to store the new value (plus 1 since m_auto_incr_val contains
12632 // the next available value) into the atomic value. If the current
12633 // value no longer matches what we have in 'last_val' this will fail and
12634 // we will repeat the loop (`last_val` will automatically get updated
12635 // with the current value).
12636 //
12637 // See above explanation for inc == 1 for why we use std::min.
12638 } while (!auto_incr.compare_exchange_weak(
12639 last_val, std::min(new_val + 1, max_val)));
12640 }
12641 }
12642
12643 *first_value = new_val;
12644 *nb_reserved_values = 1;
12645 }
12646
12647 #ifndef NDEBUG
12648
12649 /* Debugger help function */
12650 static char dbug_item_print_buf[512];
12651
12652 const char *dbug_print_item(Item *const item) {
12653 char *const buf = dbug_item_print_buf;
12654 String str(buf, sizeof(dbug_item_print_buf), &my_charset_bin);
12655 str.length(0);
12656 if (!item) return "(Item*)nullptr";
12657 item->print(&str, QT_ORDINARY);
12658 if (str.c_ptr() == buf) {
12659 return buf;
12660 } else {
12661 return "Couldn't fit into buffer";
12662 }
12663 }
12664
12665 #endif /*NDEBUG*/
12666
12667 /**
12668 SQL layer calls this function to push an index condition.
12669
12670 @details
12671 The condition is for index keyno (only one condition can be pushed at a
12672 time).
12673 The caller guarantees that condition refers only to index fields; besides
12674 that, fields must have
12675
12676 $field->part_of_key.set_bit(keyno)
12677
12678 which means that
12679
12680 (handler->index_flags(keyno, $keypart, 0) & HA_KEYREAD_ONLY) == 1
12681
12682 which means that field value can be restored from the index tuple.
12683
12684 @return
12685 Part of condition we couldn't check (always nullptr).
12686 */
12687
12688 class Item *ha_rocksdb::idx_cond_push(uint keyno, class Item *const idx_cond) {
12689 DBUG_ENTER_FUNC();
12690
12691 assert(keyno != MAX_KEY);
12692 assert(idx_cond != nullptr);
12693
12694 pushed_idx_cond = idx_cond;
12695 pushed_idx_cond_keyno = keyno;
12696 in_range_check_pushed_down = true;
12697
12698 /* We will check the whole condition */
12699 DBUG_RETURN(nullptr);
12700 }
12701
12702 /*
12703 @brief
12704 Check the index condition.
12705
12706 @detail
12707 Check the index condition. (The caller has unpacked all needed index
12708 columns into table->record[0])
12709
12710 @return
12711 ICP_NO_MATCH - Condition not satisfied (caller should continue
12712 scanning)
12713 OUT_OF_RANGE - We've left the range we're scanning (caller should
12714 stop scanning and return HA_ERR_END_OF_FILE)
12715
12716 ICP_MATCH - Condition is satisfied (caller should fetch the record
12717 and return it)
12718 */
12719
12720 enum icp_result ha_rocksdb::check_index_cond() const {
12721 assert(pushed_idx_cond);
12722 assert(pushed_idx_cond_keyno != MAX_KEY);
12723
12724 if (end_range && compare_key_icp(end_range) > 0) {
12725 /* caller should return HA_ERR_END_OF_FILE already */
12726 return ICP_OUT_OF_RANGE;
12727 }
12728
12729 return pushed_idx_cond->val_int() ? ICP_MATCH : ICP_NO_MATCH;
12730 }
12731
12732 /*
12733 Checks if inplace alter is supported for a given operation.
12734 */
12735
12736 my_core::enum_alter_inplace_result ha_rocksdb::check_if_supported_inplace_alter(
12737 TABLE *altered_table, my_core::Alter_inplace_info *const ha_alter_info) {
12738 DBUG_ENTER_FUNC();
12739
12740 assert(ha_alter_info != nullptr);
12741
12742 if (ha_alter_info->handler_flags &
12743 ~(my_core::Alter_inplace_info::DROP_INDEX |
12744 my_core::Alter_inplace_info::DROP_UNIQUE_INDEX |
12745 my_core::Alter_inplace_info::ADD_INDEX |
12746 my_core::Alter_inplace_info::ADD_UNIQUE_INDEX |
12747 my_core::Alter_inplace_info::CHANGE_CREATE_OPTION |
12748 (rocksdb_alter_column_default_inplace
12749 ? my_core::Alter_inplace_info::ALTER_COLUMN_DEFAULT
12750 : 0))) {
12751 DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED);
12752 }
12753
12754 /* We don't support unique keys on table w/ no primary keys */
12755 if ((ha_alter_info->handler_flags &
12756 my_core::Alter_inplace_info::ADD_UNIQUE_INDEX) &&
12757 has_hidden_pk(altered_table)) {
12758 DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED);
12759 }
12760
12761 /* We only support changing auto_increment for table options. */
12762 if ((ha_alter_info->handler_flags &
12763 my_core::Alter_inplace_info::CHANGE_CREATE_OPTION) &&
12764 !(ha_alter_info->create_info->used_fields & HA_CREATE_USED_AUTO)) {
12765 DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED);
12766 }
12767
12768 DBUG_RETURN(my_core::HA_ALTER_INPLACE_SHARED_LOCK_AFTER_PREPARE);
12769 }
12770
12771 /**
12772 Allows the storage engine to update internal structures with concurrent
12773 writes blocked. If check_if_supported_inplace_alter() returns
12774 HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE or
12775 HA_ALTER_INPLACE_SHARED_AFTER_PREPARE, this function is called with
12776 exclusive lock otherwise the same level of locking as for
12777 inplace_alter_table() will be used.
12778
12779 @note Storage engines are responsible for reporting any errors by
12780 calling my_error()/print_error()
12781
12782 @note If this function reports error, commit_inplace_alter_table()
12783 will be called with commit= false.
12784
12785 @note For partitioning, failing to prepare one partition, means that
12786 commit_inplace_alter_table() will be called to roll back changes for
12787 all partitions. This means that commit_inplace_alter_table() might be
12788 called without prepare_inplace_alter_table() having been called first
12789 for a given partition.
12790
12791 @param altered_table TABLE object for new version of table.
12792 @param ha_alter_info Structure describing changes to be done
12793 by ALTER TABLE and holding data used
12794 during in-place alter.
12795
12796 @retval true Error
12797 @retval false Success
12798 */
12799 bool ha_rocksdb::prepare_inplace_alter_table(
12800 TABLE *const altered_table,
12801 my_core::Alter_inplace_info *const ha_alter_info) {
12802 DBUG_ENTER_FUNC();
12803
12804 assert(altered_table != nullptr);
12805 assert(ha_alter_info != nullptr);
12806
12807 Rdb_tbl_def *new_tdef = nullptr;
12808 std::shared_ptr<Rdb_key_def> *old_key_descr = nullptr;
12809 std::shared_ptr<Rdb_key_def> *new_key_descr = nullptr;
12810 uint old_n_keys = m_tbl_def->m_key_count;
12811 uint new_n_keys = altered_table->s->keys;
12812 std::unordered_set<std::shared_ptr<Rdb_key_def>> added_indexes;
12813 std::unordered_set<GL_INDEX_ID> dropped_index_ids;
12814 uint n_dropped_keys = 0;
12815 uint n_added_keys = 0;
12816 ulonglong max_auto_incr = 0;
12817
12818 if (ha_alter_info->handler_flags &
12819 (my_core::Alter_inplace_info::DROP_INDEX |
12820 my_core::Alter_inplace_info::DROP_UNIQUE_INDEX |
12821 my_core::Alter_inplace_info::ADD_INDEX |
12822 my_core::Alter_inplace_info::ADD_UNIQUE_INDEX)) {
12823 if (has_hidden_pk(altered_table)) {
12824 new_n_keys += 1;
12825 }
12826
12827 const TABLE *const old_table = table;
12828 old_key_descr = m_tbl_def->m_key_descr_arr;
12829 new_key_descr = new std::shared_ptr<Rdb_key_def>[new_n_keys];
12830
12831 new_tdef = new Rdb_tbl_def(m_tbl_def->full_tablename());
12832 new_tdef->m_key_descr_arr = new_key_descr;
12833 new_tdef->m_key_count = new_n_keys;
12834 new_tdef->m_auto_incr_val =
12835 m_tbl_def->m_auto_incr_val.load(std::memory_order_relaxed);
12836 new_tdef->m_hidden_pk_val =
12837 m_tbl_def->m_hidden_pk_val.load(std::memory_order_relaxed);
12838
12839 if (create_key_defs(altered_table, new_tdef, table, m_tbl_def)) {
12840 /* Delete the new key descriptors */
12841 delete[] new_key_descr;
12842
12843 /*
12844 Explicitly mark as nullptr so we don't accidentally remove entries
12845 from data dictionary on cleanup (or cause double delete[]).
12846 */
12847 new_tdef->m_key_descr_arr = nullptr;
12848 delete new_tdef;
12849
12850 my_error(ER_KEY_CREATE_DURING_ALTER, MYF(0));
12851 DBUG_RETURN(HA_EXIT_FAILURE);
12852 }
12853
12854 uint i;
12855 uint j;
12856
12857 /* Determine which(if any) key definition(s) need to be dropped */
12858 for (i = 0; i < ha_alter_info->index_drop_count; i++) {
12859 const KEY *const dropped_key = ha_alter_info->index_drop_buffer[i];
12860 for (j = 0; j < old_n_keys; j++) {
12861 const KEY *const old_key =
12862 &old_table->key_info[old_key_descr[j]->get_keyno()];
12863
12864 if (!compare_keys(old_key, dropped_key)) {
12865 dropped_index_ids.insert(old_key_descr[j]->get_gl_index_id());
12866 break;
12867 }
12868 }
12869 }
12870
12871 /* Determine which(if any) key definitions(s) need to be added */
12872 int identical_indexes_found = 0;
12873 for (i = 0; i < ha_alter_info->index_add_count; i++) {
12874 const KEY *const added_key =
12875 &ha_alter_info->key_info_buffer[ha_alter_info->index_add_buffer[i]];
12876 for (j = 0; j < new_n_keys; j++) {
12877 const KEY *const new_key =
12878 &altered_table->key_info[new_key_descr[j]->get_keyno()];
12879 if (!compare_keys(new_key, added_key)) {
12880 /*
12881 Check for cases where an 'identical' index is being dropped and
12882 re-added in a single ALTER statement. Turn this into a no-op as the
12883 index has not changed.
12884
12885 E.G. Unique index -> non-unique index requires no change
12886
12887 Note that cases where the index name remains the same but the
12888 key-parts are changed is already handled in create_inplace_key_defs.
12889 In these cases the index needs to be rebuilt.
12890 */
12891 if (dropped_index_ids.count(new_key_descr[j]->get_gl_index_id())) {
12892 dropped_index_ids.erase(new_key_descr[j]->get_gl_index_id());
12893 identical_indexes_found++;
12894 } else {
12895 added_indexes.insert(new_key_descr[j]);
12896 }
12897
12898 break;
12899 }
12900 }
12901 }
12902
12903 n_dropped_keys = ha_alter_info->index_drop_count - identical_indexes_found;
12904 n_added_keys = ha_alter_info->index_add_count - identical_indexes_found;
12905 assert(dropped_index_ids.size() == n_dropped_keys);
12906 assert(added_indexes.size() == n_added_keys);
12907 assert(new_n_keys == (old_n_keys - n_dropped_keys + n_added_keys));
12908 }
12909 if (ha_alter_info->handler_flags &
12910 my_core::Alter_inplace_info::CHANGE_CREATE_OPTION) {
12911 if (!new_tdef) {
12912 new_tdef = m_tbl_def;
12913 }
12914 if (table->found_next_number_field) {
12915 max_auto_incr = load_auto_incr_value_from_index();
12916 }
12917 }
12918
12919 ha_alter_info->handler_ctx = new Rdb_inplace_alter_ctx(
12920 new_tdef, old_key_descr, new_key_descr, old_n_keys, new_n_keys,
12921 added_indexes, dropped_index_ids, n_added_keys, n_dropped_keys,
12922 max_auto_incr);
12923 DBUG_RETURN(HA_EXIT_SUCCESS);
12924 }
12925
12926 /**
12927 Alter the table structure in-place with operations specified using
12928 HA_ALTER_FLAGS and Alter_inplace_info. The level of concurrency allowed
12929 during this operation depends on the return value from
12930 check_if_supported_inplace_alter().
12931
12932 @note Storage engines are responsible for reporting any errors by
12933 calling my_error()/print_error()
12934
12935 @note If this function reports error, commit_inplace_alter_table()
12936 will be called with commit= false.
12937
12938 @param altered_table TABLE object for new version of table.
12939 @param ha_alter_info Structure describing changes to be done
12940 by ALTER TABLE and holding data used
12941 during in-place alter.
12942
12943 @retval true Error
12944 @retval false Success
12945 */
12946 bool ha_rocksdb::inplace_alter_table(
12947 TABLE *const altered_table,
12948 my_core::Alter_inplace_info *const ha_alter_info) {
12949 DBUG_ENTER_FUNC();
12950
12951 assert(altered_table != nullptr);
12952 assert(ha_alter_info != nullptr);
12953 assert(ha_alter_info->handler_ctx != nullptr);
12954
12955 Rdb_inplace_alter_ctx *const ctx =
12956 static_cast<Rdb_inplace_alter_ctx *>(ha_alter_info->handler_ctx);
12957
12958 if (ha_alter_info->handler_flags &
12959 (my_core::Alter_inplace_info::ADD_INDEX |
12960 my_core::Alter_inplace_info::ADD_UNIQUE_INDEX)) {
12961 /*
12962 Buffers need to be set up again to account for new, possibly longer
12963 secondary keys.
12964 */
12965 free_key_buffers();
12966
12967 assert(ctx != nullptr);
12968
12969 /*
12970 If adding unique index, allocate special buffers for duplicate checking.
12971 */
12972 int err;
12973 if ((err = alloc_key_buffers(
12974 altered_table, ctx->m_new_tdef,
12975 ha_alter_info->handler_flags &
12976 my_core::Alter_inplace_info::ADD_UNIQUE_INDEX))) {
12977 my_error(ER_OUT_OF_RESOURCES, MYF(0));
12978 DBUG_RETURN(err);
12979 }
12980
12981 /* Populate all new secondary keys by scanning the primary key. */
12982 if ((err = inplace_populate_sk(altered_table, ctx->m_added_indexes))) {
12983 my_error(ER_SK_POPULATE_DURING_ALTER, MYF(0));
12984 DBUG_RETURN(HA_EXIT_FAILURE);
12985 }
12986 }
12987
12988 DBUG_EXECUTE_IF("myrocks_simulate_index_create_rollback", {
12989 dbug_create_err_inplace_alter();
12990 DBUG_RETURN(HA_EXIT_FAILURE);
12991 };);
12992
12993 DBUG_RETURN(HA_EXIT_SUCCESS);
12994 }
12995
12996 /**
12997 Scan the Primary Key index entries and populate the new secondary keys.
12998 */
12999 int ha_rocksdb::inplace_populate_sk(
13000 TABLE *const new_table_arg,
13001 const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes) {
13002 DBUG_ENTER_FUNC();
13003 int res = HA_EXIT_SUCCESS;
13004 const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
13005 rocksdb::WriteBatch *const batch = wb.get();
13006
13007 DBUG_EXECUTE_IF("rocksdb_inplace_populate_sk", {
13008 static constexpr char act[] =
13009 "now signal ready_to_mark_cf_dropped_in_populate_sk "
13010 "wait_for mark_cf_dropped_done_in_populate_sk";
13011 assert(!debug_sync_set_action(ha_thd(), STRING_WITH_LEN(act)));
13012 });
13013
13014 {
13015 std::lock_guard<Rdb_dict_manager> dm_lock(dict_manager);
13016 for (const auto &kd : indexes) {
13017 const std::string cf_name = kd->get_cf()->GetName();
13018 std::shared_ptr<rocksdb::ColumnFamilyHandle> cfh =
13019 cf_manager.get_cf(cf_name);
13020
13021 if (!cfh || cfh != kd->get_shared_cf()) {
13022 // The CF has been dropped, i.e., cf_manager.remove_dropped_cf() has
13023 // been called.
13024 DBUG_RETURN(HA_EXIT_FAILURE);
13025 }
13026
13027 uint32 cf_id = cfh->GetID();
13028 if (dict_manager.get_dropped_cf(cf_id)) {
13029 DBUG_RETURN(HA_EXIT_FAILURE);
13030 }
13031 }
13032
13033 /* Update the data dictionary */
13034 std::unordered_set<GL_INDEX_ID> create_index_ids;
13035 for (const auto &index : indexes) {
13036 create_index_ids.insert(index->get_gl_index_id());
13037 }
13038 dict_manager.add_create_index(create_index_ids, batch);
13039 res = dict_manager.commit(batch);
13040 if (res != HA_EXIT_SUCCESS) {
13041 return res;
13042 }
13043
13044 /*
13045 Add uncommitted key definitons to ddl_manager. We need to do this
13046 so that the property collector can find this keydef when it needs to
13047 update stats. The property collector looks for the keydef in the
13048 data dictionary, but it won't be there yet since this key definition
13049 is still in the creation process.
13050 */
13051 ddl_manager.add_uncommitted_keydefs(indexes);
13052 }
13053
13054 const bool hidden_pk_exists = has_hidden_pk(table);
13055
13056 Rdb_transaction *tx = get_or_create_tx(table->in_use);
13057
13058 /*
13059 There is one specific scenario where m_sst_info may not be nullptr. This
13060 happens if the handler we're using happens to be the handler where the PK
13061 bulk load was done on. The sequence of events that lead to this is as
13062 follows (T1 is PK bulk load, T2 is SK alter table):
13063
13064 T1: Execute last INSERT statement
13065 T1: Return TABLE and handler object back to Table_cache_manager
13066 T1: Close connection
13067 T2: Execute ALTER statement
13068 T2: Take same TABLE/handler from Table_cache_manager
13069 T2: Call closefrm which will call finalize_bulk_load on every other open
13070 table/handler *except* the one it's on.
13071 T2: Acquire stale snapshot of PK
13072 T1: Call finalize_bulk_load
13073
13074 This is rare because usually, closefrm will call the destructor (and thus
13075 finalize_bulk_load) on the handler where PK bulk load is done. However, if
13076 the thread ids of the bulk load thread and the alter thread differ by a
13077 multiple of table_cache_instances (8 by default), then they hash to the
13078 same bucket in Table_cache_manager and the alter thread will not not call
13079 the destructor on the handler it is holding. Thus, its m_sst_info will not
13080 be nullptr.
13081
13082 At this point, it is safe to refresh the snapshot because we know all other
13083 open handlers have been closed at this point, and the one we're on is the
13084 only one left.
13085 */
13086 if (m_sst_info) {
13087 if ((res = finalize_bulk_load())) {
13088 DBUG_RETURN(res);
13089 }
13090 tx->commit();
13091 }
13092
13093 const ulonglong rdb_merge_buf_size = THDVAR(ha_thd(), merge_buf_size);
13094 const ulonglong rdb_merge_combine_read_size =
13095 THDVAR(ha_thd(), merge_combine_read_size);
13096 const ulonglong rdb_merge_tmp_file_removal_delay =
13097 THDVAR(ha_thd(), merge_tmp_file_removal_delay_ms);
13098
13099 for (const auto &index : indexes) {
13100 bool is_unique_index =
13101 new_table_arg->key_info[index->get_keyno()].flags & HA_NOSAME;
13102
13103 Rdb_index_merge rdb_merge(tx->get_rocksdb_tmpdir(), rdb_merge_buf_size,
13104 rdb_merge_combine_read_size,
13105 rdb_merge_tmp_file_removal_delay,
13106 index->get_cf());
13107
13108 if ((res = rdb_merge.init())) {
13109 DBUG_RETURN(res);
13110 }
13111
13112 /*
13113 Note: We pass in the currently existing table + tbl_def object here,
13114 as the pk index position may have changed in the case of hidden primary
13115 keys.
13116 */
13117 const uint pk = pk_index(table, m_tbl_def);
13118 res = ha_index_init(pk, true);
13119 if (res) DBUG_RETURN(res);
13120
13121 /* Scan each record in the primary key in order */
13122 for (res = index_first(table->record[0]); res == 0;
13123 res = index_next(table->record[0])) {
13124 longlong hidden_pk_id = 0;
13125 if (hidden_pk_exists &&
13126 (res = read_hidden_pk_id_from_rowkey(&hidden_pk_id))) {
13127 // NO_LINT_DEBUG
13128 sql_print_error("Error retrieving hidden pk id.");
13129 ha_index_end();
13130 DBUG_RETURN(res);
13131 }
13132
13133 /* Create new secondary index entry */
13134 const int new_packed_size = index->pack_record(
13135 new_table_arg, m_pack_buffer, table->record[0], m_sk_packed_tuple,
13136 &m_sk_tails, should_store_row_debug_checksums(), hidden_pk_id, 0,
13137 nullptr, m_ttl_bytes);
13138
13139 const rocksdb::Slice key = rocksdb::Slice(
13140 reinterpret_cast<const char *>(m_sk_packed_tuple), new_packed_size);
13141 const rocksdb::Slice val =
13142 rocksdb::Slice(reinterpret_cast<const char *>(m_sk_tails.ptr()),
13143 m_sk_tails.get_current_pos());
13144
13145 /*
13146 Add record to offset tree in preparation for writing out to
13147 disk in sorted chunks.
13148 */
13149 if ((res = rdb_merge.add(key, val))) {
13150 ha_index_end();
13151 DBUG_RETURN(res);
13152 }
13153 }
13154
13155 if (res != HA_ERR_END_OF_FILE) {
13156 // NO_LINT_DEBUG
13157 sql_print_error("Error retrieving index entry from primary key.");
13158 ha_index_end();
13159 DBUG_RETURN(res);
13160 }
13161
13162 ha_index_end();
13163
13164 /*
13165 Perform an n-way merge of n sorted buffers on disk, then writes all
13166 results to RocksDB via SSTFileWriter API.
13167 */
13168 rocksdb::Slice merge_key;
13169 rocksdb::Slice merge_val;
13170
13171 struct unique_sk_buf_info sk_info;
13172 sk_info.dup_sk_buf = m_dup_sk_packed_tuple;
13173 sk_info.dup_sk_buf_old = m_dup_sk_packed_tuple_old;
13174
13175 while ((res = rdb_merge.next(&merge_key, &merge_val)) == 0) {
13176 /* Perform uniqueness check if needed */
13177 if (is_unique_index) {
13178 if (check_duplicate_sk(new_table_arg, *index, &merge_key, &sk_info)) {
13179 /*
13180 Duplicate entry found when trying to create unique secondary key.
13181 We need to unpack the record into new_table_arg->record[0] as it
13182 is used inside print_keydup_error so that the error message shows
13183 the duplicate record.
13184 */
13185 if (index->unpack_record(
13186 new_table_arg, new_table_arg->record[0], &merge_key,
13187 &merge_val, m_converter->get_verify_row_debug_checksums())) {
13188 /* Should never reach here */
13189 assert(0);
13190 }
13191
13192 print_keydup_error(new_table_arg,
13193 &new_table_arg->key_info[index->get_keyno()],
13194 MYF(0));
13195 DBUG_RETURN(ER_DUP_ENTRY);
13196 }
13197 }
13198
13199 /*
13200 Insert key and slice to SST via SSTFileWriter API.
13201 */
13202 if ((res = bulk_load_key(tx, *index, merge_key, merge_val, false))) {
13203 break;
13204 }
13205 }
13206
13207 /*
13208 Here, res == -1 means that we are finished, while > 0 means an error
13209 occurred.
13210 */
13211 if (res > 0) {
13212 // NO_LINT_DEBUG
13213 sql_print_error("Error while bulk loading keys in external merge sort.");
13214 DBUG_RETURN(res);
13215 }
13216
13217 bool is_critical_error;
13218 res = tx->finish_bulk_load(&is_critical_error);
13219 if (res && is_critical_error) {
13220 // NO_LINT_DEBUG
13221 sql_print_error("Error finishing bulk load.");
13222 DBUG_RETURN(res);
13223 }
13224 }
13225
13226 /*
13227 Explicitly tell jemalloc to clean up any unused dirty pages at this point.
13228 See https://reviews.facebook.net/D63723 for more details.
13229 */
13230 purge_all_jemalloc_arenas();
13231
13232 DBUG_EXECUTE_IF("crash_during_online_index_creation", DBUG_SUICIDE(););
13233 DBUG_RETURN(res);
13234 }
13235
13236 /**
13237 Commit or rollback the changes made during prepare_inplace_alter_table()
13238 and inplace_alter_table() inside the storage engine.
13239 Note that in case of rollback the allowed level of concurrency during
13240 this operation will be the same as for inplace_alter_table() and thus
13241 might be higher than during prepare_inplace_alter_table(). (For example,
13242 concurrent writes were blocked during prepare, but might not be during
13243 rollback).
13244
13245 @note Storage engines are responsible for reporting any errors by
13246 calling my_error()/print_error()
13247
13248 @note If this function with commit= true reports error, it will be called
13249 again with commit= false.
13250
13251 @note In case of partitioning, this function might be called for rollback
13252 without prepare_inplace_alter_table() having been called first.
13253 Also partitioned tables sets ha_alter_info->group_commit_ctx to a NULL
13254 terminated array of the partitions handlers and if all of them are
13255 committed as one, then group_commit_ctx should be set to NULL to indicate
13256 to the partitioning handler that all partitions handlers are committed.
13257 @see prepare_inplace_alter_table().
13258
13259 @param altered_table TABLE object for new version of table.
13260 @param ha_alter_info Structure describing changes to be done
13261 by ALTER TABLE and holding data used
13262 during in-place alter.
13263 @param commit True => Commit, False => Rollback.
13264
13265 @retval true Error
13266 @retval false Success
13267 */
13268 bool ha_rocksdb::commit_inplace_alter_table(
13269 my_core::TABLE *const altered_table,
13270 my_core::Alter_inplace_info *const ha_alter_info, bool commit) {
13271 DBUG_ENTER_FUNC();
13272
13273 assert(altered_table != nullptr);
13274 assert(ha_alter_info != nullptr);
13275
13276 Rdb_inplace_alter_ctx *const ctx0 =
13277 static_cast<Rdb_inplace_alter_ctx *>(ha_alter_info->handler_ctx);
13278
13279 DEBUG_SYNC(ha_thd(), "rocksdb.commit_in_place_alter_table");
13280
13281 /*
13282 IMPORTANT: When rollback is requested, mysql will abort with
13283 an assertion failure. That means every failed commit during inplace alter
13284 table will result in a fatal error on the server. Indexes ongoing creation
13285 will be detected when the server restarts, and dropped.
13286
13287 For partitioned tables, a rollback call to this function (commit == false)
13288 is done for each partition. A successful commit call only executes once
13289 for all partitions.
13290 */
13291 if (!commit) {
13292 /* If ctx has not been created yet, nothing to do here */
13293 if (!ctx0) {
13294 DBUG_RETURN(HA_EXIT_SUCCESS);
13295 }
13296
13297 /*
13298 Cannot call destructor for Rdb_tbl_def directly because we don't want to
13299 erase the mappings inside the ddl_manager, as the old_key_descr is still
13300 using them.
13301 */
13302 if (ctx0->m_new_key_descr) {
13303 /* Delete the new key descriptors */
13304 for (uint i = 0; i < ctx0->m_new_tdef->m_key_count; i++) {
13305 ctx0->m_new_key_descr[i] = nullptr;
13306 }
13307
13308 delete[] ctx0->m_new_key_descr;
13309 ctx0->m_new_key_descr = nullptr;
13310 ctx0->m_new_tdef->m_key_descr_arr = nullptr;
13311
13312 delete ctx0->m_new_tdef;
13313 }
13314
13315 {
13316 std::lock_guard<Rdb_dict_manager> dm_lock(dict_manager);
13317 /* Remove uncommitted key definitons from ddl_manager */
13318 ddl_manager.remove_uncommitted_keydefs(ctx0->m_added_indexes);
13319
13320 std::unordered_set<GL_INDEX_ID> all_gl_index_ids;
13321 dict_manager.get_ongoing_create_indexes(&all_gl_index_ids);
13322
13323 std::unordered_set<GL_INDEX_ID> gl_index_ids;
13324 for (auto index : ctx0->m_added_indexes) {
13325 auto gl_index_id = index->get_gl_index_id();
13326 if (all_gl_index_ids.find(gl_index_id) != all_gl_index_ids.end()) {
13327 gl_index_ids.insert(gl_index_id);
13328 }
13329 }
13330
13331 if (!gl_index_ids.empty()) {
13332 /* Rollback any partially created indexes of this table */
13333 dict_manager.rollback_ongoing_index_creation(gl_index_ids);
13334 }
13335 }
13336
13337 DBUG_RETURN(HA_EXIT_SUCCESS);
13338 }
13339
13340 assert(ctx0);
13341
13342 /*
13343 For partitioned tables, we need to commit all changes to all tables at
13344 once, unlike in the other inplace alter API methods.
13345 */
13346 inplace_alter_handler_ctx **ctx_array;
13347 inplace_alter_handler_ctx *ctx_single[2];
13348
13349 if (ha_alter_info->group_commit_ctx) {
13350 DBUG_EXECUTE_IF("crash_during_index_creation_partition", DBUG_SUICIDE(););
13351 ctx_array = ha_alter_info->group_commit_ctx;
13352 } else {
13353 ctx_single[0] = ctx0;
13354 ctx_single[1] = nullptr;
13355 ctx_array = ctx_single;
13356 }
13357
13358 assert(ctx0 == ctx_array[0]);
13359 ha_alter_info->group_commit_ctx = nullptr;
13360
13361 if (ha_alter_info->handler_flags &
13362 (my_core::Alter_inplace_info::DROP_INDEX |
13363 my_core::Alter_inplace_info::DROP_UNIQUE_INDEX |
13364 my_core::Alter_inplace_info::ADD_INDEX |
13365 my_core::Alter_inplace_info::ADD_UNIQUE_INDEX)) {
13366 const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
13367 rocksdb::WriteBatch *const batch = wb.get();
13368 std::unordered_set<GL_INDEX_ID> create_index_ids;
13369
13370 m_tbl_def = ctx0->m_new_tdef;
13371 m_key_descr_arr = m_tbl_def->m_key_descr_arr;
13372 m_pk_descr = m_key_descr_arr[pk_index(altered_table, m_tbl_def)];
13373
13374 DBUG_EXECUTE_IF("rocksdb_commit_alter_table", {
13375 static constexpr char act[] =
13376 "now signal ready_to_mark_cf_dropped_before_commit_alter_table "
13377 "wait_for mark_cf_dropped_done_before_commit_alter_table";
13378 assert(!debug_sync_set_action(ha_thd(), STRING_WITH_LEN(act)));
13379 });
13380
13381 {
13382 std::lock_guard<Rdb_dict_manager> dm_lock(dict_manager);
13383 for (inplace_alter_handler_ctx **pctx = ctx_array; *pctx; pctx++) {
13384 Rdb_inplace_alter_ctx *const ctx =
13385 static_cast<Rdb_inplace_alter_ctx *>(*pctx);
13386
13387 /* Mark indexes to be dropped */
13388 dict_manager.add_drop_index(ctx->m_dropped_index_ids, batch);
13389
13390 for (const auto &index : ctx->m_added_indexes) {
13391 create_index_ids.insert(index->get_gl_index_id());
13392 }
13393
13394 if (ddl_manager.put_and_write(ctx->m_new_tdef, batch)) {
13395 /*
13396 Failed to write new entry into data dictionary, this should never
13397 happen.
13398 */
13399 assert(0);
13400 }
13401
13402 /*
13403 Remove uncommitted key definitons from ddl_manager, as they are now
13404 committed into the data dictionary.
13405 */
13406 ddl_manager.remove_uncommitted_keydefs(ctx->m_added_indexes);
13407 }
13408
13409 if (dict_manager.commit(batch)) {
13410 /*
13411 Should never reach here. We assume MyRocks will abort if commit
13412 fails.
13413 */
13414 assert(0);
13415 }
13416
13417 /* Mark ongoing create indexes as finished/remove from data dictionary */
13418 dict_manager.finish_indexes_operation(
13419 create_index_ids, Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
13420 }
13421
13422 DBUG_EXECUTE_IF("rocksdb_delete_index", {
13423 static constexpr char act[] =
13424 "now signal ready_to_mark_cf_dropped_after_commit_alter_table "
13425 "wait_for mark_cf_dropped_done_after_commit_alter_table";
13426 assert(!debug_sync_set_action(ha_thd(), STRING_WITH_LEN(act)));
13427 });
13428
13429 rdb_drop_idx_thread.signal();
13430
13431 if (rocksdb_table_stats_use_table_scan && !ctx0->m_added_indexes.empty()) {
13432 // If new indexes are created, add the table to the recalc queue
13433 // to calculate stats for new indexes
13434 rdb_is_thread.add_index_stats_request(m_tbl_def->full_tablename());
13435 }
13436 }
13437
13438 if (ha_alter_info->handler_flags &
13439 (my_core::Alter_inplace_info::CHANGE_CREATE_OPTION)) {
13440 const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
13441 rocksdb::WriteBatch *const batch = wb.get();
13442 std::unordered_set<GL_INDEX_ID> create_index_ids;
13443
13444 ulonglong auto_incr_val = ha_alter_info->create_info->auto_increment_value;
13445
13446 for (inplace_alter_handler_ctx **pctx = ctx_array; *pctx; pctx++) {
13447 Rdb_inplace_alter_ctx *const ctx =
13448 static_cast<Rdb_inplace_alter_ctx *>(*pctx);
13449 auto_incr_val = std::max(auto_incr_val, ctx->m_max_auto_incr);
13450 dict_manager.put_auto_incr_val(
13451 batch, ctx->m_new_tdef->get_autoincr_gl_index_id(), auto_incr_val,
13452 true /* overwrite */);
13453 ctx->m_new_tdef->m_auto_incr_val = auto_incr_val;
13454 }
13455
13456 if (dict_manager.commit(batch)) {
13457 assert(0);
13458 }
13459 }
13460
13461 DBUG_RETURN(HA_EXIT_SUCCESS);
13462 }
13463
13464 #define SHOW_FNAME(name) rocksdb_show_##name
13465
13466 #define DEF_SHOW_FUNC(name, key) \
13467 static int SHOW_FNAME(name)(MYSQL_THD thd, SHOW_VAR * var, char *buff) { \
13468 rocksdb_status_counters.name = \
13469 rocksdb_stats->getTickerCount(rocksdb::key); \
13470 var->type = SHOW_LONGLONG; \
13471 var->value = (char *)&rocksdb_status_counters.name; \
13472 return HA_EXIT_SUCCESS; \
13473 }
13474
13475 #define DEF_STATUS_VAR(name) \
13476 { "rocksdb_" #name, (char *)&SHOW_FNAME(name), SHOW_FUNC, SHOW_SCOPE_GLOBAL }
13477
13478 #define DEF_STATUS_VAR_PTR(name, ptr, option) \
13479 { "rocksdb_" name, (char *)ptr, option, SHOW_SCOPE_GLOBAL }
13480
13481 #define DEF_STATUS_VAR_FUNC(name, ptr, option) \
13482 { name, reinterpret_cast<char *>(ptr), option, SHOW_SCOPE_GLOBAL }
13483
13484 struct rocksdb_status_counters_t {
13485 uint64_t block_cache_miss;
13486 uint64_t block_cache_hit;
13487 uint64_t block_cache_add;
13488 uint64_t block_cache_add_failures;
13489 uint64_t block_cache_index_miss;
13490 uint64_t block_cache_index_hit;
13491 uint64_t block_cache_index_add;
13492 uint64_t block_cache_index_bytes_insert;
13493 uint64_t block_cache_index_bytes_evict;
13494 uint64_t block_cache_filter_miss;
13495 uint64_t block_cache_filter_hit;
13496 uint64_t block_cache_filter_add;
13497 uint64_t block_cache_filter_bytes_insert;
13498 uint64_t block_cache_filter_bytes_evict;
13499 uint64_t block_cache_bytes_read;
13500 uint64_t block_cache_bytes_write;
13501 uint64_t block_cache_data_bytes_insert;
13502 uint64_t block_cache_data_miss;
13503 uint64_t block_cache_data_hit;
13504 uint64_t block_cache_data_add;
13505 uint64_t bloom_filter_useful;
13506 uint64_t bloom_filter_full_positive;
13507 uint64_t bloom_filter_full_true_positive;
13508 uint64_t memtable_hit;
13509 uint64_t memtable_miss;
13510 uint64_t get_hit_l0;
13511 uint64_t get_hit_l1;
13512 uint64_t get_hit_l2_and_up;
13513 uint64_t compaction_key_drop_new;
13514 uint64_t compaction_key_drop_obsolete;
13515 uint64_t compaction_key_drop_user;
13516 uint64_t number_keys_written;
13517 uint64_t number_keys_read;
13518 uint64_t number_keys_updated;
13519 uint64_t bytes_written;
13520 uint64_t bytes_read;
13521 uint64_t number_db_seek;
13522 uint64_t number_db_seek_found;
13523 uint64_t number_db_next;
13524 uint64_t number_db_next_found;
13525 uint64_t number_db_prev;
13526 uint64_t number_db_prev_found;
13527 uint64_t iter_bytes_read;
13528 uint64_t no_file_closes;
13529 uint64_t no_file_opens;
13530 uint64_t no_file_errors;
13531 uint64_t stall_micros;
13532 uint64_t num_iterators;
13533 uint64_t number_multiget_get;
13534 uint64_t number_multiget_keys_read;
13535 uint64_t number_multiget_bytes_read;
13536 uint64_t number_deletes_filtered;
13537 uint64_t number_merge_failures;
13538 uint64_t bloom_filter_prefix_checked;
13539 uint64_t bloom_filter_prefix_useful;
13540 uint64_t number_reseeks_iteration;
13541 uint64_t get_updates_since_calls;
13542 uint64_t block_cache_compressed_miss;
13543 uint64_t block_cache_compressed_hit;
13544 uint64_t wal_synced;
13545 uint64_t wal_bytes;
13546 uint64_t write_self;
13547 uint64_t write_other;
13548 uint64_t write_timedout;
13549 uint64_t write_wal;
13550 uint64_t flush_write_bytes;
13551 uint64_t compact_read_bytes;
13552 uint64_t compact_write_bytes;
13553 uint64_t number_superversion_acquires;
13554 uint64_t number_superversion_releases;
13555 uint64_t number_superversion_cleanups;
13556 uint64_t number_block_not_compressed;
13557 };
13558
13559 static rocksdb_status_counters_t rocksdb_status_counters;
13560
13561 DEF_SHOW_FUNC(block_cache_miss, BLOCK_CACHE_MISS)
13562 DEF_SHOW_FUNC(block_cache_hit, BLOCK_CACHE_HIT)
13563 DEF_SHOW_FUNC(block_cache_add, BLOCK_CACHE_ADD)
13564 DEF_SHOW_FUNC(block_cache_add_failures, BLOCK_CACHE_ADD_FAILURES)
13565 DEF_SHOW_FUNC(block_cache_index_miss, BLOCK_CACHE_INDEX_MISS)
13566 DEF_SHOW_FUNC(block_cache_index_hit, BLOCK_CACHE_INDEX_HIT)
13567 DEF_SHOW_FUNC(block_cache_index_add, BLOCK_CACHE_INDEX_ADD)
13568 DEF_SHOW_FUNC(block_cache_index_bytes_insert, BLOCK_CACHE_INDEX_BYTES_INSERT)
13569 DEF_SHOW_FUNC(block_cache_index_bytes_evict, BLOCK_CACHE_INDEX_BYTES_EVICT)
13570 DEF_SHOW_FUNC(block_cache_filter_miss, BLOCK_CACHE_FILTER_MISS)
13571 DEF_SHOW_FUNC(block_cache_filter_hit, BLOCK_CACHE_FILTER_HIT)
13572 DEF_SHOW_FUNC(block_cache_filter_add, BLOCK_CACHE_FILTER_ADD)
13573 DEF_SHOW_FUNC(block_cache_filter_bytes_insert, BLOCK_CACHE_FILTER_BYTES_INSERT)
13574 DEF_SHOW_FUNC(block_cache_filter_bytes_evict, BLOCK_CACHE_FILTER_BYTES_EVICT)
13575 DEF_SHOW_FUNC(block_cache_bytes_read, BLOCK_CACHE_BYTES_READ)
13576 DEF_SHOW_FUNC(block_cache_bytes_write, BLOCK_CACHE_BYTES_WRITE)
13577 DEF_SHOW_FUNC(block_cache_data_bytes_insert, BLOCK_CACHE_DATA_BYTES_INSERT)
13578 DEF_SHOW_FUNC(block_cache_data_miss, BLOCK_CACHE_DATA_MISS)
13579 DEF_SHOW_FUNC(block_cache_data_hit, BLOCK_CACHE_DATA_HIT)
13580 DEF_SHOW_FUNC(block_cache_data_add, BLOCK_CACHE_DATA_ADD)
13581 DEF_SHOW_FUNC(bloom_filter_useful, BLOOM_FILTER_USEFUL)
13582 DEF_SHOW_FUNC(bloom_filter_full_positive, BLOOM_FILTER_FULL_POSITIVE)
13583 DEF_SHOW_FUNC(bloom_filter_full_true_positive, BLOOM_FILTER_FULL_TRUE_POSITIVE)
13584 DEF_SHOW_FUNC(memtable_hit, MEMTABLE_HIT)
13585 DEF_SHOW_FUNC(memtable_miss, MEMTABLE_MISS)
13586 DEF_SHOW_FUNC(get_hit_l0, GET_HIT_L0)
13587 DEF_SHOW_FUNC(get_hit_l1, GET_HIT_L1)
13588 DEF_SHOW_FUNC(get_hit_l2_and_up, GET_HIT_L2_AND_UP)
13589 DEF_SHOW_FUNC(compaction_key_drop_new, COMPACTION_KEY_DROP_NEWER_ENTRY)
13590 DEF_SHOW_FUNC(compaction_key_drop_obsolete, COMPACTION_KEY_DROP_OBSOLETE)
13591 DEF_SHOW_FUNC(compaction_key_drop_user, COMPACTION_KEY_DROP_USER)
13592 DEF_SHOW_FUNC(number_keys_written, NUMBER_KEYS_WRITTEN)
13593 DEF_SHOW_FUNC(number_keys_read, NUMBER_KEYS_READ)
13594 DEF_SHOW_FUNC(number_keys_updated, NUMBER_KEYS_UPDATED)
13595 DEF_SHOW_FUNC(bytes_written, BYTES_WRITTEN)
13596 DEF_SHOW_FUNC(bytes_read, BYTES_READ)
13597 DEF_SHOW_FUNC(number_db_seek, NUMBER_DB_SEEK)
13598 DEF_SHOW_FUNC(number_db_seek_found, NUMBER_DB_SEEK_FOUND)
13599 DEF_SHOW_FUNC(number_db_next, NUMBER_DB_NEXT)
13600 DEF_SHOW_FUNC(number_db_next_found, NUMBER_DB_NEXT_FOUND)
13601 DEF_SHOW_FUNC(number_db_prev, NUMBER_DB_PREV)
13602 DEF_SHOW_FUNC(number_db_prev_found, NUMBER_DB_PREV_FOUND)
13603 DEF_SHOW_FUNC(iter_bytes_read, ITER_BYTES_READ)
13604 DEF_SHOW_FUNC(no_file_closes, NO_FILE_CLOSES)
13605 DEF_SHOW_FUNC(no_file_opens, NO_FILE_OPENS)
13606 DEF_SHOW_FUNC(no_file_errors, NO_FILE_ERRORS)
13607 DEF_SHOW_FUNC(stall_micros, STALL_MICROS)
13608 DEF_SHOW_FUNC(num_iterators, NO_ITERATORS)
13609 DEF_SHOW_FUNC(number_multiget_get, NUMBER_MULTIGET_CALLS)
13610 DEF_SHOW_FUNC(number_multiget_keys_read, NUMBER_MULTIGET_KEYS_READ)
13611 DEF_SHOW_FUNC(number_multiget_bytes_read, NUMBER_MULTIGET_BYTES_READ)
13612 DEF_SHOW_FUNC(number_deletes_filtered, NUMBER_FILTERED_DELETES)
13613 DEF_SHOW_FUNC(number_merge_failures, NUMBER_MERGE_FAILURES)
13614 DEF_SHOW_FUNC(bloom_filter_prefix_checked, BLOOM_FILTER_PREFIX_CHECKED)
13615 DEF_SHOW_FUNC(bloom_filter_prefix_useful, BLOOM_FILTER_PREFIX_USEFUL)
13616 DEF_SHOW_FUNC(number_reseeks_iteration, NUMBER_OF_RESEEKS_IN_ITERATION)
13617 DEF_SHOW_FUNC(get_updates_since_calls, GET_UPDATES_SINCE_CALLS)
13618 DEF_SHOW_FUNC(block_cache_compressed_miss, BLOCK_CACHE_COMPRESSED_MISS)
13619 DEF_SHOW_FUNC(block_cache_compressed_hit, BLOCK_CACHE_COMPRESSED_HIT)
13620 DEF_SHOW_FUNC(wal_synced, WAL_FILE_SYNCED)
13621 DEF_SHOW_FUNC(wal_bytes, WAL_FILE_BYTES)
13622 DEF_SHOW_FUNC(write_self, WRITE_DONE_BY_SELF)
13623 DEF_SHOW_FUNC(write_other, WRITE_DONE_BY_OTHER)
13624 DEF_SHOW_FUNC(write_timedout, WRITE_TIMEDOUT)
13625 DEF_SHOW_FUNC(write_wal, WRITE_WITH_WAL)
13626 DEF_SHOW_FUNC(flush_write_bytes, FLUSH_WRITE_BYTES)
13627 DEF_SHOW_FUNC(compact_read_bytes, COMPACT_READ_BYTES)
13628 DEF_SHOW_FUNC(compact_write_bytes, COMPACT_WRITE_BYTES)
13629 DEF_SHOW_FUNC(number_superversion_acquires, NUMBER_SUPERVERSION_ACQUIRES)
13630 DEF_SHOW_FUNC(number_superversion_releases, NUMBER_SUPERVERSION_RELEASES)
13631 DEF_SHOW_FUNC(number_superversion_cleanups, NUMBER_SUPERVERSION_CLEANUPS)
13632 DEF_SHOW_FUNC(number_block_not_compressed, NUMBER_BLOCK_NOT_COMPRESSED)
13633
13634 static void myrocks_update_status() {
13635 export_stats.rows_deleted = global_stats.rows[ROWS_DELETED];
13636 export_stats.rows_deleted_blind = global_stats.rows[ROWS_DELETED_BLIND];
13637 export_stats.rows_inserted = global_stats.rows[ROWS_INSERTED];
13638 export_stats.rows_read = global_stats.rows[ROWS_READ];
13639 export_stats.rows_updated = global_stats.rows[ROWS_UPDATED];
13640 export_stats.rows_expired = global_stats.rows[ROWS_EXPIRED];
13641 export_stats.rows_filtered = global_stats.rows[ROWS_FILTERED];
13642
13643 export_stats.system_rows_deleted = global_stats.system_rows[ROWS_DELETED];
13644 export_stats.system_rows_inserted = global_stats.system_rows[ROWS_INSERTED];
13645 export_stats.system_rows_read = global_stats.system_rows[ROWS_READ];
13646 export_stats.system_rows_updated = global_stats.system_rows[ROWS_UPDATED];
13647
13648 export_stats.queries_point = global_stats.queries[QUERIES_POINT];
13649 export_stats.queries_range = global_stats.queries[QUERIES_RANGE];
13650
13651 export_stats.table_index_stats_success =
13652 global_stats.table_index_stats_result[TABLE_INDEX_STATS_SUCCESS];
13653 export_stats.table_index_stats_failure =
13654 global_stats.table_index_stats_result[TABLE_INDEX_STATS_FAILURE];
13655 export_stats.table_index_stats_req_queue_length =
13656 rdb_is_thread.get_request_queue_size();
13657
13658 export_stats.covered_secondary_key_lookups =
13659 global_stats.covered_secondary_key_lookups;
13660 }
13661
13662 static void myrocks_update_memory_status() {
13663 std::vector<rocksdb::DB *> dbs;
13664 std::unordered_set<const rocksdb::Cache *> cache_set;
13665 dbs.push_back(rdb);
13666 std::map<rocksdb::MemoryUtil::UsageType, uint64_t> temp_usage_by_type;
13667 rocksdb::MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set,
13668 &temp_usage_by_type);
13669 memory_stats.memtable_total =
13670 temp_usage_by_type[rocksdb::MemoryUtil::kMemTableTotal];
13671 memory_stats.memtable_unflushed =
13672 temp_usage_by_type[rocksdb::MemoryUtil::kMemTableUnFlushed];
13673 }
13674
13675 static SHOW_VAR myrocks_status_variables[] = {
13676 DEF_STATUS_VAR_FUNC("rows_deleted", &export_stats.rows_deleted,
13677 SHOW_LONGLONG),
13678 DEF_STATUS_VAR_FUNC("rows_deleted_blind", &export_stats.rows_deleted_blind,
13679 SHOW_LONGLONG),
13680 DEF_STATUS_VAR_FUNC("rows_inserted", &export_stats.rows_inserted,
13681 SHOW_LONGLONG),
13682 DEF_STATUS_VAR_FUNC("rows_read", &export_stats.rows_read, SHOW_LONGLONG),
13683 DEF_STATUS_VAR_FUNC("rows_updated", &export_stats.rows_updated,
13684 SHOW_LONGLONG),
13685 DEF_STATUS_VAR_FUNC("rows_expired", &export_stats.rows_expired,
13686 SHOW_LONGLONG),
13687 DEF_STATUS_VAR_FUNC("rows_filtered", &export_stats.rows_filtered,
13688 SHOW_LONGLONG),
13689 DEF_STATUS_VAR_FUNC("system_rows_deleted",
13690 &export_stats.system_rows_deleted, SHOW_LONGLONG),
13691 DEF_STATUS_VAR_FUNC("system_rows_inserted",
13692 &export_stats.system_rows_inserted, SHOW_LONGLONG),
13693 DEF_STATUS_VAR_FUNC("system_rows_read", &export_stats.system_rows_read,
13694 SHOW_LONGLONG),
13695 DEF_STATUS_VAR_FUNC("system_rows_updated",
13696 &export_stats.system_rows_updated, SHOW_LONGLONG),
13697 DEF_STATUS_VAR_FUNC("memtable_total", &memory_stats.memtable_total,
13698 SHOW_LONGLONG),
13699 DEF_STATUS_VAR_FUNC("memtable_unflushed", &memory_stats.memtable_unflushed,
13700 SHOW_LONGLONG),
13701 DEF_STATUS_VAR_FUNC("queries_point", &export_stats.queries_point,
13702 SHOW_LONGLONG),
13703 DEF_STATUS_VAR_FUNC("queries_range", &export_stats.queries_range,
13704 SHOW_LONGLONG),
13705 DEF_STATUS_VAR_FUNC("table_index_stats_success",
13706 &export_stats.table_index_stats_success, SHOW_LONGLONG),
13707 DEF_STATUS_VAR_FUNC("table_index_stats_failure",
13708 &export_stats.table_index_stats_failure, SHOW_LONGLONG),
13709 DEF_STATUS_VAR_FUNC("table_index_stats_req_queue_length",
13710 &export_stats.table_index_stats_req_queue_length,
13711 SHOW_LONGLONG),
13712 DEF_STATUS_VAR_FUNC("covered_secondary_key_lookups",
13713 &export_stats.covered_secondary_key_lookups,
13714 SHOW_LONGLONG),
13715
13716 {NullS, NullS, SHOW_LONG, SHOW_SCOPE_GLOBAL}};
13717
13718 static void show_myrocks_vars(THD *thd, SHOW_VAR *var, char *buff) {
13719 myrocks_update_status();
13720 myrocks_update_memory_status();
13721 var->type = SHOW_ARRAY;
13722 var->value = reinterpret_cast<char *>(&myrocks_status_variables);
13723 }
13724
13725 static ulonglong io_stall_prop_value(
13726 const std::map<std::string, std::string> &props, const std::string &key) {
13727 std::map<std::string, std::string>::const_iterator iter =
13728 props.find("io_stalls." + key);
13729 if (iter != props.end()) {
13730 return std::stoull(iter->second);
13731 } else {
13732 DBUG_PRINT("warning",
13733 ("RocksDB GetMapProperty hasn't returned key=%s", key.c_str()));
13734 assert(0);
13735 return 0;
13736 }
13737 }
13738
13739 static void update_rocksdb_stall_status() {
13740 st_io_stall_stats local_io_stall_stats;
13741 for (const auto &cf_name : cf_manager.get_cf_names()) {
13742 std::shared_ptr<rocksdb::ColumnFamilyHandle> cfh =
13743 cf_manager.get_cf(cf_name);
13744 if (!cfh) {
13745 continue;
13746 }
13747
13748 // Retrieve information from valid CF handle object. It is safe
13749 // even if the CF is removed from cf_manager at this point.
13750 std::map<std::string, std::string> props;
13751 if (!rdb->GetMapProperty(cfh.get(), "rocksdb.cfstats", &props)) {
13752 continue;
13753 }
13754
13755 local_io_stall_stats.level0_slowdown +=
13756 io_stall_prop_value(props, "level0_slowdown");
13757 local_io_stall_stats.level0_slowdown_with_compaction +=
13758 io_stall_prop_value(props, "level0_slowdown_with_compaction");
13759 local_io_stall_stats.level0_numfiles +=
13760 io_stall_prop_value(props, "level0_numfiles");
13761 local_io_stall_stats.level0_numfiles_with_compaction +=
13762 io_stall_prop_value(props, "level0_numfiles_with_compaction");
13763 local_io_stall_stats.stop_for_pending_compaction_bytes +=
13764 io_stall_prop_value(props, "stop_for_pending_compaction_bytes");
13765 local_io_stall_stats.slowdown_for_pending_compaction_bytes +=
13766 io_stall_prop_value(props, "slowdown_for_pending_compaction_bytes");
13767 local_io_stall_stats.memtable_compaction +=
13768 io_stall_prop_value(props, "memtable_compaction");
13769 local_io_stall_stats.memtable_slowdown +=
13770 io_stall_prop_value(props, "memtable_slowdown");
13771 local_io_stall_stats.total_stop += io_stall_prop_value(props, "total_stop");
13772 local_io_stall_stats.total_slowdown +=
13773 io_stall_prop_value(props, "total_slowdown");
13774 }
13775 io_stall_stats = local_io_stall_stats;
13776 }
13777
13778 static SHOW_VAR rocksdb_stall_status_variables[] = {
13779 DEF_STATUS_VAR_FUNC("l0_file_count_limit_slowdowns",
13780 &io_stall_stats.level0_slowdown, SHOW_LONGLONG),
13781 DEF_STATUS_VAR_FUNC("locked_l0_file_count_limit_slowdowns",
13782 &io_stall_stats.level0_slowdown_with_compaction,
13783 SHOW_LONGLONG),
13784 DEF_STATUS_VAR_FUNC("l0_file_count_limit_stops",
13785 &io_stall_stats.level0_numfiles, SHOW_LONGLONG),
13786 DEF_STATUS_VAR_FUNC("locked_l0_file_count_limit_stops",
13787 &io_stall_stats.level0_numfiles_with_compaction,
13788 SHOW_LONGLONG),
13789 DEF_STATUS_VAR_FUNC("pending_compaction_limit_stops",
13790 &io_stall_stats.stop_for_pending_compaction_bytes,
13791 SHOW_LONGLONG),
13792 DEF_STATUS_VAR_FUNC("pending_compaction_limit_slowdowns",
13793 &io_stall_stats.slowdown_for_pending_compaction_bytes,
13794 SHOW_LONGLONG),
13795 DEF_STATUS_VAR_FUNC("memtable_limit_stops",
13796 &io_stall_stats.memtable_compaction, SHOW_LONGLONG),
13797 DEF_STATUS_VAR_FUNC("memtable_limit_slowdowns",
13798 &io_stall_stats.memtable_slowdown, SHOW_LONGLONG),
13799 DEF_STATUS_VAR_FUNC("total_stops", &io_stall_stats.total_stop,
13800 SHOW_LONGLONG),
13801 DEF_STATUS_VAR_FUNC("total_slowdowns", &io_stall_stats.total_slowdown,
13802 SHOW_LONGLONG),
13803 // end of the array marker
13804 {NullS, NullS, SHOW_LONG, SHOW_SCOPE_GLOBAL}};
13805
13806 static void show_rocksdb_stall_vars(THD *thd, SHOW_VAR *var, char *buff) {
13807 update_rocksdb_stall_status();
13808 var->type = SHOW_ARRAY;
13809 var->value = reinterpret_cast<char *>(&rocksdb_stall_status_variables);
13810 }
13811
13812 static SHOW_VAR rocksdb_status_vars[] = {
13813 DEF_STATUS_VAR(block_cache_miss),
13814 DEF_STATUS_VAR(block_cache_hit),
13815 DEF_STATUS_VAR(block_cache_add),
13816 DEF_STATUS_VAR(block_cache_add_failures),
13817 DEF_STATUS_VAR(block_cache_index_miss),
13818 DEF_STATUS_VAR(block_cache_index_hit),
13819 DEF_STATUS_VAR(block_cache_index_add),
13820 DEF_STATUS_VAR(block_cache_index_bytes_insert),
13821 DEF_STATUS_VAR(block_cache_index_bytes_evict),
13822 DEF_STATUS_VAR(block_cache_filter_miss),
13823 DEF_STATUS_VAR(block_cache_filter_hit),
13824 DEF_STATUS_VAR(block_cache_filter_add),
13825 DEF_STATUS_VAR(block_cache_filter_bytes_insert),
13826 DEF_STATUS_VAR(block_cache_filter_bytes_evict),
13827 DEF_STATUS_VAR(block_cache_bytes_read),
13828 DEF_STATUS_VAR(block_cache_bytes_write),
13829 DEF_STATUS_VAR(block_cache_data_bytes_insert),
13830 DEF_STATUS_VAR(block_cache_data_miss),
13831 DEF_STATUS_VAR(block_cache_data_hit),
13832 DEF_STATUS_VAR(block_cache_data_add),
13833 DEF_STATUS_VAR(bloom_filter_useful),
13834 DEF_STATUS_VAR(bloom_filter_full_positive),
13835 DEF_STATUS_VAR(bloom_filter_full_true_positive),
13836 DEF_STATUS_VAR(memtable_hit),
13837 DEF_STATUS_VAR(memtable_miss),
13838 DEF_STATUS_VAR(get_hit_l0),
13839 DEF_STATUS_VAR(get_hit_l1),
13840 DEF_STATUS_VAR(get_hit_l2_and_up),
13841 DEF_STATUS_VAR(compaction_key_drop_new),
13842 DEF_STATUS_VAR(compaction_key_drop_obsolete),
13843 DEF_STATUS_VAR(compaction_key_drop_user),
13844 DEF_STATUS_VAR(number_keys_written),
13845 DEF_STATUS_VAR(number_keys_read),
13846 DEF_STATUS_VAR(number_keys_updated),
13847 DEF_STATUS_VAR(bytes_written),
13848 DEF_STATUS_VAR(bytes_read),
13849 DEF_STATUS_VAR(number_db_seek),
13850 DEF_STATUS_VAR(number_db_seek_found),
13851 DEF_STATUS_VAR(number_db_next),
13852 DEF_STATUS_VAR(number_db_next_found),
13853 DEF_STATUS_VAR(number_db_prev),
13854 DEF_STATUS_VAR(number_db_prev_found),
13855 DEF_STATUS_VAR(iter_bytes_read),
13856 DEF_STATUS_VAR(no_file_closes),
13857 DEF_STATUS_VAR(no_file_opens),
13858 DEF_STATUS_VAR(no_file_errors),
13859 DEF_STATUS_VAR(stall_micros),
13860 DEF_STATUS_VAR(num_iterators),
13861 DEF_STATUS_VAR(number_multiget_get),
13862 DEF_STATUS_VAR(number_multiget_keys_read),
13863 DEF_STATUS_VAR(number_multiget_bytes_read),
13864 DEF_STATUS_VAR(number_deletes_filtered),
13865 DEF_STATUS_VAR(number_merge_failures),
13866 DEF_STATUS_VAR(bloom_filter_prefix_checked),
13867 DEF_STATUS_VAR(bloom_filter_prefix_useful),
13868 DEF_STATUS_VAR(number_reseeks_iteration),
13869 DEF_STATUS_VAR(get_updates_since_calls),
13870 DEF_STATUS_VAR(block_cache_compressed_miss),
13871 DEF_STATUS_VAR(block_cache_compressed_hit),
13872 DEF_STATUS_VAR(wal_synced),
13873 DEF_STATUS_VAR(wal_bytes),
13874 DEF_STATUS_VAR(write_self),
13875 DEF_STATUS_VAR(write_other),
13876 DEF_STATUS_VAR(write_timedout),
13877 DEF_STATUS_VAR(write_wal),
13878 DEF_STATUS_VAR(flush_write_bytes),
13879 DEF_STATUS_VAR(compact_read_bytes),
13880 DEF_STATUS_VAR(compact_write_bytes),
13881 DEF_STATUS_VAR(number_superversion_acquires),
13882 DEF_STATUS_VAR(number_superversion_releases),
13883 DEF_STATUS_VAR(number_superversion_cleanups),
13884 DEF_STATUS_VAR(number_block_not_compressed),
13885 DEF_STATUS_VAR_PTR("row_lock_deadlocks", &rocksdb_row_lock_deadlocks,
13886 SHOW_LONGLONG),
13887 DEF_STATUS_VAR_PTR("row_lock_wait_timeouts",
13888 &rocksdb_row_lock_wait_timeouts, SHOW_LONGLONG),
13889 DEF_STATUS_VAR_PTR("snapshot_conflict_errors",
13890 &rocksdb_snapshot_conflict_errors, SHOW_LONGLONG),
13891 DEF_STATUS_VAR_PTR("wal_group_syncs", &rocksdb_wal_group_syncs,
13892 SHOW_LONGLONG),
13893 DEF_STATUS_VAR_PTR("manual_compactions_processed",
13894 &rocksdb_manual_compactions_processed, SHOW_LONGLONG),
13895 DEF_STATUS_VAR_PTR("manual_compactions_running",
13896 &rocksdb_manual_compactions_running, SHOW_LONGLONG),
13897 DEF_STATUS_VAR_PTR("number_sst_entry_put", &rocksdb_num_sst_entry_put,
13898 SHOW_LONGLONG),
13899 DEF_STATUS_VAR_PTR("number_sst_entry_delete", &rocksdb_num_sst_entry_delete,
13900 SHOW_LONGLONG),
13901 DEF_STATUS_VAR_PTR("number_sst_entry_singledelete",
13902 &rocksdb_num_sst_entry_singledelete, SHOW_LONGLONG),
13903 DEF_STATUS_VAR_PTR("number_sst_entry_merge", &rocksdb_num_sst_entry_merge,
13904 SHOW_LONGLONG),
13905 DEF_STATUS_VAR_PTR("number_sst_entry_other", &rocksdb_num_sst_entry_other,
13906 SHOW_LONGLONG),
13907 DEF_STATUS_VAR_PTR("additional_compaction_triggers",
13908 &rocksdb_additional_compaction_triggers, SHOW_LONGLONG),
13909 #ifndef NDEBUG
13910 DEF_STATUS_VAR_PTR("num_get_for_update_calls",
13911 &rocksdb_num_get_for_update_calls, SHOW_LONGLONG),
13912 #endif
13913 // the variables generated by SHOW_FUNC are sorted only by prefix (first
13914 // arg in the tuple below), so make sure it is unique to make sorting
13915 // deterministic as quick sort is not stable
13916 {"rocksdb", reinterpret_cast<char *>(&show_myrocks_vars), SHOW_FUNC,
13917 SHOW_SCOPE_GLOBAL},
13918 {"rocksdb_stall", reinterpret_cast<char *>(&show_rocksdb_stall_vars),
13919 SHOW_FUNC, SHOW_SCOPE_GLOBAL},
13920 {NullS, NullS, SHOW_LONG, SHOW_SCOPE_GLOBAL}};
13921
13922 /*
13923 Background thread's main logic
13924 */
13925
13926 void Rdb_background_thread::run() {
13927 // How many seconds to wait till flushing the WAL next time.
13928 const int WAKE_UP_INTERVAL = 1;
13929
13930 timespec ts_next_sync;
13931 clock_gettime(CLOCK_REALTIME, &ts_next_sync);
13932 ts_next_sync.tv_sec += WAKE_UP_INTERVAL;
13933
13934 for (;;) {
13935 // Wait until the next timeout or until we receive a signal to stop the
13936 // thread. Request to stop the thread should only be triggered when the
13937 // storage engine is being unloaded.
13938 RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
13939 const auto ret MY_ATTRIBUTE((__unused__)) =
13940 mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts_next_sync);
13941
13942 // Check that we receive only the expected error codes.
13943 assert(ret == 0 || ret == ETIMEDOUT);
13944 const THD::killed_state local_killed = m_killed;
13945 const bool local_save_stats = m_save_stats;
13946 reset();
13947 RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
13948
13949 if (local_killed) {
13950 // If we're here then that's because condition variable was signaled by
13951 // another thread and we're shutting down. Break out the loop to make
13952 // sure that shutdown thread can proceed.
13953 break;
13954 }
13955
13956 // This path should be taken only when the timer expired.
13957 assert(ret == ETIMEDOUT);
13958
13959 if (local_save_stats) {
13960 ddl_manager.persist_stats();
13961 }
13962
13963 timespec ts;
13964 clock_gettime(CLOCK_REALTIME, &ts);
13965
13966 // Flush the WAL. Sync it for both background and never modes to copy
13967 // InnoDB's behavior. For mode never, the wal file isn't even written,
13968 // whereas background writes to the wal file, but issues the syncs in a
13969 // background thread.
13970 if (rdb && (rocksdb_flush_log_at_trx_commit != FLUSH_LOG_SYNC) &&
13971 !rocksdb_db_options->allow_mmap_writes) {
13972 const rocksdb::Status s = rdb->FlushWAL(true);
13973 if (!s.ok()) {
13974 rdb_handle_io_error(s, RDB_IO_ERROR_BG_THREAD);
13975 }
13976 }
13977
13978 // Recalculate statistics for indexes only if
13979 // rocksdb_table_stats_use_table_scan is disabled.
13980 // Otherwise, Rdb_index_stats_thread will do the work
13981 if (!rocksdb_table_stats_use_table_scan && rocksdb_stats_recalc_rate) {
13982 std::vector<std::string> to_recalc;
13983 if (rdb_tables_to_recalc.empty()) {
13984 struct Rdb_index_collector : public Rdb_tables_scanner {
13985 int add_table(Rdb_tbl_def *tdef) override {
13986 rdb_tables_to_recalc.push_back(tdef->full_tablename());
13987 return HA_EXIT_SUCCESS;
13988 }
13989 } collector;
13990 ddl_manager.scan_for_tables(&collector);
13991 }
13992
13993 while (to_recalc.size() < rocksdb_stats_recalc_rate &&
13994 !rdb_tables_to_recalc.empty()) {
13995 to_recalc.push_back(rdb_tables_to_recalc.back());
13996 rdb_tables_to_recalc.pop_back();
13997 }
13998
13999 for (const auto &tbl_name : to_recalc) {
14000 calculate_stats_for_table(tbl_name, SCAN_TYPE_NONE);
14001 }
14002 }
14003
14004 // Set the next timestamp for mysql_cond_timedwait() (which ends up calling
14005 // pthread_cond_timedwait()) to wait on.
14006 ts_next_sync.tv_sec = ts.tv_sec + WAKE_UP_INTERVAL;
14007 }
14008
14009 // save remaining stats which might've left unsaved
14010 ddl_manager.persist_stats();
14011 }
14012
14013 void Rdb_index_stats_thread::run() {
14014 const int WAKE_UP_INTERVAL = 1;
14015 #ifdef TARGET_OS_LINUX
14016 RDB_MUTEX_LOCK_CHECK(m_is_mutex);
14017 m_tid_set = true;
14018 m_tid = syscall(SYS_gettid);
14019 RDB_MUTEX_UNLOCK_CHECK(m_is_mutex);
14020 #endif
14021
14022 renice(rocksdb_table_stats_background_thread_nice_value);
14023 for (;;) {
14024 RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
14025 if (m_killed) {
14026 RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
14027 break;
14028 }
14029
14030 timespec ts;
14031 clock_gettime(CLOCK_REALTIME, &ts);
14032
14033 // Wait for 24 hours if the table scan based index calculation
14034 // is off. When the switch is turned on and any request is added
14035 // to the recalc queue, this thread will be signaled.
14036 ts.tv_sec +=
14037 (rocksdb_table_stats_use_table_scan) ? WAKE_UP_INTERVAL : 24 * 60 * 60;
14038
14039 const auto ret MY_ATTRIBUTE((__unused__)) =
14040 mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts);
14041
14042 if (m_killed) {
14043 RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
14044 break;
14045 }
14046
14047 // Make sure, no program error is returned
14048 assert(ret == 0 || ret == ETIMEDOUT);
14049 RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
14050
14051 for (;;) {
14052 if (!rocksdb_table_stats_use_table_scan) {
14053 // Clear the recalc queue
14054 clear_all_index_stats_requests();
14055 break;
14056 }
14057
14058 std::string tbl_name;
14059 if (!get_index_stats_request(&tbl_name)) {
14060 // No request in the recalc queue
14061 break;
14062 }
14063
14064 Rdb_table_stats tbl_stats;
14065 if (ddl_manager.find_table_stats(tbl_name, &tbl_stats) !=
14066 HA_EXIT_SUCCESS) {
14067 // The table has been dropped. Skip this table.
14068 continue;
14069 }
14070
14071 clock_gettime(CLOCK_REALTIME, &ts);
14072 if (difftime(ts.tv_sec, tbl_stats.m_last_recalc) <
14073 RDB_MIN_RECALC_INTERVAL) {
14074 /* Stats were (re)calculated not long ago. To avoid
14075 too frequent stats updates we put back the table on
14076 the recalc queue and do nothing. */
14077
14078 add_index_stats_request(tbl_name);
14079 break;
14080 }
14081
14082 DBUG_EXECUTE_IF("rocksdb_is_bg_thread", {
14083 if (tbl_name == "test.t") {
14084 THD *thd = new THD();
14085 thd->thread_stack = reinterpret_cast<char *>(&thd);
14086 thd->store_globals();
14087
14088 static constexpr char act[] = "now wait_for ready_to_calculate_index_stats";
14089 assert(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
14090
14091 thd->restore_globals();
14092 delete thd;
14093 }
14094 });
14095
14096 int err =
14097 calculate_stats_for_table(tbl_name, SCAN_TYPE_FULL_TABLE, &m_killed);
14098
14099 if (err != HA_EXIT_SUCCESS) {
14100 global_stats.table_index_stats_result[TABLE_INDEX_STATS_FAILURE].inc();
14101 break;
14102 }
14103
14104 global_stats.table_index_stats_result[TABLE_INDEX_STATS_SUCCESS].inc();
14105
14106 DBUG_EXECUTE_IF("rocksdb_is_bg_thread", {
14107 if (tbl_name == "test.t") {
14108 THD *thd = new THD();
14109 thd->thread_stack = reinterpret_cast<char *>(&thd);
14110 thd->store_globals();
14111
14112 static constexpr char act[] = "now signal index_stats_calculation_done";
14113 assert(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
14114
14115 thd->restore_globals();
14116 delete thd;
14117 }
14118 });
14119 }
14120 }
14121
14122 RDB_MUTEX_LOCK_CHECK(m_is_mutex);
14123 m_tid_set = false;
14124 m_tid = 0;
14125 RDB_MUTEX_UNLOCK_CHECK(m_is_mutex);
14126 }
14127
14128 bool Rdb_index_stats_thread::get_index_stats_request(std::string *tbl_name) {
14129 RDB_MUTEX_LOCK_CHECK(m_is_mutex);
14130 if (m_requests.empty()) {
14131 RDB_MUTEX_UNLOCK_CHECK(m_is_mutex);
14132 return false;
14133 }
14134
14135 *tbl_name = m_requests[0];
14136 m_requests.pop_front();
14137
14138 auto count = m_tbl_names.erase(*tbl_name);
14139 if (count != 1) {
14140 assert(0);
14141 }
14142
14143 RDB_MUTEX_UNLOCK_CHECK(m_is_mutex);
14144 return true;
14145 }
14146
14147 void Rdb_index_stats_thread::add_index_stats_request(
14148 const std::string &tbl_name) {
14149 RDB_MUTEX_LOCK_CHECK(m_is_mutex);
14150
14151 /* Quit if already in the queue */
14152 auto ret = m_tbl_names.insert(tbl_name);
14153 if (!ret.second) {
14154 RDB_MUTEX_UNLOCK_CHECK(m_is_mutex);
14155 return;
14156 }
14157
14158 m_requests.push_back(*ret.first);
14159 RDB_MUTEX_UNLOCK_CHECK(m_is_mutex);
14160 signal();
14161 }
14162
14163 void Rdb_index_stats_thread::clear_all_index_stats_requests() {
14164 RDB_MUTEX_LOCK_CHECK(m_is_mutex);
14165 m_requests.clear();
14166 m_tbl_names.clear();
14167 RDB_MUTEX_UNLOCK_CHECK(m_is_mutex);
14168 }
14169
14170 int Rdb_index_stats_thread::renice(int nice_val) {
14171 RDB_MUTEX_LOCK_CHECK(m_is_mutex);
14172 if (!m_tid_set) {
14173 RDB_MUTEX_UNLOCK_CHECK(m_is_mutex);
14174 return HA_EXIT_FAILURE;
14175 }
14176
14177 #ifdef TARGET_OS_LINUX
14178 int ret = setpriority(PRIO_PROCESS, m_tid, nice_val);
14179 if (ret != 0) {
14180 // NO_LINT_DEBUG
14181 sql_print_error("Set index stats thread priority failed due to %s",
14182 strerror(errno));
14183 RDB_MUTEX_UNLOCK_CHECK(m_is_mutex);
14184 return HA_EXIT_FAILURE;
14185 }
14186 #endif
14187
14188 RDB_MUTEX_UNLOCK_CHECK(m_is_mutex);
14189 return HA_EXIT_SUCCESS;
14190 }
14191
14192 size_t Rdb_index_stats_thread::get_request_queue_size() {
14193 size_t len = 0;
14194 RDB_MUTEX_LOCK_CHECK(m_is_mutex);
14195 len = m_requests.size();
14196 RDB_MUTEX_UNLOCK_CHECK(m_is_mutex);
14197
14198 return len;
14199 }
14200
14201 /*
14202 A background thread to handle manual compactions,
14203 except for dropping indexes/tables. Every second, it checks
14204 pending manual compactions, and it calls CompactRange if there is.
14205 */
14206 void Rdb_manual_compaction_thread::run() {
14207 RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
14208 for (;;) {
14209 if (m_killed) {
14210 break;
14211 }
14212 timespec ts;
14213 clock_gettime(CLOCK_REALTIME, &ts);
14214 ts.tv_sec += 1;
14215
14216 const auto ret MY_ATTRIBUTE((__unused__)) =
14217 mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts);
14218 if (m_killed) {
14219 break;
14220 }
14221 // make sure, no program error is returned
14222 assert(ret == 0 || ret == ETIMEDOUT);
14223 RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
14224
14225 RDB_MUTEX_LOCK_CHECK(m_mc_mutex);
14226 // Grab the first item and proceed, if not empty.
14227 if (m_requests.empty()) {
14228 RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
14229 RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
14230 continue;
14231 }
14232 Manual_compaction_request &mcr = m_requests.begin()->second;
14233 assert(mcr.cf);
14234 assert(mcr.state == Manual_compaction_request::INITED);
14235 mcr.state = Manual_compaction_request::RUNNING;
14236 RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
14237
14238 assert(mcr.state == Manual_compaction_request::RUNNING);
14239 // NO_LINT_DEBUG
14240 sql_print_information("Manual Compaction id %d cf %s started.", mcr.mc_id,
14241 mcr.cf->GetName().c_str());
14242 rocksdb_manual_compactions_running++;
14243 if (rocksdb_debug_manual_compaction_delay > 0) {
14244 // In Facebook MySQL 5.6.35, my_sleep breaks the sleep when the server
14245 // gets a shutdown signal and this code depended on that behavior.
14246 // In 5.7, for whatever reason, this is not the case. my_sleep will
14247 // continue to sleep until the sleep time has elapsed. For the purpose
14248 // of this variable and the accompanying test case, we need to break this
14249 // down into a loop that sleeps and checks to see if the thread was
14250 // signalled with the stop flag. It is ugly, but without having DBUG_SYNC
14251 // available in background threads, it is good enough for the test.
14252 for (uint32_t sleeps = 0; sleeps < rocksdb_debug_manual_compaction_delay;
14253 sleeps++) {
14254 RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
14255 const bool local_stop = m_killed;
14256 RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
14257 if (local_stop) break;
14258 my_sleep(1000000);
14259 }
14260 }
14261
14262 DBUG_EXECUTE_IF("rocksdb_manual_compaction", {
14263 THD *thd = new THD();
14264 thd->thread_stack = reinterpret_cast<char *>(&(thd));
14265 thd->store_globals();
14266 static constexpr char act[] =
14267 "now signal ready_to_mark_cf_dropped_in_manual_compaction wait_for "
14268 "mark_cf_dropped_done_in_manual_compaction";
14269 assert(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
14270 thd->restore_globals();
14271 delete thd;
14272 });
14273
14274 // CompactRange may take a very long time. On clean shutdown,
14275 // it is cancelled by CancelAllBackgroundWork, then status is
14276 // set to shutdownInProgress.
14277 const rocksdb::Status s =
14278 rdb->CompactRange(getCompactRangeOptions(
14279 mcr.concurrency, mcr.bottommost_level_compaction),
14280 mcr.cf.get(), mcr.start, mcr.limit);
14281
14282 rocksdb_manual_compactions_running--;
14283 if (s.ok()) {
14284 // NO_LINT_DEBUG
14285 sql_print_information("Manual Compaction id %d cf %s ended.", mcr.mc_id,
14286 mcr.cf->GetName().c_str());
14287 } else {
14288 // NO_LINT_DEBUG
14289 sql_print_information("Manual Compaction id %d cf %s aborted. %s",
14290 mcr.mc_id, mcr.cf->GetName().c_str(), s.getState());
14291 if (!cf_manager.get_cf(mcr.cf->GetID())) {
14292 // NO_LINT_DEBUG
14293 sql_print_information("cf %s has been dropped",
14294 mcr.cf->GetName().c_str());
14295 } else if (!s.IsShutdownInProgress()) {
14296 rdb_handle_io_error(s, RDB_IO_ERROR_BG_THREAD);
14297 } else {
14298 assert(m_requests.size() == 1);
14299 }
14300 }
14301 rocksdb_manual_compactions_processed++;
14302 clear_manual_compaction_request(mcr.mc_id, false);
14303 RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
14304 }
14305 clear_all_manual_compaction_requests();
14306 assert(m_requests.empty());
14307 RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
14308 }
14309
14310 void Rdb_manual_compaction_thread::clear_all_manual_compaction_requests() {
14311 RDB_MUTEX_LOCK_CHECK(m_mc_mutex);
14312 m_requests.clear();
14313 RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
14314 }
14315
14316 void Rdb_manual_compaction_thread::clear_manual_compaction_request(
14317 int mc_id, bool init_only) {
14318 bool erase = true;
14319 RDB_MUTEX_LOCK_CHECK(m_mc_mutex);
14320 auto it = m_requests.find(mc_id);
14321 if (it != m_requests.end()) {
14322 if (init_only) {
14323 Manual_compaction_request mcr = it->second;
14324 if (mcr.state != Manual_compaction_request::INITED) {
14325 erase = false;
14326 }
14327 }
14328 if (erase) {
14329 m_requests.erase(it);
14330 }
14331 } else {
14332 // Current code path guarantees that erasing by the same mc_id happens
14333 // at most once. INITED state may be erased by a thread that requested
14334 // the compaction. RUNNING state is erased by mc thread only.
14335 assert(0);
14336 }
14337 RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
14338 }
14339
14340 int Rdb_manual_compaction_thread::request_manual_compaction(
14341 std::shared_ptr<rocksdb::ColumnFamilyHandle> cf, rocksdb::Slice *start,
14342 rocksdb::Slice *limit, int concurrency,
14343 rocksdb::BottommostLevelCompaction bottommost_level_compaction) {
14344 int mc_id = -1;
14345 RDB_MUTEX_LOCK_CHECK(m_mc_mutex);
14346 if (m_requests.size() >= rocksdb_max_manual_compactions) {
14347 RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
14348 return mc_id;
14349 }
14350 Manual_compaction_request mcr;
14351 mc_id = mcr.mc_id = ++m_latest_mc_id;
14352 mcr.state = Manual_compaction_request::INITED;
14353 mcr.cf = cf;
14354 mcr.start = start;
14355 mcr.limit = limit;
14356 mcr.concurrency = concurrency;
14357 mcr.bottommost_level_compaction = bottommost_level_compaction;
14358 m_requests.insert(std::make_pair(mcr.mc_id, mcr));
14359 RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
14360 return mc_id;
14361 }
14362
14363 bool Rdb_manual_compaction_thread::is_manual_compaction_finished(int mc_id) {
14364 bool finished = false;
14365 RDB_MUTEX_LOCK_CHECK(m_mc_mutex);
14366 if (m_requests.count(mc_id) == 0) {
14367 finished = true;
14368 }
14369 RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
14370 return finished;
14371 }
14372
14373 /**
14374 * Locking read + Not Found + Read Committed occurs if we accessed
14375 * a row by Seek, tried to lock it, failed, released and reacquired the
14376 * snapshot (because of READ COMMITTED mode) and the row was deleted by
14377 * someone else in the meantime.
14378 * If so, we either just skipping the row, or re-creating a snapshot
14379 * and seek again. In both cases, Read Committed constraint is not broken.
14380 */
14381 bool ha_rocksdb::should_skip_invalidated_record(const int rc) const {
14382 if ((m_lock_rows != RDB_LOCK_NONE && rc == HA_ERR_KEY_NOT_FOUND &&
14383 my_core::thd_tx_isolation(ha_thd()) == ISO_READ_COMMITTED)) {
14384 return true;
14385 }
14386 return false;
14387 }
14388 /**
14389 * Indicating snapshot needs to be re-created and retrying seek again,
14390 * instead of returning errors or empty set. This is normally applicable
14391 * when hitting kBusy when locking the first row of the transaction,
14392 * with Repeatable Read isolation level.
14393 */
14394 bool ha_rocksdb::should_recreate_snapshot(const int rc,
14395 const bool is_new_snapshot) const {
14396 if (should_skip_invalidated_record(rc) ||
14397 (rc == HA_ERR_ROCKSDB_STATUS_BUSY && is_new_snapshot)) {
14398 return true;
14399 }
14400 return false;
14401 }
14402
14403 /**
14404 * If calling put/delete/singledelete without locking the row,
14405 * it is necessary to pass assume_tracked=false to RocksDB TX API.
14406 * Read Free Replication and Blind Deletes are the cases when
14407 * using TX API and skipping row locking.
14408 */
14409 bool ha_rocksdb::can_assume_tracked(THD *thd) {
14410 if (use_read_free_rpl() || (THDVAR(thd, blind_delete_primary_key))) {
14411 return false;
14412 }
14413 return true;
14414 }
14415
14416 bool ha_rocksdb::check_bloom_and_set_bounds(
14417 THD *thd, const Rdb_key_def &kd, const rocksdb::Slice &eq_cond,
14418 const bool use_all_keys, size_t bound_len, uchar *const lower_bound,
14419 uchar *const upper_bound, rocksdb::Slice *lower_bound_slice,
14420 rocksdb::Slice *upper_bound_slice) {
14421 bool can_use_bloom = can_use_bloom_filter(thd, kd, eq_cond, use_all_keys);
14422 if (!can_use_bloom && (THDVAR(thd, enable_iterate_bounds))) {
14423 setup_iterator_bounds(kd, eq_cond, bound_len, lower_bound, upper_bound,
14424 lower_bound_slice, upper_bound_slice);
14425 }
14426 return can_use_bloom;
14427 }
14428
14429 /**
14430 Deciding if it is possible to use bloom filter or not.
14431
14432 @detail
14433 Even if bloom filter exists, it is not always possible
14434 to use bloom filter. If using bloom filter when you shouldn't,
14435 false negative may happen -- fewer rows than expected may be returned.
14436 It is users' responsibility to use bloom filter correctly.
14437
14438 If bloom filter does not exist, return value does not matter because
14439 RocksDB does not use bloom filter internally.
14440
14441 @param kd
14442 @param eq_cond Equal condition part of the key. This always includes
14443 system index id (4 bytes).
14444 @param use_all_keys True if all key parts are set with equal conditions.
14445 This is aware of extended keys.
14446 */
14447 bool ha_rocksdb::can_use_bloom_filter(THD *thd, const Rdb_key_def &kd,
14448 const rocksdb::Slice &eq_cond,
14449 const bool use_all_keys) {
14450 bool can_use = false;
14451
14452 if (THDVAR(thd, skip_bloom_filter_on_read)) {
14453 return can_use;
14454 }
14455
14456 const rocksdb::SliceTransform *prefix_extractor = kd.get_extractor();
14457 if (prefix_extractor) {
14458 /*
14459 This is an optimized use case for CappedPrefixTransform.
14460 If eq_cond length >= prefix extractor length and if
14461 all keys are used for equal lookup, it is
14462 always possible to use bloom filter.
14463
14464 Prefix bloom filter can't be used on descending scan with
14465 prefix lookup (i.e. WHERE id1=1 ORDER BY id2 DESC), because of
14466 RocksDB's limitation. On ascending (or not sorting) scan,
14467 keys longer than the capped prefix length will be truncated down
14468 to the capped length and the resulting key is added to the bloom filter.
14469
14470 Keys shorter than the capped prefix length will be added to
14471 the bloom filter. When keys are looked up, key conditionals
14472 longer than the capped length can be used; key conditionals
14473 shorter require all parts of the key to be available
14474 for the short key match.
14475 */
14476 if ((use_all_keys && prefix_extractor->InRange(eq_cond)) ||
14477 prefix_extractor->SameResultWhenAppended(eq_cond))
14478 can_use = true;
14479 else
14480 can_use = false;
14481 } else {
14482 /*
14483 if prefix extractor is not defined, all key parts have to be
14484 used by eq_cond.
14485 */
14486 if (use_all_keys) {
14487 can_use = true;
14488 } else {
14489 can_use = false;
14490 }
14491 }
14492
14493 return can_use;
14494 }
14495
14496 /* For modules that need access to the global data structures */
14497 rocksdb::TransactionDB *rdb_get_rocksdb_db() { return rdb; }
14498
14499 Rdb_cf_manager &rdb_get_cf_manager() { return cf_manager; }
14500
14501 const rocksdb::BlockBasedTableOptions &rdb_get_table_options() {
14502 return *rocksdb_tbl_options;
14503 }
14504
14505 bool rdb_is_table_scan_index_stats_calculation_enabled() {
14506 return rocksdb_table_stats_use_table_scan;
14507 }
14508 bool rdb_is_ttl_enabled() { return rocksdb_enable_ttl; }
14509 bool rdb_is_ttl_read_filtering_enabled() {
14510 return rocksdb_enable_ttl_read_filtering;
14511 }
14512 #if !defined(NDEBUG)
14513 int rdb_dbug_set_ttl_rec_ts() { return rocksdb_debug_ttl_rec_ts; }
14514 int rdb_dbug_set_ttl_snapshot_ts() { return rocksdb_debug_ttl_snapshot_ts; }
14515 int rdb_dbug_set_ttl_read_filter_ts() {
14516 return rocksdb_debug_ttl_read_filter_ts;
14517 }
14518 bool rdb_dbug_set_ttl_ignore_pk() { return rocksdb_debug_ttl_ignore_pk; }
14519 #endif // !defined(NDEBUG)
14520
14521 void rdb_update_global_stats(const operation_type &type, uint count,
14522 bool is_system_table) {
14523 assert(type < ROWS_MAX);
14524
14525 if (count == 0) {
14526 return;
14527 }
14528
14529 if (is_system_table) {
14530 global_stats.system_rows[type].add(count);
14531 } else {
14532 global_stats.rows[type].add(count);
14533 }
14534 }
14535
14536 int rdb_get_table_perf_counters(const char *const tablename,
14537 Rdb_perf_counters *const counters) {
14538 assert(counters != nullptr);
14539 assert(tablename != nullptr);
14540
14541 Rdb_table_handler *table_handler;
14542 table_handler = rdb_open_tables.get_table_handler(tablename);
14543 if (table_handler == nullptr) {
14544 return HA_ERR_ROCKSDB_INVALID_TABLE;
14545 }
14546
14547 counters->load(table_handler->m_table_perf_context);
14548
14549 rdb_open_tables.release_table_handler(table_handler);
14550 return HA_EXIT_SUCCESS;
14551 }
14552
14553 const char *get_rdb_io_error_string(const RDB_IO_ERROR_TYPE err_type) {
14554 // If this assertion fails then this means that a member has been either added
14555 // to or removed from RDB_IO_ERROR_TYPE enum and this function needs to be
14556 // changed to return the appropriate value.
14557 static_assert(RDB_IO_ERROR_LAST == 4, "Please handle all the error types.");
14558
14559 switch (err_type) {
14560 case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_TX_COMMIT:
14561 return "RDB_IO_ERROR_TX_COMMIT";
14562 case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_DICT_COMMIT:
14563 return "RDB_IO_ERROR_DICT_COMMIT";
14564 case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_BG_THREAD:
14565 return "RDB_IO_ERROR_BG_THREAD";
14566 case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_GENERAL:
14567 return "RDB_IO_ERROR_GENERAL";
14568 default:
14569 assert(false);
14570 return "(unknown)";
14571 }
14572 }
14573
14574 // In case of core dump generation we want this function NOT to be optimized
14575 // so that we can capture as much data as possible to debug the root cause
14576 // more efficiently.
14577 #if defined(NDEBUG)
14578 #ifdef __clang__
14579 MY_ATTRIBUTE((optnone))
14580 #else
14581 MY_ATTRIBUTE((optimize("O0")))
14582 #endif
14583 #endif
14584 void rdb_handle_io_error(const rocksdb::Status status,
14585 const RDB_IO_ERROR_TYPE err_type) {
14586 if (status.IsIOError()) {
14587 switch (err_type) {
14588 case RDB_IO_ERROR_TX_COMMIT:
14589 case RDB_IO_ERROR_DICT_COMMIT: {
14590 rdb_log_status_error(status, "failed to write to WAL");
14591 /* NO_LINT_DEBUG */
14592 sql_print_error("MyRocks: aborting on WAL write error.");
14593 abort();
14594 break;
14595 }
14596 case RDB_IO_ERROR_BG_THREAD: {
14597 rdb_log_status_error(status, "BG thread failed to write to RocksDB");
14598 /* NO_LINT_DEBUG */
14599 sql_print_error("MyRocks: aborting on BG write error.");
14600 abort();
14601 break;
14602 }
14603 case RDB_IO_ERROR_GENERAL: {
14604 rdb_log_status_error(status, "failed on I/O");
14605 /* NO_LINT_DEBUG */
14606 sql_print_error("MyRocks: aborting on I/O error.");
14607 abort();
14608 break;
14609 }
14610 default:
14611 assert(0);
14612 break;
14613 }
14614 } else if (status.IsCorruption()) {
14615 rdb_log_status_error(status, "data corruption detected!");
14616 rdb_persist_corruption_marker();
14617 /* NO_LINT_DEBUG */
14618 sql_print_error("MyRocks: aborting because of data corruption.");
14619 abort();
14620 } else if (!status.ok()) {
14621 switch (err_type) {
14622 case RDB_IO_ERROR_TX_COMMIT:
14623 case RDB_IO_ERROR_DICT_COMMIT: {
14624 rdb_log_status_error(status, "Failed to write to WAL (non kIOError)");
14625 /* NO_LINT_DEBUG */
14626 sql_print_error("MyRocks: aborting on WAL write error.");
14627 abort();
14628 break;
14629 }
14630 default:
14631 rdb_log_status_error(status, "Failed to read/write in RocksDB");
14632 break;
14633 }
14634 }
14635 }
14636
14637 Rdb_dict_manager *rdb_get_dict_manager(void) { return &dict_manager; }
14638
14639 Rdb_ddl_manager *rdb_get_ddl_manager(void) { return &ddl_manager; }
14640
14641 Rdb_hton_init_state *rdb_get_hton_init_state(void) { return &hton_init_state; }
14642
14643 void rocksdb_set_compaction_options(
14644 my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
14645 my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14646 void *const var_ptr, const void *const save) {
14647 if (var_ptr && save) {
14648 *(uint64_t *)var_ptr = *(const uint64_t *)save;
14649 }
14650 const Rdb_compact_params params = {
14651 (uint64_t)rocksdb_compaction_sequential_deletes,
14652 (uint64_t)rocksdb_compaction_sequential_deletes_window,
14653 (uint64_t)rocksdb_compaction_sequential_deletes_file_size};
14654 if (properties_collector_factory) {
14655 properties_collector_factory->SetCompactionParams(params);
14656 }
14657 }
14658
14659 void rocksdb_set_table_stats_sampling_pct(
14660 my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
14661 my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14662 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
14663 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14664
14665 const uint32_t new_val = *static_cast<const uint32_t *>(save);
14666
14667 if (new_val != rocksdb_table_stats_sampling_pct) {
14668 rocksdb_table_stats_sampling_pct = new_val;
14669
14670 if (properties_collector_factory) {
14671 properties_collector_factory->SetTableStatsSamplingPct(
14672 rocksdb_table_stats_sampling_pct);
14673 }
14674 }
14675
14676 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14677 }
14678
14679 void rocksdb_update_table_stats_use_table_scan(
14680 THD *const /* thd */, struct st_mysql_sys_var *const /* var */,
14681 void *const var_ptr, const void *const save) {
14682 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14683 bool old_val = *static_cast<const my_bool *>(var_ptr);
14684 bool new_val = *static_cast<const my_bool *>(save);
14685
14686 if (old_val == new_val) {
14687 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14688 return;
14689 }
14690
14691 if (new_val) {
14692 struct Rdb_table_collector : public Rdb_tables_scanner {
14693 int add_table(Rdb_tbl_def *tdef) override {
14694 assert(tdef->m_key_count > 0);
14695 tdef->m_tbl_stats.set(tdef->m_key_count > 0
14696 ? tdef->m_key_descr_arr[0]->m_stats.m_rows
14697 : 0,
14698 0, 0);
14699 return HA_EXIT_SUCCESS;
14700 }
14701 } collector;
14702 ddl_manager.scan_for_tables(&collector);
14703
14704 // We do not add all tables to the index stats recalculation queue
14705 // to avoid index stats calculation workload spike.
14706 } else {
14707 rdb_is_thread.clear_all_index_stats_requests();
14708 }
14709
14710 *static_cast<my_bool *>(var_ptr) = *static_cast<const my_bool *>(save);
14711 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14712 }
14713
14714 int rocksdb_index_stats_thread_renice(THD *const /* thd */,
14715 struct st_mysql_sys_var *const /* var */,
14716 void *const save,
14717 struct st_mysql_value *const value) {
14718 long long nice_val;
14719 /* value is NULL */
14720 if (value->val_int(value, &nice_val)) {
14721 return HA_EXIT_FAILURE;
14722 }
14723
14724 if (rdb_is_thread.renice(nice_val) != HA_EXIT_SUCCESS) {
14725 return HA_EXIT_FAILURE;
14726 }
14727
14728 *static_cast<int32_t *>(save) = static_cast<int32_t>(nice_val);
14729 return HA_EXIT_SUCCESS;
14730 }
14731
14732 /*
14733 This function allows setting the rate limiter's bytes per second value
14734 but only if the rate limiter is turned on which has to be done at startup.
14735 If the rate is already 0 (turned off) or we are changing it to 0 (trying
14736 to turn it off) this function will push a warning to the client and do
14737 nothing.
14738 This is similar to the code in innodb_doublewrite_update (found in
14739 storage/innobase/handler/ha_innodb.cc).
14740 */
14741 void rocksdb_set_rate_limiter_bytes_per_sec(
14742 my_core::THD *const thd,
14743 my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14744 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
14745 const uint64_t new_val = *static_cast<const uint64_t *>(save);
14746 if (new_val == 0 || rocksdb_rate_limiter_bytes_per_sec == 0) {
14747 /*
14748 If a rate_limiter was not enabled at startup we can't change it nor
14749 can we disable it if one was created at startup
14750 */
14751 push_warning_printf(thd, Sql_condition::SL_WARNING, ER_WRONG_ARGUMENTS,
14752 "RocksDB: rocksdb_rate_limiter_bytes_per_sec cannot "
14753 "be dynamically changed to or from 0. Do a clean "
14754 "shutdown if you want to change it from or to 0.");
14755 } else if (new_val != rocksdb_rate_limiter_bytes_per_sec) {
14756 /* Apply the new value to the rate limiter and store it locally */
14757 assert(rocksdb_rate_limiter != nullptr);
14758 rocksdb_rate_limiter_bytes_per_sec = new_val;
14759 rocksdb_rate_limiter->SetBytesPerSecond(new_val);
14760 }
14761 }
14762
14763 void rocksdb_set_sst_mgr_rate_bytes_per_sec(
14764 my_core::THD *const thd,
14765 my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14766 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
14767 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14768
14769 const uint64_t new_val = *static_cast<const uint64_t *>(save);
14770
14771 if (new_val != rocksdb_sst_mgr_rate_bytes_per_sec) {
14772 rocksdb_sst_mgr_rate_bytes_per_sec = new_val;
14773
14774 rocksdb_db_options->sst_file_manager->SetDeleteRateBytesPerSecond(
14775 rocksdb_sst_mgr_rate_bytes_per_sec);
14776 }
14777
14778 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14779 }
14780
14781 void rocksdb_set_delayed_write_rate(THD *thd, struct st_mysql_sys_var *var,
14782 void *var_ptr, const void *save) {
14783 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14784 const uint64_t new_val = *static_cast<const uint64_t *>(save);
14785 if (rocksdb_delayed_write_rate != new_val) {
14786 rocksdb_delayed_write_rate = new_val;
14787 rocksdb::Status s =
14788 rdb->SetDBOptions({{"delayed_write_rate", std::to_string(new_val)}});
14789
14790 if (!s.ok()) {
14791 /* NO_LINT_DEBUG */
14792 sql_print_warning(
14793 "MyRocks: failed to update delayed_write_rate. "
14794 "status code = %d, status = %s",
14795 s.code(), s.ToString().c_str());
14796 }
14797 }
14798 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14799 }
14800
14801 void rocksdb_set_max_latest_deadlocks(THD *thd, struct st_mysql_sys_var *var,
14802 void *var_ptr, const void *save) {
14803 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14804 const uint32_t new_val = *static_cast<const uint32_t *>(save);
14805 if (rocksdb_max_latest_deadlocks != new_val) {
14806 rocksdb_max_latest_deadlocks = new_val;
14807 rdb->SetDeadlockInfoBufferSize(rocksdb_max_latest_deadlocks);
14808 }
14809 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14810 }
14811
14812 void rdb_set_collation_exception_list(const char *const exception_list) {
14813 assert(rdb_collation_exceptions != nullptr);
14814
14815 int flags = MY_REG_EXTENDED | MY_REG_NOSUB;
14816 if (lower_case_table_names) flags |= MY_REG_ICASE;
14817 if (!rdb_collation_exceptions->compile(exception_list, flags,
14818 table_alias_charset)) {
14819 warn_about_bad_patterns(*rdb_collation_exceptions,
14820 "strict_collation_exceptions");
14821 }
14822 }
14823
14824 void rocksdb_set_collation_exception_list(THD *const thd,
14825 struct st_mysql_sys_var *const var,
14826 void *const var_ptr,
14827 const void *const save) {
14828 const char *const val = *static_cast<const char *const *>(save);
14829
14830 rdb_set_collation_exception_list(val == nullptr ? "" : val);
14831
14832 *static_cast<const char **>(var_ptr) = val;
14833 }
14834
14835 int mysql_value_to_bool(struct st_mysql_value *value, my_bool *return_value) {
14836 int new_value_type = value->value_type(value);
14837 if (new_value_type == MYSQL_VALUE_TYPE_STRING) {
14838 char buf[16];
14839 int len = sizeof(buf);
14840 const char *str = value->val_str(value, buf, &len);
14841 if (str && (my_strcasecmp(system_charset_info, "true", str) == 0 ||
14842 my_strcasecmp(system_charset_info, "on", str) == 0)) {
14843 *return_value = true;
14844 } else if (str && (my_strcasecmp(system_charset_info, "false", str) == 0 ||
14845 my_strcasecmp(system_charset_info, "off", str) == 0)) {
14846 *return_value = false;
14847 } else {
14848 return 1;
14849 }
14850 } else if (new_value_type == MYSQL_VALUE_TYPE_INT) {
14851 long long intbuf;
14852 value->val_int(value, &intbuf);
14853 if (intbuf > 1) return 1;
14854 *return_value = intbuf > 0;
14855 } else {
14856 return 1;
14857 }
14858
14859 return 0;
14860 }
14861
14862 int rocksdb_check_bulk_load(
14863 THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)),
14864 void *save, struct st_mysql_value *value) {
14865 my_bool new_value;
14866 if (mysql_value_to_bool(value, &new_value) != 0) {
14867 return 1;
14868 }
14869
14870 Rdb_transaction *&tx = get_tx_from_thd(thd);
14871 if (tx != nullptr) {
14872 bool is_critical_error;
14873 const int rc = tx->finish_bulk_load(&is_critical_error);
14874 if (rc != 0 && is_critical_error) {
14875 // NO_LINT_DEBUG
14876 sql_print_error(
14877 "RocksDB: Error %d finalizing last SST file while "
14878 "setting bulk loading variable",
14879 rc);
14880 THDVAR(thd, bulk_load) = 0;
14881 return 1;
14882 }
14883 }
14884
14885 *static_cast<bool *>(save) = new_value;
14886 return 0;
14887 }
14888
14889 int rocksdb_check_bulk_load_allow_unsorted(
14890 THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)),
14891 void *save, struct st_mysql_value *value) {
14892 my_bool new_value;
14893 if (mysql_value_to_bool(value, &new_value) != 0) {
14894 return 1;
14895 }
14896
14897 if (THDVAR(thd, bulk_load)) {
14898 sql_print_error(
14899 "RocksDB: Cannot change this setting while bulk load is "
14900 "enabled");
14901
14902 return 1;
14903 }
14904
14905 *static_cast<bool *>(save) = new_value;
14906 return 0;
14907 }
14908
14909 static void rocksdb_set_max_background_jobs(THD *thd,
14910 struct st_mysql_sys_var *const var,
14911 void *const var_ptr,
14912 const void *const save) {
14913 assert(save != nullptr);
14914 assert(rocksdb_db_options != nullptr);
14915 assert(rocksdb_db_options->env != nullptr);
14916
14917 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14918
14919 const int new_val = *static_cast<const int *>(save);
14920
14921 if (rocksdb_db_options->max_background_jobs != new_val) {
14922 rocksdb_db_options->max_background_jobs = new_val;
14923 rocksdb::Status s =
14924 rdb->SetDBOptions({{"max_background_jobs", std::to_string(new_val)}});
14925
14926 if (!s.ok()) {
14927 /* NO_LINT_DEBUG */
14928 sql_print_warning(
14929 "MyRocks: failed to update max_background_jobs. "
14930 "Status code = %d, status = %s.",
14931 s.code(), s.ToString().c_str());
14932 }
14933 }
14934
14935 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14936 }
14937
14938 static void rocksdb_set_max_background_compactions(THD *thd,
14939 struct st_mysql_sys_var *const var,
14940 void *const var_ptr,
14941 const void *const save) {
14942 assert(save != nullptr);
14943 assert(rocksdb_db_options != nullptr);
14944 assert(rocksdb_db_options->env != nullptr);
14945
14946 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14947
14948 const int new_val = *static_cast<const int *>(save);
14949
14950 if (rocksdb_db_options->max_background_compactions != new_val) {
14951 rocksdb_db_options->max_background_compactions = new_val;
14952 rocksdb::Status s =
14953 rdb->SetDBOptions({{"max_background_compactions", std::to_string(new_val)}});
14954
14955 if (!s.ok()) {
14956 /* NO_LINT_DEBUG */
14957 sql_print_warning(
14958 "MyRocks: failed to update max_background_compactions. "
14959 "Status code = %d, status = %s.",
14960 s.code(), s.ToString().c_str());
14961 }
14962 }
14963
14964 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14965 }
14966
14967 /**
14968 rocksdb_set_max_bottom_pri_background_compactions_internal() changes
14969 the number of rocksdb background threads.
14970 Creating new threads may take up to a few seconds, so instead of
14971 calling the function at sys_var::update path where global mutex is held,
14972 doing at sys_var::check path so that other queries are not blocked.
14973 Same optimization is done for rocksdb_block_cache_size too.
14974 */
14975 static int rocksdb_validate_max_bottom_pri_background_compactions(
14976 THD *thd MY_ATTRIBUTE((__unused__)),
14977 struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14978 void *var_ptr, struct st_mysql_value *value) {
14979 assert(value != nullptr);
14980
14981 long long new_value;
14982
14983 /* value is NULL */
14984 if (value->val_int(value, &new_value)) {
14985 return HA_EXIT_FAILURE;
14986 }
14987 if (new_value < 0 ||
14988 new_value > ROCKSDB_MAX_BOTTOM_PRI_BACKGROUND_COMPACTIONS) {
14989 return HA_EXIT_FAILURE;
14990 }
14991 RDB_MUTEX_LOCK_CHECK(rdb_bottom_pri_background_compactions_resize_mutex);
14992 if (rocksdb_max_bottom_pri_background_compactions != new_value) {
14993 if (new_value == 0) {
14994 my_error(ER_ERROR_WHEN_EXECUTING_COMMAND, MYF(0), "SET",
14995 "max_bottom_pri_background_compactions can't be changed to 0 "
14996 "online.");
14997 RDB_MUTEX_UNLOCK_CHECK(
14998 rdb_bottom_pri_background_compactions_resize_mutex);
14999 return HA_EXIT_FAILURE;
15000 }
15001 rocksdb_set_max_bottom_pri_background_compactions_internal(new_value);
15002 }
15003 *static_cast<int64_t *>(var_ptr) = static_cast<int64_t>(new_value);
15004 RDB_MUTEX_UNLOCK_CHECK(rdb_bottom_pri_background_compactions_resize_mutex);
15005 return HA_EXIT_SUCCESS;
15006 }
15007
15008 static void rocksdb_set_bytes_per_sync(
15009 THD *thd MY_ATTRIBUTE((__unused__)),
15010 struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
15011 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
15012 assert(save != nullptr);
15013 assert(rocksdb_db_options != nullptr);
15014 assert(rocksdb_db_options->env != nullptr);
15015
15016 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
15017
15018 const ulonglong new_val = *static_cast<const ulonglong *>(save);
15019
15020 if (rocksdb_db_options->bytes_per_sync != new_val) {
15021 rocksdb_db_options->bytes_per_sync = new_val;
15022 rocksdb::Status s =
15023 rdb->SetDBOptions({{"bytes_per_sync", std::to_string(new_val)}});
15024
15025 if (!s.ok()) {
15026 /* NO_LINT_DEBUG */
15027 sql_print_warning(
15028 "MyRocks: failed to update max_background_jobs. "
15029 "Status code = %d, status = %s.",
15030 s.code(), s.ToString().c_str());
15031 }
15032 }
15033
15034 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
15035 }
15036
15037 static void rocksdb_set_wal_bytes_per_sync(
15038 THD *thd MY_ATTRIBUTE((__unused__)),
15039 struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
15040 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
15041 assert(save != nullptr);
15042 assert(rocksdb_db_options != nullptr);
15043 assert(rocksdb_db_options->env != nullptr);
15044
15045 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
15046
15047 const ulonglong new_val = *static_cast<const ulonglong *>(save);
15048
15049 if (rocksdb_db_options->wal_bytes_per_sync != new_val) {
15050 rocksdb_db_options->wal_bytes_per_sync = new_val;
15051 rocksdb::Status s =
15052 rdb->SetDBOptions({{"wal_bytes_per_sync", std::to_string(new_val)}});
15053
15054 if (!s.ok()) {
15055 /* NO_LINT_DEBUG */
15056 sql_print_warning(
15057 "MyRocks: failed to update max_background_jobs. "
15058 "Status code = %d, status = %s.",
15059 s.code(), s.ToString().c_str());
15060 }
15061 }
15062
15063 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
15064 }
15065
15066 /*
15067 Validating and updating block cache size via sys_var::check path.
15068 SetCapacity may take seconds when reducing block cache, and
15069 sys_var::update holds LOCK_global_system_variables mutex, so
15070 updating block cache size is done at check path instead.
15071 */
15072 static int rocksdb_validate_set_block_cache_size(
15073 THD *thd MY_ATTRIBUTE((__unused__)),
15074 struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
15075 void *var_ptr, struct st_mysql_value *value) {
15076 assert(value != nullptr);
15077
15078 long long new_value;
15079
15080 /* value is NULL */
15081 if (value->val_int(value, &new_value)) {
15082 return HA_EXIT_FAILURE;
15083 }
15084
15085 if (new_value < RDB_MIN_BLOCK_CACHE_SIZE ||
15086 (uint64_t)new_value > (uint64_t)LLONG_MAX) {
15087 return HA_EXIT_FAILURE;
15088 }
15089
15090 RDB_MUTEX_LOCK_CHECK(rdb_block_cache_resize_mutex);
15091 const rocksdb::BlockBasedTableOptions &table_options =
15092 rdb_get_table_options();
15093
15094 if (rocksdb_block_cache_size != new_value && table_options.block_cache) {
15095 table_options.block_cache->SetCapacity(new_value);
15096 }
15097 *static_cast<int64_t *>(var_ptr) = static_cast<int64_t>(new_value);
15098 RDB_MUTEX_UNLOCK_CHECK(rdb_block_cache_resize_mutex);
15099 return HA_EXIT_SUCCESS;
15100 }
15101
15102 static int rocksdb_validate_update_cf_options(
15103 THD *thd MY_ATTRIBUTE((__unused__)),
15104 struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)), void *save,
15105 struct st_mysql_value *value) {
15106 char buff[STRING_BUFFER_USUAL_SIZE];
15107 const char *str;
15108 int length;
15109 length = sizeof(buff);
15110 str = value->val_str(value, buff, &length);
15111 *static_cast<const char **>(save) = str;
15112
15113 if (str == nullptr) {
15114 return HA_EXIT_SUCCESS;
15115 }
15116
15117 Rdb_cf_options::Name_to_config_t option_map;
15118
15119 // Basic sanity checking and parsing the options into a map. If this fails
15120 // then there's no point to proceed.
15121 if (!Rdb_cf_options::parse_cf_options(str, &option_map)) {
15122 my_error(ER_WRONG_VALUE_FOR_VAR, MYF(0), "rocksdb_update_cf_options", str);
15123 return HA_EXIT_FAILURE;
15124 }
15125 // Loop through option_map and check if all specified CFs exist.
15126 std::vector<const std::string *> unknown_cfs;
15127 for (const auto &option : option_map) {
15128 if (!cf_manager.get_cf(option.first)) {
15129 unknown_cfs.push_back(&(option.first));
15130 }
15131 }
15132
15133 if (!unknown_cfs.empty()) {
15134 std::string err(str);
15135 err.append(" Unknown CF: ");
15136 bool first = true;
15137 for (const auto cf : unknown_cfs) {
15138 if (first)
15139 first = false;
15140 else
15141 err.append(", ");
15142 err.append(*cf);
15143 }
15144 my_error(ER_WRONG_VALUE_FOR_VAR, MYF(0), "rocksdb_update_cf_options",
15145 err.c_str());
15146 return HA_EXIT_FAILURE;
15147 }
15148 return HA_EXIT_SUCCESS;
15149 }
15150
15151 static void rocksdb_set_update_cf_options(
15152 THD *const thd MY_ATTRIBUTE((__unused__)),
15153 struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
15154 void *const var_ptr, const void *const save) {
15155 const char *const val = *static_cast<const char *const *>(save);
15156
15157 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
15158
15159 if (!val) {
15160 *reinterpret_cast<char **>(var_ptr) = nullptr;
15161 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
15162 return;
15163 }
15164
15165 assert(val != nullptr);
15166
15167 // Reset the pointers regardless of how much success we had with updating
15168 // the CF options. This will results in consistent behavior and avoids
15169 // dealing with cases when only a subset of CF-s was successfully updated.
15170 *static_cast<const char **>(var_ptr) =
15171 *static_cast<const char *const *>(save);
15172
15173 // Do the real work of applying the changes.
15174 Rdb_cf_options::Name_to_config_t option_map;
15175
15176 // This should never fail, because of rocksdb_validate_update_cf_options
15177 if (!Rdb_cf_options::parse_cf_options(val, &option_map)) {
15178 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
15179 return;
15180 }
15181
15182 // For each CF we have, see if we need to update any settings.
15183 for (const auto &cf_name : cf_manager.get_cf_names()) {
15184 assert(!cf_name.empty());
15185
15186 std::shared_ptr<rocksdb::ColumnFamilyHandle> cfh =
15187 cf_manager.get_cf(cf_name);
15188
15189 if (!cfh) {
15190 // NO_LINT_DEBUG
15191 sql_print_information(
15192 "Skip updating options for cf %s because the cf has been dropped.",
15193 cf_name.c_str());
15194 continue;
15195 }
15196
15197 const auto it = option_map.find(cf_name);
15198 std::string per_cf_options = (it != option_map.end()) ? it->second : "";
15199
15200 if (!per_cf_options.empty()) {
15201 Rdb_cf_options::Name_to_config_t opt_map;
15202 rocksdb::Status s = rocksdb::StringToMap(per_cf_options, &opt_map);
15203
15204 if (s != rocksdb::Status::OK()) {
15205 // NO_LINT_DEBUG
15206 sql_print_warning(
15207 "MyRocks: failed to convert the options for column "
15208 "family '%s' to a map. %s",
15209 cf_name.c_str(), s.ToString().c_str());
15210 } else {
15211 assert(rdb != nullptr);
15212
15213 // Finally we can apply the options.
15214 // If cf_manager.drop_cf() has been called at this point, SetOptions()
15215 // will still succeed. The options data will only be cleared when
15216 // the CF handle object is destroyed.
15217 s = rdb->SetOptions(cfh.get(), opt_map);
15218
15219 if (s != rocksdb::Status::OK()) {
15220 // NO_LINT_DEBUG
15221 sql_print_warning(
15222 "MyRocks: failed to apply the options for column "
15223 "family '%s'. %s",
15224 cf_name.c_str(), s.ToString().c_str());
15225 } else {
15226 // NO_LINT_DEBUG
15227 sql_print_information(
15228 "MyRocks: options for column family '%s' "
15229 "have been successfully updated.",
15230 cf_name.c_str());
15231
15232 // Make sure that data is internally consistent as well and update
15233 // the CF options. This is necessary also to make sure that the CF
15234 // options will be correctly reflected in the relevant table:
15235 // ROCKSDB_CF_OPTIONS in INFORMATION_SCHEMA.
15236 rocksdb::ColumnFamilyOptions cf_options = rdb->GetOptions(cfh.get());
15237 std::string updated_options;
15238
15239 s = rocksdb::GetStringFromColumnFamilyOptions(&updated_options,
15240 cf_options);
15241
15242 assert(s == rocksdb::Status::OK());
15243 assert(!updated_options.empty());
15244
15245 cf_manager.update_options_map(cf_name, updated_options);
15246 }
15247 }
15248 }
15249 }
15250
15251 // Our caller (`plugin_var_memalloc_global_update`) will call `my_free` to
15252 // free up resources used before.
15253
15254 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
15255 }
15256
15257 void rdb_queue_save_stats_request() { rdb_bg_thread.request_save_stats(); }
15258
15259 #if defined(ROCKSDB_INCLUDE_RFR) && ROCKSDB_INCLUDE_RFR
15260 void ha_rocksdb::rpl_before_delete_rows() {
15261 DBUG_ENTER_FUNC();
15262
15263 m_in_rpl_delete_rows = true;
15264
15265 DBUG_VOID_RETURN;
15266 }
15267
15268 void ha_rocksdb::rpl_after_delete_rows() {
15269 DBUG_ENTER_FUNC();
15270
15271 m_in_rpl_delete_rows = false;
15272
15273 DBUG_VOID_RETURN;
15274 }
15275
15276 void ha_rocksdb::rpl_before_update_rows() {
15277 DBUG_ENTER_FUNC();
15278
15279 m_in_rpl_update_rows = true;
15280
15281 DBUG_VOID_RETURN;
15282 }
15283
15284 void ha_rocksdb::rpl_after_update_rows() {
15285 DBUG_ENTER_FUNC();
15286
15287 m_in_rpl_update_rows = false;
15288
15289 DBUG_VOID_RETURN;
15290 }
15291
15292 bool ha_rocksdb::rpl_lookup_rows() { return !use_read_free_rpl(); }
15293
15294 bool ha_rocksdb::is_read_free_rpl_table() const {
15295 #if 1 // Percona Server disabled rocksdb_read_free_rpl_tables as it's dangerous to use
15296 return true;
15297 #else
15298 return table->s && m_tbl_def->m_is_read_free_rpl_table;
15299 #endif
15300 }
15301
15302 /**
15303 @brief
15304 Read Free Replication can be used or not. Returning true means
15305 Read Free Replication can be used.
15306 */
15307 bool ha_rocksdb::use_read_free_rpl() const {
15308 DBUG_ENTER_FUNC();
15309
15310 if (!ha_thd()->rli_slave || table->triggers || !is_read_free_rpl_table()) {
15311 DBUG_RETURN(false);
15312 }
15313
15314 switch (rocksdb_read_free_rpl) {
15315 case read_free_rpl_type::OFF:
15316 DBUG_RETURN(false);
15317 case read_free_rpl_type::PK_ONLY:
15318 DBUG_RETURN(!has_hidden_pk(table) && table->s->keys == 1);
15319 case read_free_rpl_type::PK_SK:
15320 DBUG_RETURN(!has_hidden_pk(table));
15321 }
15322
15323 assert(false);
15324 DBUG_RETURN(false);
15325 }
15326 #endif // defined(ROCKSDB_INCLUDE_RFR) && ROCKSDB_INCLUDE_RFR
15327
15328 double ha_rocksdb::read_time(uint index, uint ranges, ha_rows rows) {
15329 DBUG_ENTER_FUNC();
15330
15331 if (index != table->s->primary_key) {
15332 /* Non covering index range scan */
15333 DBUG_RETURN(handler::read_time(index, ranges, rows));
15334 }
15335
15336 DBUG_RETURN((rows / 20.0) + 1);
15337 }
15338
15339 void ha_rocksdb::print_error(int error, myf errflag) {
15340 if (error == HA_ERR_ROCKSDB_STATUS_BUSY) {
15341 error = HA_ERR_LOCK_DEADLOCK;
15342 }
15343 handler::print_error(error, errflag);
15344 }
15345
15346 std::string rdb_corruption_marker_file_name() {
15347 std::string ret(rocksdb_datadir);
15348 ret.append("/ROCKSDB_CORRUPTED");
15349 return ret;
15350 }
15351
15352 } // namespace myrocks
15353
15354 /*
15355 Register the storage engine plugin outside of myrocks namespace
15356 so that mysql_declare_plugin does not get confused when it does
15357 its name generation.
15358 */
15359
15360 struct st_mysql_storage_engine rocksdb_storage_engine = {
15361 MYSQL_HANDLERTON_INTERFACE_VERSION};
15362
15363 mysql_declare_plugin(rocksdb_se){
15364 MYSQL_STORAGE_ENGINE_PLUGIN, /* Plugin Type */
15365 &rocksdb_storage_engine, /* Plugin Descriptor */
15366 "ROCKSDB", /* Plugin Name */
15367 "Monty Program Ab", /* Plugin Author */
15368 "RocksDB storage engine", /* Plugin Description */
15369 PLUGIN_LICENSE_GPL, /* Plugin Licence */
15370 myrocks::rocksdb_init_func, /* Plugin Entry Point */
15371 myrocks::rocksdb_done_func, /* Plugin Deinitializer */
15372 0x0001, /* version number (0.1) */
15373 myrocks::rocksdb_status_vars, /* status variables */
15374 myrocks::rocksdb_system_variables, /* system variables */
15375 nullptr, /* config options */
15376 0, /* flags */
15377 },
15378 myrocks::rdb_i_s_cfstats, myrocks::rdb_i_s_dbstats,
15379 myrocks::rdb_i_s_perf_context, myrocks::rdb_i_s_perf_context_global,
15380 myrocks::rdb_i_s_cfoptions, myrocks::rdb_i_s_compact_stats,
15381 myrocks::rdb_i_s_global_info, myrocks::rdb_i_s_ddl,
15382 myrocks::rdb_i_s_sst_props, myrocks::rdb_i_s_index_file_map,
15383 myrocks::rdb_i_s_lock_info, myrocks::rdb_i_s_trx_info,
15384 myrocks::rdb_i_s_deadlock_info mysql_declare_plugin_end;
15385