1 /*
2 Copyright (c) 2012, Monty Program Ab
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; version 2 of the License.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA */
16
17 #ifdef USE_PRAGMA_IMPLEMENTATION
18 #pragma implementation // gcc: Class implementation
19 #endif
20
21 #define MYSQL_SERVER 1
22
23 /* For use of 'PRIu64': */
24 #define __STDC_FORMAT_MACROS
25
26 #include <my_global.h>
27
28 #include <inttypes.h>
29
30 /* The C++ file's header */
31 #include "./ha_rocksdb.h"
32
33 /* C++ standard header files */
34 #include <inttypes.h>
35 #include <algorithm>
36 #include <limits>
37 #include <map>
38 #include <queue>
39 #include <set>
40 #include <string>
41 #include <vector>
42
43 /* MySQL includes */
44 #include <mysql/psi/mysql_table.h>
45 // #include <thread_pool_priv.h>
46 #include <mysys_err.h>
47 #include "./debug_sync.h"
48 #include "./my_bit.h"
49 #include "./my_stacktrace.h"
50 #include "./my_sys.h"
51 #include "./sql_audit.h"
52 #include "./sql_table.h"
53 #include "./sql_hset.h"
54 #ifdef MARIAROCKS_NOT_YET
55 #endif
56
57 /* RocksDB includes */
58 #include "monitoring/histogram.h"
59 #include "rocksdb/compaction_filter.h"
60 #include "rocksdb/env.h"
61 #include "rocksdb/memory_allocator.h"
62 #include "rocksdb/persistent_cache.h"
63 #include "rocksdb/rate_limiter.h"
64 #include "rocksdb/slice_transform.h"
65 #include "rocksdb/thread_status.h"
66 #include "rocksdb/utilities/checkpoint.h"
67 #include "rocksdb/utilities/convenience.h"
68 #include "rocksdb/utilities/memory_util.h"
69 #include "rocksdb/utilities/sim_cache.h"
70 #include "rocksdb/utilities/write_batch_with_index.h"
71 #include "util/stop_watch.h"
72 #include "./rdb_source_revision.h"
73
74 // MariaRocks: this is needed to access RocksDB debug syncpoints:
75 #include "test_util/sync_point.h"
76
77 /* MyRocks includes */
78 #include "./event_listener.h"
79 #include "./ha_rocksdb_proto.h"
80 #include "./logger.h"
81 #include "./nosql_access.h"
82 #include "./rdb_cf_manager.h"
83 #include "./rdb_cf_options.h"
84 #include "./rdb_converter.h"
85 #include "./rdb_datadic.h"
86 #include "./rdb_i_s.h"
87 #include "./rdb_index_merge.h"
88 #include "./rdb_mutex_wrapper.h"
89 #include "./rdb_psi.h"
90 #include "./rdb_threads.h"
91 #include "./rdb_mariadb_server_port.h"
92
93 // Internal MySQL APIs not exposed in any header.
94 extern "C" {
95 /**
96 Mark transaction to rollback and mark error as fatal to a sub-statement.
97 @param thd Thread handle
98 @param all TRUE <=> rollback main transaction.
99 */
100 void thd_mark_transaction_to_rollback(MYSQL_THD thd, bool all);
101
102 /**
103 * Get the user thread's binary logging format
104 * @param thd user thread
105 * @return Value to be used as index into the binlog_format_names array
106 */
107 int thd_binlog_format(const MYSQL_THD thd);
108
109 /**
110 * Check if binary logging is filtered for thread's current db.
111 * @param thd Thread handle
112 * @retval 1 the query is not filtered, 0 otherwise.
113 */
114 bool thd_binlog_filter_ok(const MYSQL_THD thd);
115 }
116
117 extern my_bool opt_core_file;
118
119 // Needed in rocksdb_init_func
120 void ignore_db_dirs_append(const char *dirname_arg);
121
122
123 namespace myrocks {
124
125 static st_global_stats global_stats;
126 static st_export_stats export_stats;
127 static st_memory_stats memory_stats;
128 static st_io_stall_stats io_stall_stats;
129
130 const std::string DEFAULT_CF_NAME("default");
131 const std::string DEFAULT_SYSTEM_CF_NAME("__system__");
132 const std::string PER_INDEX_CF_NAME("$per_index_cf");
133
134 static std::vector<GL_INDEX_ID> rdb_indexes_to_recalc;
135
136 #ifdef MARIADB_NOT_YET
137 class Rdb_explicit_snapshot : public explicit_snapshot {
138 public:
create(snapshot_info_st * ss_info,rocksdb::DB * db,const rocksdb::Snapshot * snapshot)139 static std::shared_ptr<Rdb_explicit_snapshot> create(
140 snapshot_info_st *ss_info, rocksdb::DB *db,
141 const rocksdb::Snapshot *snapshot) {
142 std::lock_guard<std::mutex> lock(explicit_snapshot_mutex);
143 auto s = std::unique_ptr<rocksdb::ManagedSnapshot>(
144 new rocksdb::ManagedSnapshot(db, snapshot));
145 if (!s) {
146 return nullptr;
147 }
148 ss_info->snapshot_id = ++explicit_snapshot_counter;
149 auto ret = std::make_shared<Rdb_explicit_snapshot>(*ss_info, std::move(s));
150 if (!ret) {
151 return nullptr;
152 }
153 explicit_snapshots[ss_info->snapshot_id] = ret;
154 return ret;
155 }
156
dump_snapshots()157 static std::string dump_snapshots() {
158 std::string str;
159 std::lock_guard<std::mutex> lock(explicit_snapshot_mutex);
160 for (const auto &elem : explicit_snapshots) {
161 const auto &ss = elem.second.lock();
162 DBUG_ASSERT(ss != nullptr);
163 const auto &info = ss->ss_info;
164 str += "\nSnapshot ID: " + std::to_string(info.snapshot_id) +
165 "\nBinlog File: " + info.binlog_file +
166 "\nBinlog Pos: " + std::to_string(info.binlog_pos) +
167 "\nGtid Executed: " + info.gtid_executed + "\n";
168 }
169
170 return str;
171 }
172
get(const ulonglong snapshot_id)173 static std::shared_ptr<Rdb_explicit_snapshot> get(
174 const ulonglong snapshot_id) {
175 std::lock_guard<std::mutex> lock(explicit_snapshot_mutex);
176 auto elem = explicit_snapshots.find(snapshot_id);
177 if (elem == explicit_snapshots.end()) {
178 return nullptr;
179 }
180 return elem->second.lock();
181 }
182
get_snapshot()183 rocksdb::ManagedSnapshot *get_snapshot() { return snapshot.get(); }
184
Rdb_explicit_snapshot(snapshot_info_st ss_info,std::unique_ptr<rocksdb::ManagedSnapshot> && snapshot)185 Rdb_explicit_snapshot(snapshot_info_st ss_info,
186 std::unique_ptr<rocksdb::ManagedSnapshot> &&snapshot)
187 : explicit_snapshot(ss_info), snapshot(std::move(snapshot)) {}
188
~Rdb_explicit_snapshot()189 virtual ~Rdb_explicit_snapshot() {
190 std::lock_guard<std::mutex> lock(explicit_snapshot_mutex);
191 explicit_snapshots.erase(ss_info.snapshot_id);
192 }
193
194 private:
195 std::unique_ptr<rocksdb::ManagedSnapshot> snapshot;
196
197 static std::mutex explicit_snapshot_mutex;
198 static ulonglong explicit_snapshot_counter;
199 static std::unordered_map<ulonglong, std::weak_ptr<Rdb_explicit_snapshot>>
200 explicit_snapshots;
201 };
202
203 std::mutex Rdb_explicit_snapshot::explicit_snapshot_mutex;
204 ulonglong Rdb_explicit_snapshot::explicit_snapshot_counter = 0;
205 std::unordered_map<ulonglong, std::weak_ptr<Rdb_explicit_snapshot>>
206 Rdb_explicit_snapshot::explicit_snapshots;
207 #endif
208
209 /**
210 Updates row counters based on the table type and operation type.
211 */
update_row_stats(const operation_type & type)212 void ha_rocksdb::update_row_stats(const operation_type &type) {
213 DBUG_ASSERT(type < ROWS_MAX);
214 // Find if we are modifying system databases.
215 if (table->s && m_tbl_def->m_is_mysql_system_table) {
216 global_stats.system_rows[type].inc();
217 } else {
218 global_stats.rows[type].inc();
219 }
220 }
221
222 void dbug_dump_database(rocksdb::DB *db);
223 static handler *rocksdb_create_handler(my_core::handlerton *hton,
224 my_core::TABLE_SHARE *table_arg,
225 my_core::MEM_ROOT *mem_root);
226
getCompactRangeOptions(int concurrency=0)227 static rocksdb::CompactRangeOptions getCompactRangeOptions(
228 int concurrency = 0) {
229 rocksdb::CompactRangeOptions compact_range_options;
230 compact_range_options.bottommost_level_compaction =
231 rocksdb::BottommostLevelCompaction::kForce;
232 compact_range_options.exclusive_manual_compaction = false;
233 if (concurrency > 0) {
234 compact_range_options.max_subcompactions = concurrency;
235 }
236 return compact_range_options;
237 }
238
239 ///////////////////////////////////////////////////////////
240 // Parameters and settings
241 ///////////////////////////////////////////////////////////
242 static char *rocksdb_default_cf_options = nullptr;
243 static char *rocksdb_override_cf_options = nullptr;
244 static char *rocksdb_update_cf_options = nullptr;
245
246 ///////////////////////////////////////////////////////////
247 // Globals
248 ///////////////////////////////////////////////////////////
249 handlerton *rocksdb_hton;
250
251 rocksdb::TransactionDB *rdb = nullptr;
252 rocksdb::HistogramImpl *commit_latency_stats = nullptr;
253
254 static std::shared_ptr<rocksdb::Statistics> rocksdb_stats;
255 static std::unique_ptr<rocksdb::Env> flashcache_aware_env;
256 static std::shared_ptr<Rdb_tbl_prop_coll_factory> properties_collector_factory;
257
258 Rdb_dict_manager dict_manager;
259 Rdb_cf_manager cf_manager;
260 Rdb_ddl_manager ddl_manager;
261 Rdb_binlog_manager binlog_manager;
262
263 #if !defined(_WIN32) && !defined(__APPLE__)
264 Rdb_io_watchdog *io_watchdog = nullptr;
265 #endif
266 /**
267 MyRocks background thread control
268 N.B. This is besides RocksDB's own background threads
269 (@see rocksdb::CancelAllBackgroundWork())
270 */
271
272 static Rdb_background_thread rdb_bg_thread;
273
274 static Rdb_manual_compaction_thread rdb_mc_thread;
275
276 // List of table names (using regex) that are exceptions to the strict
277 // collation check requirement.
278 Regex_list_handler *rdb_collation_exceptions;
279
280 static const char **rdb_get_error_messages(int nr);
281
rocksdb_flush_all_memtables()282 static void rocksdb_flush_all_memtables() {
283 const Rdb_cf_manager &cf_manager = rdb_get_cf_manager();
284 for (const auto &cf_handle : cf_manager.get_all_cf()) {
285 rdb->Flush(rocksdb::FlushOptions(), cf_handle);
286 }
287 }
288
rocksdb_delete_column_family_stub(THD * const,struct st_mysql_sys_var * const,void * const,const void * const)289 static void rocksdb_delete_column_family_stub(
290 THD *const /* thd */, struct st_mysql_sys_var *const /* var */,
291 void *const /* var_ptr */, const void *const /* save */) {}
292
rocksdb_delete_column_family(THD * const,struct st_mysql_sys_var * const,void * const,struct st_mysql_value * const value)293 static int rocksdb_delete_column_family(
294 THD *const /* thd */, struct st_mysql_sys_var *const /* var */,
295 void *const /* var_ptr */, struct st_mysql_value *const value) {
296 // Return failure for now until the race condition between creating
297 // CF and deleting CF is resolved
298 return HA_EXIT_FAILURE;
299
300 char buff[STRING_BUFFER_USUAL_SIZE];
301 int len = sizeof(buff);
302
303 DBUG_ASSERT(value != nullptr);
304
305 if (const char *const cf = value->val_str(value, buff, &len)) {
306 auto &cf_manager = rdb_get_cf_manager();
307 auto ret = cf_manager.drop_cf(cf);
308 if (ret == HA_EXIT_SUCCESS) {
309 // NO_LINT_DEBUG
310 sql_print_information("RocksDB: Dropped column family: %s\n", cf);
311 } else {
312 // NO_LINT_DEBUG
313 sql_print_error("RocksDB: Failed to drop column family: %s, error: %d\n",
314 cf, ret);
315 }
316
317 return ret;
318 }
319
320 return HA_EXIT_SUCCESS;
321 }
322
323 ///////////////////////////////////////////////////////////
324 // Hash map: table name => open table handler
325 ///////////////////////////////////////////////////////////
326
327 namespace // anonymous namespace = not visible outside this source file
328 {
329
330 typedef Hash_set<Rdb_table_handler> Rdb_table_set;
331
332 class Rdb_open_tables_map {
333 private:
334 /* Hash table used to track the handlers of open tables */
335 std::unordered_map<std::string, Rdb_table_handler *> m_table_map;
336
337 /* The mutex used to protect the hash table */
338 mutable mysql_mutex_t m_mutex;
339
340 public:
init()341 void init() {
342 m_table_map.clear();
343 mysql_mutex_init(rdb_psi_open_tbls_mutex_key, &m_mutex, MY_MUTEX_INIT_FAST);
344 }
345
free()346 void free() {
347 m_table_map.clear();
348 mysql_mutex_destroy(&m_mutex);
349 }
count()350 size_t count() { return m_table_map.size(); }
351
352 Rdb_table_handler *get_table_handler(const char *const table_name);
353 void release_table_handler(Rdb_table_handler *const table_handler);
354
355 std::vector<std::string> get_table_names(void) const;
356 };
357
358 } // anonymous namespace
359
360 static Rdb_open_tables_map rdb_open_tables;
361
rdb_normalize_dir(std::string dir)362 static std::string rdb_normalize_dir(std::string dir) {
363 while (dir.size() > 0 && dir.back() == '/') {
364 dir.resize(dir.size() - 1);
365 }
366 return dir;
367 }
368
rocksdb_create_checkpoint(THD * const thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * const var MY_ATTRIBUTE ((__unused__)),void * const save MY_ATTRIBUTE ((__unused__)),struct st_mysql_value * const value)369 static int rocksdb_create_checkpoint(
370 THD *const thd MY_ATTRIBUTE((__unused__)),
371 struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
372 void *const save MY_ATTRIBUTE((__unused__)),
373 struct st_mysql_value *const value) {
374 char buf[FN_REFLEN];
375 int len = sizeof(buf);
376 const char *const checkpoint_dir_raw = value->val_str(value, buf, &len);
377 if (checkpoint_dir_raw) {
378 if (rdb != nullptr) {
379 std::string checkpoint_dir = rdb_normalize_dir(checkpoint_dir_raw);
380 // NO_LINT_DEBUG
381 sql_print_information("RocksDB: creating checkpoint in directory : %s\n",
382 checkpoint_dir.c_str());
383 rocksdb::Checkpoint *checkpoint;
384 auto status = rocksdb::Checkpoint::Create(rdb, &checkpoint);
385 // We can only return HA_EXIT_FAILURE/HA_EXIT_SUCCESS here which is why
386 // the return code is ignored, but by calling into rdb_error_to_mysql,
387 // it will call my_error for us, which will propogate up to the client.
388 int rc __attribute__((__unused__));
389 if (status.ok()) {
390 status = checkpoint->CreateCheckpoint(checkpoint_dir.c_str());
391 delete checkpoint;
392 if (status.ok()) {
393 // NO_LINT_DEBUG
394 sql_print_information(
395 "RocksDB: created checkpoint in directory : %s\n",
396 checkpoint_dir.c_str());
397 return HA_EXIT_SUCCESS;
398 } else {
399 rc = ha_rocksdb::rdb_error_to_mysql(status);
400 }
401 } else {
402 rc = ha_rocksdb::rdb_error_to_mysql(status);
403 }
404 }
405 }
406 return HA_EXIT_FAILURE;
407 }
408
409 /* This method is needed to indicate that the
410 ROCKSDB_CREATE_CHECKPOINT command is not read-only */
rocksdb_create_checkpoint_stub(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)411 static void rocksdb_create_checkpoint_stub(THD *const thd,
412 struct st_mysql_sys_var *const var,
413 void *const var_ptr,
414 const void *const save) {}
415
rocksdb_force_flush_memtable_now_stub(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)416 static void rocksdb_force_flush_memtable_now_stub(
417 THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
418 const void *const save) {}
419
rocksdb_force_flush_memtable_now(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,struct st_mysql_value * const value)420 static int rocksdb_force_flush_memtable_now(
421 THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
422 struct st_mysql_value *const value) {
423 // NO_LINT_DEBUG
424 sql_print_information("RocksDB: Manual memtable flush.");
425 rocksdb_flush_all_memtables();
426 return HA_EXIT_SUCCESS;
427 }
428
rocksdb_force_flush_memtable_and_lzero_now_stub(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)429 static void rocksdb_force_flush_memtable_and_lzero_now_stub(
430 THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
431 const void *const save) {}
432
rocksdb_force_flush_memtable_and_lzero_now(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,struct st_mysql_value * const value)433 static int rocksdb_force_flush_memtable_and_lzero_now(
434 THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
435 struct st_mysql_value *const value) {
436 // NO_LINT_DEBUG
437 sql_print_information("RocksDB: Manual memtable and L0 flush.");
438 rocksdb_flush_all_memtables();
439
440 const Rdb_cf_manager &cf_manager = rdb_get_cf_manager();
441 rocksdb::CompactionOptions c_options = rocksdb::CompactionOptions();
442 rocksdb::ColumnFamilyMetaData metadata;
443 rocksdb::ColumnFamilyDescriptor cf_descr;
444
445 int i, max_attempts = 3, num_errors = 0;
446
447 for (const auto &cf_handle : cf_manager.get_all_cf()) {
448 for (i = 0; i < max_attempts; i++) {
449 rdb->GetColumnFamilyMetaData(cf_handle, &metadata);
450 cf_handle->GetDescriptor(&cf_descr);
451 c_options.output_file_size_limit = cf_descr.options.target_file_size_base;
452
453 DBUG_ASSERT(metadata.levels[0].level == 0);
454 std::vector<std::string> file_names;
455 for (auto &file : metadata.levels[0].files) {
456 file_names.emplace_back(file.db_path + file.name);
457 }
458
459 if (file_names.empty()) {
460 break;
461 }
462
463 rocksdb::Status s;
464 s = rdb->CompactFiles(c_options, cf_handle, file_names, 1);
465
466 // Due to a race, it's possible for CompactFiles to collide
467 // with auto compaction, causing an error to return
468 // regarding file not found. In that case, retry.
469 if (s.IsInvalidArgument()) {
470 continue;
471 }
472
473 if (!s.ok() && !s.IsAborted()) {
474 rdb_handle_io_error(s, RDB_IO_ERROR_GENERAL);
475 return HA_EXIT_FAILURE;
476 }
477 break;
478 }
479 if (i == max_attempts) {
480 num_errors++;
481 }
482 }
483
484 return num_errors == 0 ? HA_EXIT_SUCCESS : HA_EXIT_FAILURE;
485 }
486
487 static void rocksdb_drop_index_wakeup_thread(
488 my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
489 struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
490 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save);
491
492 static my_bool rocksdb_pause_background_work = 0;
493 static mysql_mutex_t rdb_sysvars_mutex;
494 static mysql_mutex_t rdb_block_cache_resize_mutex;
495
rocksdb_set_pause_background_work(my_core::THD * const,struct st_mysql_sys_var * const,void * const,const void * const save)496 static void rocksdb_set_pause_background_work(
497 my_core::THD *const,
498 struct st_mysql_sys_var *const,
499 void *const, const void *const save) {
500 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
501 const my_bool pause_requested = *static_cast<const my_bool *>(save);
502 if (rocksdb_pause_background_work != pause_requested) {
503 if (pause_requested) {
504 rdb->PauseBackgroundWork();
505 } else {
506 rdb->ContinueBackgroundWork();
507 }
508 rocksdb_pause_background_work = pause_requested;
509 }
510 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
511 }
512
513 static void rocksdb_set_compaction_options(THD *thd,
514 struct st_mysql_sys_var *var,
515 void *var_ptr, const void *save);
516
517 static void rocksdb_set_table_stats_sampling_pct(THD *thd,
518 struct st_mysql_sys_var *var,
519 void *var_ptr,
520 const void *save);
521
522 static void rocksdb_set_rate_limiter_bytes_per_sec(THD *thd,
523 struct st_mysql_sys_var *var,
524 void *var_ptr,
525 const void *save);
526
527 static void rocksdb_set_sst_mgr_rate_bytes_per_sec(THD *thd,
528 struct st_mysql_sys_var *var,
529 void *var_ptr,
530 const void *save);
531
532 static void rocksdb_set_delayed_write_rate(THD *thd,
533 struct st_mysql_sys_var *var,
534 void *var_ptr, const void *save);
535
536 static void rocksdb_set_max_latest_deadlocks(THD *thd,
537 struct st_mysql_sys_var *var,
538 void *var_ptr, const void *save);
539
540 static void rdb_set_collation_exception_list(const char *exception_list);
541 static void rocksdb_set_collation_exception_list(THD *thd,
542 struct st_mysql_sys_var *var,
543 void *var_ptr,
544 const void *save);
545
546 static int rocksdb_validate_update_cf_options(THD *thd,
547 struct st_mysql_sys_var *var,
548 void *save,
549 st_mysql_value *value);
550
551 static void rocksdb_set_update_cf_options(THD *thd,
552 struct st_mysql_sys_var *var,
553 void *var_ptr, const void *save);
554
555 static int rocksdb_check_bulk_load(
556 THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)),
557 void *save, struct st_mysql_value *value);
558
559 static int rocksdb_check_bulk_load_allow_unsorted(
560 THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)),
561 void *save, struct st_mysql_value *value);
562
563 static void rocksdb_set_max_background_jobs(THD *thd,
564 struct st_mysql_sys_var *const var,
565 void *const var_ptr,
566 const void *const save);
567 static void rocksdb_set_bytes_per_sync(THD *thd,
568 struct st_mysql_sys_var *const var,
569 void *const var_ptr,
570 const void *const save);
571 static void rocksdb_set_wal_bytes_per_sync(THD *thd,
572 struct st_mysql_sys_var *const var,
573 void *const var_ptr,
574 const void *const save);
575 static int rocksdb_validate_set_block_cache_size(
576 THD *thd, struct st_mysql_sys_var *const var, void *var_ptr,
577 struct st_mysql_value *value);
578 //////////////////////////////////////////////////////////////////////////////
579 // Options definitions
580 //////////////////////////////////////////////////////////////////////////////
581 static long long rocksdb_block_cache_size;
582 static long long rocksdb_sim_cache_size;
583 static my_bool rocksdb_use_clock_cache;
584 static double rocksdb_cache_high_pri_pool_ratio;
585 static my_bool rocksdb_cache_dump;
586 /* Use unsigned long long instead of uint64_t because of MySQL compatibility */
587 static unsigned long long // NOLINT(runtime/int)
588 rocksdb_rate_limiter_bytes_per_sec;
589 static unsigned long long // NOLINT(runtime/int)
590 rocksdb_sst_mgr_rate_bytes_per_sec;
591 static unsigned long long rocksdb_delayed_write_rate;
592 static uint32_t rocksdb_max_latest_deadlocks;
593 static unsigned long // NOLINT(runtime/int)
594 rocksdb_persistent_cache_size_mb;
595 static ulong rocksdb_info_log_level;
596 static char *rocksdb_wal_dir;
597 static char *rocksdb_persistent_cache_path;
598 static ulong rocksdb_index_type;
599 static uint32_t rocksdb_flush_log_at_trx_commit;
600 static uint32_t rocksdb_debug_optimizer_n_rows;
601 static my_bool rocksdb_force_compute_memtable_stats;
602 static uint32_t rocksdb_force_compute_memtable_stats_cachetime;
603 static my_bool rocksdb_debug_optimizer_no_zero_cardinality;
604 static uint32_t rocksdb_wal_recovery_mode;
605 static uint32_t rocksdb_stats_level;
606 static uint32_t rocksdb_access_hint_on_compaction_start;
607 static char *rocksdb_compact_cf_name;
608 static char *rocksdb_delete_cf_name;
609 static char *rocksdb_checkpoint_name;
610 static my_bool rocksdb_signal_drop_index_thread;
611 static my_bool rocksdb_signal_remove_mariabackup_checkpoint;
612 static my_bool rocksdb_strict_collation_check = 1;
613 static my_bool rocksdb_ignore_unknown_options = 1;
614 static my_bool rocksdb_enable_2pc = 0;
615 static char *rocksdb_strict_collation_exceptions;
616 static my_bool rocksdb_collect_sst_properties = 1;
617 static my_bool rocksdb_force_flush_memtable_now_var = 0;
618 static my_bool rocksdb_force_flush_memtable_and_lzero_now_var = 0;
619 static my_bool rocksdb_enable_ttl = 1;
620 static my_bool rocksdb_enable_ttl_read_filtering = 1;
621 static int rocksdb_debug_ttl_rec_ts = 0;
622 static int rocksdb_debug_ttl_snapshot_ts = 0;
623 static int rocksdb_debug_ttl_read_filter_ts = 0;
624 static my_bool rocksdb_debug_ttl_ignore_pk = 0;
625 static my_bool rocksdb_reset_stats = 0;
626 static uint32_t rocksdb_io_write_timeout_secs = 0;
627 static uint32_t rocksdb_seconds_between_stat_computes = 3600;
628 static long long rocksdb_compaction_sequential_deletes = 0l;
629 static long long rocksdb_compaction_sequential_deletes_window = 0l;
630 static long long rocksdb_compaction_sequential_deletes_file_size = 0l;
631 static uint32_t rocksdb_validate_tables = 1;
632 static char *rocksdb_datadir;
633 static uint32_t rocksdb_table_stats_sampling_pct;
634 static my_bool rocksdb_enable_bulk_load_api = 1;
635 static my_bool rocksdb_print_snapshot_conflict_queries = 0;
636 static my_bool rocksdb_large_prefix = 0;
637 static my_bool rocksdb_allow_to_start_after_corruption = 0;
638 static char* rocksdb_git_hash;
639
640 uint32_t rocksdb_ignore_datadic_errors = 0;
641
642 char *compression_types_val=
643 const_cast<char*>(get_rocksdb_supported_compression_types());
644 static unsigned long rocksdb_write_policy =
645 rocksdb::TxnDBWritePolicy::WRITE_COMMITTED;
646
647 #if 0 // MARIAROCKS_NOT_YET : read-free replication is not supported
648 char *rocksdb_read_free_rpl_tables;
649 std::mutex rocksdb_read_free_rpl_tables_mutex;
650 #if defined(HAVE_PSI_INTERFACE)
651 Regex_list_handler rdb_read_free_regex_handler(key_rwlock_read_free_rpl_tables);
652 #else
653 Regex_list_handler rdb_read_free_regex_handler;
654 #endif
655 enum read_free_rpl_type { OFF = 0, PK_ONLY, PK_SK };
656 static unsigned long rocksdb_read_free_rpl = read_free_rpl_type::OFF;
657 #endif
658
659 static my_bool rocksdb_error_on_suboptimal_collation = 1;
660 static uint32_t rocksdb_stats_recalc_rate = 0;
661 static uint32_t rocksdb_debug_manual_compaction_delay = 0;
662 static uint32_t rocksdb_max_manual_compactions = 0;
663 static my_bool rocksdb_rollback_on_timeout = FALSE;
664 static my_bool rocksdb_enable_insert_with_update_caching = TRUE;
665
666 std::atomic<uint64_t> rocksdb_row_lock_deadlocks(0);
667 std::atomic<uint64_t> rocksdb_row_lock_wait_timeouts(0);
668 std::atomic<uint64_t> rocksdb_snapshot_conflict_errors(0);
669 std::atomic<uint64_t> rocksdb_wal_group_syncs(0);
670 std::atomic<uint64_t> rocksdb_manual_compactions_processed(0);
671 std::atomic<uint64_t> rocksdb_manual_compactions_running(0);
672 #ifndef DBUG_OFF
673 std::atomic<uint64_t> rocksdb_num_get_for_update_calls(0);
674 #endif
675
676
677
678 /*
679 Remove directory with files in it.
680 Used to remove checkpoint created by mariabackup.
681 */
682 #ifdef _WIN32
683 #include <direct.h> /* unlink*/
684 #ifndef F_OK
685 #define F_OK 0
686 #endif
687 #endif
688
rmdir_force(const char * dir)689 static int rmdir_force(const char *dir) {
690 if (access(dir, F_OK))
691 return true;
692
693 char path[FN_REFLEN];
694 char sep[] = {FN_LIBCHAR, 0};
695 int err = 0;
696
697 MY_DIR *dir_info = my_dir(dir, MYF(MY_DONT_SORT | MY_WANT_STAT));
698 if (!dir_info)
699 return 1;
700
701 for (uint i = 0; i < dir_info->number_of_files; i++) {
702 FILEINFO *file = dir_info->dir_entry + i;
703
704 strxnmov(path, sizeof(path), dir, sep, file->name, NULL);
705
706 err = my_delete(path, 0);
707
708 if (err) {
709 break;
710 }
711 }
712
713 my_dirend(dir_info);
714
715 if (!err)
716 err = rmdir(dir);
717
718 return (err == 0) ? HA_EXIT_SUCCESS : HA_EXIT_FAILURE;
719 }
720
721
rocksdb_remove_mariabackup_checkpoint(my_core::THD * const,struct st_mysql_sys_var * const,void * const var_ptr,const void * const)722 static void rocksdb_remove_mariabackup_checkpoint(
723 my_core::THD *const,
724 struct st_mysql_sys_var *const ,
725 void *const var_ptr, const void *const) {
726 std::string mariabackup_checkpoint_dir(rocksdb_datadir);
727
728 mariabackup_checkpoint_dir.append("/mariabackup-checkpoint");
729
730 if (unlink(mariabackup_checkpoint_dir.c_str()) == 0)
731 return;
732
733 rmdir_force(mariabackup_checkpoint_dir.c_str());
734 }
735
736
rdb_init_rocksdb_db_options(void)737 static std::unique_ptr<rocksdb::DBOptions> rdb_init_rocksdb_db_options(void) {
738 auto o = std::unique_ptr<rocksdb::DBOptions>(new rocksdb::DBOptions());
739
740 o->create_if_missing = true;
741 o->listeners.push_back(std::make_shared<Rdb_event_listener>(&ddl_manager));
742 o->info_log_level = rocksdb::InfoLogLevel::INFO_LEVEL;
743 o->max_subcompactions = DEFAULT_SUBCOMPACTIONS;
744 o->max_open_files = -2; // auto-tune to 50% open_files_limit
745
746 o->two_write_queues = true;
747 o->manual_wal_flush = true;
748 return o;
749 }
750
751 /* DBOptions contains Statistics and needs to be destructed last */
752 static std::unique_ptr<rocksdb::BlockBasedTableOptions> rocksdb_tbl_options =
753 std::unique_ptr<rocksdb::BlockBasedTableOptions>(
754 new rocksdb::BlockBasedTableOptions());
755 static std::unique_ptr<rocksdb::DBOptions> rocksdb_db_options =
756 rdb_init_rocksdb_db_options();
757
758 static std::shared_ptr<rocksdb::RateLimiter> rocksdb_rate_limiter;
759
760 /* This enum needs to be kept up to date with rocksdb::TxnDBWritePolicy */
761 static const char *write_policy_names[] = {"write_committed", "write_prepared",
762 "write_unprepared", NullS};
763
764 static TYPELIB write_policy_typelib = {array_elements(write_policy_names) - 1,
765 "write_policy_typelib",
766 write_policy_names, nullptr};
767
768 #if 0 // MARIAROCKS_NOT_YET : read-free replication is not supported
769 /* This array needs to be kept up to date with myrocks::read_free_rpl_type */
770 static const char *read_free_rpl_names[] = {"OFF", "PK_ONLY", "PK_SK", NullS};
771
772 static TYPELIB read_free_rpl_typelib = {array_elements(read_free_rpl_names) - 1,
773 "read_free_rpl_typelib",
774 read_free_rpl_names, nullptr};
775 #endif
776
777 /* This enum needs to be kept up to date with rocksdb::InfoLogLevel */
778 static const char *info_log_level_names[] = {"debug_level", "info_level",
779 "warn_level", "error_level",
780 "fatal_level", NullS};
781
782 static TYPELIB info_log_level_typelib = {
783 array_elements(info_log_level_names) - 1, "info_log_level_typelib",
784 info_log_level_names, nullptr};
785
rocksdb_set_rocksdb_info_log_level(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)786 static void rocksdb_set_rocksdb_info_log_level(
787 THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
788 const void *const save) {
789 DBUG_ASSERT(save != nullptr);
790
791 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
792 rocksdb_info_log_level = *static_cast<const uint64_t *>(save);
793 rocksdb_db_options->info_log->SetInfoLogLevel(
794 static_cast<rocksdb::InfoLogLevel>(rocksdb_info_log_level));
795 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
796 }
797
rocksdb_set_rocksdb_stats_level(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)798 static void rocksdb_set_rocksdb_stats_level(THD *const thd,
799 struct st_mysql_sys_var *const var,
800 void *const var_ptr,
801 const void *const save) {
802 DBUG_ASSERT(save != nullptr);
803
804 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
805 rocksdb_db_options->statistics->set_stats_level(
806 static_cast<rocksdb::StatsLevel>(
807 *static_cast<const uint64_t *>(save)));
808 // Actual stats level is defined at rocksdb dbopt::statistics::stats_level_
809 // so adjusting rocksdb_stats_level here to make sure it points to
810 // the correct stats level.
811 rocksdb_stats_level = rocksdb_db_options->statistics->get_stats_level();
812 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
813 }
814
rocksdb_set_reset_stats(my_core::THD * const,my_core::st_mysql_sys_var * const var MY_ATTRIBUTE ((__unused__)),void * const var_ptr,const void * const save)815 static void rocksdb_set_reset_stats(
816 my_core::THD *const /* unused */,
817 my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
818 void *const var_ptr, const void *const save) {
819 DBUG_ASSERT(save != nullptr);
820 DBUG_ASSERT(rdb != nullptr);
821 DBUG_ASSERT(rocksdb_stats != nullptr);
822
823 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
824
825 *static_cast<bool *>(var_ptr) = *static_cast<const bool *>(save);
826
827 if (rocksdb_reset_stats) {
828 rocksdb::Status s = rdb->ResetStats();
829
830 // RocksDB will always return success. Let's document this assumption here
831 // as well so that we'll get immediately notified when contract changes.
832 DBUG_ASSERT(s == rocksdb::Status::OK());
833
834 s = rocksdb_stats->Reset();
835 DBUG_ASSERT(s == rocksdb::Status::OK());
836 }
837
838 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
839 }
840
rocksdb_set_io_write_timeout(my_core::THD * const thd MY_ATTRIBUTE ((__unused__)),my_core::st_mysql_sys_var * const var MY_ATTRIBUTE ((__unused__)),void * const var_ptr MY_ATTRIBUTE ((__unused__)),const void * const save)841 static void rocksdb_set_io_write_timeout(
842 my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
843 my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
844 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
845 DBUG_ASSERT(save != nullptr);
846 DBUG_ASSERT(rdb != nullptr);
847 #if !defined(_WIN32) && !defined(__APPLE__)
848 DBUG_ASSERT(io_watchdog != nullptr);
849 #endif
850
851 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
852
853 const uint32_t new_val = *static_cast<const uint32_t *>(save);
854
855 rocksdb_io_write_timeout_secs = new_val;
856 #if !defined(_WIN32) && !defined(__APPLE__)
857 io_watchdog->reset_timeout(rocksdb_io_write_timeout_secs);
858 #endif
859 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
860 }
861
862 enum rocksdb_flush_log_at_trx_commit_type : unsigned int {
863 FLUSH_LOG_NEVER = 0,
864 FLUSH_LOG_SYNC,
865 FLUSH_LOG_BACKGROUND,
866 FLUSH_LOG_MAX /* must be last */
867 };
868
rocksdb_validate_flush_log_at_trx_commit(THD * const thd,struct st_mysql_sys_var * const var,void * var_ptr,struct st_mysql_value * const value)869 static int rocksdb_validate_flush_log_at_trx_commit(
870 THD *const thd,
871 struct st_mysql_sys_var *const var, /* in: pointer to system variable */
872 void *var_ptr, /* out: immediate result for update function */
873 struct st_mysql_value *const value /* in: incoming value */) {
874 long long new_value;
875
876 /* value is NULL */
877 if (value->val_int(value, &new_value)) {
878 return HA_EXIT_FAILURE;
879 }
880
881 if (rocksdb_db_options->allow_mmap_writes && new_value != FLUSH_LOG_NEVER) {
882 return HA_EXIT_FAILURE;
883 }
884
885 *static_cast<uint32_t *>(var_ptr) = static_cast<uint32_t>(new_value);
886 return HA_EXIT_SUCCESS;
887 }
rocksdb_compact_column_family_stub(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)888 static void rocksdb_compact_column_family_stub(
889 THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
890 const void *const save) {}
891
892 static int rocksdb_compact_column_family(THD *const thd,
893 struct st_mysql_sys_var *const var,
894 void *const var_ptr,
895 struct st_mysql_value *const value);
896
897 static const char *index_type_names[] = {"kBinarySearch", "kHashSearch", NullS};
898
899 static TYPELIB index_type_typelib = {array_elements(index_type_names) - 1,
900 "index_type_typelib", index_type_names,
901 nullptr};
902
903 const ulong RDB_MAX_LOCK_WAIT_SECONDS = 1024 * 1024 * 1024;
904 const ulong RDB_DEFAULT_MAX_ROW_LOCKS = 1024 * 1024;
905 const ulong RDB_MAX_ROW_LOCKS = 1024 * 1024 * 1024;
906 const ulong RDB_DEFAULT_BULK_LOAD_SIZE = 1000;
907 const ulong RDB_MAX_BULK_LOAD_SIZE = 1024 * 1024 * 1024;
908 const size_t RDB_DEFAULT_MERGE_BUF_SIZE = 64 * 1024 * 1024;
909 const size_t RDB_MIN_MERGE_BUF_SIZE = 100;
910 const size_t RDB_DEFAULT_MERGE_COMBINE_READ_SIZE = 1024 * 1024 * 1024;
911 const size_t RDB_MIN_MERGE_COMBINE_READ_SIZE = 100;
912 const size_t RDB_DEFAULT_MERGE_TMP_FILE_REMOVAL_DELAY = 0;
913 const size_t RDB_MIN_MERGE_TMP_FILE_REMOVAL_DELAY = 0;
914 const int64 RDB_DEFAULT_BLOCK_CACHE_SIZE = 512 * 1024 * 1024;
915 const int64 RDB_MIN_BLOCK_CACHE_SIZE = 1024;
916 const int RDB_MAX_CHECKSUMS_PCT = 100;
917 const ulong RDB_DEADLOCK_DETECT_DEPTH = 50;
918
919 // TODO: 0 means don't wait at all, and we don't support it yet?
920 static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG,
921 "Number of seconds to wait for lock", nullptr,
922 nullptr, /*default*/ 1, /*min*/ 1,
923 /*max*/ RDB_MAX_LOCK_WAIT_SECONDS, 0);
924
925 static MYSQL_THDVAR_BOOL(deadlock_detect, PLUGIN_VAR_RQCMDARG,
926 "Enables deadlock detection", nullptr, nullptr, FALSE);
927
928 static MYSQL_THDVAR_ULONG(deadlock_detect_depth, PLUGIN_VAR_RQCMDARG,
929 "Number of transactions deadlock detection will "
930 "traverse through before assuming deadlock",
931 nullptr, nullptr,
932 /*default*/ RDB_DEADLOCK_DETECT_DEPTH,
933 /*min*/ 2,
934 /*max*/ ULONG_MAX, 0);
935
936 static MYSQL_THDVAR_BOOL(
937 commit_time_batch_for_recovery, PLUGIN_VAR_RQCMDARG,
938 "TransactionOptions::commit_time_batch_for_recovery for RocksDB", nullptr,
939 nullptr, TRUE);
940
941 static MYSQL_THDVAR_BOOL(
942 trace_sst_api, PLUGIN_VAR_RQCMDARG,
943 "Generate trace output in the log for each call to the SstFileWriter",
944 nullptr, nullptr, FALSE);
945
946 static MYSQL_THDVAR_BOOL(
947 bulk_load, PLUGIN_VAR_RQCMDARG,
948 "Use bulk-load mode for inserts. This disables "
949 "unique_checks and enables rocksdb_commit_in_the_middle.",
950 rocksdb_check_bulk_load, nullptr, FALSE);
951
952 static MYSQL_THDVAR_BOOL(bulk_load_allow_sk, PLUGIN_VAR_RQCMDARG,
953 "Allow bulk loading of sk keys during bulk-load. "
954 "Can be changed only when bulk load is disabled.",
955 /* Intentionally reuse unsorted's check function */
956 rocksdb_check_bulk_load_allow_unsorted, nullptr,
957 FALSE);
958
959 static MYSQL_THDVAR_BOOL(bulk_load_allow_unsorted, PLUGIN_VAR_RQCMDARG,
960 "Allow unsorted input during bulk-load. "
961 "Can be changed only when bulk load is disabled.",
962 rocksdb_check_bulk_load_allow_unsorted, nullptr,
963 FALSE);
964
965 static MYSQL_SYSVAR_BOOL(enable_bulk_load_api, rocksdb_enable_bulk_load_api,
966 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
967 "Enables using SstFileWriter for bulk loading",
968 nullptr, nullptr, rocksdb_enable_bulk_load_api);
969
970 static MYSQL_SYSVAR_STR(git_hash, rocksdb_git_hash,
971 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
972 "Git revision of the RocksDB library used by MyRocks",
973 nullptr, nullptr, ROCKSDB_GIT_HASH);
974
975 static MYSQL_THDVAR_STR(tmpdir, PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_MEMALLOC,
976 "Directory for temporary files during DDL operations.",
977 nullptr, nullptr, "");
978
979 #define DEFAULT_SKIP_UNIQUE_CHECK_TABLES ".*"
980 static MYSQL_THDVAR_STR(
981 skip_unique_check_tables, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
982 "Skip unique constraint checking for the specified tables", nullptr,
983 nullptr, DEFAULT_SKIP_UNIQUE_CHECK_TABLES);
984
985 static MYSQL_THDVAR_BOOL(
986 commit_in_the_middle, PLUGIN_VAR_RQCMDARG,
987 "Commit rows implicitly every rocksdb_bulk_load_size, on bulk load/insert, "
988 "update and delete",
989 nullptr, nullptr, FALSE);
990
991 static MYSQL_THDVAR_BOOL(
992 blind_delete_primary_key, PLUGIN_VAR_RQCMDARG,
993 "Deleting rows by primary key lookup, without reading rows (Blind Deletes)."
994 " Blind delete is disabled if the table has secondary key",
995 nullptr, nullptr, FALSE);
996
997 #if 0 // MARIAROCKS_NOT_YET : read-free replication is not supported
998
999 static const char *DEFAULT_READ_FREE_RPL_TABLES = ".*";
1000
rocksdb_validate_read_free_rpl_tables(THD * thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * var MY_ATTRIBUTE ((__unused__)),void * save,struct st_mysql_value * value)1001 static int rocksdb_validate_read_free_rpl_tables(
1002 THD *thd MY_ATTRIBUTE((__unused__)),
1003 struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)), void *save,
1004 struct st_mysql_value *value) {
1005 char buff[STRING_BUFFER_USUAL_SIZE];
1006 int length = sizeof(buff);
1007 const char *wlist_buf = value->val_str(value, buff, &length);
1008 const auto wlist = wlist_buf ? wlist_buf : DEFAULT_READ_FREE_RPL_TABLES;
1009
1010 #if defined(HAVE_PSI_INTERFACE)
1011 Regex_list_handler regex_handler(key_rwlock_read_free_rpl_tables);
1012 #else
1013 Regex_list_handler regex_handler;
1014 #endif
1015
1016 if (!regex_handler.set_patterns(wlist)) {
1017 warn_about_bad_patterns(®ex_handler, "rocksdb_read_free_rpl_tables");
1018 return HA_EXIT_FAILURE;
1019 }
1020
1021 *static_cast<const char **>(save) = my_strdup(wlist, MYF(MY_WME));
1022 return HA_EXIT_SUCCESS;
1023 }
1024
rocksdb_update_read_free_rpl_tables(THD * thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * var MY_ATTRIBUTE ((__unused__)),void * var_ptr,const void * save)1025 static void rocksdb_update_read_free_rpl_tables(
1026 THD *thd MY_ATTRIBUTE((__unused__)),
1027 struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)), void *var_ptr,
1028 const void *save) {
1029 const auto wlist = *static_cast<const char *const *>(save);
1030 DBUG_ASSERT(wlist != nullptr);
1031
1032 // This is bound to succeed since we've already checked for bad patterns in
1033 // rocksdb_validate_read_free_rpl_tables
1034 rdb_read_free_regex_handler.set_patterns(wlist);
1035
1036 // update all table defs
1037 struct Rdb_read_free_rpl_updater : public Rdb_tables_scanner {
1038 int add_table(Rdb_tbl_def *tdef) override {
1039 tdef->check_and_set_read_free_rpl_table();
1040 return HA_EXIT_SUCCESS;
1041 }
1042 } updater;
1043 ddl_manager.scan_for_tables(&updater);
1044
1045 if (wlist == DEFAULT_READ_FREE_RPL_TABLES) {
1046 // If running SET var = DEFAULT, then rocksdb_validate_read_free_rpl_tables
1047 // isn't called, and memory is never allocated for the value. Allocate it
1048 // here.
1049 *static_cast<const char **>(var_ptr) = my_strdup(wlist, MYF(MY_WME));
1050 } else {
1051 // Otherwise, we just reuse the value allocated from
1052 // rocksdb_validate_read_free_rpl_tables.
1053 *static_cast<const char **>(var_ptr) = wlist;
1054 }
1055 }
1056
1057 static MYSQL_SYSVAR_STR(
1058 read_free_rpl_tables, rocksdb_read_free_rpl_tables,
1059 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC /*| PLUGIN_VAR_ALLOCATED*/,
1060 "List of tables that will use read-free replication on the slave "
1061 "(i.e. not lookup a row during replication)",
1062 rocksdb_validate_read_free_rpl_tables, rocksdb_update_read_free_rpl_tables,
1063 DEFAULT_READ_FREE_RPL_TABLES);
1064
1065 static MYSQL_SYSVAR_ENUM(
1066 read_free_rpl, rocksdb_read_free_rpl,
1067 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
1068 "Use read-free replication on the slave (i.e. no row lookup during "
1069 "replication). Default is OFF, PK_SK will enable it on all tables with "
1070 "primary key. PK_ONLY will enable it on tables where the only key is the "
1071 "primary key (i.e. no secondary keys).",
1072 nullptr, nullptr, read_free_rpl_type::OFF, &read_free_rpl_typelib);
1073 #endif
1074
1075 static MYSQL_THDVAR_BOOL(skip_bloom_filter_on_read, PLUGIN_VAR_RQCMDARG,
1076 "Skip using bloom filter for reads", nullptr, nullptr,
1077 FALSE);
1078
1079 static MYSQL_THDVAR_ULONG(max_row_locks, PLUGIN_VAR_RQCMDARG,
1080 "Maximum number of locks a transaction can have",
1081 nullptr, nullptr,
1082 /*default*/ RDB_DEFAULT_MAX_ROW_LOCKS,
1083 /*min*/ 1,
1084 /*max*/ RDB_MAX_ROW_LOCKS, 0);
1085
1086 static MYSQL_THDVAR_ULONGLONG(
1087 write_batch_max_bytes, PLUGIN_VAR_RQCMDARG,
1088 "Maximum size of write batch in bytes. 0 means no limit.", nullptr, nullptr,
1089 /* default */ 0, /* min */ 0, /* max */ SIZE_T_MAX, 1);
1090
1091 static MYSQL_THDVAR_BOOL(
1092 lock_scanned_rows, PLUGIN_VAR_RQCMDARG,
1093 "Take and hold locks on rows that are scanned but not updated", nullptr,
1094 nullptr, FALSE);
1095
1096 static MYSQL_THDVAR_ULONG(bulk_load_size, PLUGIN_VAR_RQCMDARG,
1097 "Max #records in a batch for bulk-load mode", nullptr,
1098 nullptr,
1099 /*default*/ RDB_DEFAULT_BULK_LOAD_SIZE,
1100 /*min*/ 1,
1101 /*max*/ RDB_MAX_BULK_LOAD_SIZE, 0);
1102
1103 static MYSQL_THDVAR_ULONGLONG(
1104 merge_buf_size, PLUGIN_VAR_RQCMDARG,
1105 "Size to allocate for merge sort buffers written out to disk "
1106 "during inplace index creation.",
1107 nullptr, nullptr,
1108 /* default (64MB) */ RDB_DEFAULT_MERGE_BUF_SIZE,
1109 /* min (100B) */ RDB_MIN_MERGE_BUF_SIZE,
1110 /* max */ SIZE_T_MAX, 1);
1111
1112 static MYSQL_THDVAR_ULONGLONG(
1113 merge_combine_read_size, PLUGIN_VAR_RQCMDARG,
1114 "Size that we have to work with during combine (reading from disk) phase "
1115 "of "
1116 "external sort during fast index creation.",
1117 nullptr, nullptr,
1118 /* default (1GB) */ RDB_DEFAULT_MERGE_COMBINE_READ_SIZE,
1119 /* min (100B) */ RDB_MIN_MERGE_COMBINE_READ_SIZE,
1120 /* max */ SIZE_T_MAX, 1);
1121
1122 static MYSQL_THDVAR_ULONGLONG(
1123 merge_tmp_file_removal_delay_ms, PLUGIN_VAR_RQCMDARG,
1124 "Fast index creation creates a large tmp file on disk during index "
1125 "creation. Removing this large file all at once when index creation is "
1126 "complete can cause trim stalls on Flash. This variable specifies a "
1127 "duration to sleep (in milliseconds) between calling chsize() to truncate "
1128 "the file in chunks. The chunk size is the same as merge_buf_size.",
1129 nullptr, nullptr,
1130 /* default (0ms) */ RDB_DEFAULT_MERGE_TMP_FILE_REMOVAL_DELAY,
1131 /* min (0ms) */ RDB_MIN_MERGE_TMP_FILE_REMOVAL_DELAY,
1132 /* max */ SIZE_T_MAX, 1);
1133
1134 static MYSQL_THDVAR_INT(
1135 manual_compaction_threads, PLUGIN_VAR_RQCMDARG,
1136 "How many rocksdb threads to run for manual compactions", nullptr, nullptr,
1137 /* default rocksdb.dboption max_subcompactions */ 0,
1138 /* min */ 0, /* max */ 128, 0);
1139
1140 static MYSQL_SYSVAR_BOOL(
1141 create_if_missing,
1142 *reinterpret_cast<my_bool *>(&rocksdb_db_options->create_if_missing),
1143 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1144 "DBOptions::create_if_missing for RocksDB", nullptr, nullptr,
1145 rocksdb_db_options->create_if_missing);
1146
1147 static MYSQL_SYSVAR_BOOL(
1148 two_write_queues,
1149 *reinterpret_cast<my_bool *>(&rocksdb_db_options->two_write_queues),
1150 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1151 "DBOptions::two_write_queues for RocksDB", nullptr, nullptr,
1152 rocksdb_db_options->two_write_queues);
1153
1154 static MYSQL_SYSVAR_BOOL(
1155 manual_wal_flush,
1156 *reinterpret_cast<my_bool *>(&rocksdb_db_options->manual_wal_flush),
1157 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1158 "DBOptions::manual_wal_flush for RocksDB", nullptr, nullptr,
1159 rocksdb_db_options->manual_wal_flush);
1160
1161 static MYSQL_SYSVAR_ENUM(write_policy, rocksdb_write_policy,
1162 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1163 "DBOptions::write_policy for RocksDB", nullptr,
1164 nullptr, rocksdb::TxnDBWritePolicy::WRITE_COMMITTED,
1165 &write_policy_typelib);
1166
1167 static MYSQL_SYSVAR_BOOL(
1168 create_missing_column_families,
1169 *reinterpret_cast<my_bool *>(
1170 &rocksdb_db_options->create_missing_column_families),
1171 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1172 "DBOptions::create_missing_column_families for RocksDB", nullptr, nullptr,
1173 rocksdb_db_options->create_missing_column_families);
1174
1175 static MYSQL_SYSVAR_BOOL(
1176 error_if_exists,
1177 *reinterpret_cast<my_bool *>(&rocksdb_db_options->error_if_exists),
1178 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1179 "DBOptions::error_if_exists for RocksDB", nullptr, nullptr,
1180 rocksdb_db_options->error_if_exists);
1181
1182 static MYSQL_SYSVAR_BOOL(
1183 paranoid_checks,
1184 *reinterpret_cast<my_bool *>(&rocksdb_db_options->paranoid_checks),
1185 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1186 "DBOptions::paranoid_checks for RocksDB", nullptr, nullptr,
1187 rocksdb_db_options->paranoid_checks);
1188
1189 static MYSQL_SYSVAR_ULONGLONG(
1190 rate_limiter_bytes_per_sec, rocksdb_rate_limiter_bytes_per_sec,
1191 PLUGIN_VAR_RQCMDARG, "DBOptions::rate_limiter bytes_per_sec for RocksDB",
1192 nullptr, rocksdb_set_rate_limiter_bytes_per_sec, /* default */ 0L,
1193 /* min */ 0L, /* max */ MAX_RATE_LIMITER_BYTES_PER_SEC, 0);
1194
1195 static MYSQL_SYSVAR_ULONGLONG(
1196 sst_mgr_rate_bytes_per_sec, rocksdb_sst_mgr_rate_bytes_per_sec,
1197 PLUGIN_VAR_RQCMDARG,
1198 "DBOptions::sst_file_manager rate_bytes_per_sec for RocksDB", nullptr,
1199 rocksdb_set_sst_mgr_rate_bytes_per_sec,
1200 /* default */ DEFAULT_SST_MGR_RATE_BYTES_PER_SEC,
1201 /* min */ 0L, /* max */ UINT64_MAX, 0);
1202
1203 static MYSQL_SYSVAR_ULONGLONG(delayed_write_rate, rocksdb_delayed_write_rate,
1204 PLUGIN_VAR_RQCMDARG,
1205 "DBOptions::delayed_write_rate", nullptr,
1206 rocksdb_set_delayed_write_rate,
1207 rocksdb_db_options->delayed_write_rate, 0,
1208 UINT64_MAX, 0);
1209
1210 static MYSQL_SYSVAR_UINT(max_latest_deadlocks, rocksdb_max_latest_deadlocks,
1211 PLUGIN_VAR_RQCMDARG,
1212 "Maximum number of recent "
1213 "deadlocks to store",
1214 nullptr, rocksdb_set_max_latest_deadlocks,
1215 rocksdb::kInitialMaxDeadlocks, 0, UINT32_MAX, 0);
1216
1217 static MYSQL_SYSVAR_ENUM(
1218 info_log_level, rocksdb_info_log_level, PLUGIN_VAR_RQCMDARG,
1219 "Filter level for info logs to be written mysqld error log. "
1220 "Valid values include 'debug_level', 'info_level', 'warn_level'"
1221 "'error_level' and 'fatal_level'.",
1222 nullptr, rocksdb_set_rocksdb_info_log_level,
1223 rocksdb::InfoLogLevel::ERROR_LEVEL, &info_log_level_typelib);
1224
1225 static MYSQL_THDVAR_INT(
1226 perf_context_level, PLUGIN_VAR_RQCMDARG,
1227 "Perf Context Level for rocksdb internal timer stat collection", nullptr,
1228 nullptr,
1229 /* default */ rocksdb::PerfLevel::kUninitialized,
1230 /* min */ rocksdb::PerfLevel::kUninitialized,
1231 /* max */ rocksdb::PerfLevel::kOutOfBounds - 1, 0);
1232
1233 static MYSQL_SYSVAR_UINT(
1234 wal_recovery_mode, rocksdb_wal_recovery_mode, PLUGIN_VAR_RQCMDARG,
1235 "DBOptions::wal_recovery_mode for RocksDB. Default is kAbsoluteConsistency",
1236 nullptr, nullptr,
1237 /* default */ (uint)rocksdb::WALRecoveryMode::kAbsoluteConsistency,
1238 /* min */ (uint)rocksdb::WALRecoveryMode::kTolerateCorruptedTailRecords,
1239 /* max */ (uint)rocksdb::WALRecoveryMode::kSkipAnyCorruptedRecords, 0);
1240
1241 static MYSQL_SYSVAR_UINT(
1242 stats_level, rocksdb_stats_level, PLUGIN_VAR_RQCMDARG,
1243 "Statistics Level for RocksDB. Default is 0 (kExceptHistogramOrTimers)",
1244 nullptr, rocksdb_set_rocksdb_stats_level,
1245 /* default */ (uint)rocksdb::StatsLevel::kExceptHistogramOrTimers,
1246 /* min */ (uint)rocksdb::StatsLevel::kExceptHistogramOrTimers,
1247 /* max */ (uint)rocksdb::StatsLevel::kAll, 0);
1248
1249 static MYSQL_SYSVAR_SIZE_T(compaction_readahead_size,
1250 rocksdb_db_options->compaction_readahead_size,
1251 PLUGIN_VAR_RQCMDARG,
1252 "DBOptions::compaction_readahead_size for RocksDB",
1253 nullptr, nullptr,
1254 rocksdb_db_options->compaction_readahead_size,
1255 /* min */ 0L, /* max */ SIZE_T_MAX, 0);
1256
1257 static MYSQL_SYSVAR_BOOL(
1258 new_table_reader_for_compaction_inputs,
1259 *reinterpret_cast<my_bool *>(
1260 &rocksdb_db_options->new_table_reader_for_compaction_inputs),
1261 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1262 "DBOptions::new_table_reader_for_compaction_inputs for RocksDB", nullptr,
1263 nullptr, rocksdb_db_options->new_table_reader_for_compaction_inputs);
1264
1265 static MYSQL_SYSVAR_UINT(
1266 access_hint_on_compaction_start, rocksdb_access_hint_on_compaction_start,
1267 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1268 "DBOptions::access_hint_on_compaction_start for RocksDB", nullptr, nullptr,
1269 /* default */ (uint)rocksdb::Options::AccessHint::NORMAL,
1270 /* min */ (uint)rocksdb::Options::AccessHint::NONE,
1271 /* max */ (uint)rocksdb::Options::AccessHint::WILLNEED, 0);
1272
1273 static MYSQL_SYSVAR_BOOL(
1274 allow_concurrent_memtable_write,
1275 *reinterpret_cast<my_bool *>(
1276 &rocksdb_db_options->allow_concurrent_memtable_write),
1277 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1278 "DBOptions::allow_concurrent_memtable_write for RocksDB", nullptr, nullptr,
1279 false);
1280
1281 static MYSQL_SYSVAR_BOOL(
1282 enable_write_thread_adaptive_yield,
1283 *reinterpret_cast<my_bool *>(
1284 &rocksdb_db_options->enable_write_thread_adaptive_yield),
1285 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1286 "DBOptions::enable_write_thread_adaptive_yield for RocksDB", nullptr,
1287 nullptr, false);
1288
1289 static MYSQL_SYSVAR_INT(max_open_files, rocksdb_db_options->max_open_files,
1290 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1291 "DBOptions::max_open_files for RocksDB", nullptr,
1292 nullptr, rocksdb_db_options->max_open_files,
1293 /* min */ -2, /* max */ INT_MAX, 0);
1294
1295 static MYSQL_SYSVAR_UINT64_T(max_total_wal_size,
1296 rocksdb_db_options->max_total_wal_size,
1297 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1298 "DBOptions::max_total_wal_size for RocksDB", nullptr,
1299 nullptr, rocksdb_db_options->max_total_wal_size,
1300 /* min */ 0, /* max */ LONGLONG_MAX, 0);
1301
1302 static MYSQL_SYSVAR_BOOL(
1303 use_fsync, *reinterpret_cast<my_bool *>(&rocksdb_db_options->use_fsync),
1304 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1305 "DBOptions::use_fsync for RocksDB", nullptr, nullptr,
1306 rocksdb_db_options->use_fsync);
1307
1308 static MYSQL_SYSVAR_STR(wal_dir, rocksdb_wal_dir,
1309 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1310 "DBOptions::wal_dir for RocksDB", nullptr, nullptr,
1311 rocksdb_db_options->wal_dir.c_str());
1312
1313 static MYSQL_SYSVAR_STR(
1314 persistent_cache_path, rocksdb_persistent_cache_path,
1315 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1316 "Path for BlockBasedTableOptions::persistent_cache for RocksDB", nullptr,
1317 nullptr, "");
1318
1319 static MYSQL_SYSVAR_ULONG(
1320 persistent_cache_size_mb, rocksdb_persistent_cache_size_mb,
1321 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1322 "Size of cache in MB for BlockBasedTableOptions::persistent_cache "
1323 "for RocksDB",
1324 nullptr, nullptr, rocksdb_persistent_cache_size_mb,
1325 /* min */ 0L, /* max */ ULONG_MAX, 0);
1326
1327 static MYSQL_SYSVAR_UINT64_T(
1328 delete_obsolete_files_period_micros,
1329 rocksdb_db_options->delete_obsolete_files_period_micros,
1330 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1331 "DBOptions::delete_obsolete_files_period_micros for RocksDB", nullptr,
1332 nullptr, rocksdb_db_options->delete_obsolete_files_period_micros,
1333 /* min */ 0, /* max */ LONGLONG_MAX, 0);
1334
1335 static MYSQL_SYSVAR_INT(max_background_jobs,
1336 rocksdb_db_options->max_background_jobs,
1337 PLUGIN_VAR_RQCMDARG,
1338 "DBOptions::max_background_jobs for RocksDB", nullptr,
1339 rocksdb_set_max_background_jobs,
1340 rocksdb_db_options->max_background_jobs,
1341 /* min */ -1, /* max */ MAX_BACKGROUND_JOBS, 0);
1342
1343 static MYSQL_SYSVAR_UINT(max_subcompactions,
1344 rocksdb_db_options->max_subcompactions,
1345 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1346 "DBOptions::max_subcompactions for RocksDB", nullptr,
1347 nullptr, rocksdb_db_options->max_subcompactions,
1348 /* min */ 1, /* max */ MAX_SUBCOMPACTIONS, 0);
1349
1350 static MYSQL_SYSVAR_SIZE_T(max_log_file_size,
1351 rocksdb_db_options->max_log_file_size,
1352 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1353 "DBOptions::max_log_file_size for RocksDB", nullptr,
1354 nullptr, rocksdb_db_options->max_log_file_size,
1355 /* min */ 0L, /* max */ SIZE_T_MAX, 0);
1356
1357 static MYSQL_SYSVAR_SIZE_T(log_file_time_to_roll,
1358 rocksdb_db_options->log_file_time_to_roll,
1359 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1360 "DBOptions::log_file_time_to_roll for RocksDB",
1361 nullptr, nullptr,
1362 rocksdb_db_options->log_file_time_to_roll,
1363 /* min */ 0L, /* max */ SIZE_T_MAX, 0);
1364
1365 static MYSQL_SYSVAR_SIZE_T(keep_log_file_num,
1366 rocksdb_db_options->keep_log_file_num,
1367 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1368 "DBOptions::keep_log_file_num for RocksDB", nullptr,
1369 nullptr, rocksdb_db_options->keep_log_file_num,
1370 /* min */ 0L, /* max */ SIZE_T_MAX, 0);
1371
1372 static MYSQL_SYSVAR_UINT64_T(max_manifest_file_size,
1373 rocksdb_db_options->max_manifest_file_size,
1374 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1375 "DBOptions::max_manifest_file_size for RocksDB",
1376 nullptr, nullptr,
1377 rocksdb_db_options->max_manifest_file_size,
1378 /* min */ 0L, /* max */ ULONGLONG_MAX, 0);
1379
1380 static MYSQL_SYSVAR_INT(table_cache_numshardbits,
1381 rocksdb_db_options->table_cache_numshardbits,
1382 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1383 "DBOptions::table_cache_numshardbits for RocksDB",
1384 nullptr, nullptr,
1385 rocksdb_db_options->table_cache_numshardbits,
1386 // LRUCache limits this to 19 bits, anything greater
1387 // fails to create a cache and returns a nullptr
1388 /* min */ 0, /* max */ 19, 0);
1389
1390 static MYSQL_SYSVAR_UINT64_T(wal_ttl_seconds, rocksdb_db_options->WAL_ttl_seconds,
1391 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1392 "DBOptions::WAL_ttl_seconds for RocksDB", nullptr,
1393 nullptr, rocksdb_db_options->WAL_ttl_seconds,
1394 /* min */ 0L, /* max */ LONGLONG_MAX, 0);
1395
1396 static MYSQL_SYSVAR_UINT64_T(wal_size_limit_mb,
1397 rocksdb_db_options->WAL_size_limit_MB,
1398 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1399 "DBOptions::WAL_size_limit_MB for RocksDB", nullptr,
1400 nullptr, rocksdb_db_options->WAL_size_limit_MB,
1401 /* min */ 0L, /* max */ LONGLONG_MAX, 0);
1402
1403 static MYSQL_SYSVAR_SIZE_T(manifest_preallocation_size,
1404 rocksdb_db_options->manifest_preallocation_size,
1405 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1406 "DBOptions::manifest_preallocation_size for RocksDB",
1407 nullptr, nullptr,
1408 rocksdb_db_options->manifest_preallocation_size,
1409 /* min */ 0L, /* max */ SIZE_T_MAX, 0);
1410
1411 static MYSQL_SYSVAR_BOOL(
1412 use_direct_reads,
1413 *reinterpret_cast<my_bool *>(&rocksdb_db_options->use_direct_reads),
1414 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1415 "DBOptions::use_direct_reads for RocksDB", nullptr, nullptr,
1416 rocksdb_db_options->use_direct_reads);
1417
1418 static MYSQL_SYSVAR_BOOL(
1419 use_direct_io_for_flush_and_compaction,
1420 *reinterpret_cast<my_bool *>(&rocksdb_db_options->use_direct_io_for_flush_and_compaction),
1421 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1422 "DBOptions::use_direct_io_for_flush_and_compaction for RocksDB", nullptr, nullptr,
1423 rocksdb_db_options->use_direct_io_for_flush_and_compaction);
1424
1425 static MYSQL_SYSVAR_BOOL(
1426 allow_mmap_reads,
1427 *reinterpret_cast<my_bool *>(&rocksdb_db_options->allow_mmap_reads),
1428 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1429 "DBOptions::allow_mmap_reads for RocksDB", nullptr, nullptr,
1430 rocksdb_db_options->allow_mmap_reads);
1431
1432 static MYSQL_SYSVAR_BOOL(
1433 allow_mmap_writes,
1434 *reinterpret_cast<my_bool *>(&rocksdb_db_options->allow_mmap_writes),
1435 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1436 "DBOptions::allow_mmap_writes for RocksDB", nullptr, nullptr,
1437 rocksdb_db_options->allow_mmap_writes);
1438
1439 static MYSQL_SYSVAR_BOOL(
1440 is_fd_close_on_exec,
1441 *reinterpret_cast<my_bool *>(&rocksdb_db_options->is_fd_close_on_exec),
1442 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1443 "DBOptions::is_fd_close_on_exec for RocksDB", nullptr, nullptr,
1444 rocksdb_db_options->is_fd_close_on_exec);
1445
1446 static MYSQL_SYSVAR_UINT(stats_dump_period_sec,
1447 rocksdb_db_options->stats_dump_period_sec,
1448 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1449 "DBOptions::stats_dump_period_sec for RocksDB",
1450 nullptr, nullptr,
1451 rocksdb_db_options->stats_dump_period_sec,
1452 /* min */ 0, /* max */ INT_MAX, 0);
1453
1454 static MYSQL_SYSVAR_BOOL(
1455 advise_random_on_open,
1456 *reinterpret_cast<my_bool *>(&rocksdb_db_options->advise_random_on_open),
1457 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1458 "DBOptions::advise_random_on_open for RocksDB", nullptr, nullptr,
1459 rocksdb_db_options->advise_random_on_open);
1460
1461 static MYSQL_SYSVAR_SIZE_T(db_write_buffer_size,
1462 rocksdb_db_options->db_write_buffer_size,
1463 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1464 "DBOptions::db_write_buffer_size for RocksDB",
1465 nullptr, nullptr,
1466 rocksdb_db_options->db_write_buffer_size,
1467 /* min */ 0L, /* max */ SIZE_T_MAX, 0);
1468
1469 static MYSQL_SYSVAR_BOOL(
1470 use_adaptive_mutex,
1471 *reinterpret_cast<my_bool *>(&rocksdb_db_options->use_adaptive_mutex),
1472 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1473 "DBOptions::use_adaptive_mutex for RocksDB", nullptr, nullptr,
1474 rocksdb_db_options->use_adaptive_mutex);
1475
1476 static MYSQL_SYSVAR_UINT64_T(bytes_per_sync, rocksdb_db_options->bytes_per_sync,
1477 PLUGIN_VAR_RQCMDARG,
1478 "DBOptions::bytes_per_sync for RocksDB", nullptr,
1479 rocksdb_set_bytes_per_sync,
1480 rocksdb_db_options->bytes_per_sync,
1481 /* min */ 0L, /* max */ ULONGLONG_MAX, 0);
1482
1483 static MYSQL_SYSVAR_UINT64_T(wal_bytes_per_sync,
1484 rocksdb_db_options->wal_bytes_per_sync,
1485 PLUGIN_VAR_RQCMDARG,
1486 "DBOptions::wal_bytes_per_sync for RocksDB", nullptr,
1487 rocksdb_set_wal_bytes_per_sync,
1488 rocksdb_db_options->wal_bytes_per_sync,
1489 /* min */ 0L, /* max */ ULONGLONG_MAX, 0);
1490
1491 static MYSQL_SYSVAR_BOOL(
1492 enable_thread_tracking,
1493 *reinterpret_cast<my_bool *>(&rocksdb_db_options->enable_thread_tracking),
1494 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1495 "DBOptions::enable_thread_tracking for RocksDB", nullptr, nullptr, true);
1496
1497 static MYSQL_SYSVAR_LONGLONG(block_cache_size, rocksdb_block_cache_size,
1498 PLUGIN_VAR_RQCMDARG,
1499 "block_cache size for RocksDB",
1500 rocksdb_validate_set_block_cache_size, nullptr,
1501 /* default */ RDB_DEFAULT_BLOCK_CACHE_SIZE,
1502 /* min */ RDB_MIN_BLOCK_CACHE_SIZE,
1503 /* max */ LLONG_MAX,
1504 /* Block size */ RDB_MIN_BLOCK_CACHE_SIZE);
1505
1506 static MYSQL_SYSVAR_LONGLONG(sim_cache_size, rocksdb_sim_cache_size,
1507 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1508 "Simulated cache size for RocksDB", nullptr,
1509 nullptr,
1510 /* default */ 0,
1511 /* min */ 0,
1512 /* max */ LLONG_MAX,
1513 /* Block size */ 0);
1514
1515 static MYSQL_SYSVAR_BOOL(
1516 use_clock_cache, rocksdb_use_clock_cache,
1517 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1518 "Use ClockCache instead of default LRUCache for RocksDB", nullptr, nullptr,
1519 false);
1520
1521 static MYSQL_SYSVAR_BOOL(cache_dump, rocksdb_cache_dump,
1522 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1523 "Include RocksDB block cache content in core dump.",
1524 nullptr, nullptr, true);
1525
1526 static MYSQL_SYSVAR_DOUBLE(cache_high_pri_pool_ratio,
1527 rocksdb_cache_high_pri_pool_ratio,
1528 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1529 "Specify the size of block cache high-pri pool",
1530 nullptr, nullptr, /* default */ 0.0, /* min */ 0.0,
1531 /* max */ 1.0, 0);
1532
1533 static MYSQL_SYSVAR_BOOL(
1534 cache_index_and_filter_blocks,
1535 *reinterpret_cast<my_bool *>(
1536 &rocksdb_tbl_options->cache_index_and_filter_blocks),
1537 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1538 "BlockBasedTableOptions::cache_index_and_filter_blocks for RocksDB",
1539 nullptr, nullptr, true);
1540
1541 static MYSQL_SYSVAR_BOOL(
1542 cache_index_and_filter_with_high_priority,
1543 *reinterpret_cast<my_bool *>(
1544 &rocksdb_tbl_options->cache_index_and_filter_blocks_with_high_priority),
1545 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1546 "cache_index_and_filter_blocks_with_high_priority for RocksDB", nullptr,
1547 nullptr, true);
1548
1549 // When pin_l0_filter_and_index_blocks_in_cache is true, RocksDB will use the
1550 // LRU cache, but will always keep the filter & idndex block's handle checked
1551 // out (=won't call ShardedLRUCache::Release), plus the parsed out objects
1552 // the LRU cache will never push flush them out, hence they're pinned.
1553 //
1554 // This fixes the mutex contention between :ShardedLRUCache::Lookup and
1555 // ShardedLRUCache::Release which reduced the QPS ratio (QPS using secondary
1556 // index / QPS using PK).
1557 static MYSQL_SYSVAR_BOOL(
1558 pin_l0_filter_and_index_blocks_in_cache,
1559 *reinterpret_cast<my_bool *>(
1560 &rocksdb_tbl_options->pin_l0_filter_and_index_blocks_in_cache),
1561 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1562 "pin_l0_filter_and_index_blocks_in_cache for RocksDB", nullptr, nullptr,
1563 true);
1564
1565 static MYSQL_SYSVAR_ENUM(index_type, rocksdb_index_type,
1566 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1567 "BlockBasedTableOptions::index_type for RocksDB",
1568 nullptr, nullptr,
1569 (ulong)rocksdb_tbl_options->index_type,
1570 &index_type_typelib);
1571
1572 static MYSQL_SYSVAR_BOOL(
1573 hash_index_allow_collision,
1574 *reinterpret_cast<my_bool *>(
1575 &rocksdb_tbl_options->hash_index_allow_collision),
1576 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1577 "BlockBasedTableOptions::hash_index_allow_collision for RocksDB", nullptr,
1578 nullptr, rocksdb_tbl_options->hash_index_allow_collision);
1579
1580 static MYSQL_SYSVAR_BOOL(
1581 no_block_cache,
1582 *reinterpret_cast<my_bool *>(&rocksdb_tbl_options->no_block_cache),
1583 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1584 "BlockBasedTableOptions::no_block_cache for RocksDB", nullptr, nullptr,
1585 rocksdb_tbl_options->no_block_cache);
1586
1587 static MYSQL_SYSVAR_SIZE_T(block_size, rocksdb_tbl_options->block_size,
1588 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1589 "BlockBasedTableOptions::block_size for RocksDB",
1590 nullptr, nullptr, rocksdb_tbl_options->block_size,
1591 /* min */ 1L, /* max */ SIZE_T_MAX, 0);
1592
1593 static MYSQL_SYSVAR_INT(
1594 block_size_deviation, rocksdb_tbl_options->block_size_deviation,
1595 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1596 "BlockBasedTableOptions::block_size_deviation for RocksDB", nullptr,
1597 nullptr, rocksdb_tbl_options->block_size_deviation,
1598 /* min */ 0, /* max */ INT_MAX, 0);
1599
1600 static MYSQL_SYSVAR_INT(
1601 block_restart_interval, rocksdb_tbl_options->block_restart_interval,
1602 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1603 "BlockBasedTableOptions::block_restart_interval for RocksDB", nullptr,
1604 nullptr, rocksdb_tbl_options->block_restart_interval,
1605 /* min */ 1, /* max */ INT_MAX, 0);
1606
1607 static MYSQL_SYSVAR_BOOL(
1608 whole_key_filtering,
1609 *reinterpret_cast<my_bool *>(&rocksdb_tbl_options->whole_key_filtering),
1610 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1611 "BlockBasedTableOptions::whole_key_filtering for RocksDB", nullptr, nullptr,
1612 rocksdb_tbl_options->whole_key_filtering);
1613
1614 static MYSQL_SYSVAR_STR(default_cf_options, rocksdb_default_cf_options,
1615 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1616 "default cf options for RocksDB", nullptr, nullptr, "");
1617
1618 static MYSQL_SYSVAR_STR(override_cf_options, rocksdb_override_cf_options,
1619 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1620 "option overrides per cf for RocksDB", nullptr, nullptr,
1621 "");
1622
1623 static MYSQL_SYSVAR_STR(update_cf_options, rocksdb_update_cf_options,
1624 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC
1625 /* psergey-merge: need this? : PLUGIN_VAR_ALLOCATED*/,
1626 "Option updates per column family for RocksDB",
1627 rocksdb_validate_update_cf_options,
1628 rocksdb_set_update_cf_options, nullptr);
1629
1630 static MYSQL_SYSVAR_UINT(flush_log_at_trx_commit,
1631 rocksdb_flush_log_at_trx_commit, PLUGIN_VAR_RQCMDARG,
1632 "Sync on transaction commit. Similar to "
1633 "innodb_flush_log_at_trx_commit. 1: sync on commit, "
1634 "0,2: not sync on commit",
1635 rocksdb_validate_flush_log_at_trx_commit, nullptr,
1636 /* default */ FLUSH_LOG_SYNC,
1637 /* min */ FLUSH_LOG_NEVER,
1638 /* max */ FLUSH_LOG_BACKGROUND, 0);
1639
1640 static MYSQL_THDVAR_BOOL(write_disable_wal, PLUGIN_VAR_RQCMDARG,
1641 "WriteOptions::disableWAL for RocksDB", nullptr,
1642 nullptr, rocksdb::WriteOptions().disableWAL);
1643
1644 static MYSQL_THDVAR_BOOL(
1645 write_ignore_missing_column_families, PLUGIN_VAR_RQCMDARG,
1646 "WriteOptions::ignore_missing_column_families for RocksDB", nullptr,
1647 nullptr, rocksdb::WriteOptions().ignore_missing_column_families);
1648
1649 static MYSQL_THDVAR_BOOL(skip_fill_cache, PLUGIN_VAR_RQCMDARG,
1650 "Skip filling block cache on read requests", nullptr,
1651 nullptr, FALSE);
1652
1653 static MYSQL_THDVAR_BOOL(
1654 unsafe_for_binlog, PLUGIN_VAR_RQCMDARG,
1655 "Allowing statement based binary logging which may break consistency",
1656 nullptr, nullptr, FALSE);
1657
1658 static MYSQL_THDVAR_UINT(records_in_range, PLUGIN_VAR_RQCMDARG,
1659 "Used to override the result of records_in_range(). "
1660 "Set to a positive number to override",
1661 nullptr, nullptr, 0,
1662 /* min */ 0, /* max */ INT_MAX, 0);
1663
1664 static MYSQL_THDVAR_UINT(force_index_records_in_range, PLUGIN_VAR_RQCMDARG,
1665 "Used to override the result of records_in_range() "
1666 "when FORCE INDEX is used.",
1667 nullptr, nullptr, 0,
1668 /* min */ 0, /* max */ INT_MAX, 0);
1669
1670 static MYSQL_SYSVAR_UINT(
1671 debug_optimizer_n_rows, rocksdb_debug_optimizer_n_rows,
1672 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_NOSYSVAR,
1673 "Test only to override rocksdb estimates of table size in a memtable",
1674 nullptr, nullptr, 0, /* min */ 0, /* max */ INT_MAX, 0);
1675
1676 static MYSQL_SYSVAR_BOOL(force_compute_memtable_stats,
1677 rocksdb_force_compute_memtable_stats,
1678 PLUGIN_VAR_RQCMDARG,
1679 "Force to always compute memtable stats", nullptr,
1680 nullptr, TRUE);
1681
1682 static MYSQL_SYSVAR_UINT(force_compute_memtable_stats_cachetime,
1683 rocksdb_force_compute_memtable_stats_cachetime,
1684 PLUGIN_VAR_RQCMDARG,
1685 "Time in usecs to cache memtable estimates", nullptr,
1686 nullptr, /* default */ 60 * 1000 * 1000,
1687 /* min */ 0, /* max */ INT_MAX, 0);
1688
1689 static MYSQL_SYSVAR_BOOL(
1690 debug_optimizer_no_zero_cardinality,
1691 rocksdb_debug_optimizer_no_zero_cardinality, PLUGIN_VAR_RQCMDARG,
1692 "In case if cardinality is zero, overrides it with some value", nullptr,
1693 nullptr, TRUE);
1694
1695 static MYSQL_SYSVAR_STR(compact_cf, rocksdb_compact_cf_name,
1696 PLUGIN_VAR_RQCMDARG, "Compact column family",
1697 rocksdb_compact_column_family,
1698 rocksdb_compact_column_family_stub, "");
1699
1700 static MYSQL_SYSVAR_STR(delete_cf, rocksdb_delete_cf_name, PLUGIN_VAR_RQCMDARG,
1701 "Delete column family", rocksdb_delete_column_family,
1702 rocksdb_delete_column_family_stub, "");
1703
1704 static MYSQL_SYSVAR_STR(create_checkpoint, rocksdb_checkpoint_name,
1705 PLUGIN_VAR_RQCMDARG, "Checkpoint directory",
1706 rocksdb_create_checkpoint,
1707 rocksdb_create_checkpoint_stub, "");
1708
1709 static MYSQL_SYSVAR_BOOL(remove_mariabackup_checkpoint,
1710 rocksdb_signal_remove_mariabackup_checkpoint,
1711 PLUGIN_VAR_RQCMDARG, "Remove mariabackup checkpoint",
1712 nullptr, rocksdb_remove_mariabackup_checkpoint, FALSE);
1713
1714 static MYSQL_SYSVAR_BOOL(signal_drop_index_thread,
1715 rocksdb_signal_drop_index_thread, PLUGIN_VAR_RQCMDARG,
1716 "Wake up drop index thread", nullptr,
1717 rocksdb_drop_index_wakeup_thread, FALSE);
1718
1719 static MYSQL_SYSVAR_BOOL(pause_background_work, rocksdb_pause_background_work,
1720 PLUGIN_VAR_RQCMDARG,
1721 "Disable all rocksdb background operations", nullptr,
1722 rocksdb_set_pause_background_work, FALSE);
1723
1724 static MYSQL_SYSVAR_BOOL(
1725 enable_ttl, rocksdb_enable_ttl, PLUGIN_VAR_RQCMDARG,
1726 "Enable expired TTL records to be dropped during compaction.", nullptr,
1727 nullptr, TRUE);
1728
1729 static MYSQL_SYSVAR_BOOL(
1730 enable_ttl_read_filtering, rocksdb_enable_ttl_read_filtering,
1731 PLUGIN_VAR_RQCMDARG,
1732 "For tables with TTL, expired records are skipped/filtered out during "
1733 "processing and in query results. Disabling this will allow these records "
1734 "to be seen, but as a result rows may disappear in the middle of "
1735 "transactions as they are dropped during compaction. Use with caution.",
1736 nullptr, nullptr, TRUE);
1737
1738 static MYSQL_SYSVAR_INT(
1739 debug_ttl_rec_ts, rocksdb_debug_ttl_rec_ts, PLUGIN_VAR_RQCMDARG,
1740 "For debugging purposes only. Overrides the TTL of records to "
1741 "now() + debug_ttl_rec_ts. The value can be +/- to simulate "
1742 "a record inserted in the past vs a record inserted in the 'future'. "
1743 "A value of 0 denotes that the variable is not set. This variable is a "
1744 "no-op in non-debug builds.",
1745 nullptr, nullptr, 0, /* min */ -3600, /* max */ 3600, 0);
1746
1747 static MYSQL_SYSVAR_INT(
1748 debug_ttl_snapshot_ts, rocksdb_debug_ttl_snapshot_ts, PLUGIN_VAR_RQCMDARG,
1749 "For debugging purposes only. Sets the snapshot during compaction to "
1750 "now() + debug_set_ttl_snapshot_ts. The value can be +/- to simulate "
1751 "a snapshot in the past vs a snapshot created in the 'future'. "
1752 "A value of 0 denotes that the variable is not set. This variable is a "
1753 "no-op in non-debug builds.",
1754 nullptr, nullptr, 0, /* min */ -3600, /* max */ 3600, 0);
1755
1756 static MYSQL_SYSVAR_INT(
1757 debug_ttl_read_filter_ts, rocksdb_debug_ttl_read_filter_ts,
1758 PLUGIN_VAR_RQCMDARG,
1759 "For debugging purposes only. Overrides the TTL read filtering time to "
1760 "time + debug_ttl_read_filter_ts. A value of 0 denotes that the variable "
1761 "is not set. This variable is a no-op in non-debug builds.",
1762 nullptr, nullptr, 0, /* min */ -3600, /* max */ 3600, 0);
1763
1764 static MYSQL_SYSVAR_BOOL(
1765 debug_ttl_ignore_pk, rocksdb_debug_ttl_ignore_pk, PLUGIN_VAR_RQCMDARG,
1766 "For debugging purposes only. If true, compaction filtering will not occur "
1767 "on PK TTL data. This variable is a no-op in non-debug builds.",
1768 nullptr, nullptr, FALSE);
1769
1770 static MYSQL_SYSVAR_UINT(
1771 max_manual_compactions, rocksdb_max_manual_compactions, PLUGIN_VAR_RQCMDARG,
1772 "Maximum number of pending + ongoing number of manual compactions.",
1773 nullptr, nullptr, /* default */ 10, /* min */ 0, /* max */ UINT_MAX, 0);
1774
1775 static MYSQL_SYSVAR_BOOL(
1776 rollback_on_timeout, rocksdb_rollback_on_timeout, PLUGIN_VAR_OPCMDARG,
1777 "Whether to roll back the complete transaction or a single statement on "
1778 "lock wait timeout (a single statement by default)",
1779 NULL, NULL, FALSE);
1780
1781 static MYSQL_SYSVAR_UINT(
1782 debug_manual_compaction_delay, rocksdb_debug_manual_compaction_delay,
1783 PLUGIN_VAR_RQCMDARG,
1784 "For debugging purposes only. Sleeping specified seconds "
1785 "for simulating long running compactions.",
1786 nullptr, nullptr, 0, /* min */ 0, /* max */ UINT_MAX, 0);
1787
1788 static MYSQL_SYSVAR_BOOL(
1789 reset_stats, rocksdb_reset_stats, PLUGIN_VAR_RQCMDARG,
1790 "Reset the RocksDB internal statistics without restarting the DB.", nullptr,
1791 rocksdb_set_reset_stats, FALSE);
1792
1793 static MYSQL_SYSVAR_UINT(io_write_timeout, rocksdb_io_write_timeout_secs,
1794 PLUGIN_VAR_RQCMDARG,
1795 "Timeout for experimental I/O watchdog.", nullptr,
1796 rocksdb_set_io_write_timeout, /* default */ 0,
1797 /* min */ 0L,
1798 /* max */ UINT_MAX, 0);
1799
1800 static MYSQL_SYSVAR_BOOL(enable_2pc, rocksdb_enable_2pc, PLUGIN_VAR_RQCMDARG,
1801 "Enable two phase commit for MyRocks", nullptr,
1802 nullptr, TRUE);
1803
1804 static MYSQL_SYSVAR_BOOL(ignore_unknown_options, rocksdb_ignore_unknown_options,
1805 PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
1806 "Enable ignoring unknown options passed to RocksDB",
1807 nullptr, nullptr, TRUE);
1808
1809 static MYSQL_SYSVAR_BOOL(strict_collation_check, rocksdb_strict_collation_check,
1810 PLUGIN_VAR_RQCMDARG,
1811 "Enforce case sensitive collation for MyRocks indexes",
1812 nullptr, nullptr, TRUE);
1813
1814 static MYSQL_SYSVAR_STR(strict_collation_exceptions,
1815 rocksdb_strict_collation_exceptions,
1816 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
1817 "List of tables (using regex) that are excluded "
1818 "from the case sensitive collation enforcement",
1819 nullptr, rocksdb_set_collation_exception_list, "");
1820
1821 static MYSQL_SYSVAR_BOOL(collect_sst_properties, rocksdb_collect_sst_properties,
1822 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1823 "Enables collecting SST file properties on each flush",
1824 nullptr, nullptr, rocksdb_collect_sst_properties);
1825
1826 static MYSQL_SYSVAR_BOOL(
1827 force_flush_memtable_now, rocksdb_force_flush_memtable_now_var,
1828 PLUGIN_VAR_RQCMDARG,
1829 "Forces memstore flush which may block all write requests so be careful",
1830 rocksdb_force_flush_memtable_now, rocksdb_force_flush_memtable_now_stub,
1831 FALSE);
1832
1833 static MYSQL_SYSVAR_BOOL(
1834 force_flush_memtable_and_lzero_now,
1835 rocksdb_force_flush_memtable_and_lzero_now_var, PLUGIN_VAR_RQCMDARG,
1836 "Acts similar to force_flush_memtable_now, but also compacts all L0 files.",
1837 rocksdb_force_flush_memtable_and_lzero_now,
1838 rocksdb_force_flush_memtable_and_lzero_now_stub, FALSE);
1839
1840 static MYSQL_SYSVAR_UINT(
1841 seconds_between_stat_computes, rocksdb_seconds_between_stat_computes,
1842 PLUGIN_VAR_RQCMDARG,
1843 "Sets a number of seconds to wait between optimizer stats recomputation. "
1844 "Only changed indexes will be refreshed.",
1845 nullptr, nullptr, rocksdb_seconds_between_stat_computes,
1846 /* min */ 0L, /* max */ UINT_MAX, 0);
1847
1848 static MYSQL_SYSVAR_LONGLONG(compaction_sequential_deletes,
1849 rocksdb_compaction_sequential_deletes,
1850 PLUGIN_VAR_RQCMDARG,
1851 "RocksDB will trigger compaction for the file if "
1852 "it has more than this number sequential deletes "
1853 "per window",
1854 nullptr, rocksdb_set_compaction_options,
1855 DEFAULT_COMPACTION_SEQUENTIAL_DELETES,
1856 /* min */ 0L,
1857 /* max */ MAX_COMPACTION_SEQUENTIAL_DELETES, 0);
1858
1859 static MYSQL_SYSVAR_LONGLONG(
1860 compaction_sequential_deletes_window,
1861 rocksdb_compaction_sequential_deletes_window, PLUGIN_VAR_RQCMDARG,
1862 "Size of the window for counting rocksdb_compaction_sequential_deletes",
1863 nullptr, rocksdb_set_compaction_options,
1864 DEFAULT_COMPACTION_SEQUENTIAL_DELETES_WINDOW,
1865 /* min */ 0L, /* max */ MAX_COMPACTION_SEQUENTIAL_DELETES_WINDOW, 0);
1866
1867 static MYSQL_SYSVAR_LONGLONG(
1868 compaction_sequential_deletes_file_size,
1869 rocksdb_compaction_sequential_deletes_file_size, PLUGIN_VAR_RQCMDARG,
1870 "Minimum file size required for compaction_sequential_deletes", nullptr,
1871 rocksdb_set_compaction_options, 0L,
1872 /* min */ -1L, /* max */ LLONG_MAX, 0);
1873
1874 static MYSQL_SYSVAR_BOOL(
1875 compaction_sequential_deletes_count_sd,
1876 rocksdb_compaction_sequential_deletes_count_sd, PLUGIN_VAR_RQCMDARG,
1877 "Counting SingleDelete as rocksdb_compaction_sequential_deletes", nullptr,
1878 nullptr, rocksdb_compaction_sequential_deletes_count_sd);
1879
1880 static MYSQL_SYSVAR_BOOL(
1881 print_snapshot_conflict_queries, rocksdb_print_snapshot_conflict_queries,
1882 PLUGIN_VAR_RQCMDARG,
1883 "Logging queries that got snapshot conflict errors into *.err log", nullptr,
1884 nullptr, rocksdb_print_snapshot_conflict_queries);
1885
1886 static MYSQL_THDVAR_INT(checksums_pct, PLUGIN_VAR_RQCMDARG,
1887 "How many percentages of rows to be checksummed",
1888 nullptr, nullptr, RDB_MAX_CHECKSUMS_PCT,
1889 /* min */ 0, /* max */ RDB_MAX_CHECKSUMS_PCT, 0);
1890
1891 static MYSQL_THDVAR_BOOL(store_row_debug_checksums, PLUGIN_VAR_RQCMDARG,
1892 "Include checksums when writing index/table records",
1893 nullptr, nullptr, false /* default value */);
1894
1895 static MYSQL_THDVAR_BOOL(verify_row_debug_checksums, PLUGIN_VAR_RQCMDARG,
1896 "Verify checksums when reading index/table records",
1897 nullptr, nullptr, false /* default value */);
1898
1899 static MYSQL_THDVAR_BOOL(master_skip_tx_api, PLUGIN_VAR_RQCMDARG,
1900 "Skipping holding any lock on row access. "
1901 "Not effective on slave.",
1902 nullptr, nullptr, false);
1903
1904 static MYSQL_SYSVAR_UINT(
1905 validate_tables, rocksdb_validate_tables,
1906 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1907 "Verify all .frm files match all RocksDB tables (0 means no verification, "
1908 "1 means verify and fail on error, and 2 means verify but continue",
1909 nullptr, nullptr, 1 /* default value */, 0 /* min value */,
1910 2 /* max value */, 0);
1911
1912 static MYSQL_SYSVAR_UINT(
1913 ignore_datadic_errors, rocksdb_ignore_datadic_errors,
1914 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1915 "Ignore MyRocks' data directory errors. "
1916 "(CAUTION: Use only to start the server and perform repairs. Do NOT use "
1917 "for regular operation)",
1918 nullptr, nullptr, 0 /* default value */, 0 /* min value */,
1919 1 /* max value */, 0);
1920
1921 static MYSQL_SYSVAR_STR(datadir, rocksdb_datadir,
1922 PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
1923 "RocksDB data directory", nullptr, nullptr,
1924 "./#rocksdb");
1925
1926 static MYSQL_SYSVAR_STR(supported_compression_types,
1927 compression_types_val,
1928 PLUGIN_VAR_NOCMDOPT | PLUGIN_VAR_READONLY,
1929 "Compression algorithms supported by RocksDB",
1930 nullptr, nullptr,
1931 compression_types_val);
1932
1933 static MYSQL_SYSVAR_UINT(
1934 table_stats_sampling_pct, rocksdb_table_stats_sampling_pct,
1935 PLUGIN_VAR_RQCMDARG,
1936 "Percentage of entries to sample when collecting statistics about table "
1937 "properties. Specify either 0 to sample everything or percentage "
1938 "[" STRINGIFY_ARG(RDB_TBL_STATS_SAMPLE_PCT_MIN) ".." STRINGIFY_ARG(
1939 RDB_TBL_STATS_SAMPLE_PCT_MAX) "]. "
1940 "By default " STRINGIFY_ARG(
1941 RDB_DEFAULT_TBL_STATS_SAMPLE_PCT) "% "
1942 "of"
1943 " e"
1944 "nt"
1945 "ri"
1946 "es"
1947 " a"
1948 "re"
1949 " "
1950 "sa"
1951 "mp"
1952 "le"
1953 "d"
1954 ".",
1955 nullptr, rocksdb_set_table_stats_sampling_pct, /* default */
1956 RDB_DEFAULT_TBL_STATS_SAMPLE_PCT, /* everything */ 0,
1957 /* max */ RDB_TBL_STATS_SAMPLE_PCT_MAX, 0);
1958
1959 static MYSQL_SYSVAR_UINT(
1960 stats_recalc_rate, rocksdb_stats_recalc_rate, PLUGIN_VAR_RQCMDARG,
1961 "The number of indexes per second to recalculate statistics for. 0 to "
1962 "disable background recalculation.",
1963 nullptr, nullptr, 0 /* default value */, 0 /* min value */,
1964 UINT_MAX /* max value */, 0);
1965
1966 static MYSQL_SYSVAR_BOOL(
1967 large_prefix, rocksdb_large_prefix, PLUGIN_VAR_RQCMDARG,
1968 "Support large index prefix length of 3072 bytes. If off, the maximum "
1969 "index prefix length is 767.",
1970 nullptr, nullptr, FALSE);
1971
1972 static MYSQL_SYSVAR_BOOL(
1973 allow_to_start_after_corruption, rocksdb_allow_to_start_after_corruption,
1974 PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
1975 "Allow server still to start successfully even if RocksDB corruption is "
1976 "detected.",
1977 nullptr, nullptr, FALSE);
1978
1979 static MYSQL_SYSVAR_BOOL(error_on_suboptimal_collation,
1980 rocksdb_error_on_suboptimal_collation,
1981 PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
1982 "Raise an error instead of warning if a sub-optimal "
1983 "collation is used",
1984 nullptr, nullptr, TRUE);
1985
1986 static MYSQL_SYSVAR_BOOL(
1987 enable_insert_with_update_caching,
1988 rocksdb_enable_insert_with_update_caching, PLUGIN_VAR_OPCMDARG,
1989 "Whether to enable optimization where we cache the read from a failed "
1990 "insertion attempt in INSERT ON DUPLICATE KEY UPDATE",
1991 nullptr, nullptr, TRUE);
1992
1993 static const int ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE = 100;
1994
1995 static struct st_mysql_sys_var *rocksdb_system_variables[] = {
1996 MYSQL_SYSVAR(lock_wait_timeout),
1997 MYSQL_SYSVAR(deadlock_detect),
1998 MYSQL_SYSVAR(deadlock_detect_depth),
1999 MYSQL_SYSVAR(commit_time_batch_for_recovery),
2000 MYSQL_SYSVAR(max_row_locks),
2001 MYSQL_SYSVAR(write_batch_max_bytes),
2002 MYSQL_SYSVAR(lock_scanned_rows),
2003 MYSQL_SYSVAR(bulk_load),
2004 MYSQL_SYSVAR(bulk_load_allow_sk),
2005 MYSQL_SYSVAR(bulk_load_allow_unsorted),
2006 MYSQL_SYSVAR(skip_unique_check_tables),
2007 MYSQL_SYSVAR(trace_sst_api),
2008 MYSQL_SYSVAR(commit_in_the_middle),
2009 MYSQL_SYSVAR(blind_delete_primary_key),
2010 #if 0 // MARIAROCKS_NOT_YET : read-free replication is not supported
2011 MYSQL_SYSVAR(read_free_rpl_tables),
2012 MYSQL_SYSVAR(read_free_rpl),
2013 #endif
2014 MYSQL_SYSVAR(bulk_load_size),
2015 MYSQL_SYSVAR(merge_buf_size),
2016 MYSQL_SYSVAR(enable_bulk_load_api),
2017 MYSQL_SYSVAR(tmpdir),
2018 MYSQL_SYSVAR(merge_combine_read_size),
2019 MYSQL_SYSVAR(merge_tmp_file_removal_delay_ms),
2020 MYSQL_SYSVAR(skip_bloom_filter_on_read),
2021
2022 MYSQL_SYSVAR(create_if_missing),
2023 MYSQL_SYSVAR(two_write_queues),
2024 MYSQL_SYSVAR(manual_wal_flush),
2025 MYSQL_SYSVAR(write_policy),
2026 MYSQL_SYSVAR(create_missing_column_families),
2027 MYSQL_SYSVAR(error_if_exists),
2028 MYSQL_SYSVAR(paranoid_checks),
2029 MYSQL_SYSVAR(rate_limiter_bytes_per_sec),
2030 MYSQL_SYSVAR(sst_mgr_rate_bytes_per_sec),
2031 MYSQL_SYSVAR(delayed_write_rate),
2032 MYSQL_SYSVAR(max_latest_deadlocks),
2033 MYSQL_SYSVAR(info_log_level),
2034 MYSQL_SYSVAR(max_open_files),
2035 MYSQL_SYSVAR(max_total_wal_size),
2036 MYSQL_SYSVAR(use_fsync),
2037 MYSQL_SYSVAR(wal_dir),
2038 MYSQL_SYSVAR(persistent_cache_path),
2039 MYSQL_SYSVAR(persistent_cache_size_mb),
2040 MYSQL_SYSVAR(delete_obsolete_files_period_micros),
2041 MYSQL_SYSVAR(max_background_jobs),
2042 MYSQL_SYSVAR(max_log_file_size),
2043 MYSQL_SYSVAR(max_subcompactions),
2044 MYSQL_SYSVAR(log_file_time_to_roll),
2045 MYSQL_SYSVAR(keep_log_file_num),
2046 MYSQL_SYSVAR(max_manifest_file_size),
2047 MYSQL_SYSVAR(table_cache_numshardbits),
2048 MYSQL_SYSVAR(wal_ttl_seconds),
2049 MYSQL_SYSVAR(wal_size_limit_mb),
2050 MYSQL_SYSVAR(manifest_preallocation_size),
2051 MYSQL_SYSVAR(use_direct_reads),
2052 MYSQL_SYSVAR(use_direct_io_for_flush_and_compaction),
2053 MYSQL_SYSVAR(allow_mmap_reads),
2054 MYSQL_SYSVAR(allow_mmap_writes),
2055 MYSQL_SYSVAR(is_fd_close_on_exec),
2056 MYSQL_SYSVAR(stats_dump_period_sec),
2057 MYSQL_SYSVAR(advise_random_on_open),
2058 MYSQL_SYSVAR(db_write_buffer_size),
2059 MYSQL_SYSVAR(use_adaptive_mutex),
2060 MYSQL_SYSVAR(bytes_per_sync),
2061 MYSQL_SYSVAR(wal_bytes_per_sync),
2062 MYSQL_SYSVAR(enable_thread_tracking),
2063 MYSQL_SYSVAR(perf_context_level),
2064 MYSQL_SYSVAR(wal_recovery_mode),
2065 MYSQL_SYSVAR(stats_level),
2066 MYSQL_SYSVAR(access_hint_on_compaction_start),
2067 MYSQL_SYSVAR(new_table_reader_for_compaction_inputs),
2068 MYSQL_SYSVAR(compaction_readahead_size),
2069 MYSQL_SYSVAR(allow_concurrent_memtable_write),
2070 MYSQL_SYSVAR(enable_write_thread_adaptive_yield),
2071
2072 MYSQL_SYSVAR(block_cache_size),
2073 MYSQL_SYSVAR(sim_cache_size),
2074 MYSQL_SYSVAR(use_clock_cache),
2075 MYSQL_SYSVAR(cache_high_pri_pool_ratio),
2076 MYSQL_SYSVAR(cache_dump),
2077 MYSQL_SYSVAR(cache_index_and_filter_blocks),
2078 MYSQL_SYSVAR(cache_index_and_filter_with_high_priority),
2079 MYSQL_SYSVAR(pin_l0_filter_and_index_blocks_in_cache),
2080 MYSQL_SYSVAR(index_type),
2081 MYSQL_SYSVAR(hash_index_allow_collision),
2082 MYSQL_SYSVAR(no_block_cache),
2083 MYSQL_SYSVAR(block_size),
2084 MYSQL_SYSVAR(block_size_deviation),
2085 MYSQL_SYSVAR(block_restart_interval),
2086 MYSQL_SYSVAR(whole_key_filtering),
2087
2088 MYSQL_SYSVAR(default_cf_options),
2089 MYSQL_SYSVAR(override_cf_options),
2090 MYSQL_SYSVAR(update_cf_options),
2091
2092 MYSQL_SYSVAR(flush_log_at_trx_commit),
2093 MYSQL_SYSVAR(write_disable_wal),
2094 MYSQL_SYSVAR(write_ignore_missing_column_families),
2095
2096 MYSQL_SYSVAR(skip_fill_cache),
2097 MYSQL_SYSVAR(unsafe_for_binlog),
2098
2099 MYSQL_SYSVAR(records_in_range),
2100 MYSQL_SYSVAR(force_index_records_in_range),
2101 MYSQL_SYSVAR(debug_optimizer_n_rows),
2102 MYSQL_SYSVAR(force_compute_memtable_stats),
2103 MYSQL_SYSVAR(force_compute_memtable_stats_cachetime),
2104 MYSQL_SYSVAR(debug_optimizer_no_zero_cardinality),
2105
2106 MYSQL_SYSVAR(compact_cf),
2107 MYSQL_SYSVAR(delete_cf),
2108 MYSQL_SYSVAR(signal_drop_index_thread),
2109 MYSQL_SYSVAR(pause_background_work),
2110 MYSQL_SYSVAR(enable_2pc),
2111 MYSQL_SYSVAR(ignore_unknown_options),
2112 MYSQL_SYSVAR(strict_collation_check),
2113 MYSQL_SYSVAR(strict_collation_exceptions),
2114 MYSQL_SYSVAR(collect_sst_properties),
2115 MYSQL_SYSVAR(force_flush_memtable_now),
2116 MYSQL_SYSVAR(force_flush_memtable_and_lzero_now),
2117 MYSQL_SYSVAR(enable_ttl),
2118 MYSQL_SYSVAR(enable_ttl_read_filtering),
2119 MYSQL_SYSVAR(debug_ttl_rec_ts),
2120 MYSQL_SYSVAR(debug_ttl_snapshot_ts),
2121 MYSQL_SYSVAR(debug_ttl_read_filter_ts),
2122 MYSQL_SYSVAR(debug_ttl_ignore_pk),
2123 MYSQL_SYSVAR(reset_stats),
2124 MYSQL_SYSVAR(io_write_timeout),
2125 MYSQL_SYSVAR(seconds_between_stat_computes),
2126
2127 MYSQL_SYSVAR(compaction_sequential_deletes),
2128 MYSQL_SYSVAR(compaction_sequential_deletes_window),
2129 MYSQL_SYSVAR(compaction_sequential_deletes_file_size),
2130 MYSQL_SYSVAR(compaction_sequential_deletes_count_sd),
2131 MYSQL_SYSVAR(print_snapshot_conflict_queries),
2132
2133 MYSQL_SYSVAR(datadir),
2134 MYSQL_SYSVAR(supported_compression_types),
2135 MYSQL_SYSVAR(create_checkpoint),
2136 MYSQL_SYSVAR(remove_mariabackup_checkpoint),
2137 MYSQL_SYSVAR(checksums_pct),
2138 MYSQL_SYSVAR(store_row_debug_checksums),
2139 MYSQL_SYSVAR(verify_row_debug_checksums),
2140 MYSQL_SYSVAR(master_skip_tx_api),
2141
2142 MYSQL_SYSVAR(validate_tables),
2143 MYSQL_SYSVAR(table_stats_sampling_pct),
2144
2145 MYSQL_SYSVAR(large_prefix),
2146 MYSQL_SYSVAR(allow_to_start_after_corruption),
2147 MYSQL_SYSVAR(git_hash),
2148 MYSQL_SYSVAR(error_on_suboptimal_collation),
2149 MYSQL_SYSVAR(stats_recalc_rate),
2150 MYSQL_SYSVAR(debug_manual_compaction_delay),
2151 MYSQL_SYSVAR(max_manual_compactions),
2152 MYSQL_SYSVAR(manual_compaction_threads),
2153 MYSQL_SYSVAR(rollback_on_timeout),
2154
2155 MYSQL_SYSVAR(enable_insert_with_update_caching),
2156
2157 MYSQL_SYSVAR(ignore_datadic_errors),
2158 nullptr};
2159
rdb_get_rocksdb_write_options(my_core::THD * const thd)2160 static rocksdb::WriteOptions rdb_get_rocksdb_write_options(
2161 my_core::THD *const thd) {
2162 rocksdb::WriteOptions opt;
2163
2164 opt.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
2165 opt.disableWAL = THDVAR(thd, write_disable_wal);
2166 opt.ignore_missing_column_families =
2167 THDVAR(thd, write_ignore_missing_column_families);
2168
2169 return opt;
2170 }
2171
rocksdb_compact_column_family(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,struct st_mysql_value * const value)2172 static int rocksdb_compact_column_family(THD *const thd,
2173 struct st_mysql_sys_var *const var,
2174 void *const var_ptr,
2175 struct st_mysql_value *const value) {
2176 char buff[STRING_BUFFER_USUAL_SIZE];
2177 int len = sizeof(buff);
2178
2179 DBUG_ASSERT(value != nullptr);
2180
2181 if (const char *const cf = value->val_str(value, buff, &len)) {
2182 auto cfh = cf_manager.get_cf(cf);
2183 if (cfh != nullptr && rdb != nullptr) {
2184 int mc_id = rdb_mc_thread.request_manual_compaction(
2185 cfh, nullptr, nullptr, THDVAR(thd, manual_compaction_threads));
2186 if (mc_id == -1) {
2187 my_error(ER_INTERNAL_ERROR, MYF(0),
2188 "Can't schedule more manual compactions. "
2189 "Increase rocksdb_max_manual_compactions or stop issuing "
2190 "more manual compactions.");
2191 return HA_EXIT_FAILURE;
2192 } else if (mc_id < 0) {
2193 return HA_EXIT_FAILURE;
2194 }
2195 // NO_LINT_DEBUG
2196 sql_print_information("RocksDB: Manual compaction of column family: %s\n",
2197 cf);
2198 // Checking thd state every short cycle (100ms). This is for allowing to
2199 // exiting this function without waiting for CompactRange to finish.
2200 do {
2201 my_sleep(100000);
2202 } while (!thd->killed &&
2203 !rdb_mc_thread.is_manual_compaction_finished(mc_id));
2204
2205 if (thd->killed) {
2206 // This cancels if requested compaction state is INITED.
2207 // TODO(yoshinorim): Cancel running compaction as well once
2208 // it is supported in RocksDB.
2209 rdb_mc_thread.clear_manual_compaction_request(mc_id, true);
2210 }
2211 }
2212 }
2213 return HA_EXIT_SUCCESS;
2214 }
2215
2216 ///////////////////////////////////////////////////////////////////////////////////////////
2217
2218 /*
2219 Drop index thread's control
2220 */
2221
2222 static Rdb_drop_index_thread rdb_drop_idx_thread;
2223
rocksdb_drop_index_wakeup_thread(my_core::THD * const thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * const var MY_ATTRIBUTE ((__unused__)),void * const var_ptr MY_ATTRIBUTE ((__unused__)),const void * const save)2224 static void rocksdb_drop_index_wakeup_thread(
2225 my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
2226 struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
2227 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
2228 if (*static_cast<const bool *>(save)) {
2229 rdb_drop_idx_thread.signal();
2230 }
2231 }
2232
rocksdb_perf_context_level(THD * const thd)2233 static inline uint32_t rocksdb_perf_context_level(THD *const thd) {
2234 DBUG_ASSERT(thd != nullptr);
2235
2236 const int session_perf_context_level = THDVAR(thd, perf_context_level);
2237 if (session_perf_context_level > rocksdb::PerfLevel::kUninitialized) {
2238 return session_perf_context_level;
2239 }
2240
2241 /*
2242 Fallback to global thdvar, if session specific one was not set to a valid
2243 value.
2244 */
2245
2246 const int global_perf_context_level = THDVAR(nullptr, perf_context_level);
2247 if (global_perf_context_level > rocksdb::PerfLevel::kUninitialized) {
2248 return global_perf_context_level;
2249 }
2250
2251 return rocksdb::PerfLevel::kDisable;
2252 }
2253
2254 /*
2255 Very short (functor-like) interface to be passed to
2256 Rdb_transaction::walk_tx_list()
2257 */
2258
2259 interface Rdb_tx_list_walker {
2260 virtual ~Rdb_tx_list_walker() {}
2261 virtual void process_tran(const Rdb_transaction *const) = 0;
2262 };
2263
2264 /*
2265 This is a helper class that is passed to RocksDB to get notifications when
2266 a snapshot gets created.
2267 */
2268
2269 class Rdb_snapshot_notifier : public rocksdb::TransactionNotifier {
2270 Rdb_transaction *m_owning_tx;
2271
2272 void SnapshotCreated(const rocksdb::Snapshot *snapshot) override;
2273
2274 public:
2275 Rdb_snapshot_notifier(const Rdb_snapshot_notifier &) = delete;
2276 Rdb_snapshot_notifier &operator=(const Rdb_snapshot_notifier &) = delete;
2277
Rdb_snapshot_notifier(Rdb_transaction * const owning_tx)2278 explicit Rdb_snapshot_notifier(Rdb_transaction *const owning_tx)
2279 : m_owning_tx(owning_tx) {}
2280
2281 // If the owning Rdb_transaction gets destructed we need to not reference
2282 // it anymore.
detach()2283 void detach() { m_owning_tx = nullptr; }
2284 };
2285
2286
2287 #ifdef MARIAROCKS_NOT_YET
2288 // ER_LOCK_WAIT_TIMEOUT error also has a reason in facebook/mysql-5.6
2289 #endif
timeout_message(const char * command,const char * name1,const char * name2)2290 String timeout_message(const char *command, const char *name1,
2291 const char *name2)
2292 {
2293 String msg;
2294 msg.append("Timeout on ");
2295 msg.append(command);
2296 msg.append(": ");
2297 msg.append(name1);
2298 if (name2 && name2[0])
2299 {
2300 msg.append(".");
2301 msg.append(name2);
2302 }
2303 return msg;
2304 }
2305
2306
2307 /* This is the base class for transactions when interacting with rocksdb.
2308 */
2309 class Rdb_transaction {
2310 protected:
2311 ulonglong m_write_count = 0;
2312 ulonglong m_insert_count = 0;
2313 ulonglong m_update_count = 0;
2314 ulonglong m_delete_count = 0;
2315 ulonglong m_lock_count = 0;
2316 std::unordered_map<GL_INDEX_ID, ulonglong> m_auto_incr_map;
2317
2318 bool m_is_delayed_snapshot = false;
2319 bool m_is_two_phase = false;
2320
2321 std::unordered_set<Rdb_tbl_def*> modified_tables;
2322
2323 private:
2324 /*
2325 Number of write operations this transaction had when we took the last
2326 savepoint (the idea is not to take another savepoint if we haven't made
2327 any changes)
2328 */
2329 ulonglong m_writes_at_last_savepoint;
2330
2331 protected:
2332
2333 protected:
2334 THD *m_thd = nullptr;
2335
2336 static std::multiset<Rdb_transaction *> s_tx_list;
2337 static mysql_mutex_t s_tx_list_mutex;
2338
2339 Rdb_io_perf *m_tbl_io_perf;
2340
2341 bool m_tx_read_only = false;
2342
2343 int m_timeout_sec; /* Cached value of @@rocksdb_lock_wait_timeout */
2344
2345 /* Maximum number of locks the transaction can have */
2346 ulonglong m_max_row_locks;
2347
2348 bool m_is_tx_failed = false;
2349 bool m_rollback_only = false;
2350
2351 std::shared_ptr<Rdb_snapshot_notifier> m_notifier;
2352
2353 // This should be used only when updating binlog information.
2354 virtual rocksdb::WriteBatchBase *get_write_batch() = 0;
2355 virtual bool commit_no_binlog() = 0;
2356 virtual rocksdb::Iterator *get_iterator(
2357 const rocksdb::ReadOptions &options,
2358 rocksdb::ColumnFamilyHandle *column_family) = 0;
2359
2360 protected:
2361 /*
2362 The following two are helper functions to be overloaded by child classes.
2363 They should provide RocksDB's savepoint semantics.
2364 */
2365 virtual void do_set_savepoint() = 0;
2366 virtual void do_rollback_to_savepoint() = 0;
2367
2368 /*
2369 @detail
2370 This function takes in the WriteBatch of the transaction to add
2371 all the AUTO_INCREMENT merges. It does so by iterating through
2372 m_auto_incr_map and then constructing key/value pairs to call merge upon.
2373
2374 @param wb
2375 */
merge_auto_incr_map(rocksdb::WriteBatchBase * const wb)2376 rocksdb::Status merge_auto_incr_map(rocksdb::WriteBatchBase *const wb) {
2377 DBUG_EXECUTE_IF("myrocks_autoinc_upgrade", return rocksdb::Status::OK(););
2378
2379 // Iterate through the merge map merging all keys into data dictionary.
2380 rocksdb::Status s;
2381 for (auto &it : m_auto_incr_map) {
2382 s = dict_manager.put_auto_incr_val(wb, it.first, it.second);
2383 if (!s.ok()) {
2384 return s;
2385 }
2386 }
2387 m_auto_incr_map.clear();
2388 return s;
2389 }
2390
2391 public:
2392 rocksdb::ReadOptions m_read_opts;
2393 const char *m_mysql_log_file_name;
2394 my_off_t m_mysql_log_offset;
2395 #ifdef MARIAROCKS_NOT_YET
2396 // TODO: MariaDB probably doesn't need these at all:
2397 const char *m_mysql_gtid;
2398 const char *m_mysql_max_gtid;
2399 #endif
2400 String m_detailed_error;
2401 int64_t m_snapshot_timestamp = 0;
2402 bool m_ddl_transaction;
2403 #ifdef MARIAROCKS_NOT_YET
2404 std::shared_ptr<Rdb_explicit_snapshot> m_explicit_snapshot;
2405 #endif
2406
2407 /*
2408 Tracks the number of tables in use through external_lock.
2409 This should not be reset during start_tx().
2410 */
2411 int64_t m_n_mysql_tables_in_use = 0;
2412
2413 /*
2414 MariaDB's group commit:
2415 */
2416 bool commit_ordered_done;
2417 bool commit_ordered_res;
2418
2419 /*
2420 for distinction between rdb_transaction_impl and rdb_writebatch_impl
2421 when using walk tx list
2422 */
2423 virtual bool is_writebatch_trx() const = 0;
2424
init_mutex()2425 static void init_mutex() {
2426 mysql_mutex_init(key_mutex_tx_list, &s_tx_list_mutex, MY_MUTEX_INIT_FAST);
2427 }
2428
term_mutex()2429 static void term_mutex() {
2430 DBUG_ASSERT(s_tx_list.size() == 0);
2431 mysql_mutex_destroy(&s_tx_list_mutex);
2432 }
2433
walk_tx_list(Rdb_tx_list_walker * walker)2434 static void walk_tx_list(Rdb_tx_list_walker *walker) {
2435 DBUG_ASSERT(walker != nullptr);
2436
2437 RDB_MUTEX_LOCK_CHECK(s_tx_list_mutex);
2438
2439 for (auto it : s_tx_list) {
2440 walker->process_tran(it);
2441 }
2442
2443 RDB_MUTEX_UNLOCK_CHECK(s_tx_list_mutex);
2444 }
2445
set_status_error(THD * const thd,const rocksdb::Status & s,const Rdb_key_def & kd,Rdb_tbl_def * const tbl_def,Rdb_table_handler * const table_handler)2446 int set_status_error(THD *const thd, const rocksdb::Status &s,
2447 const Rdb_key_def &kd, Rdb_tbl_def *const tbl_def,
2448 Rdb_table_handler *const table_handler) {
2449 DBUG_ASSERT(!s.ok());
2450 DBUG_ASSERT(tbl_def != nullptr);
2451
2452 if (s.IsTimedOut()) {
2453 /*
2454 SQL layer has weird expectations. If we return an error when
2455 doing a read in DELETE IGNORE, it will ignore the error ("because it's
2456 an IGNORE command!) but then will fail an assert, because "error code
2457 was returned, but no error happened". Do what InnoDB's
2458 convert_error_code_to_mysql() does: force a statement
2459 rollback before returning HA_ERR_LOCK_WAIT_TIMEOUT:
2460 */
2461 my_core::thd_mark_transaction_to_rollback(
2462 thd, static_cast<bool>(rocksdb_rollback_on_timeout));
2463 m_detailed_error.copy(timeout_message(
2464 "index", tbl_def->full_tablename().c_str(), kd.get_name().c_str()));
2465 table_handler->m_lock_wait_timeout_counter.inc();
2466 rocksdb_row_lock_wait_timeouts++;
2467
2468 return HA_ERR_LOCK_WAIT_TIMEOUT;
2469 }
2470
2471 if (s.IsDeadlock()) {
2472 my_core::thd_mark_transaction_to_rollback(thd,
2473 true /* whole transaction */);
2474 m_detailed_error = String();
2475 table_handler->m_deadlock_counter.inc();
2476 rocksdb_row_lock_deadlocks++;
2477 return HA_ERR_LOCK_DEADLOCK;
2478 } else if (s.IsBusy()) {
2479 rocksdb_snapshot_conflict_errors++;
2480 if (rocksdb_print_snapshot_conflict_queries) {
2481 char user_host_buff[MAX_USER_HOST_SIZE + 1];
2482 make_user_name(thd, user_host_buff);
2483 // NO_LINT_DEBUG
2484 sql_print_warning(
2485 "Got snapshot conflict errors: User: %s "
2486 "Query: %s",
2487 user_host_buff, thd->query());
2488 }
2489 m_detailed_error = String(" (snapshot conflict)", system_charset_info);
2490 table_handler->m_deadlock_counter.inc();
2491 return HA_ERR_ROCKSDB_STATUS_BUSY;
2492 }
2493
2494 if (s.IsIOError() || s.IsCorruption()) {
2495 rdb_handle_io_error(s, RDB_IO_ERROR_GENERAL);
2496 }
2497
2498 return ha_rocksdb::rdb_error_to_mysql(s);
2499 }
2500
get_thd() const2501 THD *get_thd() const { return m_thd; }
2502
2503 /* Used for tracking io_perf counters */
io_perf_start(Rdb_io_perf * const io_perf)2504 void io_perf_start(Rdb_io_perf *const io_perf) {
2505 /*
2506 Since perf_context is tracked per thread, it is difficult and expensive
2507 to maintain perf_context on a per table basis. Therefore, roll all
2508 perf_context data into the first table used in a query. This works well
2509 for single table queries and is probably good enough for queries that hit
2510 multiple tables.
2511
2512 perf_context stats gathering is started when the table lock is acquired
2513 or when ha_rocksdb::start_stmt is called in case of LOCK TABLES. They
2514 are recorded when the table lock is released, or when commit/rollback
2515 is called on the transaction, whichever comes first. Table lock release
2516 and commit/rollback can happen in different orders. In the case where
2517 the lock is released before commit/rollback is called, an extra step to
2518 gather stats during commit/rollback is needed.
2519 */
2520 if (m_tbl_io_perf == nullptr &&
2521 io_perf->start(rocksdb_perf_context_level(m_thd))) {
2522 m_tbl_io_perf = io_perf;
2523 }
2524 }
2525
io_perf_end_and_record(void)2526 void io_perf_end_and_record(void) {
2527 if (m_tbl_io_perf != nullptr) {
2528 m_tbl_io_perf->end_and_record(rocksdb_perf_context_level(m_thd));
2529 m_tbl_io_perf = nullptr;
2530 }
2531 }
2532
io_perf_end_and_record(Rdb_io_perf * const io_perf)2533 void io_perf_end_and_record(Rdb_io_perf *const io_perf) {
2534 if (m_tbl_io_perf == io_perf) {
2535 io_perf_end_and_record();
2536 }
2537 }
2538
update_bytes_written(ulonglong bytes_written)2539 void update_bytes_written(ulonglong bytes_written) {
2540 if (m_tbl_io_perf != nullptr) {
2541 m_tbl_io_perf->update_bytes_written(rocksdb_perf_context_level(m_thd),
2542 bytes_written);
2543 }
2544 }
2545
set_params(int timeout_sec_arg,int max_row_locks_arg)2546 void set_params(int timeout_sec_arg, int max_row_locks_arg) {
2547 m_timeout_sec = timeout_sec_arg;
2548 m_max_row_locks = max_row_locks_arg;
2549 set_lock_timeout(timeout_sec_arg);
2550 }
2551
2552 virtual void set_lock_timeout(int timeout_sec_arg) = 0;
2553
get_write_count() const2554 ulonglong get_write_count() const { return m_write_count; }
2555
get_insert_count() const2556 ulonglong get_insert_count() const { return m_insert_count; }
2557
get_update_count() const2558 ulonglong get_update_count() const { return m_update_count; }
2559
get_delete_count() const2560 ulonglong get_delete_count() const { return m_delete_count; }
2561
incr_insert_count()2562 void incr_insert_count() { ++m_insert_count; }
2563
incr_update_count()2564 void incr_update_count() { ++m_update_count; }
2565
incr_delete_count()2566 void incr_delete_count() { ++m_delete_count; }
2567
get_timeout_sec() const2568 int get_timeout_sec() const { return m_timeout_sec; }
2569
get_lock_count() const2570 ulonglong get_lock_count() const { return m_lock_count; }
2571
2572 virtual void set_sync(bool sync) = 0;
2573
2574 virtual void release_lock(rocksdb::ColumnFamilyHandle *const column_family,
2575 const std::string &rowkey) = 0;
2576
2577 virtual bool prepare(const rocksdb::TransactionName &name) = 0;
2578
commit_or_rollback()2579 bool commit_or_rollback() {
2580 bool res;
2581 if (m_is_tx_failed) {
2582 rollback();
2583 res = false;
2584 } else {
2585 res = commit();
2586 }
2587 return res;
2588 }
2589
commit()2590 bool commit() {
2591 if (get_write_count() == 0) {
2592 rollback();
2593 return false;
2594 } else if (m_rollback_only) {
2595 /*
2596 Transactions marked as rollback_only are expected to be rolled back at
2597 prepare(). But there are some exceptions like below that prepare() is
2598 never called and commit() is called instead.
2599 1. Binlog is disabled
2600 2. No modification exists in binlog cache for the transaction (#195)
2601 In both cases, rolling back transaction is safe. Nothing is written to
2602 binlog.
2603 */
2604 my_error(ER_ROLLBACK_ONLY, MYF(0));
2605 rollback();
2606 return true;
2607 } else {
2608 #ifdef MARIAROCKS_NOT_YET
2609 /*
2610 Storing binlog position inside MyRocks is needed only for restoring
2611 MyRocks from backups. This feature is not supported yet.
2612 */
2613 mysql_bin_log_commit_pos(m_thd, &m_mysql_log_offset,
2614 &m_mysql_log_file_name);
2615 binlog_manager.update(m_mysql_log_file_name, m_mysql_log_offset,
2616 get_write_batch());
2617 #endif
2618 return commit_no_binlog();
2619 }
2620 }
2621
2622 virtual void rollback() = 0;
2623
snapshot_created(const rocksdb::Snapshot * const snapshot)2624 void snapshot_created(const rocksdb::Snapshot *const snapshot) {
2625 DBUG_ASSERT(snapshot != nullptr);
2626
2627 m_read_opts.snapshot = snapshot;
2628 rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp);
2629 m_is_delayed_snapshot = false;
2630 }
2631
2632 virtual void acquire_snapshot(bool acquire_now) = 0;
2633 virtual void release_snapshot() = 0;
2634
has_snapshot() const2635 bool has_snapshot() const { return m_read_opts.snapshot != nullptr; }
2636
2637 private:
2638 // The Rdb_sst_info structures we are currently loading. In a partitioned
2639 // table this can have more than one entry
2640 std::vector<std::shared_ptr<Rdb_sst_info>> m_curr_bulk_load;
2641 std::string m_curr_bulk_load_tablename;
2642
2643 /* External merge sorts for bulk load: key ID -> merge sort instance */
2644 std::unordered_map<GL_INDEX_ID, Rdb_index_merge> m_key_merge;
2645
2646 public:
get_key_merge(GL_INDEX_ID kd_gl_id,rocksdb::ColumnFamilyHandle * cf,Rdb_index_merge ** key_merge)2647 int get_key_merge(GL_INDEX_ID kd_gl_id, rocksdb::ColumnFamilyHandle *cf,
2648 Rdb_index_merge **key_merge) {
2649 int res;
2650 auto it = m_key_merge.find(kd_gl_id);
2651 if (it == m_key_merge.end()) {
2652 m_key_merge.emplace(
2653 std::piecewise_construct, std::make_tuple(kd_gl_id),
2654 std::make_tuple(
2655 get_rocksdb_tmpdir(), THDVAR(get_thd(), merge_buf_size),
2656 THDVAR(get_thd(), merge_combine_read_size),
2657 THDVAR(get_thd(), merge_tmp_file_removal_delay_ms), cf));
2658 it = m_key_merge.find(kd_gl_id);
2659 if ((res = it->second.init()) != 0) {
2660 return res;
2661 }
2662 }
2663 *key_merge = &it->second;
2664 return HA_EXIT_SUCCESS;
2665 }
2666
2667 /* Finish bulk loading for all table handlers belongs to one connection */
finish_bulk_load(bool * is_critical_error=nullptr,int print_client_error=true)2668 int finish_bulk_load(bool *is_critical_error = nullptr,
2669 int print_client_error = true) {
2670 Ensure_cleanup cleanup([&]() {
2671 // Always clear everything regardless of success/failure
2672 m_curr_bulk_load.clear();
2673 m_curr_bulk_load_tablename.clear();
2674 m_key_merge.clear();
2675 });
2676
2677 int rc = 0;
2678 if (is_critical_error) {
2679 *is_critical_error = true;
2680 }
2681
2682 // PREPARE phase: finish all on-going bulk loading Rdb_sst_info and
2683 // collect all Rdb_sst_commit_info containing (SST files, cf)
2684 int rc2 = 0;
2685 std::vector<Rdb_sst_info::Rdb_sst_commit_info> sst_commit_list;
2686 sst_commit_list.reserve(m_curr_bulk_load.size());
2687
2688 for (auto &sst_info : m_curr_bulk_load) {
2689 Rdb_sst_info::Rdb_sst_commit_info commit_info;
2690
2691 // Commit the list of SST files and move it to the end of
2692 // sst_commit_list, effectively transfer the ownership over
2693 rc2 = sst_info->finish(&commit_info, print_client_error);
2694 if (rc2 && rc == 0) {
2695 // Don't return yet - make sure we finish all the SST infos
2696 rc = rc2;
2697 }
2698
2699 // Make sure we have work to do - we might be losing the race
2700 if (rc2 == 0 && commit_info.has_work()) {
2701 sst_commit_list.emplace_back(std::move(commit_info));
2702 DBUG_ASSERT(!commit_info.has_work());
2703 }
2704 }
2705
2706 if (rc) {
2707 return rc;
2708 }
2709
2710 // MERGING Phase: Flush the index_merge sort buffers into SST files in
2711 // Rdb_sst_info and collect all Rdb_sst_commit_info containing
2712 // (SST files, cf)
2713 if (!m_key_merge.empty()) {
2714 Ensure_cleanup malloc_cleanup([]() {
2715 /*
2716 Explicitly tell jemalloc to clean up any unused dirty pages at this
2717 point.
2718 See https://reviews.facebook.net/D63723 for more details.
2719 */
2720 purge_all_jemalloc_arenas();
2721 });
2722
2723 rocksdb::Slice merge_key;
2724 rocksdb::Slice merge_val;
2725 for (auto it = m_key_merge.begin(); it != m_key_merge.end(); it++) {
2726 GL_INDEX_ID index_id = it->first;
2727 std::shared_ptr<const Rdb_key_def> keydef =
2728 ddl_manager.safe_find(index_id);
2729 std::string table_name = ddl_manager.safe_get_table_name(index_id);
2730
2731 // Unable to find key definition or table name since the
2732 // table could have been dropped.
2733 // TODO(herman): there is a race here between dropping the table
2734 // and detecting a drop here. If the table is dropped while bulk
2735 // loading is finishing, these keys being added here may
2736 // be missed by the compaction filter and not be marked for
2737 // removal. It is unclear how to lock the sql table from the storage
2738 // engine to prevent modifications to it while bulk load is occurring.
2739 if (keydef == nullptr) {
2740 if (is_critical_error) {
2741 // We used to set the error but simply ignores it. This follows
2742 // current behavior and we should revisit this later
2743 *is_critical_error = false;
2744 }
2745 return HA_ERR_KEY_NOT_FOUND;
2746 } else if (table_name.empty()) {
2747 if (is_critical_error) {
2748 // We used to set the error but simply ignores it. This follows
2749 // current behavior and we should revisit this later
2750 *is_critical_error = false;
2751 }
2752 return HA_ERR_NO_SUCH_TABLE;
2753 }
2754 const std::string &index_name = keydef->get_name();
2755 Rdb_index_merge &rdb_merge = it->second;
2756
2757 // Rdb_sst_info expects a denormalized table name in the form of
2758 // "./database/table"
2759 std::replace(table_name.begin(), table_name.end(), '.', '/');
2760 table_name = "./" + table_name;
2761 auto sst_info = std::make_shared<Rdb_sst_info>(
2762 rdb, table_name, index_name, rdb_merge.get_cf(),
2763 *rocksdb_db_options, THDVAR(get_thd(), trace_sst_api));
2764
2765 while ((rc2 = rdb_merge.next(&merge_key, &merge_val)) == 0) {
2766 if ((rc2 = sst_info->put(merge_key, merge_val)) != 0) {
2767 rc = rc2;
2768
2769 // Don't return yet - make sure we finish the sst_info
2770 break;
2771 }
2772 }
2773
2774 // -1 => no more items
2775 if (rc2 != -1 && rc != 0) {
2776 rc = rc2;
2777 }
2778
2779 Rdb_sst_info::Rdb_sst_commit_info commit_info;
2780 rc2 = sst_info->finish(&commit_info, print_client_error);
2781 if (rc2 != 0 && rc == 0) {
2782 // Only set the error from sst_info->finish if finish failed and we
2783 // didn't fail before. In other words, we don't have finish's
2784 // success mask earlier failures
2785 rc = rc2;
2786 }
2787
2788 if (rc) {
2789 return rc;
2790 }
2791
2792 if (commit_info.has_work()) {
2793 sst_commit_list.emplace_back(std::move(commit_info));
2794 DBUG_ASSERT(!commit_info.has_work());
2795 }
2796 }
2797 }
2798
2799 // Early return in case we lost the race completely and end up with no
2800 // work at all
2801 if (sst_commit_list.size() == 0) {
2802 return rc;
2803 }
2804
2805 // INGEST phase: Group all Rdb_sst_commit_info by cf (as they might
2806 // have the same cf across different indexes) and call out to RocksDB
2807 // to ingest all SST files in one atomic operation
2808 rocksdb::IngestExternalFileOptions options;
2809 options.move_files = true;
2810 options.snapshot_consistency = false;
2811 options.allow_global_seqno = false;
2812 options.allow_blocking_flush = false;
2813
2814 std::map<rocksdb::ColumnFamilyHandle *, rocksdb::IngestExternalFileArg>
2815 arg_map;
2816
2817 // Group by column_family
2818 for (auto &commit_info : sst_commit_list) {
2819 if (arg_map.find(commit_info.get_cf()) == arg_map.end()) {
2820 rocksdb::IngestExternalFileArg arg;
2821 arg.column_family = commit_info.get_cf(),
2822 arg.external_files = commit_info.get_committed_files(),
2823 arg.options = options;
2824
2825 arg_map.emplace(commit_info.get_cf(), arg);
2826 } else {
2827 auto &files = arg_map[commit_info.get_cf()].external_files;
2828 files.insert(files.end(), commit_info.get_committed_files().begin(),
2829 commit_info.get_committed_files().end());
2830 }
2831 }
2832
2833 std::vector<rocksdb::IngestExternalFileArg> args;
2834 size_t file_count = 0;
2835 for (auto &cf_files_pair : arg_map) {
2836 args.push_back(cf_files_pair.second);
2837 file_count += cf_files_pair.second.external_files.size();
2838 }
2839
2840 const rocksdb::Status s = rdb->IngestExternalFiles(args);
2841 if (THDVAR(m_thd, trace_sst_api)) {
2842 // NO_LINT_DEBUG
2843 sql_print_information(
2844 "SST Tracing: IngestExternalFile '%zu' files returned %s", file_count,
2845 s.ok() ? "ok" : "not ok");
2846 }
2847
2848 if (!s.ok()) {
2849 if (print_client_error) {
2850 Rdb_sst_info::report_error_msg(s, nullptr);
2851 }
2852 return HA_ERR_ROCKSDB_BULK_LOAD;
2853 }
2854
2855 // COMMIT phase: mark everything as completed. This avoids SST file
2856 // deletion kicking in. Otherwise SST files would get deleted if this
2857 // entire operation is aborted
2858 for (auto &commit_info : sst_commit_list) {
2859 commit_info.commit();
2860 }
2861
2862 return rc;
2863 }
2864
start_bulk_load(ha_rocksdb * const bulk_load,std::shared_ptr<Rdb_sst_info> sst_info)2865 int start_bulk_load(ha_rocksdb *const bulk_load,
2866 std::shared_ptr<Rdb_sst_info> sst_info) {
2867 /*
2868 If we already have an open bulk load of a table and the name doesn't
2869 match the current one, close out the currently running one. This allows
2870 multiple bulk loads to occur on a partitioned table, but then closes
2871 them all out when we switch to another table.
2872 */
2873 DBUG_ASSERT(bulk_load != nullptr);
2874
2875 if (!m_curr_bulk_load.empty() &&
2876 bulk_load->get_table_basename() != m_curr_bulk_load_tablename) {
2877 const auto res = finish_bulk_load();
2878 if (res != HA_EXIT_SUCCESS) {
2879 return res;
2880 }
2881 }
2882
2883 /*
2884 This used to track ha_rocksdb handler objects, but those can be
2885 freed by the table cache while this was referencing them. Instead
2886 of tracking ha_rocksdb handler objects, this now tracks the
2887 Rdb_sst_info allocated, and both the ha_rocksdb handler and the
2888 Rdb_transaction both have shared pointers to them.
2889
2890 On transaction complete, it will commit each Rdb_sst_info structure found.
2891 If the ha_rocksdb object is freed, etc., it will also commit
2892 the Rdb_sst_info. The Rdb_sst_info commit path needs to be idempotent.
2893 */
2894 m_curr_bulk_load.push_back(sst_info);
2895 m_curr_bulk_load_tablename = bulk_load->get_table_basename();
2896 return HA_EXIT_SUCCESS;
2897 }
2898
num_ongoing_bulk_load() const2899 int num_ongoing_bulk_load() const { return m_curr_bulk_load.size(); }
2900
get_rocksdb_tmpdir() const2901 const char *get_rocksdb_tmpdir() const {
2902 const char *tmp_dir = THDVAR(get_thd(), tmpdir);
2903
2904 /*
2905 We want to treat an empty string as nullptr, in these cases DDL operations
2906 will use the default --tmpdir passed to mysql instead.
2907 */
2908 if (tmp_dir != nullptr && *tmp_dir == '\0') {
2909 tmp_dir = nullptr;
2910 }
2911 return (tmp_dir);
2912 }
2913
2914 /*
2915 Flush the data accumulated so far. This assumes we're doing a bulk insert.
2916
2917 @detail
2918 This should work like transaction commit, except that we don't
2919 synchronize with the binlog (there is no API that would allow to have
2920 binlog flush the changes accumulated so far and return its current
2921 position)
2922
2923 @todo
2924 Add test coverage for what happens when somebody attempts to do bulk
2925 inserts while inside a multi-statement transaction.
2926 */
flush_batch()2927 bool flush_batch() {
2928 if (get_write_count() == 0) return false;
2929
2930 /* Commit the current transaction */
2931 if (commit_no_binlog()) return true;
2932
2933 /* Start another one */
2934 start_tx();
2935 return false;
2936 }
2937
set_auto_incr(const GL_INDEX_ID & gl_index_id,ulonglong curr_id)2938 void set_auto_incr(const GL_INDEX_ID &gl_index_id, ulonglong curr_id) {
2939 m_auto_incr_map[gl_index_id] =
2940 std::max(m_auto_incr_map[gl_index_id], curr_id);
2941 }
2942
2943 #ifndef DBUG_OFF
get_auto_incr(const GL_INDEX_ID & gl_index_id)2944 ulonglong get_auto_incr(const GL_INDEX_ID &gl_index_id) {
2945 if (m_auto_incr_map.count(gl_index_id) > 0) {
2946 return m_auto_incr_map[gl_index_id];
2947 }
2948 return 0;
2949 }
2950 #endif
2951
2952 virtual rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family,
2953 const rocksdb::Slice &key,
2954 const rocksdb::Slice &value,
2955 const bool assume_tracked) = 0;
2956 virtual rocksdb::Status delete_key(
2957 rocksdb::ColumnFamilyHandle *const column_family,
2958 const rocksdb::Slice &key, const bool assume_tracked) = 0;
2959 virtual rocksdb::Status single_delete(
2960 rocksdb::ColumnFamilyHandle *const column_family,
2961 const rocksdb::Slice &key, const bool assume_tracked) = 0;
2962
2963 virtual bool has_modifications() const = 0;
2964
2965 virtual rocksdb::WriteBatchBase *get_indexed_write_batch() = 0;
2966 /*
2967 Return a WriteBatch that one can write to. The writes will skip any
2968 transaction locking. The writes will NOT be visible to the transaction.
2969 */
get_blind_write_batch()2970 rocksdb::WriteBatchBase *get_blind_write_batch() {
2971 return get_indexed_write_batch()->GetWriteBatch();
2972 }
2973
2974 virtual rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family,
2975 const rocksdb::Slice &key,
2976 rocksdb::PinnableSlice *const value) const = 0;
2977 virtual rocksdb::Status get_for_update(
2978 rocksdb::ColumnFamilyHandle *const column_family,
2979 const rocksdb::Slice &key, rocksdb::PinnableSlice *const value,
2980 bool exclusive, const bool do_validate) = 0;
2981
get_iterator(rocksdb::ColumnFamilyHandle * const column_family,bool skip_bloom_filter,bool fill_cache,const rocksdb::Slice & eq_cond_lower_bound,const rocksdb::Slice & eq_cond_upper_bound,bool read_current=false,bool create_snapshot=true)2982 rocksdb::Iterator *get_iterator(
2983 rocksdb::ColumnFamilyHandle *const column_family, bool skip_bloom_filter,
2984 bool fill_cache, const rocksdb::Slice &eq_cond_lower_bound,
2985 const rocksdb::Slice &eq_cond_upper_bound, bool read_current = false,
2986 bool create_snapshot = true) {
2987 // Make sure we are not doing both read_current (which implies we don't
2988 // want a snapshot) and create_snapshot which makes sure we create
2989 // a snapshot
2990 DBUG_ASSERT(column_family != nullptr);
2991 DBUG_ASSERT(!read_current || !create_snapshot);
2992
2993 if (create_snapshot) acquire_snapshot(true);
2994
2995 rocksdb::ReadOptions options = m_read_opts;
2996
2997 if (skip_bloom_filter) {
2998 options.total_order_seek = true;
2999 options.iterate_lower_bound = &eq_cond_lower_bound;
3000 options.iterate_upper_bound = &eq_cond_upper_bound;
3001 } else {
3002 // With this option, Iterator::Valid() returns false if key
3003 // is outside of the prefix bloom filter range set at Seek().
3004 // Must not be set to true if not using bloom filter.
3005 options.prefix_same_as_start = true;
3006 }
3007 options.fill_cache = fill_cache;
3008 if (read_current) {
3009 options.snapshot = nullptr;
3010 }
3011 return get_iterator(options, column_family);
3012 }
3013
3014 virtual bool is_tx_started() const = 0;
3015 virtual void start_tx() = 0;
3016 virtual void start_stmt() = 0;
3017
3018 protected:
3019 // Non-virtual functions with actions to be done on transaction start and
3020 // commit.
on_commit()3021 void on_commit() {
3022 time_t tm;
3023 tm = time(nullptr);
3024 for (auto &it : modified_tables) {
3025 it->m_update_time = tm;
3026 }
3027 modified_tables.clear();
3028 }
on_rollback()3029 void on_rollback() {
3030 modified_tables.clear();
3031 }
3032 public:
3033 // Inform the transaction that this table was modified
log_table_write_op(Rdb_tbl_def * tbl)3034 void log_table_write_op(Rdb_tbl_def *tbl) {
3035 modified_tables.insert(tbl);
3036 }
3037
set_initial_savepoint()3038 void set_initial_savepoint() {
3039 /*
3040 Set the initial savepoint. If the first statement in the transaction
3041 fails, we need something to roll back to, without rolling back the
3042 entire transaction.
3043 */
3044 do_set_savepoint();
3045 m_writes_at_last_savepoint = m_write_count;
3046 }
3047
3048 /*
3049 Called when a "top-level" statement inside a transaction completes
3050 successfully and its changes become part of the transaction's changes.
3051 */
make_stmt_savepoint_permanent()3052 int make_stmt_savepoint_permanent() {
3053 // Take another RocksDB savepoint only if we had changes since the last
3054 // one. This is very important for long transactions doing lots of
3055 // SELECTs.
3056 if (m_writes_at_last_savepoint != m_write_count) {
3057 rocksdb::WriteBatchBase *batch = get_write_batch();
3058 rocksdb::Status status = rocksdb::Status::NotFound();
3059 while ((status = batch->PopSavePoint()) == rocksdb::Status::OK()) {
3060 }
3061
3062 if (status != rocksdb::Status::NotFound()) {
3063 return HA_EXIT_FAILURE;
3064 }
3065
3066 do_set_savepoint();
3067 m_writes_at_last_savepoint = m_write_count;
3068 }
3069
3070 return HA_EXIT_SUCCESS;
3071 }
3072
3073 /*
3074 Rollback to the savepoint we've set before the last statement
3075 */
rollback_to_stmt_savepoint()3076 void rollback_to_stmt_savepoint() {
3077 if (m_writes_at_last_savepoint != m_write_count) {
3078 do_rollback_to_savepoint();
3079 /*
3080 RollbackToSavePoint "removes the most recent SetSavePoint()", so
3081 we need to set it again so that next statement can roll back to this
3082 stage.
3083 It's ok to do it here at statement end (instead of doing it at next
3084 statement start) because setting a savepoint is cheap.
3085 */
3086 do_set_savepoint();
3087 m_writes_at_last_savepoint = m_write_count;
3088 }
3089 }
3090
3091 virtual void rollback_stmt() = 0;
3092
set_tx_failed(bool failed_arg)3093 void set_tx_failed(bool failed_arg) { m_is_tx_failed = failed_arg; }
3094
can_prepare() const3095 bool can_prepare() const {
3096 if (m_rollback_only) {
3097 my_error(ER_ROLLBACK_ONLY, MYF(0));
3098 return false;
3099 }
3100 return true;
3101 }
3102
rollback_to_savepoint(void * const savepoint)3103 int rollback_to_savepoint(void *const savepoint) {
3104 if (has_modifications()) {
3105 my_error(ER_ROLLBACK_TO_SAVEPOINT, MYF(0));
3106 m_rollback_only = true;
3107 return HA_EXIT_FAILURE;
3108 }
3109 return HA_EXIT_SUCCESS;
3110 }
3111
3112 /*
3113 This is used by transactions started with "START TRANSACTION WITH "
3114 "CONSISTENT [ROCKSDB] SNAPSHOT". When tx_read_only is turned on,
3115 snapshot has to be created via DB::GetSnapshot(), not via Transaction
3116 API.
3117 */
is_tx_read_only() const3118 bool is_tx_read_only() const { return m_tx_read_only; }
3119
is_two_phase() const3120 bool is_two_phase() const { return m_is_two_phase; }
3121
set_tx_read_only(bool val)3122 void set_tx_read_only(bool val) { m_tx_read_only = val; }
3123
Rdb_transaction(THD * const thd)3124 explicit Rdb_transaction(THD *const thd)
3125 : m_thd(thd), m_tbl_io_perf(nullptr) {
3126 RDB_MUTEX_LOCK_CHECK(s_tx_list_mutex);
3127 s_tx_list.insert(this);
3128 RDB_MUTEX_UNLOCK_CHECK(s_tx_list_mutex);
3129 }
3130
~Rdb_transaction()3131 virtual ~Rdb_transaction() {
3132 RDB_MUTEX_LOCK_CHECK(s_tx_list_mutex);
3133 s_tx_list.erase(this);
3134 RDB_MUTEX_UNLOCK_CHECK(s_tx_list_mutex);
3135 }
3136 };
3137
3138 /*
3139 This is a rocksdb transaction. Its members represent the current transaction,
3140 which consists of:
3141 - the snapshot
3142 - the changes we've made but are not seeing yet.
3143
3144 The changes are made to individual tables, which store them here and then
3145 this object commits them on commit.
3146 */
3147 class Rdb_transaction_impl : public Rdb_transaction {
3148 rocksdb::Transaction *m_rocksdb_tx = nullptr;
3149 rocksdb::Transaction *m_rocksdb_reuse_tx = nullptr;
3150
3151 public:
set_lock_timeout(int timeout_sec_arg)3152 void set_lock_timeout(int timeout_sec_arg) override {
3153 if (m_rocksdb_tx) {
3154 m_rocksdb_tx->SetLockTimeout(rdb_convert_sec_to_ms(m_timeout_sec));
3155 }
3156 }
3157
set_sync(bool sync)3158 void set_sync(bool sync) override {
3159 if (m_rocksdb_tx)
3160 m_rocksdb_tx->GetWriteOptions()->sync = sync;
3161 }
3162
release_lock(rocksdb::ColumnFamilyHandle * const column_family,const std::string & rowkey)3163 void release_lock(rocksdb::ColumnFamilyHandle *const column_family,
3164 const std::string &rowkey) override {
3165 if (!THDVAR(m_thd, lock_scanned_rows)) {
3166 m_rocksdb_tx->UndoGetForUpdate(column_family, rocksdb::Slice(rowkey));
3167 }
3168 }
3169
is_writebatch_trx() const3170 virtual bool is_writebatch_trx() const override { return false; }
3171
3172 private:
release_tx(void)3173 void release_tx(void) {
3174 // We are done with the current active transaction object. Preserve it
3175 // for later reuse.
3176 DBUG_ASSERT(m_rocksdb_reuse_tx == nullptr);
3177 m_rocksdb_reuse_tx = m_rocksdb_tx;
3178 m_rocksdb_tx = nullptr;
3179 }
3180
prepare(const rocksdb::TransactionName & name)3181 bool prepare(const rocksdb::TransactionName &name) override {
3182 rocksdb::Status s;
3183 s = m_rocksdb_tx->SetName(name);
3184 if (!s.ok()) {
3185 rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3186 return false;
3187 }
3188
3189 s = merge_auto_incr_map(m_rocksdb_tx->GetWriteBatch()->GetWriteBatch());
3190 if (!s.ok()) {
3191 rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3192 return false;
3193 }
3194
3195 s = m_rocksdb_tx->Prepare();
3196 if (!s.ok()) {
3197 rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3198 return false;
3199 }
3200 return true;
3201 }
3202
commit_no_binlog()3203 bool commit_no_binlog() override {
3204 bool res = false;
3205 rocksdb::Status s;
3206
3207 s = merge_auto_incr_map(m_rocksdb_tx->GetWriteBatch()->GetWriteBatch());
3208 if (!s.ok()) {
3209 rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3210 res = true;
3211 goto error;
3212 }
3213
3214 release_snapshot();
3215 s = m_rocksdb_tx->Commit();
3216 if (!s.ok()) {
3217 rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3218 res = true;
3219 goto error;
3220 }
3221
3222 on_commit();
3223 error:
3224 on_rollback();
3225 /* Save the transaction object to be reused */
3226 release_tx();
3227
3228 m_write_count = 0;
3229 m_insert_count = 0;
3230 m_update_count = 0;
3231 m_delete_count = 0;
3232 m_lock_count = 0;
3233 set_tx_read_only(false);
3234 m_rollback_only = false;
3235 return res;
3236 }
3237
3238 public:
rollback()3239 void rollback() override {
3240 on_rollback();
3241 m_write_count = 0;
3242 m_insert_count = 0;
3243 m_update_count = 0;
3244 m_delete_count = 0;
3245 m_lock_count = 0;
3246 m_auto_incr_map.clear();
3247 m_ddl_transaction = false;
3248 if (m_rocksdb_tx) {
3249 release_snapshot();
3250 /* This will also release all of the locks: */
3251 m_rocksdb_tx->Rollback();
3252
3253 /* Save the transaction object to be reused */
3254 release_tx();
3255
3256 set_tx_read_only(false);
3257 m_rollback_only = false;
3258 }
3259 }
3260
acquire_snapshot(bool acquire_now)3261 void acquire_snapshot(bool acquire_now) override {
3262 if (m_read_opts.snapshot == nullptr) {
3263 #ifdef MARIAROCKS_NOT_YET
3264 const auto thd_ss = std::static_pointer_cast<Rdb_explicit_snapshot>(
3265 m_thd->get_explicit_snapshot());
3266 if (thd_ss) {
3267 m_explicit_snapshot = thd_ss;
3268 }
3269 if (m_explicit_snapshot) {
3270 auto snapshot = m_explicit_snapshot->get_snapshot()->snapshot();
3271 snapshot_created(snapshot);
3272 } else
3273 #endif
3274 if (is_tx_read_only()) {
3275 snapshot_created(rdb->GetSnapshot());
3276 } else if (acquire_now) {
3277 m_rocksdb_tx->SetSnapshot();
3278 snapshot_created(m_rocksdb_tx->GetSnapshot());
3279 } else if (!m_is_delayed_snapshot) {
3280 m_rocksdb_tx->SetSnapshotOnNextOperation(m_notifier);
3281 m_is_delayed_snapshot = true;
3282 }
3283 }
3284 }
3285
release_snapshot()3286 void release_snapshot() override {
3287 bool need_clear = m_is_delayed_snapshot;
3288
3289 if (m_read_opts.snapshot != nullptr) {
3290 m_snapshot_timestamp = 0;
3291 #ifdef MARIAROCKS_NOT_YET
3292 if (m_explicit_snapshot) {
3293 m_explicit_snapshot.reset();
3294 need_clear = false;
3295 } else
3296 #endif
3297 if (is_tx_read_only()) {
3298 rdb->ReleaseSnapshot(m_read_opts.snapshot);
3299 need_clear = false;
3300 } else {
3301 need_clear = true;
3302 }
3303 m_read_opts.snapshot = nullptr;
3304 }
3305
3306 if (need_clear && m_rocksdb_tx != nullptr) m_rocksdb_tx->ClearSnapshot();
3307 }
3308
has_snapshot()3309 bool has_snapshot() { return m_read_opts.snapshot != nullptr; }
3310
put(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const rocksdb::Slice & value,const bool assume_tracked)3311 rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family,
3312 const rocksdb::Slice &key, const rocksdb::Slice &value,
3313 const bool assume_tracked) override {
3314 ++m_write_count;
3315 ++m_lock_count;
3316 if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks) {
3317 return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
3318 }
3319 return m_rocksdb_tx->Put(column_family, key, value, assume_tracked);
3320 }
3321
delete_key(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const bool assume_tracked)3322 rocksdb::Status delete_key(rocksdb::ColumnFamilyHandle *const column_family,
3323 const rocksdb::Slice &key,
3324 const bool assume_tracked) override {
3325 ++m_write_count;
3326 ++m_lock_count;
3327 if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks) {
3328 return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
3329 }
3330 return m_rocksdb_tx->Delete(column_family, key, assume_tracked);
3331 }
3332
single_delete(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const bool assume_tracked)3333 rocksdb::Status single_delete(
3334 rocksdb::ColumnFamilyHandle *const column_family,
3335 const rocksdb::Slice &key, const bool assume_tracked) override {
3336 ++m_write_count;
3337 ++m_lock_count;
3338 if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks) {
3339 return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
3340 }
3341 return m_rocksdb_tx->SingleDelete(column_family, key, assume_tracked);
3342 }
3343
has_modifications() const3344 bool has_modifications() const override {
3345 return m_rocksdb_tx->GetWriteBatch() &&
3346 m_rocksdb_tx->GetWriteBatch()->GetWriteBatch() &&
3347 m_rocksdb_tx->GetWriteBatch()->GetWriteBatch()->Count() > 0;
3348 }
3349
get_write_batch()3350 rocksdb::WriteBatchBase *get_write_batch() override {
3351 if (is_two_phase()) {
3352 return m_rocksdb_tx->GetCommitTimeWriteBatch();
3353 }
3354 return m_rocksdb_tx->GetWriteBatch()->GetWriteBatch();
3355 }
3356
3357 /*
3358 Return a WriteBatch that one can write to. The writes will skip any
3359 transaction locking. The writes WILL be visible to the transaction.
3360 */
get_indexed_write_batch()3361 rocksdb::WriteBatchBase *get_indexed_write_batch() override {
3362 ++m_write_count;
3363 return m_rocksdb_tx->GetWriteBatch();
3364 }
3365
get(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,rocksdb::PinnableSlice * const value) const3366 rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family,
3367 const rocksdb::Slice &key,
3368 rocksdb::PinnableSlice *const value) const override {
3369 // clean PinnableSlice right begfore Get() for multiple gets per statement
3370 // the resources after the last Get in a statement are cleared in
3371 // handler::reset call
3372 value->Reset();
3373 global_stats.queries[QUERIES_POINT].inc();
3374 return m_rocksdb_tx->Get(m_read_opts, column_family, key, value);
3375 }
3376
get_for_update(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,rocksdb::PinnableSlice * const value,bool exclusive,const bool do_validate)3377 rocksdb::Status get_for_update(
3378 rocksdb::ColumnFamilyHandle *const column_family,
3379 const rocksdb::Slice &key, rocksdb::PinnableSlice *const value,
3380 bool exclusive, const bool do_validate) override {
3381 if (++m_lock_count > m_max_row_locks) {
3382 return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
3383 }
3384
3385 if (value != nullptr) {
3386 value->Reset();
3387 }
3388 rocksdb::Status s;
3389 // If snapshot is null, pass it to GetForUpdate and snapshot is
3390 // initialized there. Snapshot validation is skipped in that case.
3391 if (m_read_opts.snapshot == nullptr || do_validate) {
3392 s = m_rocksdb_tx->GetForUpdate(
3393 m_read_opts, column_family, key, value, exclusive,
3394 m_read_opts.snapshot ? do_validate : false);
3395 } else {
3396 // If snapshot is set, and if skipping validation,
3397 // call GetForUpdate without validation and set back old snapshot
3398 auto saved_snapshot = m_read_opts.snapshot;
3399 m_read_opts.snapshot = nullptr;
3400 s = m_rocksdb_tx->GetForUpdate(m_read_opts, column_family, key, value,
3401 exclusive, false);
3402 m_read_opts.snapshot = saved_snapshot;
3403 }
3404 return s;
3405 }
3406
get_iterator(const rocksdb::ReadOptions & options,rocksdb::ColumnFamilyHandle * const column_family)3407 rocksdb::Iterator *get_iterator(
3408 const rocksdb::ReadOptions &options,
3409 rocksdb::ColumnFamilyHandle *const column_family) override {
3410 global_stats.queries[QUERIES_RANGE].inc();
3411 return m_rocksdb_tx->GetIterator(options, column_family);
3412 }
3413
get_rdb_trx() const3414 const rocksdb::Transaction *get_rdb_trx() const { return m_rocksdb_tx; }
3415
is_tx_started() const3416 bool is_tx_started() const override { return (m_rocksdb_tx != nullptr); }
3417
start_tx()3418 void start_tx() override {
3419 rocksdb::TransactionOptions tx_opts;
3420 rocksdb::WriteOptions write_opts;
3421 tx_opts.set_snapshot = false;
3422 tx_opts.lock_timeout = rdb_convert_sec_to_ms(m_timeout_sec);
3423 tx_opts.deadlock_detect = THDVAR(m_thd, deadlock_detect);
3424 tx_opts.deadlock_detect_depth = THDVAR(m_thd, deadlock_detect_depth);
3425 // If this variable is set, this will write commit time write batch
3426 // information on recovery or memtable flush.
3427 tx_opts.use_only_the_last_commit_time_batch_for_recovery =
3428 THDVAR(m_thd, commit_time_batch_for_recovery);
3429 tx_opts.max_write_batch_size = THDVAR(m_thd, write_batch_max_bytes);
3430
3431 write_opts.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
3432 write_opts.disableWAL = THDVAR(m_thd, write_disable_wal);
3433 write_opts.ignore_missing_column_families =
3434 THDVAR(m_thd, write_ignore_missing_column_families);
3435 m_is_two_phase = rocksdb_enable_2pc;
3436
3437 commit_ordered_done= false;
3438
3439 /*
3440 If m_rocksdb_reuse_tx is null this will create a new transaction object.
3441 Otherwise it will reuse the existing one.
3442 */
3443 m_rocksdb_tx =
3444 rdb->BeginTransaction(write_opts, tx_opts, m_rocksdb_reuse_tx);
3445 m_rocksdb_reuse_tx = nullptr;
3446
3447 m_read_opts = rocksdb::ReadOptions();
3448
3449 set_initial_savepoint();
3450
3451 m_ddl_transaction = false;
3452 }
3453
3454 /* Implementations of do_*savepoint based on rocksdB::Transaction savepoints
3455 */
do_set_savepoint()3456 void do_set_savepoint() override { m_rocksdb_tx->SetSavePoint(); }
3457
do_rollback_to_savepoint()3458 void do_rollback_to_savepoint() override {
3459 m_rocksdb_tx->RollbackToSavePoint();
3460 }
3461
3462 /*
3463 Start a statement inside a multi-statement transaction.
3464
3465 @todo: are we sure this is called once (and not several times) per
3466 statement start?
3467
3468 For hooking to start of statement that is its own transaction, see
3469 ha_rocksdb::external_lock().
3470 */
start_stmt()3471 void start_stmt() override {
3472 // Set the snapshot to delayed acquisition (SetSnapshotOnNextOperation)
3473 acquire_snapshot(false);
3474 }
3475
3476 /*
3477 This must be called when last statement is rolled back, but the transaction
3478 continues
3479 */
rollback_stmt()3480 void rollback_stmt() override {
3481 /* TODO: here we must release the locks taken since the start_stmt() call */
3482 if (m_rocksdb_tx) {
3483 const rocksdb::Snapshot *const org_snapshot = m_rocksdb_tx->GetSnapshot();
3484 rollback_to_stmt_savepoint();
3485
3486 const rocksdb::Snapshot *const cur_snapshot = m_rocksdb_tx->GetSnapshot();
3487 if (org_snapshot != cur_snapshot) {
3488 if (org_snapshot != nullptr) m_snapshot_timestamp = 0;
3489
3490 m_read_opts.snapshot = cur_snapshot;
3491 if (cur_snapshot != nullptr) {
3492 rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp);
3493 } else {
3494 m_is_delayed_snapshot = true;
3495 }
3496 }
3497 }
3498 }
3499
Rdb_transaction_impl(THD * const thd)3500 explicit Rdb_transaction_impl(THD *const thd)
3501 : Rdb_transaction(thd), m_rocksdb_tx(nullptr) {
3502 // Create a notifier that can be called when a snapshot gets generated.
3503 m_notifier = std::make_shared<Rdb_snapshot_notifier>(this);
3504 }
3505
~Rdb_transaction_impl()3506 virtual ~Rdb_transaction_impl() override {
3507 rollback();
3508
3509 // Theoretically the notifier could outlive the Rdb_transaction_impl
3510 // (because of the shared_ptr), so let it know it can't reference
3511 // the transaction anymore.
3512 m_notifier->detach();
3513
3514 // Free any transaction memory that is still hanging around.
3515 delete m_rocksdb_reuse_tx;
3516 DBUG_ASSERT(m_rocksdb_tx == nullptr);
3517 }
3518 };
3519
3520 /* This is a rocksdb write batch. This class doesn't hold or wait on any
3521 transaction locks (skips rocksdb transaction API) thus giving better
3522 performance.
3523
3524 Currently this is only used for replication threads which are guaranteed
3525 to be non-conflicting. Any further usage of this class should completely
3526 be thought thoroughly.
3527 */
3528 class Rdb_writebatch_impl : public Rdb_transaction {
3529 rocksdb::WriteBatchWithIndex *m_batch;
3530 rocksdb::WriteOptions write_opts;
3531 // Called after commit/rollback.
reset()3532 void reset() {
3533 m_batch->Clear();
3534 m_read_opts = rocksdb::ReadOptions();
3535 m_ddl_transaction = false;
3536 }
3537
3538 private:
prepare(const rocksdb::TransactionName & name)3539 bool prepare(const rocksdb::TransactionName &name) override { return true; }
3540
commit_no_binlog()3541 bool commit_no_binlog() override {
3542 bool res = false;
3543 rocksdb::Status s;
3544 rocksdb::TransactionDBWriteOptimizations optimize;
3545 optimize.skip_concurrency_control = true;
3546
3547 s = merge_auto_incr_map(m_batch->GetWriteBatch());
3548 if (!s.ok()) {
3549 rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3550 res = true;
3551 goto error;
3552 }
3553
3554 release_snapshot();
3555
3556 s = rdb->Write(write_opts, optimize, m_batch->GetWriteBatch());
3557 if (!s.ok()) {
3558 rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3559 res = true;
3560 goto error;
3561 }
3562 on_commit();
3563 error:
3564 on_rollback();
3565 reset();
3566
3567 m_write_count = 0;
3568 m_insert_count = 0;
3569 m_update_count = 0;
3570 m_delete_count = 0;
3571 set_tx_read_only(false);
3572 m_rollback_only = false;
3573 return res;
3574 }
3575
3576 /* Implementations of do_*savepoint based on rocksdB::WriteBatch savepoints */
do_set_savepoint()3577 void do_set_savepoint() override { m_batch->SetSavePoint(); }
3578
do_rollback_to_savepoint()3579 void do_rollback_to_savepoint() override { m_batch->RollbackToSavePoint(); }
3580
3581
3582 public:
is_writebatch_trx() const3583 bool is_writebatch_trx() const override { return true; }
3584
set_lock_timeout(int timeout_sec_arg)3585 void set_lock_timeout(int timeout_sec_arg) override {
3586 // Nothing to do here.
3587 }
3588
set_sync(bool sync)3589 void set_sync(bool sync) override { write_opts.sync = sync; }
3590
release_lock(rocksdb::ColumnFamilyHandle * const column_family,const std::string & rowkey)3591 void release_lock(rocksdb::ColumnFamilyHandle *const column_family,
3592 const std::string &rowkey) override {
3593 // Nothing to do here since we don't hold any row locks.
3594 }
3595
rollback()3596 void rollback() override {
3597 on_rollback();
3598 m_write_count = 0;
3599 m_insert_count = 0;
3600 m_update_count = 0;
3601 m_delete_count = 0;
3602 m_lock_count = 0;
3603 release_snapshot();
3604
3605 reset();
3606 set_tx_read_only(false);
3607 m_rollback_only = false;
3608 }
3609
acquire_snapshot(bool acquire_now)3610 void acquire_snapshot(bool acquire_now) override {
3611 if (m_read_opts.snapshot == nullptr) snapshot_created(rdb->GetSnapshot());
3612 }
3613
release_snapshot()3614 void release_snapshot() override {
3615 if (m_read_opts.snapshot != nullptr) {
3616 rdb->ReleaseSnapshot(m_read_opts.snapshot);
3617 m_read_opts.snapshot = nullptr;
3618 }
3619 }
3620
put(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const rocksdb::Slice & value,const bool assume_tracked)3621 rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family,
3622 const rocksdb::Slice &key, const rocksdb::Slice &value,
3623 const bool assume_tracked) override {
3624 ++m_write_count;
3625 m_batch->Put(column_family, key, value);
3626 // Note Put/Delete in write batch doesn't return any error code. We simply
3627 // return OK here.
3628 return rocksdb::Status::OK();
3629 }
3630
delete_key(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const bool assume_tracked)3631 rocksdb::Status delete_key(rocksdb::ColumnFamilyHandle *const column_family,
3632 const rocksdb::Slice &key,
3633 const bool assume_tracked) override {
3634 ++m_write_count;
3635 m_batch->Delete(column_family, key);
3636 return rocksdb::Status::OK();
3637 }
3638
single_delete(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const bool)3639 rocksdb::Status single_delete(
3640 rocksdb::ColumnFamilyHandle *const column_family,
3641 const rocksdb::Slice &key, const bool /* assume_tracked */) override {
3642 ++m_write_count;
3643 m_batch->SingleDelete(column_family, key);
3644 return rocksdb::Status::OK();
3645 }
3646
has_modifications() const3647 bool has_modifications() const override {
3648 return m_batch->GetWriteBatch()->Count() > 0;
3649 }
3650
get_write_batch()3651 rocksdb::WriteBatchBase *get_write_batch() override { return m_batch; }
3652
get_indexed_write_batch()3653 rocksdb::WriteBatchBase *get_indexed_write_batch() override {
3654 ++m_write_count;
3655 return m_batch;
3656 }
3657
get(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,rocksdb::PinnableSlice * const value) const3658 rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family,
3659 const rocksdb::Slice &key,
3660 rocksdb::PinnableSlice *const value) const override {
3661 value->Reset();
3662 return m_batch->GetFromBatchAndDB(rdb, m_read_opts, column_family, key,
3663 value);
3664 }
3665
get_for_update(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,rocksdb::PinnableSlice * const value,bool,const bool)3666 rocksdb::Status get_for_update(
3667 rocksdb::ColumnFamilyHandle *const column_family,
3668 const rocksdb::Slice &key, rocksdb::PinnableSlice *const value,
3669 bool /* exclusive */, const bool /* do_validate */) override {
3670 if (value == nullptr) {
3671 rocksdb::PinnableSlice pin_val;
3672 rocksdb::Status s = get(column_family, key, &pin_val);
3673 pin_val.Reset();
3674 return s;
3675 }
3676
3677 return get(column_family, key, value);
3678 }
3679
get_iterator(const rocksdb::ReadOptions & options,rocksdb::ColumnFamilyHandle * const)3680 rocksdb::Iterator *get_iterator(
3681 const rocksdb::ReadOptions &options,
3682 rocksdb::ColumnFamilyHandle *const /* column_family */) override {
3683 const auto it = rdb->NewIterator(options);
3684 return m_batch->NewIteratorWithBase(it);
3685 }
3686
is_tx_started() const3687 bool is_tx_started() const override { return (m_batch != nullptr); }
3688
start_tx()3689 void start_tx() override {
3690 commit_ordered_done= false; // Do we need this here?
3691 reset();
3692 write_opts.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
3693 write_opts.disableWAL = THDVAR(m_thd, write_disable_wal);
3694 write_opts.ignore_missing_column_families =
3695 THDVAR(m_thd, write_ignore_missing_column_families);
3696
3697 set_initial_savepoint();
3698 }
3699
start_stmt()3700 void start_stmt() override {}
3701
rollback_stmt()3702 void rollback_stmt() override {
3703 if (m_batch) rollback_to_stmt_savepoint();
3704 }
3705
Rdb_writebatch_impl(THD * const thd)3706 explicit Rdb_writebatch_impl(THD *const thd)
3707 : Rdb_transaction(thd), m_batch(nullptr) {
3708 m_batch = new rocksdb::WriteBatchWithIndex(rocksdb::BytewiseComparator(), 0,
3709 true);
3710 }
3711
~Rdb_writebatch_impl()3712 virtual ~Rdb_writebatch_impl() override {
3713 rollback();
3714 delete m_batch;
3715 }
3716 };
3717
SnapshotCreated(const rocksdb::Snapshot * const snapshot)3718 void Rdb_snapshot_notifier::SnapshotCreated(
3719 const rocksdb::Snapshot *const snapshot) {
3720 if (m_owning_tx != nullptr) {
3721 m_owning_tx->snapshot_created(snapshot);
3722 }
3723 }
3724
3725 std::multiset<Rdb_transaction *> Rdb_transaction::s_tx_list;
3726 mysql_mutex_t Rdb_transaction::s_tx_list_mutex;
3727
get_tx_from_thd(THD * const thd)3728 static Rdb_transaction *get_tx_from_thd(THD *const thd) {
3729 return reinterpret_cast<Rdb_transaction *>(
3730 my_core::thd_get_ha_data(thd, rocksdb_hton));
3731 }
3732
3733 namespace {
3734
3735 class Rdb_perf_context_guard {
3736 Rdb_io_perf m_io_perf;
3737 Rdb_io_perf *m_io_perf_ptr;
3738 Rdb_transaction *m_tx;
3739 uint m_level;
3740
3741 public:
3742 Rdb_perf_context_guard(const Rdb_perf_context_guard &) = delete;
3743 Rdb_perf_context_guard &operator=(const Rdb_perf_context_guard &) = delete;
3744
Rdb_perf_context_guard(Rdb_io_perf * io_perf,uint level)3745 explicit Rdb_perf_context_guard(Rdb_io_perf *io_perf, uint level)
3746 : m_io_perf_ptr(io_perf), m_tx(nullptr), m_level(level) {
3747 m_io_perf_ptr->start(m_level);
3748 }
3749
Rdb_perf_context_guard(Rdb_transaction * tx,uint level)3750 explicit Rdb_perf_context_guard(Rdb_transaction *tx, uint level)
3751 : m_io_perf_ptr(nullptr), m_tx(tx), m_level(level) {
3752 /*
3753 if perf_context information is already being recorded, this becomes a
3754 no-op
3755 */
3756 if (tx != nullptr) {
3757 tx->io_perf_start(&m_io_perf);
3758 }
3759 }
3760
~Rdb_perf_context_guard()3761 ~Rdb_perf_context_guard() {
3762 if (m_tx != nullptr) {
3763 m_tx->io_perf_end_and_record();
3764 } else if (m_io_perf_ptr != nullptr) {
3765 m_io_perf_ptr->end_and_record(m_level);
3766 }
3767 }
3768 };
3769
3770 } // anonymous namespace
3771
3772 /*
3773 TODO: maybe, call this in external_lock() and store in ha_rocksdb..
3774 */
3775
get_or_create_tx(THD * const thd)3776 static Rdb_transaction *get_or_create_tx(THD *const thd) {
3777 Rdb_transaction *tx = get_tx_from_thd(thd);
3778 // TODO: this is called too many times.. O(#rows)
3779 if (tx == nullptr) {
3780 bool rpl_skip_tx_api= false; // MARIAROCKS_NOT_YET.
3781 if ((rpl_skip_tx_api && thd->rgi_slave) ||
3782 (THDVAR(thd, master_skip_tx_api) && !thd->rgi_slave))
3783 {
3784 tx = new Rdb_writebatch_impl(thd);
3785 } else {
3786 tx = new Rdb_transaction_impl(thd);
3787 }
3788 tx->set_params(THDVAR(thd, lock_wait_timeout), THDVAR(thd, max_row_locks));
3789 tx->start_tx();
3790 my_core::thd_set_ha_data(thd, rocksdb_hton, tx);
3791 } else {
3792 tx->set_params(THDVAR(thd, lock_wait_timeout), THDVAR(thd, max_row_locks));
3793 if (!tx->is_tx_started()) {
3794 tx->start_tx();
3795 }
3796 }
3797
3798 return tx;
3799 }
3800
rocksdb_close_connection(handlerton * const hton,THD * const thd)3801 static int rocksdb_close_connection(handlerton *const hton, THD *const thd) {
3802 Rdb_transaction *tx = get_tx_from_thd(thd);
3803 if (tx != nullptr) {
3804 bool is_critical_error;
3805 int rc = tx->finish_bulk_load(&is_critical_error, false);
3806 if (rc != 0 && is_critical_error) {
3807 // NO_LINT_DEBUG
3808 sql_print_error(
3809 "RocksDB: Error %d finalizing last SST file while "
3810 "disconnecting",
3811 rc);
3812 }
3813
3814 delete tx;
3815 }
3816 return HA_EXIT_SUCCESS;
3817 }
3818
3819 /*
3820 * Serializes an xid to a string so that it can
3821 * be used as a rocksdb transaction name
3822 */
rdb_xid_to_string(const XID & src)3823 static std::string rdb_xid_to_string(const XID &src) {
3824 DBUG_ASSERT(src.gtrid_length >= 0 && src.gtrid_length <= MAXGTRIDSIZE);
3825 DBUG_ASSERT(src.bqual_length >= 0 && src.bqual_length <= MAXBQUALSIZE);
3826
3827 std::string buf;
3828 buf.reserve(RDB_XIDHDR_LEN + src.gtrid_length + src.bqual_length);
3829
3830 /*
3831 * expand formatID to fill 8 bytes if it doesn't already
3832 * then reinterpret bit pattern as unsigned and store in network order
3833 */
3834 uchar fidbuf[RDB_FORMATID_SZ];
3835 int64 signed_fid8 = src.formatID;
3836 const uint64 raw_fid8 = *reinterpret_cast<uint64 *>(&signed_fid8);
3837 rdb_netbuf_store_uint64(fidbuf, raw_fid8);
3838 buf.append(reinterpret_cast<const char *>(fidbuf), RDB_FORMATID_SZ);
3839
3840 buf.push_back(src.gtrid_length);
3841 buf.push_back(src.bqual_length);
3842 buf.append(src.data, (src.gtrid_length) + (src.bqual_length));
3843 return buf;
3844 }
3845
3846 #if 0
3847 // MARIAROCKS: MariaDB doesn't have flush_wal method
3848 /**
3849 Called by hton->flush_logs after MySQL group commit prepares a set of
3850 transactions.
3851 */
3852 static bool rocksdb_flush_wal(handlerton* hton __attribute__((__unused__)))
3853 DBUG_ASSERT(rdb != nullptr);
3854
3855 rocksdb::Status s;
3856 /*
3857 target_lsn is set to 0 when MySQL wants to sync the wal files
3858 */
3859 if ((target_lsn == 0 && !rocksdb_db_options->allow_mmap_writes) ||
3860 rocksdb_flush_log_at_trx_commit != FLUSH_LOG_NEVER) {
3861 rocksdb_wal_group_syncs++;
3862 s = rdb->FlushWAL(target_lsn == 0 ||
3863 rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
3864 }
3865
3866 if (!s.ok()) {
3867 rdb_log_status_error(s);
3868 return HA_EXIT_FAILURE;
3869 }
3870 return HA_EXIT_SUCCESS;
3871 }
3872 #endif
3873
3874 /**
3875 For a slave, prepare() updates the slave_gtid_info table which tracks the
3876 replication progress.
3877 */
rocksdb_prepare(handlerton * hton,THD * thd,bool prepare_tx)3878 static int rocksdb_prepare(handlerton* hton, THD* thd, bool prepare_tx)
3879 {
3880 bool async=false; // This is "ASYNC_COMMIT" feature which is only present in webscalesql
3881
3882 Rdb_transaction *tx = get_tx_from_thd(thd);
3883 if (!tx->can_prepare()) {
3884 return HA_EXIT_FAILURE;
3885 }
3886 if (prepare_tx ||
3887 (!my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
3888 /* We were instructed to prepare the whole transaction, or
3889 this is an SQL statement end and autocommit is on */
3890
3891 #ifdef MARIAROCKS_NOT_YET
3892 /*
3893 Storing binlog position inside MyRocks is needed only for restoring
3894 MyRocks from backups. This feature is not supported yet.
3895 */
3896 std::vector<st_slave_gtid_info> slave_gtid_info;
3897 my_core::thd_slave_gtid_info(thd, &slave_gtid_info);
3898 for (const auto &it : slave_gtid_info) {
3899 rocksdb::WriteBatchBase *const write_batch = tx->get_blind_write_batch();
3900 binlog_manager.update_slave_gtid_info(it.id, it.db, it.gtid, write_batch);
3901 }
3902 #endif
3903
3904 if (tx->is_two_phase()) {
3905
3906 /*
3907 MariaDB: the following branch is never taken.
3908 We always flush at Prepare and rely on RocksDB's internal Group Commit
3909 to do some grouping.
3910 */
3911 if (thd->durability_property == HA_IGNORE_DURABILITY || async) {
3912 tx->set_sync(false);
3913 }
3914
3915 /*
3916 MariaDB: do not flush logs if we are running in a non-crash-safe mode.
3917 */
3918 if (!rocksdb_flush_log_at_trx_commit)
3919 tx->set_sync(false);
3920
3921 XID xid;
3922 thd_get_xid(thd, reinterpret_cast<MYSQL_XID *>(&xid));
3923 if (!tx->prepare(rdb_xid_to_string(xid))) {
3924 return HA_EXIT_FAILURE;
3925 }
3926
3927 /*
3928 MariaDB: our Group Commit implementation does not use the
3929 hton->flush_logs call (at least currently) so the following is not
3930 needed (TODO: will we need this for binlog rotation?)
3931 */
3932 #ifdef MARIAROCKS_NOT_YET
3933 if (thd->durability_property == HA_IGNORE_DURABILITY )
3934 (rocksdb_flush_log_at_trx_commit != FLUSH_LOG_NEVER))
3935 &&
3936 THDVAR(thd, flush_log_at_trx_commit))
3937 #endif
3938 #ifdef MARIAROCKS_NOT_YET
3939 {
3940 // MariaRocks: disable the
3941 // "write/sync redo log before flushing binlog cache to file"
3942 // feature. See a869c56d361bb44f46c0efeb11a8f03561676247
3943 /**
3944 we set the log sequence as '1' just to trigger hton->flush_logs
3945 */
3946 thd_store_lsn(thd, 1, DB_TYPE_ROCKSDB);
3947 }
3948 #endif
3949 }
3950
3951 DEBUG_SYNC(thd, "rocksdb.prepared");
3952 } else {
3953 tx->make_stmt_savepoint_permanent();
3954 }
3955 return HA_EXIT_SUCCESS;
3956 }
3957
3958 /**
3959 do nothing for prepare/commit by xid
3960 this is needed to avoid crashes in XA scenarios
3961 */
rocksdb_commit_by_xid(handlerton * const hton,XID * const xid)3962 static int rocksdb_commit_by_xid(handlerton *const hton, XID *const xid) {
3963 DBUG_ENTER_FUNC();
3964
3965 DBUG_ASSERT(hton != nullptr);
3966 DBUG_ASSERT(xid != nullptr);
3967 DBUG_ASSERT(commit_latency_stats != nullptr);
3968
3969 rocksdb::StopWatchNano timer(rocksdb::Env::Default(), true);
3970
3971 const auto name = rdb_xid_to_string(*xid);
3972 DBUG_ASSERT(!name.empty());
3973
3974 rocksdb::Transaction *const trx = rdb->GetTransactionByName(name);
3975
3976 if (trx == nullptr) {
3977 DBUG_RETURN(HA_EXIT_FAILURE);
3978 }
3979
3980 const rocksdb::Status s = trx->Commit();
3981
3982 if (!s.ok()) {
3983 rdb_log_status_error(s);
3984 DBUG_RETURN(HA_EXIT_FAILURE);
3985 }
3986
3987 delete trx;
3988
3989 // `Add()` is implemented in a thread-safe manner.
3990 commit_latency_stats->Add(timer.ElapsedNanos() / 1000);
3991
3992 DBUG_RETURN(HA_EXIT_SUCCESS);
3993 }
3994
rocksdb_rollback_by_xid(handlerton * const hton MY_ATTRIBUTE ((__unused__)),XID * const xid)3995 static int rocksdb_rollback_by_xid(
3996 handlerton *const hton MY_ATTRIBUTE((__unused__)), XID *const xid) {
3997 DBUG_ENTER_FUNC();
3998
3999 DBUG_ASSERT(hton != nullptr);
4000 DBUG_ASSERT(xid != nullptr);
4001 DBUG_ASSERT(rdb != nullptr);
4002
4003 const auto name = rdb_xid_to_string(*xid);
4004
4005 rocksdb::Transaction *const trx = rdb->GetTransactionByName(name);
4006
4007 if (trx == nullptr) {
4008 DBUG_RETURN(HA_EXIT_FAILURE);
4009 }
4010
4011 const rocksdb::Status s = trx->Rollback();
4012
4013 if (!s.ok()) {
4014 rdb_log_status_error(s);
4015 DBUG_RETURN(HA_EXIT_FAILURE);
4016 }
4017
4018 delete trx;
4019
4020 DBUG_RETURN(HA_EXIT_SUCCESS);
4021 }
4022
4023 /**
4024 Rebuilds an XID from a serialized version stored in a string.
4025 */
rdb_xid_from_string(const std::string & src,XID * const dst)4026 static void rdb_xid_from_string(const std::string &src, XID *const dst) {
4027 DBUG_ASSERT(dst != nullptr);
4028 uint offset = 0;
4029 uint64 raw_fid8 =
4030 rdb_netbuf_to_uint64(reinterpret_cast<const uchar *>(src.data()));
4031 const int64 signed_fid8 = *reinterpret_cast<int64 *>(&raw_fid8);
4032 dst->formatID = signed_fid8;
4033 offset += RDB_FORMATID_SZ;
4034 dst->gtrid_length = src.at(offset);
4035 offset += RDB_GTRID_SZ;
4036 dst->bqual_length = src.at(offset);
4037 offset += RDB_BQUAL_SZ;
4038
4039 DBUG_ASSERT(dst->gtrid_length >= 0 && dst->gtrid_length <= MAXGTRIDSIZE);
4040 DBUG_ASSERT(dst->bqual_length >= 0 && dst->bqual_length <= MAXBQUALSIZE);
4041
4042 memset(dst->data, 0, XIDDATASIZE);
4043 src.copy(dst->data, (dst->gtrid_length) + (dst->bqual_length),
4044 RDB_XIDHDR_LEN);
4045 }
4046
4047 /**
4048 Reading last committed binary log info from RocksDB system row.
4049 The info is needed for crash safe slave/master to work.
4050 */
rocksdb_recover(handlerton * hton,XID * xid_list,uint len)4051 static int rocksdb_recover(handlerton* hton, XID* xid_list, uint len)
4052 #ifdef MARIAROCKS_NOT_YET
4053 char* const binlog_file,
4054 my_off_t *const binlog_pos,
4055 Gtid *const binlog_max_gtid) {
4056 #endif
4057 {
4058 #ifdef MARIAROCKS_NOT_YET
4059 if (binlog_file && binlog_pos) {
4060 char file_buf[FN_REFLEN + 1] = {0};
4061 my_off_t pos;
4062 char gtid_buf[FN_REFLEN + 1] = {0};
4063 if (binlog_manager.read(file_buf, &pos, gtid_buf)) {
4064 if (is_binlog_advanced(binlog_file, *binlog_pos, file_buf, pos)) {
4065 memcpy(binlog_file, file_buf, FN_REFLEN + 1);
4066 *binlog_pos = pos;
4067 // NO_LINT_DEBUG
4068 fprintf(stderr,
4069 "RocksDB: Last binlog file position %llu,"
4070 " file name %s\n",
4071 pos, file_buf);
4072 if (*gtid_buf) {
4073 global_sid_lock->rdlock();
4074 binlog_max_gtid->parse(global_sid_map, gtid_buf);
4075 global_sid_lock->unlock();
4076 // NO_LINT_DEBUG
4077 fprintf(stderr, "RocksDB: Last MySQL Gtid %s\n", gtid_buf);
4078 }
4079 }
4080 }
4081 }
4082 #endif
4083
4084 if (len == 0 || xid_list == nullptr) {
4085 return HA_EXIT_SUCCESS;
4086 }
4087
4088 std::vector<rocksdb::Transaction *> trans_list;
4089 rdb->GetAllPreparedTransactions(&trans_list);
4090
4091 uint count = 0;
4092 for (auto &trans : trans_list) {
4093 if (count >= len) {
4094 break;
4095 }
4096 auto name = trans->GetName();
4097 rdb_xid_from_string(name, &xid_list[count]);
4098 count++;
4099 }
4100 return count;
4101 }
4102
4103
4104 /*
4105 Handle a commit checkpoint request from server layer.
4106
4107 InnoDB does this:
4108 We put the request in a queue, so that we can notify upper layer about
4109 checkpoint complete when we have flushed the redo log.
4110 If we have already flushed all relevant redo log, we notify immediately.
4111
4112 MariaRocks just flushes everything right away ATM
4113 */
4114
4115 static void rocksdb_checkpoint_request(handlerton *hton,
4116 void *cookie)
4117 {
4118 const rocksdb::Status s= rdb->SyncWAL();
4119 //TODO: what to do on error?
4120 if (s.ok())
4121 {
4122 rocksdb_wal_group_syncs++;
4123 commit_checkpoint_notify_ha(hton, cookie);
4124 }
4125 }
4126
4127 /*
4128 @param all: TRUE - commit the transaction
4129 FALSE - SQL statement ended
4130 */
4131 static void rocksdb_commit_ordered(handlerton *hton, THD* thd, bool all)
4132 {
4133 // Same assert as InnoDB has
4134 DBUG_ASSERT(all || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT |
4135 OPTION_BEGIN)));
4136 Rdb_transaction *tx = get_tx_from_thd(thd);
4137 if (!tx->is_two_phase()) {
4138 /*
4139 ordered_commit is supposedly slower as it is done sequentially
4140 in order to preserve commit order.
4141
4142 if we are not required do 2-phase commit with the binlog, do not do
4143 anything here.
4144 */
4145 return;
4146 }
4147
4148 tx->set_sync(false);
4149
4150 /* This will note the master position also */
4151 tx->commit_ordered_res= tx->commit();
4152 tx->commit_ordered_done= true;
4153
4154 }
4155
4156
4157 static int rocksdb_commit(handlerton* hton, THD* thd, bool commit_tx)
4158 {
4159 DBUG_ENTER_FUNC();
4160
4161 DBUG_ASSERT(hton != nullptr);
4162 DBUG_ASSERT(thd != nullptr);
4163 DBUG_ASSERT(commit_latency_stats != nullptr);
4164
4165 rocksdb::StopWatchNano timer(rocksdb::Env::Default(), true);
4166
4167 /* note: h->external_lock(F_UNLCK) is called after this function is called) */
4168 Rdb_transaction *tx = get_tx_from_thd(thd);
4169
4170 /* this will trigger saving of perf_context information */
4171 Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd));
4172
4173 if (tx != nullptr) {
4174 if (commit_tx || (!my_core::thd_test_options(
4175 thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
4176 /*
4177 This will not add anything to commit_latency_stats, and this is correct
4178 right?
4179 */
4180 if (tx->commit_ordered_done)
4181 {
4182 thd_wakeup_subsequent_commits(thd, 0);
4183 DBUG_RETURN((tx->commit_ordered_res? HA_ERR_INTERNAL_ERROR: 0));
4184 }
4185
4186 /*
4187 We get here
4188 - For a COMMIT statement that finishes a multi-statement transaction
4189 - For a statement that has its own transaction
4190 */
4191 if (thd->slave_thread)
4192 {
4193 // An attempt to make parallel slave performant (not fully successful,
4194 // see MDEV-15372):
4195
4196 // First, commit without syncing. This establishes the commit order
4197 tx->set_sync(false);
4198 bool tx_had_writes = tx->get_write_count()? true : false ;
4199 if (tx->commit()) {
4200 DBUG_RETURN(HA_ERR_ROCKSDB_COMMIT_FAILED);
4201 }
4202 thd_wakeup_subsequent_commits(thd, 0);
4203
4204 if (tx_had_writes && rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC)
4205 {
4206 rocksdb::Status s= rdb->FlushWAL(true);
4207 if (!s.ok())
4208 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
4209 }
4210 }
4211 else
4212 {
4213 /* Not a slave thread */
4214 if (tx->commit()) {
4215 DBUG_RETURN(HA_ERR_ROCKSDB_COMMIT_FAILED);
4216 }
4217 }
4218 } else {
4219 /*
4220 We get here when committing a statement within a transaction.
4221 */
4222 tx->make_stmt_savepoint_permanent();
4223 }
4224
4225 if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED) {
4226 // For READ_COMMITTED, we release any existing snapshot so that we will
4227 // see any changes that occurred since the last statement.
4228 tx->release_snapshot();
4229 }
4230 }
4231
4232 // `Add()` is implemented in a thread-safe manner.
4233 commit_latency_stats->Add(timer.ElapsedNanos() / 1000);
4234
4235 DBUG_RETURN(HA_EXIT_SUCCESS);
4236 }
4237
4238
4239 static int rocksdb_rollback(handlerton *const hton, THD *const thd,
4240 bool rollback_tx) {
4241 Rdb_transaction *tx = get_tx_from_thd(thd);
4242 Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd));
4243
4244 if (tx != nullptr) {
4245 if (rollback_tx) {
4246 /*
4247 We get here, when
4248 - ROLLBACK statement is issued.
4249
4250 Discard the changes made by the transaction
4251 */
4252 tx->rollback();
4253 } else {
4254 /*
4255 We get here when
4256 - a statement with AUTOCOMMIT=1 is being rolled back (because of some
4257 error)
4258 - a statement inside a transaction is rolled back
4259 */
4260
4261 tx->rollback_stmt();
4262 tx->set_tx_failed(true);
4263 }
4264
4265 if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED) {
4266 // For READ_COMMITTED, we release any existing snapshot so that we will
4267 // see any changes that occurred since the last statement.
4268 tx->release_snapshot();
4269 }
4270 }
4271 return HA_EXIT_SUCCESS;
4272 }
4273
4274 static bool print_stats(THD *const thd, std::string const &type,
4275 std::string const &name, std::string const &status,
4276 stat_print_fn *stat_print) {
4277 return stat_print(thd, type.c_str(), type.size(), name.c_str(), name.size(),
4278 status.c_str(), status.size());
4279 }
4280
4281 static std::string format_string(const char *const format, ...) {
4282 std::string res;
4283 va_list args;
4284 va_list args_copy;
4285 char static_buff[256];
4286
4287 DBUG_ASSERT(format != nullptr);
4288
4289 va_start(args, format);
4290 va_copy(args_copy, args);
4291
4292 // Calculate how much space we will need
4293 int len = vsnprintf(nullptr, 0, format, args);
4294 va_end(args);
4295
4296 if (len < 0) {
4297 res = std::string("<format error>");
4298 } else if (len == 0) {
4299 // Shortcut for an empty string
4300 res = std::string("");
4301 } else {
4302 // For short enough output use a static buffer
4303 char *buff = static_buff;
4304 std::unique_ptr<char[]> dynamic_buff = nullptr;
4305
4306 len++; // Add one for null terminator
4307
4308 // for longer output use an allocated buffer
4309 if (static_cast<uint>(len) > sizeof(static_buff)) {
4310 dynamic_buff.reset(new char[len]);
4311 buff = dynamic_buff.get();
4312 }
4313
4314 // Now re-do the vsnprintf with the buffer which is now large enough
4315 (void)vsnprintf(buff, len, format, args_copy);
4316
4317 // Convert to a std::string. Note we could have created a std::string
4318 // large enough and then converted the buffer to a 'char*' and created
4319 // the output in place. This would probably work but feels like a hack.
4320 // Since this isn't code that needs to be super-performant we are going
4321 // with this 'safer' method.
4322 res = std::string(buff);
4323 }
4324
4325 va_end(args_copy);
4326
4327 return res;
4328 }
4329
4330 class Rdb_snapshot_status : public Rdb_tx_list_walker {
4331 private:
4332 std::string m_data;
4333
4334 static std::string current_timestamp(void) {
4335 static const char *const format = "%d-%02d-%02d %02d:%02d:%02d";
4336 time_t currtime;
4337 struct tm currtm;
4338
4339 time(&currtime);
4340
4341 localtime_r(&currtime, &currtm);
4342
4343 return format_string(format, currtm.tm_year + 1900, currtm.tm_mon + 1,
4344 currtm.tm_mday, currtm.tm_hour, currtm.tm_min,
4345 currtm.tm_sec);
4346 }
4347
4348 static std::string get_header(void) {
4349 return "\n============================================================\n" +
4350 current_timestamp() +
4351 " ROCKSDB TRANSACTION MONITOR OUTPUT\n"
4352 "============================================================\n"
4353 "---------\n"
4354 "SNAPSHOTS\n"
4355 "---------\n"
4356 "LIST OF SNAPSHOTS FOR EACH SESSION:\n";
4357 }
4358
4359 static std::string get_footer(void) {
4360 return "-----------------------------------------\n"
4361 "END OF ROCKSDB TRANSACTION MONITOR OUTPUT\n"
4362 "=========================================\n";
4363 }
4364
4365 static Rdb_deadlock_info::Rdb_dl_trx_info get_dl_txn_info(
4366 const rocksdb::DeadlockInfo &txn, const GL_INDEX_ID &gl_index_id) {
4367 Rdb_deadlock_info::Rdb_dl_trx_info txn_data;
4368
4369 txn_data.trx_id = txn.m_txn_id;
4370
4371 txn_data.table_name = ddl_manager.safe_get_table_name(gl_index_id);
4372 if (txn_data.table_name.empty()) {
4373 txn_data.table_name =
4374 "NOT FOUND; INDEX_ID: " + std::to_string(gl_index_id.index_id);
4375 }
4376
4377 auto kd = ddl_manager.safe_find(gl_index_id);
4378 txn_data.index_name =
4379 (kd) ? kd->get_name()
4380 : "NOT FOUND; INDEX_ID: " + std::to_string(gl_index_id.index_id);
4381
4382 rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(txn.m_cf_id);
4383 txn_data.cf_name = cfh->GetName();
4384
4385 txn_data.waiting_key =
4386 rdb_hexdump(txn.m_waiting_key.c_str(), txn.m_waiting_key.length());
4387
4388 txn_data.exclusive_lock = txn.m_exclusive;
4389
4390 return txn_data;
4391 }
4392
4393 static Rdb_deadlock_info get_dl_path_trx_info(
4394 const rocksdb::DeadlockPath &path_entry) {
4395 Rdb_deadlock_info deadlock_info;
4396
4397 for (auto it = path_entry.path.begin(); it != path_entry.path.end(); it++) {
4398 const auto &txn = *it;
4399 const GL_INDEX_ID gl_index_id = {
4400 txn.m_cf_id, rdb_netbuf_to_uint32(reinterpret_cast<const uchar *>(
4401 txn.m_waiting_key.c_str()))};
4402 deadlock_info.path.push_back(get_dl_txn_info(txn, gl_index_id));
4403 }
4404 DBUG_ASSERT_IFF(path_entry.limit_exceeded, path_entry.path.empty());
4405 /* print the first txn in the path to display the full deadlock cycle */
4406 if (!path_entry.path.empty() && !path_entry.limit_exceeded) {
4407 const auto &deadlocking_txn = *(path_entry.path.end() - 1);
4408 deadlock_info.victim_trx_id = deadlocking_txn.m_txn_id;
4409 deadlock_info.deadlock_time = path_entry.deadlock_time;
4410 }
4411 return deadlock_info;
4412 }
4413
4414 public:
4415 Rdb_snapshot_status() : m_data(get_header()) {}
4416
4417 std::string getResult() { return m_data + get_footer(); }
4418
4419 /* Implement Rdb_transaction interface */
4420 /* Create one row in the snapshot status table */
4421 void process_tran(const Rdb_transaction *const tx) override {
4422 DBUG_ASSERT(tx != nullptr);
4423
4424 /* Calculate the duration the snapshot has existed */
4425 int64_t snapshot_timestamp = tx->m_snapshot_timestamp;
4426 if (snapshot_timestamp != 0) {
4427 int64_t curr_time;
4428 rdb->GetEnv()->GetCurrentTime(&curr_time);
4429
4430 char buffer[1024];
4431 #ifdef MARIAROCKS_NOT_YET
4432 thd_security_context(tx->get_thd(), buffer, sizeof buffer, 0);
4433 #endif
4434 m_data += format_string(
4435 "---SNAPSHOT, ACTIVE %lld sec\n"
4436 "%s\n"
4437 "lock count %llu, write count %llu\n"
4438 "insert count %llu, update count %llu, delete count %llu\n",
4439 (longlong)(curr_time - snapshot_timestamp), buffer, tx->get_lock_count(),
4440 tx->get_write_count(), tx->get_insert_count(), tx->get_update_count(),
4441 tx->get_delete_count());
4442 }
4443 }
4444
4445 void populate_deadlock_buffer() {
4446 auto dlock_buffer = rdb->GetDeadlockInfoBuffer();
4447 m_data += "----------LATEST DETECTED DEADLOCKS----------\n";
4448
4449 for (const auto &path_entry : dlock_buffer) {
4450 std::string path_data;
4451 if (path_entry.limit_exceeded) {
4452 path_data += "\n-------DEADLOCK EXCEEDED MAX DEPTH-------\n";
4453 } else {
4454 path_data +=
4455 "\n*** DEADLOCK PATH\n"
4456 "=========================================\n";
4457 const auto dl_info = get_dl_path_trx_info(path_entry);
4458 const auto deadlock_time = dl_info.deadlock_time;
4459 for (auto it = dl_info.path.begin(); it != dl_info.path.end(); it++) {
4460 const auto &trx_info = *it;
4461 path_data += format_string(
4462 "TIMESTAMP: %" PRId64
4463 "\n"
4464 "TRANSACTION ID: %u\n"
4465 "COLUMN FAMILY NAME: %s\n"
4466 "WAITING KEY: %s\n"
4467 "LOCK TYPE: %s\n"
4468 "INDEX NAME: %s\n"
4469 "TABLE NAME: %s\n",
4470 deadlock_time, trx_info.trx_id, trx_info.cf_name.c_str(),
4471 trx_info.waiting_key.c_str(),
4472 trx_info.exclusive_lock ? "EXCLUSIVE" : "SHARED",
4473 trx_info.index_name.c_str(), trx_info.table_name.c_str());
4474 if (it != dl_info.path.end() - 1) {
4475 path_data += "---------------WAITING FOR---------------\n";
4476 }
4477 }
4478 path_data += format_string(
4479 "\n--------TRANSACTION ID: %u GOT DEADLOCK---------\n",
4480 dl_info.victim_trx_id);
4481 }
4482 m_data += path_data;
4483 }
4484 }
4485
4486 std::vector<Rdb_deadlock_info> get_deadlock_info() {
4487 std::vector<Rdb_deadlock_info> deadlock_info;
4488 auto dlock_buffer = rdb->GetDeadlockInfoBuffer();
4489 for (const auto &path_entry : dlock_buffer) {
4490 if (!path_entry.limit_exceeded) {
4491 deadlock_info.push_back(get_dl_path_trx_info(path_entry));
4492 }
4493 }
4494 return deadlock_info;
4495 }
4496 };
4497
4498 /**
4499 * @brief
4500 * walks through all non-replication transactions and copies
4501 * out relevant information for information_schema.rocksdb_trx
4502 */
4503 class Rdb_trx_info_aggregator : public Rdb_tx_list_walker {
4504 private:
4505 std::vector<Rdb_trx_info> *m_trx_info;
4506
4507 public:
4508 explicit Rdb_trx_info_aggregator(std::vector<Rdb_trx_info> *const trx_info)
4509 : m_trx_info(trx_info) {}
4510
4511 void process_tran(const Rdb_transaction *const tx) override {
4512 static const std::map<int, std::string> state_map = {
4513 {rocksdb::Transaction::STARTED, "STARTED"},
4514 {rocksdb::Transaction::AWAITING_PREPARE, "AWAITING_PREPARE"},
4515 {rocksdb::Transaction::PREPARED, "PREPARED"},
4516 {rocksdb::Transaction::AWAITING_COMMIT, "AWAITING_COMMIT"},
4517 {rocksdb::Transaction::COMMITED, "COMMITED"},
4518 {rocksdb::Transaction::AWAITING_ROLLBACK, "AWAITING_ROLLBACK"},
4519 {rocksdb::Transaction::ROLLEDBACK, "ROLLEDBACK"},
4520 };
4521
4522 DBUG_ASSERT(tx != nullptr);
4523
4524 THD *const thd = tx->get_thd();
4525 ulong thread_id = thd_get_thread_id(thd);
4526
4527 if (tx->is_writebatch_trx()) {
4528 const auto wb_impl = static_cast<const Rdb_writebatch_impl *>(tx);
4529 DBUG_ASSERT(wb_impl);
4530 m_trx_info->push_back(
4531 {"", /* name */
4532 0, /* trx_id */
4533 wb_impl->get_write_count(), 0, /* lock_count */
4534 0, /* timeout_sec */
4535 "", /* state */
4536 "", /* waiting_key */
4537 0, /* waiting_cf_id */
4538 1, /*is_replication */
4539 1, /* skip_trx_api */
4540 wb_impl->is_tx_read_only(), 0, /* deadlock detection */
4541 wb_impl->num_ongoing_bulk_load(), thread_id, "" /* query string */});
4542 } else {
4543 const auto tx_impl = static_cast<const Rdb_transaction_impl *>(tx);
4544 DBUG_ASSERT(tx_impl);
4545 const rocksdb::Transaction *rdb_trx = tx_impl->get_rdb_trx();
4546
4547 if (rdb_trx == nullptr) {
4548 return;
4549 }
4550
4551 char query_buf[NAME_LEN+1];
4552 thd_query_safe(thd, query_buf, sizeof(query_buf));
4553 std::string query_str(query_buf);
4554
4555 const auto state_it = state_map.find(rdb_trx->GetState());
4556 DBUG_ASSERT(state_it != state_map.end());
4557 const int is_replication = (thd->rgi_slave != nullptr);
4558 uint32_t waiting_cf_id;
4559 std::string waiting_key;
4560 rdb_trx->GetWaitingTxns(&waiting_cf_id, &waiting_key),
4561
4562 m_trx_info->push_back(
4563 {rdb_trx->GetName(), rdb_trx->GetID(), tx_impl->get_write_count(),
4564 tx_impl->get_lock_count(), tx_impl->get_timeout_sec(),
4565 state_it->second, waiting_key, waiting_cf_id, is_replication,
4566 0, /* skip_trx_api */
4567 tx_impl->is_tx_read_only(), rdb_trx->IsDeadlockDetect(),
4568 tx_impl->num_ongoing_bulk_load(), thread_id, query_str});
4569 }
4570 }
4571 };
4572
4573 /*
4574 returns a vector of info for all non-replication threads
4575 for use by information_schema.rocksdb_trx
4576 */
4577 std::vector<Rdb_trx_info> rdb_get_all_trx_info() {
4578 std::vector<Rdb_trx_info> trx_info;
4579 Rdb_trx_info_aggregator trx_info_agg(&trx_info);
4580 Rdb_transaction::walk_tx_list(&trx_info_agg);
4581 return trx_info;
4582 }
4583
4584
4585 /*
4586 returns a vector of info of recent deadlocks
4587 for use by information_schema.rocksdb_deadlock
4588 */
4589 std::vector<Rdb_deadlock_info> rdb_get_deadlock_info() {
4590 Rdb_snapshot_status showStatus;
4591 Rdb_transaction::walk_tx_list(&showStatus);
4592 return showStatus.get_deadlock_info();
4593 }
4594
4595 #ifdef MARIAROCKS_NOT_YET
4596 /* Generate the snapshot status table */
4597 static bool rocksdb_show_snapshot_status(handlerton *const hton, THD *const thd,
4598 stat_print_fn *const stat_print) {
4599 Rdb_snapshot_status showStatus;
4600
4601 Rdb_transaction::walk_tx_list(&showStatus);
4602 showStatus.populate_deadlock_buffer();
4603
4604 /* Send the result data back to MySQL */
4605 return print_stats(thd, "rocksdb", "", showStatus.getResult(), stat_print);
4606 }
4607 #endif
4608
4609 /*
4610 This is called for SHOW ENGINE ROCKSDB STATUS | LOGS | etc.
4611
4612 For now, produce info about live files (which gives an imprecise idea about
4613 what column families are there).
4614 */
4615 static bool rocksdb_show_status(handlerton *const hton, THD *const thd,
4616 stat_print_fn *const stat_print,
4617 enum ha_stat_type stat_type) {
4618 DBUG_ASSERT(hton != nullptr);
4619 DBUG_ASSERT(thd != nullptr);
4620 DBUG_ASSERT(stat_print != nullptr);
4621
4622 bool res = false;
4623 char buf[100] = {'\0'};
4624
4625 if (stat_type == HA_ENGINE_STATUS) {
4626 DBUG_ASSERT(rdb != nullptr);
4627
4628 std::string str;
4629
4630 /* Global DB Statistics */
4631 if (rocksdb_stats) {
4632 str = rocksdb_stats->ToString();
4633
4634 // Use the same format as internal RocksDB statistics entries to make
4635 // sure that output will look unified.
4636 DBUG_ASSERT(commit_latency_stats != nullptr);
4637
4638 snprintf(buf, sizeof(buf),
4639 "rocksdb.commit_latency statistics "
4640 "Percentiles :=> 50 : %.2f 95 : %.2f "
4641 "99 : %.2f 100 : %.2f\n",
4642 commit_latency_stats->Percentile(50),
4643 commit_latency_stats->Percentile(95),
4644 commit_latency_stats->Percentile(99),
4645 commit_latency_stats->Percentile(100));
4646 str.append(buf);
4647
4648 uint64_t v = 0;
4649
4650 // Retrieve additional stalling related numbers from RocksDB and append
4651 // them to the buffer meant for displaying detailed statistics. The intent
4652 // here is to avoid adding another row to the query output because of
4653 // just two numbers.
4654 //
4655 // NB! We're replacing hyphens with underscores in output to better match
4656 // the existing naming convention.
4657 if (rdb->GetIntProperty("rocksdb.is-write-stopped", &v)) {
4658 snprintf(buf, sizeof(buf), "rocksdb.is_write_stopped COUNT : %llu\n", (ulonglong)v);
4659 str.append(buf);
4660 }
4661
4662 if (rdb->GetIntProperty("rocksdb.actual-delayed-write-rate", &v)) {
4663 snprintf(buf, sizeof(buf),
4664 "COUNT : %llu\n",
4665 (ulonglong)v);
4666 str.append(buf);
4667 }
4668
4669 res |= print_stats(thd, "STATISTICS", "rocksdb", str, stat_print);
4670 }
4671
4672 /* Per DB stats */
4673 if (rdb->GetProperty("rocksdb.dbstats", &str)) {
4674 res |= print_stats(thd, "DBSTATS", "rocksdb", str, stat_print);
4675 }
4676
4677 /* Per column family stats */
4678 for (const auto &cf_name : cf_manager.get_cf_names()) {
4679 rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(cf_name);
4680 if (cfh == nullptr) {
4681 continue;
4682 }
4683
4684 if (!rdb->GetProperty(cfh, "rocksdb.cfstats", &str)) {
4685 continue;
4686 }
4687
4688 res |= print_stats(thd, "CF_COMPACTION", cf_name, str, stat_print);
4689 }
4690
4691 /* Memory Statistics */
4692 std::vector<rocksdb::DB *> dbs;
4693 std::unordered_set<const rocksdb::Cache *> cache_set;
4694 size_t internal_cache_count = 0;
4695 size_t kDefaultInternalCacheSize = 8 * 1024 * 1024;
4696
4697 dbs.push_back(rdb);
4698 cache_set.insert(rocksdb_tbl_options->block_cache.get());
4699
4700 for (const auto &cf_handle : cf_manager.get_all_cf()) {
4701 rocksdb::ColumnFamilyDescriptor cf_desc;
4702 cf_handle->GetDescriptor(&cf_desc);
4703 auto *const table_factory = cf_desc.options.table_factory.get();
4704
4705 if (table_factory != nullptr) {
4706 std::string tf_name = table_factory->Name();
4707
4708 if (tf_name.find("BlockBasedTable") != std::string::npos) {
4709 const rocksdb::BlockBasedTableOptions *const bbt_opt =
4710 reinterpret_cast<rocksdb::BlockBasedTableOptions *>(
4711 table_factory->GetOptions());
4712
4713 if (bbt_opt != nullptr) {
4714 if (bbt_opt->block_cache.get() != nullptr) {
4715 cache_set.insert(bbt_opt->block_cache.get());
4716 } else {
4717 internal_cache_count++;
4718 }
4719 cache_set.insert(bbt_opt->block_cache_compressed.get());
4720 }
4721 }
4722 }
4723 }
4724
4725 std::map<rocksdb::MemoryUtil::UsageType, uint64_t> temp_usage_by_type;
4726 str.clear();
4727 rocksdb::MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set,
4728 &temp_usage_by_type);
4729 snprintf(buf, sizeof(buf), "\nMemTable Total: %llu",
4730 (ulonglong)temp_usage_by_type[rocksdb::MemoryUtil::kMemTableTotal]);
4731 str.append(buf);
4732 snprintf(buf, sizeof(buf), "\nMemTable Unflushed: %llu",
4733 (ulonglong)temp_usage_by_type[rocksdb::MemoryUtil::kMemTableUnFlushed]);
4734 str.append(buf);
4735 snprintf(buf, sizeof(buf), "\nTable Readers Total: %llu",
4736 (ulonglong)temp_usage_by_type[rocksdb::MemoryUtil::kTableReadersTotal]);
4737 str.append(buf);
4738 snprintf(buf, sizeof(buf), "\nCache Total: %llu",
4739 (ulonglong)temp_usage_by_type[rocksdb::MemoryUtil::kCacheTotal]);
4740 str.append(buf);
4741 snprintf(buf, sizeof(buf), "\nDefault Cache Capacity: %llu",
4742 (ulonglong)internal_cache_count * kDefaultInternalCacheSize);
4743 str.append(buf);
4744 res |= print_stats(thd, "MEMORY_STATS", "rocksdb", str, stat_print);
4745
4746 /* Show the background thread status */
4747 std::vector<rocksdb::ThreadStatus> thread_list;
4748 rocksdb::Status s = rdb->GetEnv()->GetThreadList(&thread_list);
4749
4750 if (!s.ok()) {
4751 // NO_LINT_DEBUG
4752 sql_print_error("RocksDB: Returned error (%s) from GetThreadList.\n",
4753 s.ToString().c_str());
4754 res |= true;
4755 } else {
4756 /* For each background thread retrieved, print out its information */
4757 for (auto &it : thread_list) {
4758 /* Only look at background threads. Ignore user threads, if any. */
4759 if (it.thread_type > rocksdb::ThreadStatus::LOW_PRIORITY) {
4760 continue;
4761 }
4762
4763 str = "\nthread_type: " + it.GetThreadTypeName(it.thread_type) +
4764 "\ncf_name: " + it.cf_name +
4765 "\noperation_type: " + it.GetOperationName(it.operation_type) +
4766 "\noperation_stage: " +
4767 it.GetOperationStageName(it.operation_stage) +
4768 "\nelapsed_time_ms: " + it.MicrosToString(it.op_elapsed_micros);
4769
4770 for (auto &it_props : it.InterpretOperationProperties(
4771 it.operation_type, it.op_properties)) {
4772 str += "\n" + it_props.first + ": " + std::to_string(it_props.second);
4773 }
4774
4775 str += "\nstate_type: " + it.GetStateName(it.state_type);
4776
4777 res |= print_stats(thd, "BG_THREADS", std::to_string(it.thread_id), str,
4778 stat_print);
4779 }
4780 }
4781
4782 #ifdef MARIAROCKS_NOT_YET
4783 /* Explicit snapshot information */
4784 str = Rdb_explicit_snapshot::dump_snapshots();
4785 #endif
4786
4787 if (!str.empty()) {
4788 res |= print_stats(thd, "EXPLICIT_SNAPSHOTS", "rocksdb", str, stat_print);
4789 }
4790 #ifdef MARIAROCKS_NOT_YET
4791 } else if (stat_type == HA_ENGINE_TRX) {
4792 /* Handle the SHOW ENGINE ROCKSDB TRANSACTION STATUS command */
4793 res |= rocksdb_show_snapshot_status(hton, thd, stat_print);
4794 #endif
4795 }
4796 return res;
4797 }
4798
4799 static inline void rocksdb_register_tx(handlerton *const hton, THD *const thd,
4800 Rdb_transaction *const tx) {
4801 DBUG_ASSERT(tx != nullptr);
4802
4803 trans_register_ha(thd, FALSE, rocksdb_hton);
4804 if (my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
4805 tx->start_stmt();
4806 trans_register_ha(thd, TRUE, rocksdb_hton);
4807 }
4808 }
4809
4810 static const char *ha_rocksdb_exts[] = {NullS};
4811
4812 #ifdef MARIAROCKS_NOT_YET
4813 static bool rocksdb_explicit_snapshot(
4814 handlerton *const /* hton */, /*!< in: RocksDB handlerton */
4815 THD *const thd, /*!< in: MySQL thread handle */
4816 snapshot_info_st *ss_info) /*!< out: Snapshot information */
4817 {
4818 switch (ss_info->op) {
4819 case snapshot_operation::SNAPSHOT_CREATE: {
4820 if (mysql_bin_log_is_open()) {
4821 mysql_bin_log_lock_commits(ss_info);
4822 }
4823 auto s = Rdb_explicit_snapshot::create(ss_info, rdb, rdb->GetSnapshot());
4824 if (mysql_bin_log_is_open()) {
4825 mysql_bin_log_unlock_commits(ss_info);
4826 }
4827
4828 thd->set_explicit_snapshot(s);
4829 return s == nullptr;
4830 }
4831 case snapshot_operation::SNAPSHOT_ATTACH: {
4832 auto s = Rdb_explicit_snapshot::get(ss_info->snapshot_id);
4833 if (!s) {
4834 return true;
4835 }
4836 *ss_info = s->ss_info;
4837 thd->set_explicit_snapshot(s);
4838 return false;
4839 }
4840 case snapshot_operation::SNAPSHOT_RELEASE: {
4841 if (!thd->get_explicit_snapshot()) {
4842 return true;
4843 }
4844 *ss_info = thd->get_explicit_snapshot()->ss_info;
4845 thd->set_explicit_snapshot(nullptr);
4846 return false;
4847 }
4848 default:
4849 DBUG_ASSERT(false);
4850 return true;
4851 }
4852 return true;
4853 }
4854 #endif
4855
4856 /*
4857 Supporting START TRANSACTION WITH CONSISTENT [ROCKSDB] SNAPSHOT
4858
4859 Features:
4860 1. Supporting START TRANSACTION WITH CONSISTENT SNAPSHOT
4861 2. Getting current binlog position in addition to #1.
4862
4863 The second feature is done by START TRANSACTION WITH
4864 CONSISTENT ROCKSDB SNAPSHOT. This is Facebook's extension, and
4865 it works like existing START TRANSACTION WITH CONSISTENT INNODB SNAPSHOT.
4866
4867 - When not setting engine, START TRANSACTION WITH CONSISTENT SNAPSHOT
4868 takes both InnoDB and RocksDB snapshots, and both InnoDB and RocksDB
4869 participate in transaction. When executing COMMIT, both InnoDB and
4870 RocksDB modifications are committed. Remember that XA is not supported yet,
4871 so mixing engines is not recommended anyway.
4872
4873 - When setting engine, START TRANSACTION WITH CONSISTENT.. takes
4874 snapshot for the specified engine only. But it starts both
4875 InnoDB and RocksDB transactions.
4876 */
4877 static int rocksdb_start_tx_and_assign_read_view(
4878 handlerton *const hton, /*!< in: RocksDB handlerton */
4879 THD *const thd /*!< in: MySQL thread handle of the
4880 user for whom the transaction should
4881 be committed */
4882 )
4883 #ifdef MARIAROCKS_NOT_YET
4884 snapshot_info_st *ss_info) /*!< in/out: Snapshot info like binlog file, pos,
4885 gtid executed and snapshot ID */
4886 #endif
4887 {
4888 ulong const tx_isolation = my_core::thd_tx_isolation(thd);
4889
4890 if (tx_isolation != ISO_REPEATABLE_READ) {
4891 my_error(ER_ISOLATION_LEVEL_WITH_CONSISTENT_SNAPSHOT, MYF(0));
4892 return HA_EXIT_FAILURE;
4893 }
4894
4895 #ifdef MARIADB_NOT_YET
4896 if (ss_info) {
4897 if (mysql_bin_log_is_open()) {
4898 mysql_bin_log_lock_commits(ss_info);
4899 } else {
4900 return HA_EXIT_FAILURE;
4901 }
4902 #endif
4903
4904 /*
4905 MariaDB: there is no need to call mysql_bin_log_lock_commits and then
4906 unlock back.
4907 SQL layer calls start_consistent_snapshot() for all engines, including the
4908 binlog under LOCK_commit_ordered mutex.
4909
4910 The mutex prevents binlog commits from happening (right?) while the storage
4911 engine(s) allocate read snapshots. That way, each storage engine is
4912 synchronized with current binlog position.
4913 */
4914 mysql_mutex_assert_owner(&LOCK_commit_ordered);
4915
4916 Rdb_transaction *const tx = get_or_create_tx(thd);
4917 Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd));
4918
4919 DBUG_ASSERT(!tx->has_snapshot());
4920 tx->set_tx_read_only(true);
4921 rocksdb_register_tx(hton, thd, tx);
4922 tx->acquire_snapshot(true);
4923
4924 #ifdef MARIADB_NOT_YET
4925 if (ss_info) {
4926 mysql_bin_log_unlock_commits(ss_info);
4927 }
4928 #endif
4929 return HA_EXIT_SUCCESS;
4930 }
4931
4932 #ifdef MARIADB_NOT_YET
4933 static int rocksdb_start_tx_with_shared_read_view(
4934 handlerton *const hton, /*!< in: RocksDB handlerton */
4935 THD *const thd) /*!< in: MySQL thread handle of the
4936 user for whom the transaction should
4937 be committed */
4938 #ifdef MARIADB_NOT_YET
4939 snapshot_info_st *ss_info) /*!< out: Snapshot info like binlog file, pos,
4940 gtid executed and snapshot ID */
4941 #endif
4942 {
4943 DBUG_ASSERT(thd != nullptr);
4944
4945 int error = HA_EXIT_SUCCESS;
4946
4947 ulong const tx_isolation = my_core::thd_tx_isolation(thd);
4948 if (tx_isolation != ISO_REPEATABLE_READ) {
4949 my_error(ER_ISOLATION_LEVEL_WITH_CONSISTENT_SNAPSHOT, MYF(0));
4950 return HA_EXIT_FAILURE;
4951 }
4952
4953 Rdb_transaction *tx = nullptr;
4954 #ifdef MARIADB_NOT_YET
4955 std::shared_ptr<Rdb_explicit_snapshot> explicit_snapshot;
4956 const auto op = ss_info->op;
4957
4958 DBUG_ASSERT(op == snapshot_operation::SNAPSHOT_CREATE ||
4959 op == snapshot_operation::SNAPSHOT_ATTACH);
4960
4961 // case: if binlogs are available get binlog file/pos and gtid info
4962 if (op == snapshot_operation::SNAPSHOT_CREATE && mysql_bin_log_is_open()) {
4963 mysql_bin_log_lock_commits(ss_info);
4964 }
4965
4966 if (op == snapshot_operation::SNAPSHOT_ATTACH) {
4967 explicit_snapshot = Rdb_explicit_snapshot::get(ss_info->snapshot_id);
4968 if (!explicit_snapshot) {
4969 my_printf_error(ER_UNKNOWN_ERROR, "Snapshot %llu does not exist", MYF(0),
4970 ss_info->snapshot_id);
4971 error = HA_EXIT_FAILURE;
4972 }
4973 }
4974 #endif
4975
4976 // case: all good till now
4977 if (error == HA_EXIT_SUCCESS) {
4978 tx = get_or_create_tx(thd);
4979 Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd));
4980
4981 #ifdef MARIADB_NOT_YET
4982 if (explicit_snapshot) {
4983 tx->m_explicit_snapshot = explicit_snapshot;
4984 }
4985 #endif
4986
4987 DBUG_ASSERT(!tx->has_snapshot());
4988 tx->set_tx_read_only(true);
4989 rocksdb_register_tx(hton, thd, tx);
4990 tx->acquire_snapshot(true);
4991
4992 #ifdef MARIADB_NOT_YET
4993 // case: an explicit snapshot was not assigned to this transaction
4994 if (!tx->m_explicit_snapshot) {
4995 tx->m_explicit_snapshot =
4996 Rdb_explicit_snapshot::create(ss_info, rdb, tx->m_read_opts.snapshot);
4997 if (!tx->m_explicit_snapshot) {
4998 my_printf_error(ER_UNKNOWN_ERROR, "Could not create snapshot", MYF(0));
4999 error = HA_EXIT_FAILURE;
5000 }
5001 }
5002 #endif
5003 }
5004
5005 #ifdef MARIADB_NOT_YET
5006 // case: unlock the binlog
5007 if (op == snapshot_operation::SNAPSHOT_CREATE && mysql_bin_log_is_open()) {
5008 mysql_bin_log_unlock_commits(ss_info);
5009 }
5010
5011 DBUG_ASSERT(error == HA_EXIT_FAILURE || tx->m_explicit_snapshot);
5012
5013 // copy over the snapshot details to pass to the upper layers
5014 if (tx->m_explicit_snapshot) {
5015 *ss_info = tx->m_explicit_snapshot->ss_info;
5016 ss_info->op = op;
5017 }
5018 #endif
5019
5020 return error;
5021 }
5022 #endif
5023
5024 /* Dummy SAVEPOINT support. This is needed for long running transactions
5025 * like mysqldump (https://bugs.mysql.com/bug.php?id=71017).
5026 * Current SAVEPOINT does not correctly handle ROLLBACK and does not return
5027 * errors. This needs to be addressed in future versions (Issue#96).
5028 */
5029 static int rocksdb_savepoint(handlerton *const hton, THD *const thd,
5030 void *const savepoint) {
5031 return HA_EXIT_SUCCESS;
5032 }
5033
5034 static int rocksdb_rollback_to_savepoint(handlerton *const hton, THD *const thd,
5035 void *const savepoint) {
5036 Rdb_transaction *tx = get_tx_from_thd(thd);
5037 return tx->rollback_to_savepoint(savepoint);
5038 }
5039
5040 static bool rocksdb_rollback_to_savepoint_can_release_mdl(
5041 handlerton *const /* hton */, THD *const /* thd */) {
5042 return true;
5043 }
5044
5045 #ifdef MARIAROCKS_NOT_YET
5046 /*
5047 This is called for INFORMATION_SCHEMA
5048 */
5049 static void rocksdb_update_table_stats(
5050 /* per-table stats callback */
5051 void (*cb)(const char *db, const char *tbl, bool is_partition,
5052 my_io_perf_t *r, my_io_perf_t *w, my_io_perf_t *r_blob,
5053 my_io_perf_t *r_primary, my_io_perf_t *r_secondary,
5054 page_stats_t *page_stats, comp_stats_t *comp_stats,
5055 int n_lock_wait, int n_lock_wait_timeout, int n_lock_deadlock,
5056 const char *engine)) {
5057 my_io_perf_t io_perf_read;
5058 my_io_perf_t io_perf_write;
5059 my_io_perf_t io_perf;
5060 page_stats_t page_stats;
5061 comp_stats_t comp_stats;
5062 uint lock_wait_timeout_stats;
5063 uint deadlock_stats;
5064 uint lock_wait_stats;
5065 std::vector<std::string> tablenames;
5066
5067 /*
5068 Most of these are for innodb, so setting them to 0.
5069 TODO: possibly separate out primary vs. secondary index reads
5070 */
5071 memset(&io_perf, 0, sizeof(io_perf));
5072 memset(&page_stats, 0, sizeof(page_stats));
5073 memset(&comp_stats, 0, sizeof(comp_stats));
5074 memset(&io_perf_write, 0, sizeof(io_perf_write));
5075
5076 tablenames = rdb_open_tables.get_table_names();
5077
5078 for (const auto &it : tablenames) {
5079 Rdb_table_handler *table_handler;
5080 std::string str, dbname, tablename, partname;
5081 char dbname_sys[NAME_LEN + 1];
5082 char tablename_sys[NAME_LEN + 1];
5083 bool is_partition;
5084
5085 if (rdb_normalize_tablename(it, &str) != HA_EXIT_SUCCESS) {
5086 /* Function needs to return void because of the interface and we've
5087 * detected an error which shouldn't happen. There's no way to let
5088 * caller know that something failed.
5089 */
5090 SHIP_ASSERT(false);
5091 return;
5092 }
5093
5094 if (rdb_split_normalized_tablename(str, &dbname, &tablename, &partname)) {
5095 continue;
5096 }
5097
5098 is_partition = (partname.size() != 0);
5099
5100 table_handler = rdb_open_tables.get_table_handler(it.c_str());
5101 if (table_handler == nullptr) {
5102 continue;
5103 }
5104
5105 io_perf_read.bytes = table_handler->m_io_perf_read.bytes.load();
5106 io_perf_read.requests = table_handler->m_io_perf_read.requests.load();
5107 io_perf_write.bytes = table_handler->m_io_perf_write.bytes.load();
5108 io_perf_write.requests = table_handler->m_io_perf_write.requests.load();
5109 lock_wait_timeout_stats = table_handler->m_lock_wait_timeout_counter.load();
5110 deadlock_stats = table_handler->m_deadlock_counter.load();
5111 lock_wait_stats =
5112 table_handler->m_table_perf_context.m_value[PC_KEY_LOCK_WAIT_COUNT]
5113 .load();
5114
5115 /*
5116 Convert from rocksdb timer to mysql timer. RocksDB values are
5117 in nanoseconds, but table statistics expect the value to be
5118 in my_timer format.
5119 */
5120 io_perf_read.svc_time = my_core::microseconds_to_my_timer(
5121 table_handler->m_io_perf_read.svc_time.load() / 1000);
5122 io_perf_read.svc_time_max = my_core::microseconds_to_my_timer(
5123 table_handler->m_io_perf_read.svc_time_max.load() / 1000);
5124 io_perf_read.wait_time = my_core::microseconds_to_my_timer(
5125 table_handler->m_io_perf_read.wait_time.load() / 1000);
5126 io_perf_read.wait_time_max = my_core::microseconds_to_my_timer(
5127 table_handler->m_io_perf_read.wait_time_max.load() / 1000);
5128 io_perf_read.slow_ios = table_handler->m_io_perf_read.slow_ios.load();
5129 rdb_open_tables.release_table_handler(table_handler);
5130
5131 /*
5132 Table stats expects our database and table name to be in system encoding,
5133 not filename format. Convert before calling callback.
5134 */
5135 my_core::filename_to_tablename(dbname.c_str(), dbname_sys,
5136 sizeof(dbname_sys));
5137 my_core::filename_to_tablename(tablename.c_str(), tablename_sys,
5138 sizeof(tablename_sys));
5139 (*cb)(dbname_sys, tablename_sys, is_partition, &io_perf_read,
5140 &io_perf_write, &io_perf, &io_perf, &io_perf, &page_stats,
5141 &comp_stats, lock_wait_stats, lock_wait_timeout_stats, deadlock_stats,
5142 rocksdb_hton_name);
5143 }
5144 }
5145 #endif
5146 static rocksdb::Status check_rocksdb_options_compatibility(
5147 const char *const dbpath, const rocksdb::Options &main_opts,
5148 const std::vector<rocksdb::ColumnFamilyDescriptor> &cf_descr) {
5149 DBUG_ASSERT(rocksdb_datadir != nullptr);
5150
5151 rocksdb::DBOptions loaded_db_opt;
5152 std::vector<rocksdb::ColumnFamilyDescriptor> loaded_cf_descs;
5153 rocksdb::Status status =
5154 LoadLatestOptions(dbpath, rocksdb::Env::Default(), &loaded_db_opt,
5155 &loaded_cf_descs, rocksdb_ignore_unknown_options);
5156
5157 // If we're starting from scratch and there are no options saved yet then this
5158 // is a valid case. Therefore we can't compare the current set of options to
5159 // anything.
5160 if (status.IsNotFound()) {
5161 return rocksdb::Status::OK();
5162 }
5163
5164 if (!status.ok()) {
5165 return status;
5166 }
5167
5168 if (loaded_cf_descs.size() != cf_descr.size()) {
5169 return rocksdb::Status::NotSupported(
5170 "Mismatched size of column family "
5171 "descriptors.");
5172 }
5173
5174 // Please see RocksDB documentation for more context about why we need to set
5175 // user-defined functions and pointer-typed options manually.
5176 for (size_t i = 0; i < loaded_cf_descs.size(); i++) {
5177 loaded_cf_descs[i].options.compaction_filter =
5178 cf_descr[i].options.compaction_filter;
5179 loaded_cf_descs[i].options.compaction_filter_factory =
5180 cf_descr[i].options.compaction_filter_factory;
5181 loaded_cf_descs[i].options.comparator = cf_descr[i].options.comparator;
5182 loaded_cf_descs[i].options.memtable_factory =
5183 cf_descr[i].options.memtable_factory;
5184 loaded_cf_descs[i].options.merge_operator =
5185 cf_descr[i].options.merge_operator;
5186 loaded_cf_descs[i].options.prefix_extractor =
5187 cf_descr[i].options.prefix_extractor;
5188 loaded_cf_descs[i].options.table_factory =
5189 cf_descr[i].options.table_factory;
5190 }
5191
5192 // This is the essence of the function - determine if it's safe to open the
5193 // database or not.
5194 status = CheckOptionsCompatibility(dbpath, rocksdb::Env::Default(), main_opts,
5195 loaded_cf_descs,
5196 rocksdb_ignore_unknown_options);
5197
5198 return status;
5199 }
5200
5201 bool prevent_myrocks_loading= false;
5202
5203
5204 /*
5205 Storage Engine initialization function, invoked when plugin is loaded.
5206 */
5207
5208 static int rocksdb_init_func(void *const p) {
5209
5210 DBUG_ENTER_FUNC();
5211
5212 if (prevent_myrocks_loading)
5213 {
5214 my_error(ER_INTERNAL_ERROR, MYF(0),
5215 "Loading MyRocks plugin after it has been unloaded is not "
5216 "supported. Please restart mysqld");
5217 DBUG_RETURN(1);
5218 }
5219
5220 if (rocksdb_ignore_datadic_errors)
5221 {
5222 sql_print_information(
5223 "CAUTION: Running with rocksdb_ignore_datadic_errors=1. "
5224 " This should only be used to perform repairs");
5225 }
5226
5227 if (rdb_check_rocksdb_corruption()) {
5228 // NO_LINT_DEBUG
5229 sql_print_error(
5230 "RocksDB: There was a corruption detected in RockDB files. "
5231 "Check error log emitted earlier for more details.");
5232 if (rocksdb_allow_to_start_after_corruption) {
5233 // NO_LINT_DEBUG
5234 sql_print_information(
5235 "RocksDB: Remove rocksdb_allow_to_start_after_corruption to prevent "
5236 "server operating if RocksDB corruption is detected.");
5237 } else {
5238 // NO_LINT_DEBUG
5239 sql_print_error(
5240 "RocksDB: The server will exit normally and stop restart "
5241 "attempts. Remove %s file from data directory and "
5242 "start mysqld manually.",
5243 rdb_corruption_marker_file_name().c_str());
5244 exit(0);
5245 }
5246 }
5247
5248 // Validate the assumption about the size of ROCKSDB_SIZEOF_HIDDEN_PK_COLUMN.
5249 static_assert(sizeof(longlong) == 8, "Assuming that longlong is 8 bytes.");
5250
5251 init_rocksdb_psi_keys();
5252
5253 rocksdb_hton = (handlerton *)p;
5254
5255 rdb_open_tables.init();
5256 Ensure_cleanup rdb_open_tables_cleanup([]() { rdb_open_tables.free(); });
5257
5258 #ifdef HAVE_PSI_INTERFACE
5259 rdb_bg_thread.init(rdb_signal_bg_psi_mutex_key, rdb_signal_bg_psi_cond_key);
5260 rdb_drop_idx_thread.init(rdb_signal_drop_idx_psi_mutex_key,
5261 rdb_signal_drop_idx_psi_cond_key);
5262 rdb_mc_thread.init(rdb_signal_mc_psi_mutex_key, rdb_signal_mc_psi_cond_key);
5263 #else
5264 rdb_bg_thread.init();
5265 rdb_drop_idx_thread.init();
5266 rdb_mc_thread.init();
5267 #endif
5268 mysql_mutex_init(rdb_collation_data_mutex_key, &rdb_collation_data_mutex,
5269 MY_MUTEX_INIT_FAST);
5270 mysql_mutex_init(rdb_mem_cmp_space_mutex_key, &rdb_mem_cmp_space_mutex,
5271 MY_MUTEX_INIT_FAST);
5272
5273 const char* initial_rocksdb_datadir_for_ignore_dirs= rocksdb_datadir;
5274 if (!strncmp(rocksdb_datadir, "./", 2))
5275 initial_rocksdb_datadir_for_ignore_dirs += 2;
5276 ignore_db_dirs_append(initial_rocksdb_datadir_for_ignore_dirs);
5277
5278 #if defined(HAVE_PSI_INTERFACE)
5279 rdb_collation_exceptions =
5280 new Regex_list_handler(key_rwlock_collation_exception_list);
5281 #else
5282 rdb_collation_exceptions = new Regex_list_handler();
5283 #endif
5284
5285 mysql_mutex_init(rdb_sysvars_psi_mutex_key, &rdb_sysvars_mutex,
5286 MY_MUTEX_INIT_FAST);
5287 mysql_mutex_init(rdb_block_cache_resize_mutex_key,
5288 &rdb_block_cache_resize_mutex, MY_MUTEX_INIT_FAST);
5289 Rdb_transaction::init_mutex();
5290
5291 rocksdb_hton->state = SHOW_OPTION_YES;
5292 rocksdb_hton->create = rocksdb_create_handler;
5293 rocksdb_hton->close_connection = rocksdb_close_connection;
5294
5295 rocksdb_hton->prepare = rocksdb_prepare;
5296 rocksdb_hton->prepare_ordered = NULL; // Do not need it
5297
5298 rocksdb_hton->commit_by_xid = rocksdb_commit_by_xid;
5299 rocksdb_hton->rollback_by_xid = rocksdb_rollback_by_xid;
5300 rocksdb_hton->recover = rocksdb_recover;
5301
5302 rocksdb_hton->commit_ordered= rocksdb_commit_ordered;
5303 rocksdb_hton->commit = rocksdb_commit;
5304
5305 rocksdb_hton->commit_checkpoint_request= rocksdb_checkpoint_request;
5306
5307 rocksdb_hton->rollback = rocksdb_rollback;
5308 rocksdb_hton->show_status = rocksdb_show_status;
5309 #ifdef MARIADB_NOT_YET
5310 rocksdb_hton->explicit_snapshot = rocksdb_explicit_snapshot;
5311 #endif
5312 rocksdb_hton->start_consistent_snapshot =
5313 rocksdb_start_tx_and_assign_read_view;
5314 #ifdef MARIADB_NOT_YET
5315 rocksdb_hton->start_shared_snapshot = rocksdb_start_tx_with_shared_read_view;
5316 #endif
5317 rocksdb_hton->savepoint_set = rocksdb_savepoint;
5318 rocksdb_hton->savepoint_rollback = rocksdb_rollback_to_savepoint;
5319 rocksdb_hton->savepoint_rollback_can_release_mdl =
5320 rocksdb_rollback_to_savepoint_can_release_mdl;
5321 #ifdef MARIAROCKS_NOT_YET
5322 rocksdb_hton->update_table_stats = rocksdb_update_table_stats;
5323 #endif // MARIAROCKS_NOT_YET
5324
5325 /*
5326 Not needed in MariaDB:
5327 rocksdb_hton->flush_logs = rocksdb_flush_wal;
5328 rocksdb_hton->handle_single_table_select = rocksdb_handle_single_table_select;
5329
5330 */
5331
5332 rocksdb_hton->flags = HTON_TEMPORARY_NOT_SUPPORTED |
5333 HTON_SUPPORTS_EXTENDED_KEYS | HTON_CAN_RECREATE;
5334
5335 rocksdb_hton->tablefile_extensions= ha_rocksdb_exts;
5336 DBUG_ASSERT(!mysqld_embedded);
5337
5338 if (rocksdb_db_options->max_open_files > (long)open_files_limit) {
5339 // NO_LINT_DEBUG
5340 sql_print_information(
5341 "RocksDB: rocksdb_max_open_files should not be "
5342 "greater than the open_files_limit, effective value "
5343 "of rocksdb_max_open_files is being set to "
5344 "open_files_limit / 2.");
5345 rocksdb_db_options->max_open_files = open_files_limit / 2;
5346 } else if (rocksdb_db_options->max_open_files == -2) {
5347 rocksdb_db_options->max_open_files = open_files_limit / 2;
5348 }
5349
5350 #if 0 // MARIAROCKS_NOT_YET : read-free replication is not supported
5351 rdb_read_free_regex_handler.set_patterns(DEFAULT_READ_FREE_RPL_TABLES);
5352 #endif
5353
5354 rocksdb_stats = rocksdb::CreateDBStatistics();
5355 rocksdb_stats->set_stats_level(
5356 static_cast<rocksdb::StatsLevel>(rocksdb_stats_level));
5357 rocksdb_stats_level = rocksdb_stats->get_stats_level();
5358 rocksdb_db_options->statistics = rocksdb_stats;
5359
5360 if (rocksdb_rate_limiter_bytes_per_sec != 0) {
5361 rocksdb_rate_limiter.reset(
5362 rocksdb::NewGenericRateLimiter(rocksdb_rate_limiter_bytes_per_sec));
5363 rocksdb_db_options->rate_limiter = rocksdb_rate_limiter;
5364 }
5365
5366 rocksdb_db_options->delayed_write_rate = rocksdb_delayed_write_rate;
5367
5368 std::shared_ptr<Rdb_logger> myrocks_logger = std::make_shared<Rdb_logger>();
5369 rocksdb::Status s = rocksdb::CreateLoggerFromOptions(
5370 rocksdb_datadir, *rocksdb_db_options, &rocksdb_db_options->info_log);
5371 if (s.ok()) {
5372 myrocks_logger->SetRocksDBLogger(rocksdb_db_options->info_log);
5373 }
5374
5375 rocksdb_db_options->info_log = myrocks_logger;
5376 myrocks_logger->SetInfoLogLevel(
5377 static_cast<rocksdb::InfoLogLevel>(rocksdb_info_log_level));
5378 rocksdb_db_options->wal_dir = rocksdb_wal_dir;
5379
5380 rocksdb_db_options->wal_recovery_mode =
5381 static_cast<rocksdb::WALRecoveryMode>(rocksdb_wal_recovery_mode);
5382
5383 rocksdb_db_options->access_hint_on_compaction_start =
5384 static_cast<rocksdb::Options::AccessHint>(
5385 rocksdb_access_hint_on_compaction_start);
5386
5387 if (rocksdb_db_options->allow_mmap_reads &&
5388 rocksdb_db_options->use_direct_reads) {
5389 // allow_mmap_reads implies !use_direct_reads and RocksDB will not open if
5390 // mmap_reads and direct_reads are both on. (NO_LINT_DEBUG)
5391 sql_print_error(
5392 "RocksDB: Can't enable both use_direct_reads "
5393 "and allow_mmap_reads\n");
5394 DBUG_RETURN(HA_EXIT_FAILURE);
5395 }
5396
5397 // Check whether the filesystem backing rocksdb_datadir allows O_DIRECT
5398 if (rocksdb_db_options->use_direct_reads ||
5399 rocksdb_db_options->use_direct_io_for_flush_and_compaction) {
5400 rocksdb::EnvOptions soptions;
5401 rocksdb::Status check_status;
5402 rocksdb::Env *const env = rocksdb_db_options->env;
5403
5404 std::string fname = format_string("%s/DIRECT_CHECK", rocksdb_datadir);
5405 if (env->FileExists(fname).ok()) {
5406 std::unique_ptr<rocksdb::SequentialFile> file;
5407 soptions.use_direct_reads = true;
5408 check_status = env->NewSequentialFile(fname, &file, soptions);
5409 } else {
5410 std::unique_ptr<rocksdb::WritableFile> file;
5411 soptions.use_direct_writes = true;
5412 check_status = env->ReopenWritableFile(fname, &file, soptions);
5413 if (file != nullptr) {
5414 file->Close();
5415 }
5416 env->DeleteFile(fname);
5417 }
5418
5419 if (!check_status.ok()) {
5420 // NO_LINT_DEBUG
5421 sql_print_error(
5422 "RocksDB: Unable to use direct io in rocksdb-datadir:"
5423 "(%s)",
5424 check_status.getState());
5425 DBUG_RETURN(HA_EXIT_FAILURE);
5426 }
5427 }
5428
5429 if (rocksdb_db_options->allow_mmap_writes &&
5430 rocksdb_db_options->use_direct_io_for_flush_and_compaction) {
5431 // See above comment for allow_mmap_reads. (NO_LINT_DEBUG)
5432 sql_print_error(
5433 "RocksDB: Can't enable both "
5434 "use_direct_io_for_flush_and_compaction and "
5435 "allow_mmap_writes\n");
5436 DBUG_RETURN(HA_EXIT_FAILURE);
5437 }
5438
5439 if (rocksdb_db_options->allow_mmap_writes &&
5440 rocksdb_flush_log_at_trx_commit != FLUSH_LOG_NEVER) {
5441 // NO_LINT_DEBUG
5442 sql_print_error(
5443 "RocksDB: rocksdb_flush_log_at_trx_commit needs to be 0 "
5444 "to use allow_mmap_writes");
5445 DBUG_RETURN(HA_EXIT_FAILURE);
5446 }
5447
5448 // sst_file_manager will move deleted rocksdb sst files to trash_dir
5449 // to be deleted in a background thread.
5450 std::string trash_dir = std::string(rocksdb_datadir) + "/trash";
5451 rocksdb_db_options->sst_file_manager.reset(NewSstFileManager(
5452 rocksdb_db_options->env, myrocks_logger, trash_dir,
5453 rocksdb_sst_mgr_rate_bytes_per_sec, true /* delete_existing_trash */));
5454
5455 std::vector<std::string> cf_names;
5456 rocksdb::Status status;
5457 status = rocksdb::DB::ListColumnFamilies(*rocksdb_db_options, rocksdb_datadir,
5458 &cf_names);
5459 if (!status.ok()) {
5460 /*
5461 When we start on an empty datadir, ListColumnFamilies returns IOError,
5462 and RocksDB doesn't provide any way to check what kind of error it was.
5463 Checking system errno happens to work right now.
5464 */
5465 if (status.IsIOError()
5466 #ifndef _WIN32
5467 && errno == ENOENT
5468 #endif
5469 ) {
5470 sql_print_information("RocksDB: Got ENOENT when listing column families");
5471
5472 // NO_LINT_DEBUG
5473 sql_print_information(
5474 "RocksDB: assuming that we're creating a new database");
5475 } else {
5476 rdb_log_status_error(status, "Error listing column families");
5477 DBUG_RETURN(HA_EXIT_FAILURE);
5478 }
5479 } else {
5480 // NO_LINT_DEBUG
5481 sql_print_information("RocksDB: %ld column families found",
5482 cf_names.size());
5483 }
5484
5485 std::vector<rocksdb::ColumnFamilyDescriptor> cf_descr;
5486 std::vector<rocksdb::ColumnFamilyHandle *> cf_handles;
5487
5488 rocksdb_tbl_options->index_type =
5489 (rocksdb::BlockBasedTableOptions::IndexType)rocksdb_index_type;
5490
5491 if (!rocksdb_tbl_options->no_block_cache) {
5492 std::shared_ptr<rocksdb::MemoryAllocator> memory_allocator;
5493 if (!rocksdb_cache_dump) {
5494 size_t block_size = rocksdb_tbl_options->block_size;
5495 rocksdb::JemallocAllocatorOptions alloc_opt;
5496 // Limit jemalloc tcache memory usage. The range
5497 // [block_size/4, block_size] should be enough to cover most of
5498 // block cache allocation sizes.
5499 alloc_opt.limit_tcache_size = true;
5500 alloc_opt.tcache_size_lower_bound = block_size / 4;
5501 alloc_opt.tcache_size_upper_bound = block_size;
5502 rocksdb::Status new_alloc_status =
5503 rocksdb::NewJemallocNodumpAllocator(alloc_opt, &memory_allocator);
5504 if (!new_alloc_status.ok()) {
5505 // Fallback to use default malloc/free.
5506 rdb_log_status_error(new_alloc_status,
5507 "Error excluding block cache from core dump");
5508 memory_allocator = nullptr;
5509 DBUG_RETURN(HA_EXIT_FAILURE);
5510 }
5511 }
5512 std::shared_ptr<rocksdb::Cache> block_cache =
5513 rocksdb_use_clock_cache
5514 ? rocksdb::NewClockCache(rocksdb_block_cache_size)
5515 : rocksdb::NewLRUCache(
5516 rocksdb_block_cache_size, -1 /*num_shard_bits*/,
5517 false /*strict_capcity_limit*/,
5518 rocksdb_cache_high_pri_pool_ratio, memory_allocator);
5519 if (rocksdb_sim_cache_size > 0) {
5520 // Simulated cache enabled
5521 // Wrap block cache inside a simulated cache and pass it to RocksDB
5522 rocksdb_tbl_options->block_cache =
5523 rocksdb::NewSimCache(block_cache, rocksdb_sim_cache_size, 6);
5524 } else {
5525 // Pass block cache to RocksDB
5526 rocksdb_tbl_options->block_cache = block_cache;
5527 }
5528 }
5529 // Using newer BlockBasedTable format version for better compression
5530 // and better memory allocation.
5531 // See:
5532 // https://github.com/facebook/rocksdb/commit/9ab5adfc59a621d12357580c94451d9f7320c2dd
5533 rocksdb_tbl_options->format_version = 2;
5534
5535 if (rocksdb_collect_sst_properties) {
5536 properties_collector_factory =
5537 std::make_shared<Rdb_tbl_prop_coll_factory>(&ddl_manager);
5538
5539 rocksdb_set_compaction_options(nullptr, nullptr, nullptr, nullptr);
5540
5541 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
5542
5543 DBUG_ASSERT(rocksdb_table_stats_sampling_pct <=
5544 RDB_TBL_STATS_SAMPLE_PCT_MAX);
5545 properties_collector_factory->SetTableStatsSamplingPct(
5546 rocksdb_table_stats_sampling_pct);
5547
5548 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
5549 }
5550
5551 if (rocksdb_persistent_cache_size_mb > 0) {
5552 std::shared_ptr<rocksdb::PersistentCache> pcache;
5553 uint64_t cache_size_bytes = rocksdb_persistent_cache_size_mb * 1024 * 1024;
5554 status = rocksdb::NewPersistentCache(
5555 rocksdb::Env::Default(), std::string(rocksdb_persistent_cache_path),
5556 cache_size_bytes, myrocks_logger, true, &pcache);
5557 if (!status.ok()) {
5558 // NO_LINT_DEBUG
5559 sql_print_error("RocksDB: Persistent cache returned error: (%s)",
5560 status.getState());
5561 DBUG_RETURN(HA_EXIT_FAILURE);
5562 }
5563 rocksdb_tbl_options->persistent_cache = pcache;
5564 } else if (strlen(rocksdb_persistent_cache_path)) {
5565 // NO_LINT_DEBUG
5566 sql_print_error("RocksDB: Must specify rocksdb_persistent_cache_size_mb");
5567 DBUG_RETURN(HA_EXIT_FAILURE);
5568 }
5569
5570 std::unique_ptr<Rdb_cf_options> cf_options_map(new Rdb_cf_options());
5571 if (!cf_options_map->init(*rocksdb_tbl_options, properties_collector_factory,
5572 rocksdb_default_cf_options,
5573 rocksdb_override_cf_options)) {
5574 // NO_LINT_DEBUG
5575 sql_print_error("RocksDB: Failed to initialize CF options map.");
5576 DBUG_RETURN(HA_EXIT_FAILURE);
5577 }
5578
5579 /*
5580 If there are no column families, we're creating the new database.
5581 Create one column family named "default".
5582 */
5583 if (cf_names.size() == 0) cf_names.push_back(DEFAULT_CF_NAME);
5584
5585 std::vector<int> compaction_enabled_cf_indices;
5586
5587 // NO_LINT_DEBUG
5588 sql_print_information("RocksDB: Column Families at start:");
5589 for (size_t i = 0; i < cf_names.size(); ++i) {
5590 rocksdb::ColumnFamilyOptions opts;
5591 cf_options_map->get_cf_options(cf_names[i], &opts);
5592
5593 // NO_LINT_DEBUG
5594 sql_print_information(" cf=%s", cf_names[i].c_str());
5595
5596 // NO_LINT_DEBUG
5597 sql_print_information(" write_buffer_size=%ld", opts.write_buffer_size);
5598
5599 // NO_LINT_DEBUG
5600 sql_print_information(" target_file_size_base=%" PRIu64,
5601 opts.target_file_size_base);
5602
5603 /*
5604 Temporarily disable compactions to prevent a race condition where
5605 compaction starts before compaction filter is ready.
5606 */
5607 if (!opts.disable_auto_compactions) {
5608 compaction_enabled_cf_indices.push_back(i);
5609 opts.disable_auto_compactions = true;
5610 }
5611 cf_descr.push_back(rocksdb::ColumnFamilyDescriptor(cf_names[i], opts));
5612 }
5613
5614 rocksdb::Options main_opts(*rocksdb_db_options,
5615 cf_options_map->get_defaults());
5616
5617 rocksdb::TransactionDBOptions tx_db_options;
5618 tx_db_options.transaction_lock_timeout = 2000; // 2 seconds
5619 tx_db_options.custom_mutex_factory = std::make_shared<Rdb_mutex_factory>();
5620 tx_db_options.write_policy =
5621 static_cast<rocksdb::TxnDBWritePolicy>(rocksdb_write_policy);
5622
5623 status =
5624 check_rocksdb_options_compatibility(rocksdb_datadir, main_opts, cf_descr);
5625
5626 // We won't start if we'll determine that there's a chance of data corruption
5627 // because of incompatible options.
5628 if (!status.ok()) {
5629 rdb_log_status_error(
5630 status, "Compatibility check against existing database options failed");
5631 DBUG_RETURN(HA_EXIT_FAILURE);
5632 }
5633
5634 status = rocksdb::TransactionDB::Open(
5635 main_opts, tx_db_options, rocksdb_datadir, cf_descr, &cf_handles, &rdb);
5636
5637 if (!status.ok()) {
5638 rdb_log_status_error(status, "Error opening instance");
5639 DBUG_RETURN(HA_EXIT_FAILURE);
5640 }
5641 cf_manager.init(std::move(cf_options_map), &cf_handles);
5642
5643 if (dict_manager.init(rdb, &cf_manager)) {
5644 // NO_LINT_DEBUG
5645 sql_print_error("RocksDB: Failed to initialize data dictionary.");
5646 DBUG_RETURN(HA_EXIT_FAILURE);
5647 }
5648
5649 if (binlog_manager.init(&dict_manager)) {
5650 // NO_LINT_DEBUG
5651 sql_print_error("RocksDB: Failed to initialize binlog manager.");
5652 DBUG_RETURN(HA_EXIT_FAILURE);
5653 }
5654
5655 if (ddl_manager.init(&dict_manager, &cf_manager, rocksdb_validate_tables)) {
5656 // NO_LINT_DEBUG
5657 sql_print_error("RocksDB: Failed to initialize DDL manager.");
5658
5659 if (rocksdb_ignore_datadic_errors)
5660 {
5661 sql_print_error("RocksDB: rocksdb_ignore_datadic_errors=1, "
5662 "trying to continue");
5663 }
5664 else
5665 DBUG_RETURN(HA_EXIT_FAILURE);
5666 }
5667
5668 Rdb_sst_info::init(rdb);
5669
5670 /*
5671 Enable auto compaction, things needed for compaction filter are finished
5672 initializing
5673 */
5674 std::vector<rocksdb::ColumnFamilyHandle *> compaction_enabled_cf_handles;
5675 compaction_enabled_cf_handles.reserve(compaction_enabled_cf_indices.size());
5676 for (const auto &index : compaction_enabled_cf_indices) {
5677 compaction_enabled_cf_handles.push_back(cf_handles[index]);
5678 }
5679
5680 status = rdb->EnableAutoCompaction(compaction_enabled_cf_handles);
5681
5682 if (!status.ok()) {
5683 rdb_log_status_error(status, "Error enabling compaction");
5684 DBUG_RETURN(HA_EXIT_FAILURE);
5685 }
5686
5687 #ifndef HAVE_PSI_INTERFACE
5688 auto err = rdb_bg_thread.create_thread(BG_THREAD_NAME);
5689 #else
5690 auto err = rdb_bg_thread.create_thread(BG_THREAD_NAME,
5691 rdb_background_psi_thread_key);
5692 #endif
5693 if (err != 0) {
5694 // NO_LINT_DEBUG
5695 sql_print_error("RocksDB: Couldn't start the background thread: (errno=%d)",
5696 err);
5697 DBUG_RETURN(HA_EXIT_FAILURE);
5698 }
5699
5700 #ifndef HAVE_PSI_INTERFACE
5701 err = rdb_drop_idx_thread.create_thread(INDEX_THREAD_NAME);
5702 #else
5703 err = rdb_drop_idx_thread.create_thread(INDEX_THREAD_NAME,
5704 rdb_drop_idx_psi_thread_key);
5705 #endif
5706 if (err != 0) {
5707 // NO_LINT_DEBUG
5708 sql_print_error("RocksDB: Couldn't start the drop index thread: (errno=%d)",
5709 err);
5710 DBUG_RETURN(HA_EXIT_FAILURE);
5711 }
5712
5713 err = rdb_mc_thread.create_thread(MANUAL_COMPACTION_THREAD_NAME
5714 #ifdef HAVE_PSI_INTERFACE
5715 ,
5716 rdb_mc_psi_thread_key
5717 #endif
5718 );
5719 if (err != 0) {
5720 // NO_LINT_DEBUG
5721 sql_print_error(
5722 "RocksDB: Couldn't start the manual compaction thread: (errno=%d)",
5723 err);
5724 DBUG_RETURN(HA_EXIT_FAILURE);
5725 }
5726
5727 rdb_set_collation_exception_list(rocksdb_strict_collation_exceptions);
5728
5729 if (rocksdb_pause_background_work) {
5730 rdb->PauseBackgroundWork();
5731 }
5732
5733 // NO_LINT_DEBUG
5734 sql_print_information("RocksDB: global statistics using %s indexer",
5735 STRINGIFY_ARG(RDB_INDEXER));
5736 #if defined(HAVE_SCHED_GETCPU)
5737 if (sched_getcpu() == -1) {
5738 // NO_LINT_DEBUG
5739 sql_print_information(
5740 "RocksDB: sched_getcpu() failed - "
5741 "global statistics will use thread_id_indexer_t instead");
5742 }
5743 #endif
5744
5745 err = my_error_register(rdb_get_error_messages, HA_ERR_ROCKSDB_FIRST,
5746 HA_ERR_ROCKSDB_LAST);
5747 if (err != 0) {
5748 // NO_LINT_DEBUG
5749 sql_print_error("RocksDB: Couldn't initialize error messages");
5750 DBUG_RETURN(HA_EXIT_FAILURE);
5751 }
5752
5753
5754
5755 // Creating an instance of HistogramImpl should only happen after RocksDB
5756 // has been successfully initialized.
5757 commit_latency_stats = new rocksdb::HistogramImpl();
5758
5759 // Construct a list of directories which will be monitored by I/O watchdog
5760 // to make sure that we won't lose write access to them.
5761 std::vector<std::string> directories;
5762
5763 // 1. Data directory.
5764 directories.push_back(mysql_real_data_home);
5765
5766 // 2. Transaction logs.
5767 if (myrocks::rocksdb_wal_dir && *myrocks::rocksdb_wal_dir) {
5768 directories.push_back(myrocks::rocksdb_wal_dir);
5769 }
5770
5771 #if !defined(_WIN32) && !defined(__APPLE__)
5772 io_watchdog = new Rdb_io_watchdog(std::move(directories));
5773 io_watchdog->reset_timeout(rocksdb_io_write_timeout_secs);
5774 #endif
5775
5776 // NO_LINT_DEBUG
5777 sql_print_information(
5778 "MyRocks storage engine plugin has been successfully "
5779 "initialized.");
5780
5781 // Skip cleaning up rdb_open_tables as we've succeeded
5782 rdb_open_tables_cleanup.skip();
5783
5784 DBUG_RETURN(HA_EXIT_SUCCESS);
5785 }
5786
5787 /*
5788 Storage Engine deinitialization function, invoked when plugin is unloaded.
5789 */
5790
5791 static int rocksdb_done_func(void *const p) {
5792 DBUG_ENTER_FUNC();
5793
5794 int error = 0;
5795
5796 // signal the drop index thread to stop
5797 rdb_drop_idx_thread.signal(true);
5798
5799 // Flush all memtables for not losing data, even if WAL is disabled.
5800 rocksdb_flush_all_memtables();
5801
5802 // Stop all rocksdb background work
5803 CancelAllBackgroundWork(rdb->GetBaseDB(), true);
5804
5805 // Signal the background thread to stop and to persist all stats collected
5806 // from background flushes and compactions. This will add more keys to a new
5807 // memtable, but since the memtables were just flushed, it should not trigger
5808 // a flush that can stall due to background threads being stopped. As long
5809 // as these keys are stored in a WAL file, they can be retrieved on restart.
5810 rdb_bg_thread.signal(true);
5811
5812 // Wait for the background thread to finish.
5813 auto err = rdb_bg_thread.join();
5814 if (err != 0) {
5815 // We'll log the message and continue because we're shutting down and
5816 // continuation is the optimal strategy.
5817 // NO_LINT_DEBUG
5818 sql_print_error("RocksDB: Couldn't stop the background thread: (errno=%d)",
5819 err);
5820 }
5821
5822 // Wait for the drop index thread to finish.
5823 err = rdb_drop_idx_thread.join();
5824 if (err != 0) {
5825 // NO_LINT_DEBUG
5826 sql_print_error("RocksDB: Couldn't stop the index thread: (errno=%d)", err);
5827 }
5828
5829 // signal the manual compaction thread to stop
5830 rdb_mc_thread.signal(true);
5831 // Wait for the manual compaction thread to finish.
5832 err = rdb_mc_thread.join();
5833 if (err != 0) {
5834 // NO_LINT_DEBUG
5835 sql_print_error(
5836 "RocksDB: Couldn't stop the manual compaction thread: (errno=%d)", err);
5837 }
5838
5839 if (rdb_open_tables.count()) {
5840 // Looks like we are getting unloaded and yet we have some open tables
5841 // left behind.
5842 error = 1;
5843 }
5844
5845 rdb_open_tables.free();
5846 /*
5847 destructors for static objects can be called at _exit(),
5848 but we want to free the memory at dlclose()
5849 */
5850 // MARIADB_MERGE_2019: rdb_open_tables.m_hash.~Rdb_table_set();
5851 mysql_mutex_destroy(&rdb_sysvars_mutex);
5852 mysql_mutex_destroy(&rdb_block_cache_resize_mutex);
5853
5854
5855 delete rdb_collation_exceptions;
5856
5857 mysql_mutex_destroy(&rdb_collation_data_mutex);
5858 mysql_mutex_destroy(&rdb_mem_cmp_space_mutex);
5859
5860 Rdb_transaction::term_mutex();
5861
5862 for (auto &it : rdb_collation_data) {
5863 delete it;
5864 it = nullptr;
5865 }
5866
5867 ddl_manager.cleanup();
5868 binlog_manager.cleanup();
5869 dict_manager.cleanup();
5870 cf_manager.cleanup();
5871
5872 delete rdb;
5873 rdb = nullptr;
5874
5875 delete commit_latency_stats;
5876 commit_latency_stats = nullptr;
5877
5878 #if !defined(_WIN32) && !defined(__APPLE__)
5879 delete io_watchdog;
5880 io_watchdog = nullptr;
5881 #endif
5882
5883 // Disown the cache data since we're shutting down.
5884 // This results in memory leaks but it improved the shutdown time.
5885 // Don't disown when running under valgrind
5886 #ifndef HAVE_valgrind
5887 if (rocksdb_tbl_options->block_cache) {
5888 rocksdb_tbl_options->block_cache->DisownData();
5889 }
5890 #endif /* HAVE_valgrind */
5891
5892 /*
5893 MariaDB: don't clear rocksdb_db_options and rocksdb_tbl_options.
5894 MyRocks' plugin variables refer to them.
5895
5896 The plugin cannot be loaded again (see prevent_myrocks_loading) but plugin
5897 variables are processed before myrocks::rocksdb_init_func is invoked, so
5898 they must point to valid memory.
5899 */
5900 //rocksdb_db_options = nullptr;
5901 rocksdb_db_options->statistics = nullptr;
5902 //rocksdb_tbl_options = nullptr;
5903 rocksdb_stats = nullptr;
5904
5905 my_free(rocksdb_update_cf_options);
5906 rocksdb_update_cf_options = nullptr;
5907
5908 my_error_unregister(HA_ERR_ROCKSDB_FIRST, HA_ERR_ROCKSDB_LAST);
5909
5910 /*
5911 Prevent loading the plugin after it has been loaded and then unloaded. This
5912 doesn't work currently.
5913 */
5914 prevent_myrocks_loading= true;
5915
5916 DBUG_RETURN(error);
5917 }
5918
5919 static inline void rocksdb_smart_seek(bool seek_backward,
5920 rocksdb::Iterator *const iter,
5921 const rocksdb::Slice &key_slice) {
5922 if (seek_backward) {
5923 iter->SeekForPrev(key_slice);
5924 } else {
5925 iter->Seek(key_slice);
5926 }
5927 }
5928
5929 static inline void rocksdb_smart_next(bool seek_backward,
5930 rocksdb::Iterator *const iter) {
5931 if (seek_backward) {
5932 iter->Prev();
5933 } else {
5934 iter->Next();
5935 }
5936 }
5937
5938 #ifndef DBUG_OFF
5939 // simulate that RocksDB has reported corrupted data
5940 static void dbug_change_status_to_corrupted(rocksdb::Status *status) {
5941 *status = rocksdb::Status::Corruption();
5942 }
5943 #endif
5944
5945 // If the iterator is not valid it might be because of EOF but might be due
5946 // to IOError or corruption. The good practice is always check it.
5947 // https://github.com/facebook/rocksdb/wiki/Iterator#error-handling
5948 static inline bool is_valid(rocksdb::Iterator *scan_it) {
5949 if (scan_it->Valid()) {
5950 return true;
5951 } else {
5952 rocksdb::Status s = scan_it->status();
5953 DBUG_EXECUTE_IF("rocksdb_return_status_corrupted",
5954 dbug_change_status_to_corrupted(&s););
5955 if (s.IsIOError() || s.IsCorruption()) {
5956 if (s.IsCorruption()) {
5957 rdb_persist_corruption_marker();
5958 }
5959 rdb_handle_io_error(s, RDB_IO_ERROR_GENERAL);
5960 }
5961 return false;
5962 }
5963 }
5964
5965 /**
5966 @brief
5967 Example of simple lock controls. The "table_handler" it creates is a
5968 structure we will pass to each ha_rocksdb handler. Do you have to have
5969 one of these? Well, you have pieces that are used for locking, and
5970 they are needed to function.
5971 */
5972
5973 Rdb_table_handler *Rdb_open_tables_map::get_table_handler(
5974 const char *const table_name) {
5975 DBUG_ASSERT(table_name != nullptr);
5976
5977 Rdb_table_handler *table_handler;
5978
5979 std::string table_name_str(table_name);
5980
5981 // First, look up the table in the hash map.
5982 RDB_MUTEX_LOCK_CHECK(m_mutex);
5983 const auto it = m_table_map.find(table_name_str);
5984 if (it != m_table_map.end()) {
5985 // Found it
5986 table_handler = it->second;
5987 } else {
5988 char *tmp_name;
5989
5990 // Since we did not find it in the hash map, attempt to create and add it
5991 // to the hash map.
5992 if (!(table_handler = reinterpret_cast<Rdb_table_handler *>(my_multi_malloc(
5993 MYF(MY_WME | MY_ZEROFILL), &table_handler, sizeof(*table_handler),
5994 &tmp_name, table_name_str.length() + 1, NullS)))) {
5995 // Allocating a new Rdb_table_handler and a new table name failed.
5996 RDB_MUTEX_UNLOCK_CHECK(m_mutex);
5997 return nullptr;
5998 }
5999
6000 table_handler->m_ref_count = 0;
6001 table_handler->m_table_name_length = table_name_str.length();
6002 table_handler->m_table_name = tmp_name;
6003 strmov(table_handler->m_table_name, table_name);
6004
6005 m_table_map.emplace(table_name_str, table_handler);
6006
6007 thr_lock_init(&table_handler->m_thr_lock);
6008 #ifdef MARIAROCKS_NOT_YET
6009 table_handler->m_io_perf_read.init();
6010 table_handler->m_io_perf_write.init();
6011 #endif
6012 }
6013 DBUG_ASSERT(table_handler->m_ref_count >= 0);
6014 table_handler->m_ref_count++;
6015
6016 RDB_MUTEX_UNLOCK_CHECK(m_mutex);
6017
6018 return table_handler;
6019 }
6020
6021 std::vector<std::string> rdb_get_open_table_names(void) {
6022 return rdb_open_tables.get_table_names();
6023 }
6024
6025 std::vector<std::string> Rdb_open_tables_map::get_table_names(void) const {
6026 const Rdb_table_handler *table_handler;
6027 std::vector<std::string> names;
6028
6029 RDB_MUTEX_LOCK_CHECK(m_mutex);
6030 for (const auto &kv : m_table_map) {
6031 table_handler = kv.second;
6032 DBUG_ASSERT(table_handler != nullptr);
6033 names.push_back(table_handler->m_table_name);
6034 }
6035 RDB_MUTEX_UNLOCK_CHECK(m_mutex);
6036
6037 return names;
6038 }
6039
6040 /*
6041 Inspired by innobase_get_int_col_max_value from InnoDB. This returns the
6042 maximum value a type can take on.
6043 */
6044 static ulonglong rdb_get_int_col_max_value(const Field *field) {
6045 ulonglong max_value = 0;
6046 switch (field->key_type()) {
6047 case HA_KEYTYPE_BINARY:
6048 max_value = 0xFFULL;
6049 break;
6050 case HA_KEYTYPE_INT8:
6051 max_value = 0x7FULL;
6052 break;
6053 case HA_KEYTYPE_USHORT_INT:
6054 max_value = 0xFFFFULL;
6055 break;
6056 case HA_KEYTYPE_SHORT_INT:
6057 max_value = 0x7FFFULL;
6058 break;
6059 case HA_KEYTYPE_UINT24:
6060 max_value = 0xFFFFFFULL;
6061 break;
6062 case HA_KEYTYPE_INT24:
6063 max_value = 0x7FFFFFULL;
6064 break;
6065 case HA_KEYTYPE_ULONG_INT:
6066 max_value = 0xFFFFFFFFULL;
6067 break;
6068 case HA_KEYTYPE_LONG_INT:
6069 max_value = 0x7FFFFFFFULL;
6070 break;
6071 case HA_KEYTYPE_ULONGLONG:
6072 max_value = 0xFFFFFFFFFFFFFFFFULL;
6073 break;
6074 case HA_KEYTYPE_LONGLONG:
6075 max_value = 0x7FFFFFFFFFFFFFFFULL;
6076 break;
6077 case HA_KEYTYPE_FLOAT:
6078 max_value = 0x1000000ULL;
6079 break;
6080 case HA_KEYTYPE_DOUBLE:
6081 max_value = 0x20000000000000ULL;
6082 break;
6083 default:
6084 abort();
6085 }
6086
6087 return max_value;
6088 }
6089
6090 void ha_rocksdb::load_auto_incr_value() {
6091 ulonglong auto_incr = 0;
6092 bool validate_last = false, use_datadic = true;
6093 #ifndef DBUG_OFF
6094 DBUG_EXECUTE_IF("myrocks_autoinc_upgrade", use_datadic = false;);
6095 validate_last = true;
6096 #endif
6097
6098 if (use_datadic && dict_manager.get_auto_incr_val(
6099 m_tbl_def->get_autoincr_gl_index_id(), &auto_incr)) {
6100 update_auto_incr_val(auto_incr);
6101 }
6102
6103 // If we find nothing in the data dictionary, or if we are in debug mode,
6104 // then call index_last to get the last value.
6105 //
6106 // This is needed when upgrading from a server that did not support
6107 // persistent auto_increment, of if the table is empty.
6108 //
6109 // For debug mode, we are just verifying that the data dictionary value is
6110 // greater than or equal to the maximum value in the table.
6111 if (auto_incr == 0 || validate_last) {
6112 auto_incr = load_auto_incr_value_from_index();
6113 update_auto_incr_val(auto_incr);
6114 }
6115
6116 // If we failed to find anything from the data dictionary and index, then
6117 // initialize auto_increment to 1.
6118 if (m_tbl_def->m_auto_incr_val == 0) {
6119 update_auto_incr_val(1);
6120 }
6121 }
6122
6123 ulonglong ha_rocksdb::load_auto_incr_value_from_index() {
6124 const int save_active_index = active_index;
6125 active_index = table->s->next_number_index;
6126 const uint8 save_table_status = table->status;
6127 ulonglong last_val = 0;
6128
6129 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
6130 const bool is_new_snapshot = !tx->has_snapshot();
6131 if (is_new_snapshot) {
6132 tx->acquire_snapshot(true);
6133 }
6134
6135 // Do a lookup. We only need index column, so it should be index-only.
6136 // (another reason to make it index-only is that table->read_set is not set
6137 // appropriately and non-index-only lookup will not read the value)
6138 const bool save_keyread_only = m_keyread_only;
6139 m_keyread_only = true;
6140 m_converter->set_is_key_requested(true);
6141
6142 if (!index_last(table->record[0])) {
6143 Field *field =
6144 table->key_info[table->s->next_number_index].key_part[0].field;
6145 ulonglong max_val = rdb_get_int_col_max_value(field);
6146 MY_BITMAP *const old_map = dbug_tmp_use_all_columns(table, &table->read_set);
6147 last_val = field->val_int();
6148 if (last_val != max_val) {
6149 last_val++;
6150 }
6151 #ifndef DBUG_OFF
6152 ulonglong dd_val;
6153 if (last_val <= max_val) {
6154 const auto &gl_index_id = m_tbl_def->get_autoincr_gl_index_id();
6155 if (dict_manager.get_auto_incr_val(gl_index_id, &dd_val) &&
6156 tx->get_auto_incr(gl_index_id) == 0) {
6157 DBUG_ASSERT(dd_val >= last_val);
6158 }
6159 }
6160 #endif
6161 dbug_tmp_restore_column_map(&table->read_set, old_map);
6162 }
6163
6164 m_keyread_only = save_keyread_only;
6165 if (is_new_snapshot) {
6166 tx->release_snapshot();
6167 }
6168
6169 table->status = save_table_status;
6170 active_index = save_active_index;
6171
6172 /*
6173 Do what ha_rocksdb::index_end() does.
6174 (Why don't we use index_init/index_end? class handler defines index_init
6175 as private, for some reason).
6176 */
6177 release_scan_iterator();
6178
6179 return last_val;
6180 }
6181
6182 void ha_rocksdb::update_auto_incr_val(ulonglong val) {
6183 ulonglong auto_incr_val = m_tbl_def->m_auto_incr_val;
6184 while (
6185 auto_incr_val < val &&
6186 !m_tbl_def->m_auto_incr_val.compare_exchange_weak(auto_incr_val, val)) {
6187 // Do nothing - just loop until auto_incr_val is >= val or we successfully
6188 // set it
6189 }
6190 }
6191
6192 void ha_rocksdb::update_auto_incr_val_from_field() {
6193 Field *field;
6194 ulonglong new_val, max_val;
6195 field = table->key_info[table->s->next_number_index].key_part[0].field;
6196 max_val = rdb_get_int_col_max_value(field);
6197
6198 MY_BITMAP *const old_map =
6199 dbug_tmp_use_all_columns(table, &table->read_set);
6200 new_val = field->val_int();
6201 // don't increment if we would wrap around
6202 if (new_val != max_val) {
6203 new_val++;
6204 }
6205
6206 dbug_tmp_restore_column_map(&table->read_set, old_map);
6207
6208 // Only update if positive value was set for auto_incr column.
6209 if (new_val <= max_val) {
6210 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
6211 tx->set_auto_incr(m_tbl_def->get_autoincr_gl_index_id(), new_val);
6212
6213 // Update the in memory auto_incr value in m_tbl_def.
6214 update_auto_incr_val(new_val);
6215 }
6216 }
6217
6218 int ha_rocksdb::load_hidden_pk_value() {
6219 const int save_active_index = active_index;
6220 active_index = m_tbl_def->m_key_count - 1;
6221 const uint8 save_table_status = table->status;
6222
6223 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
6224 const bool is_new_snapshot = !tx->has_snapshot();
6225
6226 longlong hidden_pk_id = 1;
6227 // Do a lookup.
6228 if (!index_last(table->record[0])) {
6229 /*
6230 Decode PK field from the key
6231 */
6232 auto err = read_hidden_pk_id_from_rowkey(&hidden_pk_id);
6233 if (err) {
6234 if (is_new_snapshot) {
6235 tx->release_snapshot();
6236 }
6237 return err;
6238 }
6239
6240 hidden_pk_id++;
6241 }
6242
6243 longlong old = m_tbl_def->m_hidden_pk_val;
6244 while (old < hidden_pk_id &&
6245 !m_tbl_def->m_hidden_pk_val.compare_exchange_weak(old, hidden_pk_id)) {
6246 }
6247
6248 if (is_new_snapshot) {
6249 tx->release_snapshot();
6250 }
6251
6252 table->status = save_table_status;
6253 active_index = save_active_index;
6254
6255 release_scan_iterator();
6256
6257 return HA_EXIT_SUCCESS;
6258 }
6259
6260 /* Get PK value from m_tbl_def->m_hidden_pk_info. */
6261 longlong ha_rocksdb::update_hidden_pk_val() {
6262 DBUG_ASSERT(has_hidden_pk(table));
6263 const longlong new_val = m_tbl_def->m_hidden_pk_val++;
6264 return new_val;
6265 }
6266
6267 /* Get the id of the hidden pk id from m_last_rowkey */
6268 int ha_rocksdb::read_hidden_pk_id_from_rowkey(longlong *const hidden_pk_id) {
6269 DBUG_ASSERT(table != nullptr);
6270 DBUG_ASSERT(has_hidden_pk(table));
6271
6272 rocksdb::Slice rowkey_slice(m_last_rowkey.ptr(), m_last_rowkey.length());
6273
6274 // Get hidden primary key from old key slice
6275 Rdb_string_reader reader(&rowkey_slice);
6276 if ((!reader.read(Rdb_key_def::INDEX_NUMBER_SIZE))) {
6277 return HA_ERR_ROCKSDB_CORRUPT_DATA;
6278 }
6279
6280 const int length= 8; /* was Field_longlong::PACK_LENGTH in FB MySQL tree */
6281 const uchar *from = reinterpret_cast<const uchar *>(reader.read(length));
6282 if (from == nullptr) {
6283 /* Mem-comparable image doesn't have enough bytes */
6284 return HA_ERR_ROCKSDB_CORRUPT_DATA;
6285 }
6286
6287 *hidden_pk_id = rdb_netbuf_read_uint64(&from);
6288 return HA_EXIT_SUCCESS;
6289 }
6290
6291 /**
6292 @brief
6293 Free lock controls. We call this whenever we close a table. If the table had
6294 the last reference to the table_handler, then we free the memory associated
6295 with it.
6296 */
6297
6298 void Rdb_open_tables_map::release_table_handler(
6299 Rdb_table_handler *const table_handler) {
6300 RDB_MUTEX_LOCK_CHECK(m_mutex);
6301
6302 DBUG_ASSERT(table_handler != nullptr);
6303 DBUG_ASSERT(table_handler->m_ref_count > 0);
6304 if (!--table_handler->m_ref_count) {
6305 // Last reference was released. Tear down the hash entry.
6306 const auto ret MY_ATTRIBUTE((__unused__)) =
6307 m_table_map.erase(std::string(table_handler->m_table_name));
6308 DBUG_ASSERT(ret == 1); // the hash entry must actually be found and deleted
6309 my_core::thr_lock_delete(&table_handler->m_thr_lock);
6310 my_free(table_handler);
6311 }
6312
6313 RDB_MUTEX_UNLOCK_CHECK(m_mutex);
6314 }
6315
6316 static handler *rocksdb_create_handler(my_core::handlerton *const hton,
6317 my_core::TABLE_SHARE *const table_arg,
6318 my_core::MEM_ROOT *const mem_root) {
6319 return new (mem_root) ha_rocksdb(hton, table_arg);
6320 }
6321
6322 ha_rocksdb::ha_rocksdb(my_core::handlerton *const hton,
6323 my_core::TABLE_SHARE *const table_arg)
6324 : handler(hton, table_arg),
6325 m_table_handler(nullptr),
6326 m_scan_it(nullptr),
6327 m_scan_it_skips_bloom(false),
6328 m_scan_it_snapshot(nullptr),
6329 m_scan_it_lower_bound(nullptr),
6330 m_scan_it_upper_bound(nullptr),
6331 m_tbl_def(nullptr),
6332 m_pk_descr(nullptr),
6333 m_key_descr_arr(nullptr),
6334 m_pk_can_be_decoded(false),
6335 m_pk_tuple(nullptr),
6336 m_pk_packed_tuple(nullptr),
6337 m_sk_packed_tuple(nullptr),
6338 m_end_key_packed_tuple(nullptr),
6339 m_sk_match_prefix(nullptr),
6340 m_sk_match_prefix_buf(nullptr),
6341 m_sk_packed_tuple_old(nullptr),
6342 m_dup_sk_packed_tuple(nullptr),
6343 m_dup_sk_packed_tuple_old(nullptr),
6344 m_pack_buffer(nullptr),
6345 m_lock_rows(RDB_LOCK_NONE),
6346 m_keyread_only(false),
6347 m_insert_with_update(false),
6348 m_dup_pk_found(false),
6349 m_in_rpl_delete_rows(false),
6350 m_in_rpl_update_rows(false),
6351 m_force_skip_unique_check(false) {}
6352
6353
6354 const std::string &ha_rocksdb::get_table_basename() const {
6355 return m_tbl_def->base_tablename();
6356 }
6357
6358 /**
6359 @return
6360 false OK
6361 other Error inpacking the data
6362 */
6363 bool ha_rocksdb::init_with_fields() {
6364 DBUG_ENTER_FUNC();
6365
6366 const uint pk = table_share->primary_key;
6367 if (pk != MAX_KEY) {
6368 const uint key_parts = table_share->key_info[pk].user_defined_key_parts;
6369 check_keyread_allowed(pk /*PK*/, key_parts - 1, true);
6370 } else {
6371 m_pk_can_be_decoded = false;
6372 }
6373 cached_table_flags = table_flags();
6374
6375 DBUG_RETURN(false); /* Ok */
6376 }
6377
6378 /*
6379 If the key is a TTL key, we may need to filter it out.
6380
6381 The purpose of read filtering for tables with TTL is to ensure that
6382 during a transaction a key which has expired already but not removed by
6383 compaction yet is not returned to the user.
6384
6385 Without this the user might be hit with problems such as disappearing
6386 rows within a transaction, etc, because the compaction filter ignores
6387 snapshots when filtering keys.
6388 */
6389 bool ha_rocksdb::should_hide_ttl_rec(const Rdb_key_def &kd,
6390 const rocksdb::Slice &ttl_rec_val,
6391 const int64_t curr_ts) {
6392 DBUG_ASSERT(kd.has_ttl());
6393 DBUG_ASSERT(kd.m_ttl_rec_offset != UINT_MAX);
6394
6395 /*
6396 Curr_ts can only be 0 if there are no snapshots open.
6397 should_hide_ttl_rec can only be called when there is >=1 snapshots, unless
6398 we are filtering on the write path (single INSERT/UPDATE) in which case
6399 we are passed in the current time as curr_ts.
6400
6401 In the event curr_ts is 0, we always decide not to filter the record. We
6402 also log a warning and increment a diagnostic counter.
6403 */
6404 if (curr_ts == 0) {
6405 update_row_stats(ROWS_HIDDEN_NO_SNAPSHOT);
6406 return false;
6407 }
6408
6409 if (!rdb_is_ttl_read_filtering_enabled() || !rdb_is_ttl_enabled()) {
6410 return false;
6411 }
6412
6413 Rdb_string_reader reader(&ttl_rec_val);
6414
6415 /*
6416 Find where the 8-byte ttl is for each record in this index.
6417 */
6418 uint64 ts;
6419 if (!reader.read(kd.m_ttl_rec_offset) || reader.read_uint64(&ts)) {
6420 /*
6421 This condition should never be reached since all TTL records have an
6422 8 byte ttl field in front. Don't filter the record out, and log an error.
6423 */
6424 std::string buf;
6425 buf = rdb_hexdump(ttl_rec_val.data(), ttl_rec_val.size(),
6426 RDB_MAX_HEXDUMP_LEN);
6427 const GL_INDEX_ID gl_index_id = kd.get_gl_index_id();
6428 // NO_LINT_DEBUG
6429 sql_print_error(
6430 "Decoding ttl from PK value failed, "
6431 "for index (%u,%u), val: %s",
6432 gl_index_id.cf_id, gl_index_id.index_id, buf.c_str());
6433 DBUG_ASSERT(0);
6434 return false;
6435 }
6436
6437 /* Hide record if it has expired before the current snapshot time. */
6438 uint64 read_filter_ts = 0;
6439 #ifndef DBUG_OFF
6440 read_filter_ts += rdb_dbug_set_ttl_read_filter_ts();
6441 #endif
6442 bool is_hide_ttl =
6443 ts + kd.m_ttl_duration + read_filter_ts <= static_cast<uint64>(curr_ts);
6444 if (is_hide_ttl) {
6445 update_row_stats(ROWS_FILTERED);
6446
6447 /* increment examined row count when rows are skipped */
6448 THD *thd = ha_thd();
6449 thd->inc_examined_row_count(1);
6450 DEBUG_SYNC(thd, "rocksdb.ttl_rows_examined");
6451 }
6452 return is_hide_ttl;
6453 }
6454
6455 int ha_rocksdb::rocksdb_skip_expired_records(const Rdb_key_def &kd,
6456 rocksdb::Iterator *const iter,
6457 bool seek_backward) {
6458 if (kd.has_ttl()) {
6459 THD *thd = ha_thd();
6460 while (iter->Valid() &&
6461 should_hide_ttl_rec(
6462 kd, iter->value(),
6463 get_or_create_tx(table->in_use)->m_snapshot_timestamp)) {
6464 DEBUG_SYNC(thd, "rocksdb.check_flags_ser");
6465 if (thd && thd->killed) {
6466 return HA_ERR_QUERY_INTERRUPTED;
6467 }
6468 rocksdb_smart_next(seek_backward, iter);
6469 }
6470 }
6471 return HA_EXIT_SUCCESS;
6472 }
6473
6474 #ifndef DBUG_OFF
6475 void dbug_append_garbage_at_end(rocksdb::PinnableSlice *on_disk_rec) {
6476 std::string str(on_disk_rec->data(), on_disk_rec->size());
6477 on_disk_rec->Reset();
6478 str.append("abc");
6479 on_disk_rec->PinSelf(rocksdb::Slice(str));
6480 }
6481
6482 void dbug_truncate_record(rocksdb::PinnableSlice *on_disk_rec) {
6483 on_disk_rec->remove_suffix(on_disk_rec->size());
6484 }
6485
6486 void dbug_modify_rec_varchar12(rocksdb::PinnableSlice *on_disk_rec) {
6487 std::string res;
6488 // The record is NULL-byte followed by VARCHAR(10).
6489 // Put the NULL-byte
6490 res.append("\0", 1);
6491 // Then, add a valid VARCHAR(12) value.
6492 res.append("\xC", 1);
6493 res.append("123456789ab", 12);
6494
6495 on_disk_rec->Reset();
6496 on_disk_rec->PinSelf(rocksdb::Slice(res));
6497 }
6498
6499 void dbug_create_err_inplace_alter() {
6500 my_printf_error(ER_UNKNOWN_ERROR,
6501 "Intentional failure in inplace alter occurred.", MYF(0));
6502 }
6503 #endif
6504
6505 int ha_rocksdb::convert_record_from_storage_format(
6506 const rocksdb::Slice *const key, uchar *const buf) {
6507 DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read1",
6508 dbug_append_garbage_at_end(&m_retrieved_record););
6509 DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read2",
6510 dbug_truncate_record(&m_retrieved_record););
6511 DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read3",
6512 dbug_modify_rec_varchar12(&m_retrieved_record););
6513
6514 return convert_record_from_storage_format(key, &m_retrieved_record, buf);
6515 }
6516
6517 /*
6518 @brief
6519 Unpack the record in this->m_retrieved_record and this->m_last_rowkey from
6520 storage format into buf (which can be table->record[0] or table->record[1]).
6521
6522 @param key Table record's key in mem-comparable form.
6523 @param buf Store record in table->record[0] format here
6524
6525 @detail
6526 If the table has blobs, the unpacked data in buf may keep pointers to the
6527 data in this->m_retrieved_record.
6528
6529 The key is only needed to check its checksum value (the checksum is in
6530 m_retrieved_record).
6531
6532 @seealso
6533 rdb_converter::setup_read_decoders() Sets up data structures which tell
6534 which columns to decode.
6535
6536 @return
6537 0 OK
6538 other Error inpacking the data
6539 */
6540
6541 int ha_rocksdb::convert_record_from_storage_format(
6542 const rocksdb::Slice *const key, const rocksdb::Slice *const value,
6543 uchar *const buf) {
6544 return m_converter->decode(m_pk_descr, buf, key, value);
6545 }
6546
6547 int ha_rocksdb::alloc_key_buffers(const TABLE *const table_arg,
6548 const Rdb_tbl_def *const tbl_def_arg,
6549 bool alloc_alter_buffers) {
6550 DBUG_ENTER_FUNC();
6551
6552 DBUG_ASSERT(m_pk_tuple == nullptr);
6553
6554 std::shared_ptr<Rdb_key_def> *const kd_arr = tbl_def_arg->m_key_descr_arr;
6555
6556 uint key_len = 0;
6557 uint max_packed_sk_len = 0;
6558 uint pack_key_len = 0;
6559
6560 m_pk_descr = kd_arr[pk_index(table_arg, tbl_def_arg)];
6561 if (has_hidden_pk(table_arg)) {
6562 m_pk_key_parts = 1;
6563 } else {
6564 m_pk_key_parts =
6565 table->key_info[table->s->primary_key].user_defined_key_parts;
6566 key_len = table->key_info[table->s->primary_key].key_length;
6567 }
6568
6569 // move this into get_table_handler() ??
6570 m_pk_descr->setup(table_arg, tbl_def_arg);
6571
6572 m_pk_tuple = reinterpret_cast<uchar *>(my_malloc(key_len, MYF(0)));
6573
6574 pack_key_len = m_pk_descr->max_storage_fmt_length();
6575 m_pk_packed_tuple =
6576 reinterpret_cast<uchar *>(my_malloc(pack_key_len, MYF(0)));
6577
6578 /* Sometimes, we may use m_sk_packed_tuple for storing packed PK */
6579 max_packed_sk_len = pack_key_len;
6580 for (uint i = 0; i < table_arg->s->keys; i++) {
6581 /* Primary key was processed above */
6582 if (i == table_arg->s->primary_key) continue;
6583
6584 // TODO: move this into get_table_handler() ??
6585 kd_arr[i]->setup(table_arg, tbl_def_arg);
6586
6587 const uint packed_len = kd_arr[i]->max_storage_fmt_length();
6588 if (packed_len > max_packed_sk_len) {
6589 max_packed_sk_len = packed_len;
6590 }
6591 }
6592
6593 m_sk_packed_tuple =
6594 reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
6595 m_sk_match_prefix_buf =
6596 reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
6597 m_sk_packed_tuple_old =
6598 reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
6599 m_end_key_packed_tuple =
6600 reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
6601 m_pack_buffer =
6602 reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
6603
6604 m_scan_it_lower_bound =
6605 reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
6606 m_scan_it_upper_bound =
6607 reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
6608
6609 /*
6610 If inplace alter is happening, allocate special buffers for unique
6611 secondary index duplicate checking.
6612 */
6613 if (alloc_alter_buffers) {
6614 m_dup_sk_packed_tuple =
6615 reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
6616 m_dup_sk_packed_tuple_old =
6617 reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
6618 }
6619
6620 if (m_pk_tuple == nullptr || m_pk_packed_tuple == nullptr ||
6621 m_sk_packed_tuple == nullptr || m_sk_packed_tuple_old == nullptr ||
6622 m_end_key_packed_tuple == nullptr || m_pack_buffer == nullptr ||
6623 m_scan_it_upper_bound == nullptr || m_scan_it_lower_bound == nullptr ||
6624 (alloc_alter_buffers && (m_dup_sk_packed_tuple == nullptr ||
6625 m_dup_sk_packed_tuple_old == nullptr))) {
6626 // One or more of the above allocations failed. Clean up and exit
6627 free_key_buffers();
6628
6629 DBUG_RETURN(HA_ERR_OUT_OF_MEM);
6630 }
6631
6632 DBUG_RETURN(HA_EXIT_SUCCESS);
6633 }
6634
6635 void ha_rocksdb::free_key_buffers() {
6636 my_free(m_pk_tuple);
6637 m_pk_tuple = nullptr;
6638
6639 my_free(m_pk_packed_tuple);
6640 m_pk_packed_tuple = nullptr;
6641
6642 my_free(m_sk_packed_tuple);
6643 m_sk_packed_tuple = nullptr;
6644
6645 my_free(m_sk_match_prefix_buf);
6646 m_sk_match_prefix_buf = nullptr;
6647
6648 my_free(m_sk_packed_tuple_old);
6649 m_sk_packed_tuple_old = nullptr;
6650
6651 my_free(m_end_key_packed_tuple);
6652 m_end_key_packed_tuple = nullptr;
6653
6654 my_free(m_pack_buffer);
6655 m_pack_buffer = nullptr;
6656
6657 my_free(m_dup_sk_packed_tuple);
6658 m_dup_sk_packed_tuple = nullptr;
6659
6660 my_free(m_dup_sk_packed_tuple_old);
6661 m_dup_sk_packed_tuple_old = nullptr;
6662
6663 my_free(m_scan_it_lower_bound);
6664 m_scan_it_lower_bound = nullptr;
6665
6666 my_free(m_scan_it_upper_bound);
6667 m_scan_it_upper_bound = nullptr;
6668 }
6669
6670 #ifdef MARIAROCKS_NOT_YET
6671 #endif
6672 void ha_rocksdb::set_skip_unique_check_tables(const char *const whitelist) {
6673 const char *const wl =
6674 whitelist ? whitelist : DEFAULT_SKIP_UNIQUE_CHECK_TABLES;
6675
6676 #if defined(HAVE_PSI_INTERFACE)
6677 Regex_list_handler regex_handler(key_rwlock_skip_unique_check_tables);
6678 #else
6679 Regex_list_handler regex_handler;
6680 #endif
6681
6682 if (!regex_handler.set_patterns(wl)) {
6683 warn_about_bad_patterns(®ex_handler, "skip_unique_check_tables");
6684 }
6685
6686 m_skip_unique_check = regex_handler.matches(m_tbl_def->base_tablename());
6687 }
6688
6689 /**
6690 @return
6691 HA_EXIT_SUCCESS OK
6692 other HA_ERR error code (can be SE-specific)
6693 */
6694 int ha_rocksdb::open(const char *const name, int mode, uint test_if_locked) {
6695 DBUG_ENTER_FUNC();
6696
6697 int err = close();
6698 if (err) {
6699 DBUG_RETURN(err);
6700 }
6701
6702 m_table_handler = rdb_open_tables.get_table_handler(name);
6703
6704 if (m_table_handler == nullptr) {
6705 DBUG_RETURN(HA_ERR_OUT_OF_MEM);
6706 }
6707
6708 my_core::thr_lock_data_init(&m_table_handler->m_thr_lock, &m_db_lock,
6709 nullptr);
6710 m_io_perf.init(&m_table_handler->m_table_perf_context,
6711 &m_table_handler->m_io_perf_read,
6712 &m_table_handler->m_io_perf_write, &stats);
6713 Rdb_perf_context_guard guard(&m_io_perf,
6714 rocksdb_perf_context_level(ha_thd()));
6715
6716 std::string fullname;
6717 err = rdb_normalize_tablename(name, &fullname);
6718 if (err != HA_EXIT_SUCCESS) {
6719 DBUG_RETURN(err);
6720 }
6721
6722 m_tbl_def = ddl_manager.find(fullname);
6723 if (m_tbl_def == nullptr) {
6724 my_error(ER_INTERNAL_ERROR, MYF(0),
6725 "Attempt to open a table that is not present in RocksDB-SE data "
6726 "dictionary");
6727 DBUG_RETURN(HA_ERR_ROCKSDB_INVALID_TABLE);
6728 }
6729 if (m_tbl_def->m_key_count != table->s->keys + has_hidden_pk(table)? 1:0)
6730 {
6731 sql_print_error("MyRocks: DDL mismatch: .frm file has %u indexes, "
6732 "MyRocks has %u (%s hidden pk)",
6733 table->s->keys, m_tbl_def->m_key_count,
6734 has_hidden_pk(table)? "1" : "no");
6735
6736 if (rocksdb_ignore_datadic_errors)
6737 {
6738 sql_print_error("MyRocks: rocksdb_ignore_datadic_errors=1, "
6739 "trying to continue");
6740 }
6741 else
6742 {
6743 my_error(ER_INTERNAL_ERROR, MYF(0),
6744 "MyRocks: DDL mismatch. Check the error log for details");
6745 DBUG_RETURN(HA_ERR_ROCKSDB_INVALID_TABLE);
6746 }
6747 }
6748
6749
6750 m_lock_rows = RDB_LOCK_NONE;
6751 m_key_descr_arr = m_tbl_def->m_key_descr_arr;
6752
6753 /*
6754 Full table scan actually uses primary key
6755 (UPDATE needs to know this, otherwise it will go into infinite loop on
6756 queries like "UPDATE tbl SET pk=pk+100")
6757 */
6758 key_used_on_scan = table->s->primary_key;
6759
6760 // close() above has already called free_key_buffers(). No need to do it here.
6761 err = alloc_key_buffers(table, m_tbl_def);
6762
6763 if (err) {
6764 DBUG_RETURN(err);
6765 }
6766
6767 /*
6768 init_with_fields() is used to initialize table flags based on the field
6769 definitions in table->field[].
6770 It is called by open_binary_frm(), but that function calls the method for
6771 a temporary ha_rocksdb object which is later destroyed.
6772
6773 If we are here in ::open(), then init_with_fields() has not been called
6774 for this object. Call it ourselves, we want all member variables to be
6775 properly initialized.
6776 */
6777 init_with_fields();
6778
6779 /* Initialize decoder */
6780 m_converter = std::make_shared<Rdb_converter>(ha_thd(), m_tbl_def, table);
6781
6782 /*
6783 Update m_ttl_bytes address to same as Rdb_converter's m_ttl_bytes.
6784 Remove this code after moving convert_record_to_storage_format() into
6785 Rdb_converter class.
6786 */
6787 m_ttl_bytes = m_converter->get_ttl_bytes_buffer();
6788
6789 /*
6790 MariaDB: adjust field->part_of_key for PK columns. We can only do it here
6791 because SE API is just relying on the HA_PRIMARY_KEY_IN_READ_INDEX which
6792 does not allow to distinguish between unpack'able and non-unpack'able
6793 columns.
6794 Upstream uses handler->init_with_fields() but we don't have that call.
6795 */
6796 {
6797 if (!has_hidden_pk(table)) {
6798 KEY *const pk_info = &table->key_info[table->s->primary_key];
6799 for (uint kp = 0; kp < pk_info->user_defined_key_parts; kp++) {
6800 if (!m_pk_descr->can_unpack(kp)) {
6801 //
6802 uint field_index= pk_info->key_part[kp].field->field_index;
6803 table->field[field_index]->part_of_key.clear_all();
6804 table->field[field_index]->part_of_key.set_bit(table->s->primary_key);
6805 }
6806 }
6807 }
6808
6809 for (uint key= 0; key < table->s->keys; key++) {
6810 KEY *const key_info = &table->key_info[key];
6811 if (key == table->s->primary_key)
6812 continue;
6813 for (uint kp = 0; kp < key_info->usable_key_parts; kp++) {
6814 uint field_index= key_info->key_part[kp].field->field_index;
6815 if (m_key_descr_arr[key]->can_unpack(kp)) {
6816 table->field[field_index]->part_of_key.set_bit(key);
6817 } else {
6818 table->field[field_index]->part_of_key.clear_bit(key);
6819 }
6820 }
6821 }
6822 }
6823
6824 info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
6825
6826 /*
6827 The following load_XXX code calls row decode functions, and they do
6828 that without having done ::external_lock() or index_init()/rnd_init().
6829 (Note: this also means we're doing a read when there was no
6830 rdb_converter::setup_field_encoders() call)
6831
6832 Initialize the necessary variables for them:
6833 */
6834
6835 /* Load auto_increment value only once on first use. */
6836 if (table->found_next_number_field && m_tbl_def->m_auto_incr_val == 0) {
6837 load_auto_incr_value();
6838 }
6839
6840 /* Load hidden pk only once on first use. */
6841 if (has_hidden_pk(table) && m_tbl_def->m_hidden_pk_val == 0 &&
6842 (err = load_hidden_pk_value()) != HA_EXIT_SUCCESS) {
6843 free_key_buffers();
6844 DBUG_RETURN(err);
6845 }
6846
6847 /* Index block size in MyRocks: used by MySQL in query optimization */
6848 stats.block_size = rocksdb_tbl_options->block_size;
6849
6850 #ifdef MARIAROCKS_NOT_YET // MDEV-10976
6851 #endif
6852 /* Determine at open whether we should skip unique checks for this table */
6853 set_skip_unique_check_tables(THDVAR(ha_thd(), skip_unique_check_tables));
6854
6855 DBUG_RETURN(HA_EXIT_SUCCESS);
6856 }
6857
6858 int ha_rocksdb::close(void) {
6859 DBUG_ENTER_FUNC();
6860
6861 m_pk_descr = nullptr;
6862 m_key_descr_arr = nullptr;
6863 m_converter = nullptr;
6864 free_key_buffers();
6865
6866 if (m_table_handler != nullptr) {
6867 rdb_open_tables.release_table_handler(m_table_handler);
6868 m_table_handler = nullptr;
6869 }
6870
6871 // These are needed to suppress valgrind errors in rocksdb.partition
6872 m_last_rowkey.free();
6873 m_sk_tails.free();
6874 m_sk_tails_old.free();
6875 m_pk_unpack_info.free();
6876
6877 DBUG_RETURN(HA_EXIT_SUCCESS);
6878 }
6879
6880 static const char *rdb_error_messages[] = {
6881 "Table must have a PRIMARY KEY.",
6882 "Specifying DATA DIRECTORY for an individual table is not supported.",
6883 "Specifying INDEX DIRECTORY for an individual table is not supported.",
6884 "RocksDB commit failed.",
6885 "Failure during bulk load operation.",
6886 "Found data corruption.",
6887 "CRC checksum mismatch.",
6888 "Invalid table.",
6889 "Could not access RocksDB properties.",
6890 "File I/O error during merge/sort operation.",
6891 "RocksDB status: not found.",
6892 "RocksDB status: corruption.",
6893 "RocksDB status: invalid argument.",
6894 "RocksDB status: io error.",
6895 "RocksDB status: no space.",
6896 "RocksDB status: merge in progress.",
6897 "RocksDB status: incomplete.",
6898 "RocksDB status: shutdown in progress.",
6899 "RocksDB status: timed out.",
6900 "RocksDB status: aborted.",
6901 "RocksDB status: lock limit reached.",
6902 "RocksDB status: busy.",
6903 "RocksDB status: deadlock.",
6904 "RocksDB status: expired.",
6905 "RocksDB status: try again.",
6906 };
6907
6908 static_assert((sizeof(rdb_error_messages) / sizeof(rdb_error_messages[0])) ==
6909 ((HA_ERR_ROCKSDB_LAST - HA_ERR_ROCKSDB_FIRST) + 1),
6910 "Number of error messages doesn't match number of error codes");
6911
6912 //psergey-merge: do we need this in MariaDB: we have get_error_messages
6913 //below...
6914 #if 0
6915 static const char *rdb_get_error_message(int nr) {
6916 return rdb_error_messages[nr - HA_ERR_ROCKSDB_FIRST];
6917 }
6918 #endif
6919
6920 static const char **rdb_get_error_messages(int nr) { return rdb_error_messages; }
6921
6922 bool ha_rocksdb::get_error_message(const int error, String *const buf) {
6923 DBUG_ENTER_FUNC();
6924
6925 static_assert(HA_ERR_ROCKSDB_LAST > HA_ERR_FIRST,
6926 "HA_ERR_ROCKSDB_LAST > HA_ERR_FIRST");
6927 static_assert(HA_ERR_ROCKSDB_LAST > HA_ERR_LAST,
6928 "HA_ERR_ROCKSDB_LAST > HA_ERR_LAST");
6929
6930 if (error == HA_ERR_LOCK_WAIT_TIMEOUT || error == HA_ERR_LOCK_DEADLOCK ||
6931 error == HA_ERR_ROCKSDB_STATUS_BUSY) {
6932 Rdb_transaction *const tx = get_tx_from_thd(ha_thd());
6933 DBUG_ASSERT(tx != nullptr);
6934 buf->append(tx->m_detailed_error);
6935 DBUG_RETURN(true);
6936 }
6937
6938 if (error >= HA_ERR_ROCKSDB_FIRST && error <= HA_ERR_ROCKSDB_LAST) {
6939 buf->append(rdb_error_messages[error - HA_ERR_ROCKSDB_FIRST]);
6940 }
6941
6942 // We can be called with the values which are < HA_ERR_FIRST because most
6943 // MySQL internal functions will just return HA_EXIT_FAILURE in case of
6944 // an error.
6945
6946 DBUG_RETURN(false);
6947 }
6948
6949 /*
6950 Generalized way to convert RocksDB status errors into MySQL error code, and
6951 print error message.
6952
6953 Each error code below maps to a RocksDB status code found in:
6954 rocksdb/include/rocksdb/status.h
6955 */
6956 int ha_rocksdb::rdb_error_to_mysql(const rocksdb::Status &s,
6957 const char *opt_msg) {
6958 DBUG_ASSERT(!s.ok());
6959
6960 int err;
6961 switch (s.code()) {
6962 case rocksdb::Status::Code::kOk:
6963 err = HA_EXIT_SUCCESS;
6964 break;
6965 case rocksdb::Status::Code::kNotFound:
6966 err = HA_ERR_ROCKSDB_STATUS_NOT_FOUND;
6967 break;
6968 case rocksdb::Status::Code::kCorruption:
6969 err = HA_ERR_ROCKSDB_STATUS_CORRUPTION;
6970 break;
6971 case rocksdb::Status::Code::kNotSupported:
6972 err = HA_ERR_ROCKSDB_STATUS_NOT_SUPPORTED;
6973 break;
6974 case rocksdb::Status::Code::kInvalidArgument:
6975 err = HA_ERR_ROCKSDB_STATUS_INVALID_ARGUMENT;
6976 break;
6977 case rocksdb::Status::Code::kIOError:
6978 err = (s.IsNoSpace()) ? HA_ERR_ROCKSDB_STATUS_NO_SPACE
6979 : HA_ERR_ROCKSDB_STATUS_IO_ERROR;
6980 break;
6981 case rocksdb::Status::Code::kMergeInProgress:
6982 err = HA_ERR_ROCKSDB_STATUS_MERGE_IN_PROGRESS;
6983 break;
6984 case rocksdb::Status::Code::kIncomplete:
6985 err = HA_ERR_ROCKSDB_STATUS_INCOMPLETE;
6986 break;
6987 case rocksdb::Status::Code::kShutdownInProgress:
6988 err = HA_ERR_ROCKSDB_STATUS_SHUTDOWN_IN_PROGRESS;
6989 break;
6990 case rocksdb::Status::Code::kTimedOut:
6991 err = HA_ERR_ROCKSDB_STATUS_TIMED_OUT;
6992 break;
6993 case rocksdb::Status::Code::kAborted:
6994 err = (s.IsLockLimit()) ? HA_ERR_ROCKSDB_STATUS_LOCK_LIMIT
6995 : HA_ERR_ROCKSDB_STATUS_ABORTED;
6996 break;
6997 case rocksdb::Status::Code::kBusy:
6998 err = (s.IsDeadlock()) ? HA_ERR_ROCKSDB_STATUS_DEADLOCK
6999 : HA_ERR_ROCKSDB_STATUS_BUSY;
7000 break;
7001 case rocksdb::Status::Code::kExpired:
7002 err = HA_ERR_ROCKSDB_STATUS_EXPIRED;
7003 break;
7004 case rocksdb::Status::Code::kTryAgain:
7005 err = HA_ERR_ROCKSDB_STATUS_TRY_AGAIN;
7006 break;
7007 default:
7008 DBUG_ASSERT(0);
7009 return -1;
7010 }
7011
7012 std::string errMsg;
7013 if (s.IsLockLimit()) {
7014 errMsg =
7015 "Operation aborted: Failed to acquire lock due to "
7016 "rocksdb_max_row_locks limit";
7017 } else {
7018 errMsg = s.ToString();
7019 }
7020
7021 if (opt_msg) {
7022 std::string concatenated_error = errMsg + " (" + std::string(opt_msg) + ")";
7023 my_error(ER_GET_ERRMSG, MYF(0), s.code(), concatenated_error.c_str(),
7024 rocksdb_hton_name);
7025 } else {
7026 my_error(ER_GET_ERRMSG, MYF(0), s.code(), errMsg.c_str(),
7027 rocksdb_hton_name);
7028 }
7029
7030 return err;
7031 }
7032
7033 /* MyRocks supports only the following collations for indexed columns */
7034 static const std::set<uint> RDB_INDEX_COLLATIONS = {
7035 COLLATION_BINARY, COLLATION_UTF8_BIN, COLLATION_LATIN1_BIN};
7036
7037 static bool rdb_is_index_collation_supported(
7038 const my_core::Field *const field) {
7039 const my_core::enum_field_types type = field->real_type();
7040 /* Handle [VAR](CHAR|BINARY) or TEXT|BLOB */
7041 if (type == MYSQL_TYPE_VARCHAR || type == MYSQL_TYPE_STRING ||
7042 type == MYSQL_TYPE_BLOB) {
7043
7044 return (RDB_INDEX_COLLATIONS.find(field->charset()->number) !=
7045 RDB_INDEX_COLLATIONS.end()) ||
7046 rdb_is_collation_supported(field->charset());
7047 }
7048 return true;
7049 }
7050
7051
7052 static bool
7053 rdb_field_uses_nopad_collation(const my_core::Field *const field) {
7054 const my_core::enum_field_types type = field->real_type();
7055 /* Handle [VAR](CHAR|BINARY) or TEXT|BLOB */
7056 if (type == MYSQL_TYPE_VARCHAR || type == MYSQL_TYPE_STRING ||
7057 type == MYSQL_TYPE_BLOB) {
7058
7059 /*
7060 This is technically a NOPAD collation but it's a binary collation
7061 that we can handle.
7062 */
7063 if (RDB_INDEX_COLLATIONS.find(field->charset()->number) !=
7064 RDB_INDEX_COLLATIONS.end())
7065 return false;
7066
7067 return (field->charset()->state & MY_CS_NOPAD);
7068 }
7069 return false;
7070 }
7071
7072
7073 /*
7074 Create structures needed for storing data in rocksdb. This is called when the
7075 table is created. The structures will be shared by all TABLE* objects.
7076
7077 @param
7078 table_arg Table with definition
7079 db_table "dbname.tablename"
7080 len strlen of the above
7081 tbl_def_arg tbl_def whose key_descr is being created/populated
7082 old_tbl_def_arg tbl_def from which keys are being copied over from
7083 (for use during inplace alter)
7084
7085 @return
7086 0 - Ok
7087 other - error, either given table ddl is not supported by rocksdb or OOM.
7088 */
7089 int ha_rocksdb::create_key_defs(
7090 const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
7091 const TABLE *const old_table_arg /* = nullptr */,
7092 const Rdb_tbl_def *const old_tbl_def_arg
7093 /* = nullptr */) const {
7094 DBUG_ENTER_FUNC();
7095
7096 DBUG_ASSERT(table_arg->s != nullptr);
7097
7098 /*
7099 These need to be one greater than MAX_INDEXES since the user can create
7100 MAX_INDEXES secondary keys and no primary key which would cause us
7101 to generate a hidden one.
7102 */
7103 std::array<key_def_cf_info, MAX_INDEXES + 1> cfs;
7104
7105 /*
7106 NOTE: All new column families must be created before new index numbers are
7107 allocated to each key definition. See below for more details.
7108 http://github.com/MySQLOnRocksDB/mysql-5.6/issues/86#issuecomment-138515501
7109 */
7110 if (create_cfs(table_arg, tbl_def_arg, &cfs)) {
7111 DBUG_RETURN(HA_EXIT_FAILURE);
7112 }
7113
7114 uint64 ttl_duration = 0;
7115 std::string ttl_column;
7116 uint ttl_field_offset;
7117
7118 uint err;
7119 if ((err = Rdb_key_def::extract_ttl_duration(table_arg, tbl_def_arg,
7120 &ttl_duration))) {
7121 DBUG_RETURN(err);
7122 }
7123
7124 if ((err = Rdb_key_def::extract_ttl_col(table_arg, tbl_def_arg, &ttl_column,
7125 &ttl_field_offset))) {
7126 DBUG_RETURN(err);
7127 }
7128
7129 /* We don't currently support TTL on tables with hidden primary keys. */
7130 if (ttl_duration > 0 && has_hidden_pk(table_arg)) {
7131 my_error(ER_RDB_TTL_UNSUPPORTED, MYF(0));
7132 DBUG_RETURN(HA_EXIT_FAILURE);
7133 }
7134
7135 /*
7136 If TTL duration is not specified but TTL column was specified, throw an
7137 error because TTL column requires duration.
7138 */
7139 if (ttl_duration == 0 && !ttl_column.empty()) {
7140 my_error(ER_RDB_TTL_COL_FORMAT, MYF(0), ttl_column.c_str());
7141 DBUG_RETURN(HA_EXIT_FAILURE);
7142 }
7143
7144 if (!old_tbl_def_arg) {
7145 /*
7146 old_tbl_def doesn't exist. this means we are in the process of creating
7147 a new table.
7148
7149 Get the index numbers (this will update the next_index_number)
7150 and create Rdb_key_def structures.
7151 */
7152 for (uint i = 0; i < tbl_def_arg->m_key_count; i++) {
7153 if (create_key_def(table_arg, i, tbl_def_arg, &m_key_descr_arr[i], cfs[i],
7154 ttl_duration, ttl_column)) {
7155 DBUG_RETURN(HA_EXIT_FAILURE);
7156 }
7157 }
7158 } else {
7159 /*
7160 old_tbl_def exists. This means we are creating a new tbl_def as part of
7161 in-place alter table. Copy over existing keys from the old_tbl_def and
7162 generate the necessary new key definitions if any.
7163 */
7164 if (create_inplace_key_defs(table_arg, tbl_def_arg, old_table_arg,
7165 old_tbl_def_arg, cfs, ttl_duration,
7166 ttl_column)) {
7167 DBUG_RETURN(HA_EXIT_FAILURE);
7168 }
7169 }
7170
7171 DBUG_RETURN(HA_EXIT_SUCCESS);
7172 }
7173
7174 /*
7175 Checks index parameters and creates column families needed for storing data
7176 in rocksdb if necessary.
7177
7178 @param in
7179 table_arg Table with definition
7180 db_table Table name
7181 tbl_def_arg Table def structure being populated
7182
7183 @param out
7184 cfs CF info for each key definition in 'key_info' order
7185
7186 @return
7187 0 - Ok
7188 other - error
7189 */
7190 int ha_rocksdb::create_cfs(
7191 const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
7192 std::array<struct key_def_cf_info, MAX_INDEXES + 1> *const cfs) const {
7193 DBUG_ENTER_FUNC();
7194
7195 DBUG_ASSERT(table_arg->s != nullptr);
7196
7197 char tablename_sys[NAME_LEN + 1];
7198 bool tsys_set= false;
7199
7200 /*
7201 The first loop checks the index parameters and creates
7202 column families if necessary.
7203 */
7204 for (uint i = 0; i < tbl_def_arg->m_key_count; i++) {
7205 rocksdb::ColumnFamilyHandle *cf_handle;
7206
7207 if (!is_hidden_pk(i, table_arg, tbl_def_arg) &&
7208 tbl_def_arg->base_tablename().find(tmp_file_prefix) != 0) {
7209 if (!tsys_set)
7210 {
7211 tsys_set= true;
7212 my_core::filename_to_tablename(tbl_def_arg->base_tablename().c_str(),
7213 tablename_sys, sizeof(tablename_sys));
7214 }
7215
7216 for (uint part = 0; part < table_arg->key_info[i].ext_key_parts;
7217 part++)
7218 {
7219 /* MariaDB: disallow NOPAD collations */
7220 if (rdb_field_uses_nopad_collation(
7221 table_arg->key_info[i].key_part[part].field))
7222 {
7223 my_error(ER_MYROCKS_CANT_NOPAD_COLLATION, MYF(0));
7224 DBUG_RETURN(HA_EXIT_FAILURE);
7225 }
7226
7227 if (rocksdb_strict_collation_check &&
7228 !rdb_is_index_collation_supported(
7229 table_arg->key_info[i].key_part[part].field) &&
7230 !rdb_collation_exceptions->matches(tablename_sys)) {
7231
7232 char buf[1024];
7233 my_snprintf(buf, sizeof(buf),
7234 "Indexed column %s.%s uses a collation that does not "
7235 "allow index-only access in secondary key and has "
7236 "reduced disk space efficiency in primary key.",
7237 tbl_def_arg->full_tablename().c_str(),
7238 table_arg->key_info[i].key_part[part].field->field_name.str);
7239
7240 my_error(ER_INTERNAL_ERROR, MYF(ME_WARNING), buf);
7241 }
7242 }
7243 }
7244
7245 // Internal consistency check to make sure that data in TABLE and
7246 // Rdb_tbl_def structures matches. Either both are missing or both are
7247 // specified. Yes, this is critical enough to make it into SHIP_ASSERT.
7248 SHIP_ASSERT(IF_PARTITIONING(!table_arg->part_info,true) == tbl_def_arg->base_partition().empty());
7249
7250 // Generate the name for the column family to use.
7251 bool per_part_match_found = false;
7252 std::string cf_name =
7253 generate_cf_name(i, table_arg, tbl_def_arg, &per_part_match_found);
7254
7255 // Prevent create from using the system column family.
7256 if (cf_name == DEFAULT_SYSTEM_CF_NAME) {
7257 my_error(ER_WRONG_ARGUMENTS, MYF(0),
7258 "column family not valid for storing index data.");
7259 DBUG_RETURN(HA_EXIT_FAILURE);
7260 }
7261
7262 // Here's how `get_or_create_cf` will use the input parameters:
7263 //
7264 // `cf_name` - will be used as a CF name.
7265 cf_handle = cf_manager.get_or_create_cf(rdb, cf_name);
7266
7267 if (!cf_handle) {
7268 DBUG_RETURN(HA_EXIT_FAILURE);
7269 }
7270
7271 auto &cf = (*cfs)[i];
7272
7273 cf.cf_handle = cf_handle;
7274 cf.is_reverse_cf = Rdb_cf_manager::is_cf_name_reverse(cf_name.c_str());
7275 cf.is_per_partition_cf = per_part_match_found;
7276 }
7277
7278 DBUG_RETURN(HA_EXIT_SUCCESS);
7279 }
7280
7281 /*
7282 Create key definition needed for storing data in rocksdb during ADD index
7283 inplace operations.
7284
7285 @param in
7286 table_arg Table with definition
7287 tbl_def_arg New table def structure being populated
7288 old_tbl_def_arg Old(current) table def structure
7289 cfs Struct array which contains column family information
7290
7291 @return
7292 0 - Ok
7293 other - error, either given table ddl is not supported by rocksdb or OOM.
7294 */
7295 int ha_rocksdb::create_inplace_key_defs(
7296 const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
7297 const TABLE *const old_table_arg, const Rdb_tbl_def *const old_tbl_def_arg,
7298 const std::array<key_def_cf_info, MAX_INDEXES + 1> &cfs,
7299 uint64 ttl_duration, const std::string &ttl_column) const {
7300 DBUG_ENTER_FUNC();
7301
7302 std::shared_ptr<Rdb_key_def> *const old_key_descr =
7303 old_tbl_def_arg->m_key_descr_arr;
7304 std::shared_ptr<Rdb_key_def> *const new_key_descr =
7305 tbl_def_arg->m_key_descr_arr;
7306 const std::unordered_map<std::string, uint> old_key_pos =
7307 get_old_key_positions(table_arg, tbl_def_arg, old_table_arg,
7308 old_tbl_def_arg);
7309
7310 uint i;
7311 for (i = 0; i < tbl_def_arg->m_key_count; i++) {
7312 const auto &it = old_key_pos.find(get_key_name(i, table_arg, tbl_def_arg));
7313
7314 if (it != old_key_pos.end()) {
7315 /*
7316 Found matching index in old table definition, so copy it over to the
7317 new one created.
7318 */
7319 const Rdb_key_def &okd = *old_key_descr[it->second];
7320
7321 const GL_INDEX_ID gl_index_id = okd.get_gl_index_id();
7322 struct Rdb_index_info index_info;
7323 if (!dict_manager.get_index_info(gl_index_id, &index_info)) {
7324 // NO_LINT_DEBUG
7325 sql_print_error(
7326 "RocksDB: Could not get index information "
7327 "for Index Number (%u,%u), table %s",
7328 gl_index_id.cf_id, gl_index_id.index_id,
7329 old_tbl_def_arg->full_tablename().c_str());
7330 DBUG_RETURN(HA_EXIT_FAILURE);
7331 }
7332
7333 uint32 ttl_rec_offset =
7334 Rdb_key_def::has_index_flag(index_info.m_index_flags,
7335 Rdb_key_def::TTL_FLAG)
7336 ? Rdb_key_def::calculate_index_flag_offset(
7337 index_info.m_index_flags, Rdb_key_def::TTL_FLAG)
7338 : UINT_MAX;
7339
7340 /*
7341 We can't use the copy constructor because we need to update the
7342 keynr within the pack_info for each field and the keyno of the keydef
7343 itself.
7344 */
7345 new_key_descr[i] = std::make_shared<Rdb_key_def>(
7346 okd.get_index_number(), i, okd.get_cf(),
7347 index_info.m_index_dict_version, index_info.m_index_type,
7348 index_info.m_kv_version, okd.m_is_reverse_cf,
7349 okd.m_is_per_partition_cf, okd.m_name.c_str(),
7350 dict_manager.get_stats(gl_index_id), index_info.m_index_flags,
7351 ttl_rec_offset, index_info.m_ttl_duration);
7352 } else if (create_key_def(table_arg, i, tbl_def_arg, &new_key_descr[i],
7353 cfs[i], ttl_duration, ttl_column)) {
7354 DBUG_RETURN(HA_EXIT_FAILURE);
7355 }
7356
7357 DBUG_ASSERT(new_key_descr[i] != nullptr);
7358 new_key_descr[i]->setup(table_arg, tbl_def_arg);
7359 }
7360
7361 DBUG_RETURN(HA_EXIT_SUCCESS);
7362 }
7363
7364 std::unordered_map<std::string, uint> ha_rocksdb::get_old_key_positions(
7365 const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg,
7366 const TABLE *const old_table_arg,
7367 const Rdb_tbl_def *const old_tbl_def_arg) const {
7368 DBUG_ENTER_FUNC();
7369
7370 std::shared_ptr<Rdb_key_def> *const old_key_descr =
7371 old_tbl_def_arg->m_key_descr_arr;
7372 std::unordered_map<std::string, uint> old_key_pos;
7373 std::unordered_map<std::string, uint> new_key_pos;
7374 uint i;
7375
7376 for (i = 0; i < tbl_def_arg->m_key_count; i++) {
7377 new_key_pos[get_key_name(i, table_arg, tbl_def_arg)] = i;
7378 }
7379
7380 for (i = 0; i < old_tbl_def_arg->m_key_count; i++) {
7381 if (is_hidden_pk(i, old_table_arg, old_tbl_def_arg)) {
7382 old_key_pos[old_key_descr[i]->m_name] = i;
7383 continue;
7384 }
7385
7386 /*
7387 In case of matching key name, need to check key parts of keys as well,
7388 in case a simultaneous drop + add is performed, where the key name is the
7389 same but the key parts are different.
7390
7391 Example:
7392 CREATE TABLE t1 (a INT, b INT, KEY ka(a)) ENGINE=RocksDB;
7393 ALTER TABLE t1 DROP INDEX ka, ADD INDEX ka(b), ALGORITHM=INPLACE;
7394 */
7395 const KEY *const old_key = &old_table_arg->key_info[i];
7396 const auto &it = new_key_pos.find(old_key->name.str);
7397 if (it == new_key_pos.end()) {
7398 continue;
7399 }
7400
7401 KEY *const new_key = &table_arg->key_info[it->second];
7402
7403 /*
7404 Check that the key is identical between old and new tables.
7405 If not, we still need to create a new index.
7406
7407 The exception is if there is an index changed from unique to non-unique,
7408 in these cases we don't need to rebuild as they are stored the same way in
7409 RocksDB.
7410 */
7411 bool unique_to_non_unique =
7412 ((old_key->flags ^ new_key->flags) == HA_NOSAME) &&
7413 (old_key->flags & HA_NOSAME);
7414
7415 if (compare_keys(old_key, new_key) && !unique_to_non_unique) {
7416 continue;
7417 }
7418
7419 /* Check to make sure key parts match. */
7420 if (compare_key_parts(old_key, new_key)) {
7421 continue;
7422 }
7423
7424 old_key_pos[old_key->name.str] = i;
7425 }
7426
7427 DBUG_RETURN(old_key_pos);
7428 }
7429
7430 /* Check to see if two keys are identical. */
7431 int ha_rocksdb::compare_keys(const KEY *const old_key,
7432 const KEY *const new_key) const {
7433 DBUG_ENTER_FUNC();
7434
7435 /* Check index name. */
7436 if (strcmp(old_key->name.str, new_key->name.str) != 0) {
7437 DBUG_RETURN(HA_EXIT_FAILURE);
7438 }
7439
7440 /* If index algorithms are different then keys are different. */
7441 if (old_key->algorithm != new_key->algorithm) {
7442 DBUG_RETURN(HA_EXIT_FAILURE);
7443 }
7444
7445 /* Check that the key is identical between old and new tables. */
7446 if ((old_key->flags ^ new_key->flags) & HA_KEYFLAG_MASK) {
7447 DBUG_RETURN(HA_EXIT_FAILURE);
7448 }
7449
7450 /* Check index comment. (for column family changes) */
7451 std::string old_comment(old_key->comment.str, old_key->comment.length);
7452 std::string new_comment(new_key->comment.str, new_key->comment.length);
7453 if (old_comment.compare(new_comment) != 0) {
7454 DBUG_RETURN(HA_EXIT_FAILURE);
7455 }
7456
7457 DBUG_RETURN(HA_EXIT_SUCCESS);
7458 }
7459
7460 /* Check two keys to ensure that key parts within keys match */
7461 int ha_rocksdb::compare_key_parts(const KEY *const old_key,
7462 const KEY *const new_key) const {
7463 DBUG_ENTER_FUNC();
7464
7465 /* Skip if key parts do not match, as it is a different key */
7466 if (new_key->user_defined_key_parts != old_key->user_defined_key_parts) {
7467 DBUG_RETURN(HA_EXIT_FAILURE);
7468 }
7469
7470 /* Check to see that key parts themselves match */
7471 for (uint i = 0; i < old_key->user_defined_key_parts; i++) {
7472 if (strcmp(old_key->key_part[i].field->field_name.str,
7473 new_key->key_part[i].field->field_name.str) != 0) {
7474 DBUG_RETURN(HA_EXIT_FAILURE);
7475 }
7476
7477 /* Check if prefix index key part length has changed */
7478 if (old_key->key_part[i].length != new_key->key_part[i].length) {
7479 DBUG_RETURN(HA_EXIT_FAILURE);
7480 }
7481 }
7482
7483 DBUG_RETURN(HA_EXIT_SUCCESS);
7484 }
7485
7486 /*
7487 Create key definition needed for storing data in rocksdb.
7488 This can be called either during CREATE table or doing ADD index operations.
7489
7490 @param in
7491 table_arg Table with definition
7492 i Position of index being created inside table_arg->key_info
7493 tbl_def_arg Table def structure being populated
7494 cf_info Struct which contains column family information
7495
7496 @param out
7497 new_key_def Newly created index definition.
7498
7499 @return
7500 0 - Ok
7501 other - error, either given table ddl is not supported by rocksdb or OOM.
7502 */
7503 int ha_rocksdb::create_key_def(const TABLE *const table_arg, const uint i,
7504 const Rdb_tbl_def *const tbl_def_arg,
7505 std::shared_ptr<Rdb_key_def> *const new_key_def,
7506 const struct key_def_cf_info &cf_info,
7507 uint64 ttl_duration,
7508 const std::string &ttl_column) const {
7509 DBUG_ENTER_FUNC();
7510
7511 DBUG_ASSERT(*new_key_def == nullptr);
7512
7513 const uint index_id = ddl_manager.get_and_update_next_number(&dict_manager);
7514 const uint16_t index_dict_version = Rdb_key_def::INDEX_INFO_VERSION_LATEST;
7515 uchar index_type;
7516 uint16_t kv_version;
7517
7518 if (is_hidden_pk(i, table_arg, tbl_def_arg)) {
7519 index_type = Rdb_key_def::INDEX_TYPE_HIDDEN_PRIMARY;
7520 kv_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_LATEST;
7521 } else if (i == table_arg->s->primary_key) {
7522 index_type = Rdb_key_def::INDEX_TYPE_PRIMARY;
7523 uint16 pk_latest_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_LATEST;
7524 kv_version = pk_latest_version;
7525 } else {
7526 index_type = Rdb_key_def::INDEX_TYPE_SECONDARY;
7527 uint16 sk_latest_version = Rdb_key_def::SECONDARY_FORMAT_VERSION_LATEST;
7528 kv_version = sk_latest_version;
7529 }
7530
7531 // Use PRIMARY_FORMAT_VERSION_UPDATE1 here since it is the same value as
7532 // SECONDARY_FORMAT_VERSION_UPDATE1 so it doesn't matter if this is a
7533 // primary key or secondary key.
7534 DBUG_EXECUTE_IF("MYROCKS_LEGACY_VARBINARY_FORMAT", {
7535 kv_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_UPDATE1;
7536 });
7537
7538 DBUG_EXECUTE_IF("MYROCKS_NO_COVERED_BITMAP_FORMAT", {
7539 if (index_type == Rdb_key_def::INDEX_TYPE_SECONDARY) {
7540 kv_version = Rdb_key_def::SECONDARY_FORMAT_VERSION_UPDATE2;
7541 }
7542 });
7543
7544 uint32 index_flags = (ttl_duration > 0 ? Rdb_key_def::TTL_FLAG : 0);
7545
7546 uint32 ttl_rec_offset =
7547 Rdb_key_def::has_index_flag(index_flags, Rdb_key_def::TTL_FLAG)
7548 ? Rdb_key_def::calculate_index_flag_offset(index_flags,
7549 Rdb_key_def::TTL_FLAG)
7550 : UINT_MAX;
7551
7552 const char *const key_name = get_key_name(i, table_arg, m_tbl_def);
7553 *new_key_def = std::make_shared<Rdb_key_def>(
7554 index_id, i, cf_info.cf_handle, index_dict_version, index_type,
7555 kv_version, cf_info.is_reverse_cf, cf_info.is_per_partition_cf, key_name,
7556 Rdb_index_stats(), index_flags, ttl_rec_offset, ttl_duration);
7557
7558 if (!ttl_column.empty()) {
7559 (*new_key_def)->m_ttl_column = ttl_column;
7560 }
7561 // initialize key_def
7562 (*new_key_def)->setup(table_arg, tbl_def_arg);
7563 DBUG_RETURN(HA_EXIT_SUCCESS);
7564 }
7565
7566 int rdb_normalize_tablename(const std::string &tablename,
7567 std::string *const strbuf) {
7568 if (tablename.size() < 2 || tablename[0] != '.' ||
7569 (tablename[1] != FN_LIBCHAR && tablename[1] != FN_LIBCHAR2)) {
7570 DBUG_ASSERT(0); // We were not passed table name?
7571 return HA_ERR_ROCKSDB_INVALID_TABLE;
7572 }
7573
7574 size_t pos = tablename.find_first_of(FN_LIBCHAR, 2);
7575 if (pos == std::string::npos) {
7576 pos = tablename.find_first_of(FN_LIBCHAR2, 2);
7577 }
7578
7579 if (pos == std::string::npos) {
7580 DBUG_ASSERT(0); // We were not passed table name?
7581 return HA_ERR_ROCKSDB_INVALID_TABLE;
7582 }
7583
7584 *strbuf = tablename.substr(2, pos - 2) + "." + tablename.substr(pos + 1);
7585
7586 return HA_EXIT_SUCCESS;
7587 }
7588
7589 /*
7590 Check to see if the user's original statement includes foreign key
7591 references
7592 */
7593 bool ha_rocksdb::contains_foreign_key(THD *const thd) {
7594 bool success;
7595 const char *str = thd_query_string(thd)->str;
7596
7597 DBUG_ASSERT(str != nullptr);
7598
7599 while (*str != '\0') {
7600 // Scan from our current pos looking for 'FOREIGN'
7601 str = rdb_find_in_string(str, "FOREIGN", &success);
7602 if (!success) {
7603 return false;
7604 }
7605
7606 // Skip past the found "FOREIGN'
7607 str = rdb_check_next_token(&my_charset_bin, str, "FOREIGN", &success);
7608 DBUG_ASSERT(success);
7609
7610 if (!my_isspace(&my_charset_bin, *str)) {
7611 return false;
7612 }
7613
7614 // See if the next token is 'KEY'
7615 str = rdb_check_next_token(&my_charset_bin, str, "KEY", &success);
7616 if (!success) {
7617 continue;
7618 }
7619
7620 // See if the next token is '('
7621 str = rdb_check_next_token(&my_charset_bin, str, "(", &success);
7622 if (!success) {
7623 // There is an optional index id after 'FOREIGN KEY', skip it
7624 str = rdb_skip_id(&my_charset_bin, str);
7625
7626 // Now check for '(' again
7627 str = rdb_check_next_token(&my_charset_bin, str, "(", &success);
7628 }
7629
7630 // If we have found 'FOREIGN KEY [<word>] (' we can be confident we have
7631 // a foreign key clause.
7632 return success;
7633 }
7634
7635 // We never found a valid foreign key clause
7636 return false;
7637 }
7638
7639 /**
7640 @brief
7641 splits the normalized table name of <dbname>.<tablename>#P#<part_no> into
7642 the <dbname>, <tablename> and <part_no> components.
7643
7644 @param dbbuf returns database name/table_schema
7645 @param tablebuf returns tablename
7646 @param partitionbuf returns partition suffix if there is one
7647 @return HA_EXIT_SUCCESS on success, non-zero on failure to split
7648 */
7649 int rdb_split_normalized_tablename(const std::string &fullname,
7650 std::string *const db,
7651 std::string *const table,
7652 std::string *const partition) {
7653 DBUG_ASSERT(!fullname.empty());
7654
7655 #define RDB_PARTITION_STR "#P#"
7656
7657 /* Normalize returns dbname.tablename. */
7658 size_t dotpos = fullname.find('.');
7659
7660 /* Invalid table name? */
7661 if (dotpos == std::string::npos) {
7662 return HA_ERR_ROCKSDB_INVALID_TABLE;
7663 }
7664
7665 // Table must have a database name associated with it.
7666 DBUG_ASSERT(dotpos > 0);
7667
7668 if (db != nullptr) {
7669 *db = fullname.substr(0, dotpos);
7670 }
7671
7672 dotpos++;
7673
7674 const size_t partpos =
7675 fullname.find(RDB_PARTITION_STR, dotpos, strlen(RDB_PARTITION_STR));
7676
7677 if (partpos != std::string::npos) {
7678 DBUG_ASSERT(partpos >= dotpos);
7679
7680 if (table != nullptr) {
7681 *table = fullname.substr(dotpos, partpos - dotpos);
7682 }
7683
7684 if (partition != nullptr) {
7685 *partition = fullname.substr(partpos + strlen(RDB_PARTITION_STR));
7686 }
7687 } else if (table != nullptr) {
7688 *table = fullname.substr(dotpos);
7689 }
7690
7691 return HA_EXIT_SUCCESS;
7692 }
7693
7694 /*
7695 Create a table's Rdb_tbl_def and its Rdb_key_defs and store table information
7696 into MyRocks Data Dictionary
7697 The method is called during create table/partition, truncate table/partition
7698
7699 @param table_name IN table's name formated as
7700 'dbname.tablename'
7701 @param table_arg IN sql table
7702 @param auto_increment_value IN specified table's auto increment value
7703
7704 @return
7705 HA_EXIT_SUCCESS OK
7706 other HA_ERR error code (can be SE-specific)
7707 */
7708 int ha_rocksdb::create_table(const std::string &table_name,
7709 const TABLE *table_arg,
7710 ulonglong auto_increment_value) {
7711 DBUG_ENTER_FUNC();
7712
7713
7714
7715 int err;
7716
7717 const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
7718 rocksdb::WriteBatch *const batch = wb.get();
7719
7720 /* Create table/key descriptions and put them into the data dictionary */
7721 m_tbl_def = new Rdb_tbl_def(table_name);
7722
7723 uint n_keys = table_arg->s->keys;
7724
7725 /*
7726 If no primary key found, create a hidden PK and place it inside table
7727 definition
7728 */
7729 if (has_hidden_pk(table_arg)) {
7730 n_keys += 1;
7731 // reset hidden pk id
7732 // the starting valid value for hidden pk is 1
7733 m_tbl_def->m_hidden_pk_val = 1;
7734 }
7735
7736 m_key_descr_arr = new std::shared_ptr<Rdb_key_def>[n_keys];
7737 m_tbl_def->m_key_count = n_keys;
7738 m_tbl_def->m_key_descr_arr = m_key_descr_arr;
7739
7740 err = create_key_defs(table_arg, m_tbl_def);
7741 if (err != HA_EXIT_SUCCESS) {
7742 goto error;
7743 }
7744
7745 m_pk_descr = m_key_descr_arr[pk_index(table_arg, m_tbl_def)];
7746
7747 if (auto_increment_value) {
7748 bool autoinc_upgrade_test = false;
7749 m_tbl_def->m_auto_incr_val = auto_increment_value;
7750 DBUG_EXECUTE_IF("myrocks_autoinc_upgrade", autoinc_upgrade_test = true;);
7751 if (!autoinc_upgrade_test) {
7752 auto s = dict_manager.put_auto_incr_val(
7753 batch, m_tbl_def->get_autoincr_gl_index_id(),
7754 m_tbl_def->m_auto_incr_val);
7755 if (!s.ok()) {
7756 goto error;
7757 }
7758 }
7759 }
7760
7761 dict_manager.lock();
7762 err = ddl_manager.put_and_write(m_tbl_def, batch);
7763 if (err != HA_EXIT_SUCCESS) {
7764 dict_manager.unlock();
7765 goto error;
7766 }
7767
7768 err = dict_manager.commit(batch);
7769 if (err != HA_EXIT_SUCCESS) {
7770 dict_manager.unlock();
7771 goto error;
7772 }
7773
7774 dict_manager.unlock();
7775
7776 DBUG_RETURN(HA_EXIT_SUCCESS);
7777
7778 error:
7779 /* Delete what we have allocated so far */
7780 delete m_tbl_def;
7781 m_tbl_def = nullptr;
7782 m_key_descr_arr = nullptr;
7783
7784 DBUG_RETURN(err);
7785 }
7786
7787 /**
7788 @brief
7789 create() is called to create a table. The variable name will have the name
7790 of the table.
7791
7792 @details
7793 When create() is called you do not need to worry about
7794 opening the table. Also, the .frm file will have already been
7795 created so adjusting create_info is not necessary. You can overwrite
7796 the .frm file at this point if you wish to change the table
7797 definition, but there are no methods currently provided for doing
7798 so.
7799
7800 Called from handle.cc by ha_create_table().
7801
7802 @return
7803 HA_EXIT_SUCCESS OK
7804 other HA_ERR error code (can be SE-specific)
7805
7806 @see
7807 ha_create_table() in handle.cc
7808 */
7809
7810 int ha_rocksdb::create(const char *const name, TABLE *const table_arg,
7811 HA_CREATE_INFO *const create_info) {
7812 DBUG_ENTER_FUNC();
7813
7814 DBUG_ASSERT(table_arg != nullptr);
7815 DBUG_ASSERT(create_info != nullptr);
7816
7817 if (create_info->data_file_name) {
7818 // DATA DIRECTORY is used to create tables under a specific location
7819 // outside the MySQL data directory. We don't support this for MyRocks.
7820 // The `rocksdb_datadir` setting should be used to configure RocksDB data
7821 // directory.
7822 DBUG_RETURN(HA_ERR_ROCKSDB_TABLE_DATA_DIRECTORY_NOT_SUPPORTED);
7823 }
7824
7825 if (create_info->index_file_name) {
7826 // Similar check for INDEX DIRECTORY as well.
7827 DBUG_RETURN(HA_ERR_ROCKSDB_TABLE_INDEX_DIRECTORY_NOT_SUPPORTED);
7828 }
7829
7830 int err;
7831 /*
7832 Construct dbname.tablename ourselves, because parititioning
7833 passes strings like "./test/t14#P#p0" for individual partitions,
7834 while table_arg->s->table_name has none of that.
7835 */
7836 std::string str;
7837 err = rdb_normalize_tablename(name, &str);
7838 if (err != HA_EXIT_SUCCESS) {
7839 DBUG_RETURN(err);
7840 }
7841
7842 // FOREIGN KEY isn't supported yet
7843 THD *const thd = my_core::thd_get_current_thd();
7844 if (contains_foreign_key(thd)) {
7845 my_error(ER_NOT_SUPPORTED_YET, MYF(0),
7846 "FOREIGN KEY for the RocksDB storage engine");
7847 DBUG_RETURN(HA_ERR_UNSUPPORTED);
7848 }
7849
7850 // Check whether Data Dictionary contain information
7851 Rdb_tbl_def *tbl = ddl_manager.find(str);
7852 if (tbl != nullptr) {
7853 if (thd->lex->sql_command == SQLCOM_TRUNCATE) {
7854 err = delete_table(tbl);
7855 if (err != HA_EXIT_SUCCESS) {
7856 DBUG_RETURN(err);
7857 }
7858 } else {
7859 my_error(ER_METADATA_INCONSISTENCY, MYF(0), str.c_str(), name);
7860 DBUG_RETURN(HA_ERR_ROCKSDB_CORRUPT_DATA);
7861 }
7862 }
7863
7864 // The below adds/clears hooks in RocksDB sync points. There's no reason for
7865 // this code to be in ::create() but it needs to be somewhere where it is
7866 // away from any tight loops and where one can invoke it from mtr:
7867 DBUG_EXECUTE_IF("rocksdb_enable_delay_commits",
7868 {
7869 auto syncpoint= rocksdb::SyncPoint::GetInstance();
7870 syncpoint->SetCallBack("DBImpl::WriteImpl:BeforeLeaderEnters",
7871 [&](void* /*arg*/) {my_sleep(500);} );
7872 syncpoint->EnableProcessing();
7873 push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_WRONG_ARGUMENTS,
7874 "enable_delay_commits_mode ON");
7875
7876 });
7877 DBUG_EXECUTE_IF("rocksdb_disable_delay_commits",
7878 {
7879 auto syncpoint= rocksdb::SyncPoint::GetInstance();
7880 syncpoint->ClearCallBack("DBImpl::WriteImpl:BeforeLeaderEnters");
7881 syncpoint->DisableProcessing();
7882 push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_WRONG_ARGUMENTS,
7883 "enable_delay_commits_mode OFF");
7884 });
7885
7886 DBUG_RETURN(create_table(str, table_arg, create_info->auto_increment_value));
7887 }
7888
7889 /**
7890 @note
7891 This function is used only when the table has not yet been opened, and
7892 keyread_allowed bitmap doesn't have the correct values yet.
7893
7894 See comment in ha_rocksdb::index_flags() for details.
7895 */
7896
7897 bool ha_rocksdb::check_keyread_allowed(uint inx, uint part,
7898 bool all_parts) const {
7899 bool res = true;
7900 KEY *const key_info = &table_share->key_info[inx];
7901
7902 Rdb_field_packing dummy1;
7903 res = dummy1.setup(nullptr, key_info->key_part[part].field, inx, part,
7904 key_info->key_part[part].length);
7905
7906 if (res && all_parts) {
7907 for (uint i = 0; i < part; i++) {
7908 Field *field;
7909 if ((field = key_info->key_part[i].field)) {
7910 Rdb_field_packing dummy;
7911 if (!dummy.setup(nullptr, field, inx, i,
7912 key_info->key_part[i].length)) {
7913 /* Cannot do index-only reads for this column */
7914 res = false;
7915 break;
7916 }
7917 }
7918 }
7919 }
7920
7921 const uint pk = table_share->primary_key;
7922 if (inx == pk && all_parts &&
7923 part + 1 == table_share->key_info[pk].user_defined_key_parts) {
7924 m_pk_can_be_decoded = res;
7925 }
7926
7927 return res;
7928 }
7929
7930 int ha_rocksdb::read_key_exact(const Rdb_key_def &kd,
7931 rocksdb::Iterator *const iter,
7932 const bool /* unused */,
7933 const rocksdb::Slice &key_slice,
7934 const int64_t ttl_filter_ts) {
7935 THD *thd = ha_thd();
7936 /*
7937 We are looking for the first record such that
7938 index_tuple= lookup_tuple.
7939 lookup_tuple may be a prefix of the index.
7940 */
7941 rocksdb_smart_seek(kd.m_is_reverse_cf, iter, key_slice);
7942
7943 while (iter->Valid() && kd.value_matches_prefix(iter->key(), key_slice)) {
7944 if (thd && thd->killed) {
7945 return HA_ERR_QUERY_INTERRUPTED;
7946 }
7947 /*
7948 If TTL is enabled we need to check if the given key has already expired
7949 from the POV of the current transaction. If it has, try going to the next
7950 key.
7951 */
7952 if (kd.has_ttl() && should_hide_ttl_rec(kd, iter->value(), ttl_filter_ts)) {
7953 rocksdb_smart_next(kd.m_is_reverse_cf, iter);
7954 continue;
7955 }
7956
7957 return HA_EXIT_SUCCESS;
7958 }
7959
7960 /*
7961 Got a record that is not equal to the lookup value, or even a record
7962 from another table.index.
7963 */
7964 return HA_ERR_KEY_NOT_FOUND;
7965 }
7966
7967 int ha_rocksdb::read_before_key(const Rdb_key_def &kd,
7968 const bool full_key_match,
7969 const rocksdb::Slice &key_slice,
7970 const int64_t ttl_filter_ts) {
7971 THD *thd = ha_thd();
7972 /*
7973 We are looking for record with the biggest t.key such that
7974 t.key < lookup_tuple.
7975 */
7976 rocksdb_smart_seek(!kd.m_is_reverse_cf, m_scan_it, key_slice);
7977
7978 while (is_valid(m_scan_it)) {
7979 if (thd && thd->killed) {
7980 return HA_ERR_QUERY_INTERRUPTED;
7981 }
7982 /*
7983 We are using full key and we've hit an exact match, or...
7984
7985 If TTL is enabled we need to check if the given key has already expired
7986 from the POV of the current transaction. If it has, try going to the next
7987 key.
7988 */
7989 if ((full_key_match &&
7990 kd.value_matches_prefix(m_scan_it->key(), key_slice)) ||
7991 (kd.has_ttl() &&
7992 should_hide_ttl_rec(kd, m_scan_it->value(), ttl_filter_ts))) {
7993 rocksdb_smart_next(!kd.m_is_reverse_cf, m_scan_it);
7994 continue;
7995 }
7996
7997 return HA_EXIT_SUCCESS;
7998 }
7999
8000 return HA_ERR_KEY_NOT_FOUND;
8001 }
8002
8003 int ha_rocksdb::read_after_key(const Rdb_key_def &kd,
8004 const rocksdb::Slice &key_slice,
8005 const int64_t ttl_filter_ts) {
8006 THD *thd = ha_thd();
8007 /*
8008 We are looking for the first record such that
8009
8010 index_tuple $GT lookup_tuple
8011
8012 with HA_READ_AFTER_KEY, $GT = '>',
8013 with HA_READ_KEY_OR_NEXT, $GT = '>='
8014 */
8015 rocksdb_smart_seek(kd.m_is_reverse_cf, m_scan_it, key_slice);
8016
8017 /*
8018 If TTL is enabled we need to check if the given key has already expired
8019 from the POV of the current transaction. If it has, try going to the next
8020 key.
8021 */
8022 while (is_valid(m_scan_it) && kd.has_ttl() &&
8023 should_hide_ttl_rec(kd, m_scan_it->value(), ttl_filter_ts)) {
8024 if (thd && thd->killed) {
8025 return HA_ERR_QUERY_INTERRUPTED;
8026 }
8027 rocksdb_smart_next(kd.m_is_reverse_cf, m_scan_it);
8028 }
8029
8030 return is_valid(m_scan_it) ? HA_EXIT_SUCCESS : HA_ERR_KEY_NOT_FOUND;
8031 }
8032
8033 int ha_rocksdb::position_to_correct_key(
8034 const Rdb_key_def &kd, const enum ha_rkey_function &find_flag,
8035 const bool full_key_match, const uchar *const key,
8036 const key_part_map &keypart_map, const rocksdb::Slice &key_slice,
8037 bool *const move_forward, const int64_t ttl_filter_ts) {
8038 int rc = 0;
8039
8040 *move_forward = true;
8041
8042 switch (find_flag) {
8043 case HA_READ_KEY_EXACT:
8044 rc = read_key_exact(kd, m_scan_it, full_key_match, key_slice,
8045 ttl_filter_ts);
8046 break;
8047 case HA_READ_BEFORE_KEY:
8048 *move_forward = false;
8049 rc = read_before_key(kd, full_key_match, key_slice, ttl_filter_ts);
8050 if (rc == 0 && !kd.covers_key(m_scan_it->key())) {
8051 /* The record we've got is not from this index */
8052 rc = HA_ERR_KEY_NOT_FOUND;
8053 }
8054 break;
8055 case HA_READ_AFTER_KEY:
8056 case HA_READ_KEY_OR_NEXT:
8057 rc = read_after_key(kd, key_slice, ttl_filter_ts);
8058 if (rc == 0 && !kd.covers_key(m_scan_it->key())) {
8059 /* The record we've got is not from this index */
8060 rc = HA_ERR_KEY_NOT_FOUND;
8061 }
8062 break;
8063 case HA_READ_KEY_OR_PREV:
8064 case HA_READ_PREFIX:
8065 /* This flag is not used by the SQL layer, so we don't support it yet. */
8066 rc = HA_ERR_UNSUPPORTED;
8067 break;
8068 case HA_READ_PREFIX_LAST:
8069 case HA_READ_PREFIX_LAST_OR_PREV:
8070 *move_forward = false;
8071 /*
8072 Find the last record with the specified index prefix lookup.
8073 - HA_READ_PREFIX_LAST requires that the record has the
8074 prefix=lookup (if there are no such records,
8075 HA_ERR_KEY_NOT_FOUND should be returned).
8076 - HA_READ_PREFIX_LAST_OR_PREV has no such requirement. If there are no
8077 records with prefix=lookup, we should return the last record
8078 before that.
8079 */
8080 rc = read_before_key(kd, full_key_match, key_slice, ttl_filter_ts);
8081 if (rc == 0) {
8082 const rocksdb::Slice &rkey = m_scan_it->key();
8083 if (!kd.covers_key(rkey)) {
8084 /* The record we've got is not from this index */
8085 rc = HA_ERR_KEY_NOT_FOUND;
8086 } else if (find_flag == HA_READ_PREFIX_LAST) {
8087 uint size = kd.pack_index_tuple(table, m_pack_buffer,
8088 m_sk_packed_tuple, key, keypart_map);
8089 rocksdb::Slice lookup_tuple(
8090 reinterpret_cast<char *>(m_sk_packed_tuple), size);
8091
8092 // We need to compare the key we've got with the original search
8093 // prefix.
8094 if (!kd.value_matches_prefix(rkey, lookup_tuple)) {
8095 rc = HA_ERR_KEY_NOT_FOUND;
8096 }
8097 }
8098 }
8099 break;
8100 default:
8101 DBUG_ASSERT(0);
8102 break;
8103 }
8104
8105 return rc;
8106 }
8107
8108 int ha_rocksdb::calc_eq_cond_len(const Rdb_key_def &kd,
8109 const enum ha_rkey_function &find_flag,
8110 const rocksdb::Slice &slice,
8111 const int bytes_changed_by_succ,
8112 const key_range *const end_key,
8113 uint *const end_key_packed_size) {
8114 if (find_flag == HA_READ_KEY_EXACT) return slice.size();
8115
8116 if (find_flag == HA_READ_PREFIX_LAST) {
8117 /*
8118 We have made the kd.successor(m_sk_packed_tuple) call above.
8119
8120 The slice is at least Rdb_key_def::INDEX_NUMBER_SIZE bytes long.
8121 */
8122 return slice.size() - bytes_changed_by_succ;
8123 }
8124
8125 if (end_key) {
8126 *end_key_packed_size =
8127 kd.pack_index_tuple(table, m_pack_buffer, m_end_key_packed_tuple,
8128 end_key->key, end_key->keypart_map);
8129
8130 /*
8131 Calculating length of the equal conditions here. 4 byte index id is
8132 included.
8133 Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3)
8134 WHERE id1=1 AND id2=1 AND id3>=2 => eq_cond_len= 4+8+4= 16
8135 WHERE id1=1 AND id2>=1 AND id3>=2 => eq_cond_len= 4+8= 12
8136 Example2: id1 VARCHAR(30), id2 INT, PRIMARY KEY (id1, id2)
8137 WHERE id1 = 'AAA' and id2 < 3; => eq_cond_len=13 (varchar used 9 bytes)
8138 */
8139 rocksdb::Slice end_slice(reinterpret_cast<char *>(m_end_key_packed_tuple),
8140 *end_key_packed_size);
8141 return slice.difference_offset(end_slice);
8142 }
8143
8144 /*
8145 On range scan without any end key condition, there is no
8146 eq cond, and eq cond length is the same as index_id size (4 bytes).
8147 Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3)
8148 WHERE id1>=1 AND id2 >= 2 and id2 <= 5 => eq_cond_len= 4
8149 */
8150 return Rdb_key_def::INDEX_NUMBER_SIZE;
8151 }
8152
8153 int ha_rocksdb::read_row_from_primary_key(uchar *const buf) {
8154 int rc;
8155 const rocksdb::Slice &rkey = m_scan_it->key();
8156 const uint pk_size = rkey.size();
8157 const char *pk_data = rkey.data();
8158
8159 memcpy(m_pk_packed_tuple, pk_data, pk_size);
8160 m_last_rowkey.copy(pk_data, pk_size, &my_charset_bin);
8161
8162 if (m_lock_rows != RDB_LOCK_NONE) {
8163 /* We need to put a lock and re-read */
8164 rc = get_row_by_rowid(buf, m_pk_packed_tuple, pk_size);
8165 } else {
8166 /* Unpack from the row we've read */
8167 const rocksdb::Slice &value = m_scan_it->value();
8168 rc = convert_record_from_storage_format(&rkey, &value, buf);
8169 }
8170
8171 return rc;
8172 }
8173
8174 int ha_rocksdb::read_row_from_secondary_key(uchar *const buf,
8175 const Rdb_key_def &kd,
8176 bool move_forward) {
8177 int rc = 0;
8178 uint pk_size;
8179
8180 /* Get the key columns and primary key value */
8181 const rocksdb::Slice &rkey = m_scan_it->key();
8182 const rocksdb::Slice &value = m_scan_it->value();
8183
8184 #ifndef DBUG_OFF
8185 bool save_keyread_only = m_keyread_only;
8186 #endif
8187 DBUG_EXECUTE_IF("dbug.rocksdb.HA_EXTRA_KEYREAD", { m_keyread_only = true; });
8188
8189 bool covered_lookup = (m_keyread_only && kd.can_cover_lookup()) ||
8190 kd.covers_lookup(&value, &m_lookup_bitmap);
8191
8192 #ifndef DBUG_OFF
8193 m_keyread_only = save_keyread_only;
8194 #endif
8195
8196 if (covered_lookup && m_lock_rows == RDB_LOCK_NONE) {
8197 pk_size =
8198 kd.get_primary_key_tuple(table, *m_pk_descr, &rkey, m_pk_packed_tuple);
8199 if (pk_size == RDB_INVALID_KEY_LEN) {
8200 rc = HA_ERR_ROCKSDB_CORRUPT_DATA;
8201 } else {
8202 rc = kd.unpack_record(table, buf, &rkey, &value,
8203 m_converter->get_verify_row_debug_checksums());
8204 global_stats.covered_secondary_key_lookups.inc();
8205 }
8206 } else {
8207 if (kd.m_is_reverse_cf) move_forward = !move_forward;
8208
8209 rc = find_icp_matching_index_rec(move_forward, buf);
8210 if (!rc) {
8211 const rocksdb::Slice &rkey = m_scan_it->key();
8212 pk_size = kd.get_primary_key_tuple(table, *m_pk_descr, &rkey,
8213 m_pk_packed_tuple);
8214 if (pk_size == RDB_INVALID_KEY_LEN) {
8215 rc = HA_ERR_ROCKSDB_CORRUPT_DATA;
8216 } else {
8217 rc = get_row_by_rowid(buf, m_pk_packed_tuple, pk_size);
8218 }
8219 }
8220 }
8221
8222 if (!rc) {
8223 m_last_rowkey.copy((const char *)m_pk_packed_tuple, pk_size,
8224 &my_charset_bin);
8225 }
8226
8227 return rc;
8228 }
8229
8230 /**
8231 @note
8232 The problem with this function is that SQL layer calls it, when
8233 - the table has not been yet opened (no ::open() call done)
8234 - this->table_share already exists, but it is in the process of being
8235 filled, so some of fields are still NULL.
8236 - In particular, table_share->key_info[inx].key_part[] is filled only up
8237 to part #part. Subsequent key parts are not yet filled.
8238
8239 To complicate things further, SQL layer will call index_flags() with
8240 all_parts=TRUE. Essentially, we're asked to provide flags for reading
8241 keyparts whose datatype is not yet known.
8242
8243 We walk around this problem by using check_keyread_allowed(), which uses
8244 table_share object and is careful not to step on unitialized data.
8245
8246 When we get a call with all_parts=TRUE, we try to analyze all parts but
8247 ignore those that have key_part->field==nullptr (these are not initialized
8248 yet).
8249 */
8250
8251 ulong ha_rocksdb::index_flags(uint inx, uint part, bool all_parts) const {
8252 DBUG_ENTER_FUNC();
8253
8254 ulong base_flags = HA_READ_NEXT | // doesn't seem to be used
8255 HA_READ_ORDER | HA_READ_RANGE | HA_READ_PREV;
8256
8257 if (check_keyread_allowed(inx, part, all_parts)) {
8258 base_flags |= HA_KEYREAD_ONLY;
8259 }
8260
8261 if (inx == table_share->primary_key) {
8262 /*
8263 Index-only reads on primary key are the same as table scan for us. Still,
8264 we need to explicitly "allow" them, otherwise SQL layer will miss some
8265 plans.
8266 */
8267 base_flags |= HA_KEYREAD_ONLY | HA_CLUSTERED_INDEX;
8268 } else {
8269 /*
8270 We can Index Condition Pushdown any key except the primary. With primary
8271 key, we get (pk, record) pair immediately, there is no place to put the
8272 ICP check.
8273 */
8274 base_flags |= HA_DO_INDEX_COND_PUSHDOWN;
8275 }
8276
8277 DBUG_RETURN(base_flags);
8278 }
8279
8280 /**
8281 @brief
8282 Read next index tuple through the secondary index.
8283
8284 @details
8285 m_scan_it points at the index key-value pair that we should read the (pk,row)
8286 pair for.
8287 */
8288 int ha_rocksdb::secondary_index_read(const int keyno, uchar *const buf) {
8289 DBUG_ASSERT(table != nullptr);
8290 #ifdef MARIAROCKS_NOT_YET
8291 stats.rows_requested++;
8292 #endif
8293 /* Use STATUS_NOT_FOUND when record not found or some error occurred */
8294 table->status = STATUS_NOT_FOUND;
8295
8296 if (is_valid(m_scan_it)) {
8297 rocksdb::Slice key = m_scan_it->key();
8298
8299 /* Check if we've ran out of records of this index */
8300 if (m_key_descr_arr[keyno]->covers_key(key)) {
8301 int rc = 0;
8302
8303 // TODO: We could here check if we have ran out of range we're scanning
8304 const uint size = m_key_descr_arr[keyno]->get_primary_key_tuple(
8305 table, *m_pk_descr, &key, m_pk_packed_tuple);
8306 if (size == RDB_INVALID_KEY_LEN) {
8307 return HA_ERR_ROCKSDB_CORRUPT_DATA;
8308 }
8309
8310 m_last_rowkey.copy((const char *)m_pk_packed_tuple, size,
8311 &my_charset_bin);
8312
8313 rocksdb::Slice value = m_scan_it->value();
8314 bool covered_lookup =
8315 (m_keyread_only && m_key_descr_arr[keyno]->can_cover_lookup()) ||
8316 m_key_descr_arr[keyno]->covers_lookup(&value, &m_lookup_bitmap);
8317 if (covered_lookup && m_lock_rows == RDB_LOCK_NONE) {
8318 rc = m_key_descr_arr[keyno]->unpack_record(
8319 table, buf, &key, &value,
8320 m_converter->get_verify_row_debug_checksums());
8321 global_stats.covered_secondary_key_lookups.inc();
8322 } else {
8323 DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete_sk");
8324 rc = get_row_by_rowid(buf, m_pk_packed_tuple, size);
8325 }
8326
8327 if (!rc) {
8328 table->status = 0;
8329 #ifdef MARIAROCKS_NOT_YET
8330 stats.rows_read++;
8331 stats.rows_index_next++;
8332 #endif
8333 update_row_stats(ROWS_READ);
8334 }
8335 return rc;
8336 }
8337 }
8338 return HA_ERR_END_OF_FILE;
8339 }
8340
8341 bool ha_rocksdb::is_using_full_key(key_part_map keypart_map,
8342 uint actual_key_parts)
8343 {
8344 return (keypart_map == HA_WHOLE_KEY) ||
8345 (keypart_map == ((key_part_map(1) << actual_key_parts)
8346 - 1));
8347 }
8348
8349 /*
8350 ha_rocksdb::read_range_first overrides handler::read_range_first.
8351 The only difference from handler::read_range_first is that
8352 ha_rocksdb::read_range_first passes end_key to
8353 ha_rocksdb::index_read_map_impl function.
8354
8355 @return
8356 HA_EXIT_SUCCESS OK
8357 other HA_ERR error code (can be SE-specific)
8358 */
8359 int ha_rocksdb::read_range_first(const key_range *const start_key,
8360 const key_range *const end_key,
8361 bool eq_range_arg, bool sorted) {
8362 DBUG_ENTER_FUNC();
8363
8364 int result;
8365
8366 eq_range = eq_range_arg;
8367
8368 /* MariaDB: Pass RANGE_SCAN_ASC when we support ICP on backwards scans */
8369 set_end_range(end_key);
8370
8371 range_key_part = table->key_info[active_index].key_part;
8372
8373 if (!start_key) {
8374 // Read first record
8375 result = ha_index_first(table->record[0]);
8376 } else {
8377 #ifdef MARIAROCKS_NOT_YET
8378 if (is_using_prohibited_gap_locks(
8379 is_using_full_unique_key(active_index, start_key->keypart_map,
8380 start_key->flag))) {
8381 DBUG_RETURN(HA_ERR_LOCK_DEADLOCK);
8382 }
8383 #endif
8384 increment_statistics(&SSV::ha_read_key_count);
8385
8386 MYSQL_TABLE_IO_WAIT(m_psi, PSI_TABLE_FETCH_ROW, active_index, 0, {
8387 result =
8388 index_read_map_impl(table->record[0], start_key->key,
8389 start_key->keypart_map, start_key->flag, end_key);
8390 })
8391 }
8392 if (result) {
8393 DBUG_RETURN((result == HA_ERR_KEY_NOT_FOUND) ? HA_ERR_END_OF_FILE : result);
8394 }
8395
8396 if (compare_key(end_range) <= 0) {
8397 DBUG_RETURN(HA_EXIT_SUCCESS);
8398 } else {
8399 /*
8400 The last read row does not fall in the range. So request
8401 storage engine to release row lock if possible.
8402 */
8403 unlock_row();
8404 DBUG_RETURN(HA_ERR_END_OF_FILE);
8405 }
8406 }
8407
8408 int ha_rocksdb::prepare_index_scan()
8409 {
8410 range_key_part= table->key_info[active_index].key_part;
8411 set_end_range(NULL);
8412 return 0;
8413 }
8414
8415
8416 int ha_rocksdb::prepare_range_scan(const key_range *start_key,
8417 const key_range *end_key)
8418 {
8419 range_key_part= table->key_info[active_index].key_part;
8420
8421 if (start_key)
8422 {
8423 m_save_start_range= *start_key;
8424 m_start_range= &m_save_start_range;
8425 }
8426 else
8427 m_start_range= NULL;
8428
8429 set_end_range(end_key);
8430 return 0;
8431 }
8432
8433
8434 /**
8435 @return
8436 HA_EXIT_SUCCESS OK
8437 other HA_ERR error code (can be SE-specific)
8438 */
8439
8440 int ha_rocksdb::index_read_map(uchar *const buf, const uchar *const key,
8441 key_part_map keypart_map,
8442 enum ha_rkey_function find_flag) {
8443 DBUG_ENTER_FUNC();
8444
8445 DBUG_RETURN(index_read_map_impl(buf, key, keypart_map, find_flag, nullptr));
8446 }
8447
8448 /*
8449 See storage/rocksdb/rocksdb-range-access.txt for description of how MySQL
8450 index navigation commands are converted into RocksDB lookup commands.
8451
8452 This function takes end_key as an argument, and it is set on range scan.
8453 MyRocks needs to decide whether prefix bloom filter can be used or not.
8454 To decide to use prefix bloom filter or not, calculating equal condition
8455 length
8456 is needed. On equal lookups (find_flag == HA_READ_KEY_EXACT), equal
8457 condition length is the same as rocksdb::Slice.size() of the start key.
8458 On range scan, equal condition length is MIN(start_key, end_key) of the
8459 rocksdb::Slice expression.
8460
8461 @return
8462 HA_EXIT_SUCCESS OK
8463 other HA_ERR error code (can be SE-specific)
8464 */
8465 int ha_rocksdb::index_read_map_impl(uchar *const buf, const uchar *const key,
8466 key_part_map keypart_map,
8467 enum ha_rkey_function find_flag,
8468 const key_range *end_key) {
8469 DBUG_ENTER_FUNC();
8470
8471 DBUG_EXECUTE_IF("myrocks_busy_loop_on_row_read", int debug_i = 0;
8472 while (1) { debug_i++; });
8473
8474 int rc = 0;
8475
8476 THD *thd = ha_thd();
8477 DEBUG_SYNC(thd, "rocksdb.check_flags_rmi");
8478 if (thd && thd->killed) {
8479 rc = HA_ERR_QUERY_INTERRUPTED;
8480 DBUG_RETURN(rc);
8481 }
8482
8483 const Rdb_key_def &kd = *m_key_descr_arr[active_index];
8484 const uint actual_key_parts = kd.get_key_parts();
8485 bool using_full_key = is_using_full_key(keypart_map, actual_key_parts);
8486
8487 if (!end_key) end_key = end_range;
8488
8489 /* By default, we don't need the retrieved records to match the prefix */
8490 m_sk_match_prefix = nullptr;
8491 #ifdef MARIAROCKS_NOT_YET
8492 stats.rows_requested++;
8493 #endif
8494 if (active_index == table->s->primary_key && find_flag == HA_READ_KEY_EXACT &&
8495 using_full_key) {
8496 /*
8497 Equality lookup over primary key, using full tuple.
8498 This is a special case, use DB::Get.
8499 */
8500 const uint size = kd.pack_index_tuple(table, m_pack_buffer,
8501 m_pk_packed_tuple, key, keypart_map);
8502 bool skip_lookup = is_blind_delete_enabled();
8503
8504 rc = get_row_by_rowid(buf, m_pk_packed_tuple, size, skip_lookup, false);
8505
8506 if (!rc && !skip_lookup) {
8507 #ifdef MARIAROCKS_NOT_YET
8508 stats.rows_read++;
8509 stats.rows_index_first++;
8510 #endif
8511 update_row_stats(ROWS_READ);
8512 }
8513 DBUG_RETURN(rc);
8514 }
8515
8516 /*
8517 Unique secondary index performs lookups without the extended key fields
8518 */
8519 uint packed_size;
8520 if (active_index != table->s->primary_key &&
8521 table->key_info[active_index].flags & HA_NOSAME &&
8522 find_flag == HA_READ_KEY_EXACT && using_full_key) {
8523 key_part_map tmp_map = (key_part_map(1) << table->key_info[active_index]
8524 .user_defined_key_parts) -
8525 1;
8526 packed_size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
8527 key, tmp_map);
8528 if (table->key_info[active_index].user_defined_key_parts !=
8529 kd.get_key_parts()) {
8530 using_full_key = false;
8531 }
8532 } else {
8533 packed_size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
8534 key, keypart_map);
8535 }
8536
8537 if ((pushed_idx_cond && pushed_idx_cond_keyno == active_index) &&
8538 (find_flag == HA_READ_KEY_EXACT || find_flag == HA_READ_PREFIX_LAST)) {
8539 /*
8540 We are doing a point index lookup, and ICP is enabled. It is possible
8541 that this call will be followed by ha_rocksdb->index_next_same() call.
8542
8543 Do what InnoDB does: save the lookup tuple now. We will need it in
8544 index_next_same/find_icp_matching_index_rec in order to stop scanning
8545 as soon as index record doesn't match the lookup tuple.
8546
8547 When not using ICP, handler::index_next_same() will make sure that rows
8548 that don't match the lookup prefix are not returned.
8549 row matches the lookup prefix.
8550 */
8551 m_sk_match_prefix = m_sk_match_prefix_buf;
8552 m_sk_match_length = packed_size;
8553 memcpy(m_sk_match_prefix, m_sk_packed_tuple, packed_size);
8554 }
8555
8556 int bytes_changed_by_succ = 0;
8557 if (find_flag == HA_READ_PREFIX_LAST_OR_PREV ||
8558 find_flag == HA_READ_PREFIX_LAST || find_flag == HA_READ_AFTER_KEY) {
8559 /* See below */
8560 bytes_changed_by_succ = kd.successor(m_sk_packed_tuple, packed_size);
8561 }
8562
8563 rocksdb::Slice slice(reinterpret_cast<const char *>(m_sk_packed_tuple),
8564 packed_size);
8565
8566 uint end_key_packed_size = 0;
8567 /*
8568 In MariaDB, the end_key is always the bigger end of the range.
8569 If we are doing a reverse-ordered scan (that is, walking from the bigger
8570 key values to smaller), we should use the smaller end of range as end_key.
8571 */
8572 const key_range *cur_end_key= end_key;
8573 if (find_flag == HA_READ_PREFIX_LAST_OR_PREV ||
8574 find_flag == HA_READ_BEFORE_KEY)
8575 {
8576 cur_end_key= m_start_range;
8577 }
8578
8579 const uint eq_cond_len =
8580 calc_eq_cond_len(kd, find_flag, slice, bytes_changed_by_succ, cur_end_key,
8581 &end_key_packed_size);
8582
8583 bool use_all_keys = false;
8584 if (find_flag == HA_READ_KEY_EXACT &&
8585 my_count_bits(keypart_map) == kd.get_key_parts()) {
8586 use_all_keys = true;
8587 }
8588
8589 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
8590 const bool is_new_snapshot = !tx->has_snapshot();
8591 // Loop as long as we get a deadlock error AND we end up creating the
8592 // snapshot here (i.e. it did not exist prior to this)
8593 for (;;) {
8594 DEBUG_SYNC(thd, "rocksdb.check_flags_rmi_scan");
8595 if (thd && thd->killed) {
8596 rc = HA_ERR_QUERY_INTERRUPTED;
8597 break;
8598 }
8599 /*
8600 This will open the iterator and position it at a record that's equal or
8601 greater than the lookup tuple.
8602 */
8603 setup_scan_iterator(kd, &slice, use_all_keys, eq_cond_len);
8604
8605 /*
8606 Once we are positioned on from above, move to the position we really
8607 want: See storage/rocksdb/rocksdb-range-access.txt
8608 */
8609 bool move_forward;
8610 rc =
8611 position_to_correct_key(kd, find_flag, using_full_key, key, keypart_map,
8612 slice, &move_forward, tx->m_snapshot_timestamp);
8613
8614 if (rc) {
8615 break;
8616 }
8617
8618 m_skip_scan_it_next_call = false;
8619
8620 /*
8621 Now get the data for the row into 'buf'. If we were using a primary key
8622 then we have all the rows we need. For a secondary key we now need to
8623 lookup the primary key.
8624 */
8625 if (active_index == table->s->primary_key) {
8626 rc = read_row_from_primary_key(buf);
8627 } else {
8628 rc = read_row_from_secondary_key(buf, kd, move_forward);
8629 }
8630
8631 if (!should_recreate_snapshot(rc, is_new_snapshot)) {
8632 break; /* Exit the loop */
8633 }
8634
8635 // release the snapshot and iterator so they will be regenerated
8636 tx->release_snapshot();
8637 release_scan_iterator();
8638 }
8639
8640 if (rc) {
8641 /*
8642 This status is returned on any error
8643 the only possible error condition is record-not-found
8644 */
8645 table->status = STATUS_NOT_FOUND;
8646 } else {
8647 table->status = 0;
8648 #ifdef MARIAROCKS_NOT_YET
8649 stats.rows_read++;
8650 stats.rows_index_first++;
8651 #endif
8652 update_row_stats(ROWS_READ);
8653 }
8654
8655 DBUG_RETURN(rc);
8656 }
8657
8658 /*
8659 @brief
8660 Scan the secondary index until we find an index record that satisfies ICP
8661
8662 @param move_forward TRUE <=> move m_scan_it forward
8663 FALSE <=> move m_scan_it backward
8664 @param buf Record buffer (must be the same buffer that
8665 pushed index condition points to, in practice
8666 it is table->record[0])
8667
8668 @detail
8669 Move the current iterator m_scan_it until we get an index tuple that
8670 satisfies the pushed Index Condition.
8671 (if there is no pushed index condition, return right away)
8672
8673 @return
8674 0 - Index tuple satisfies ICP, can do index read.
8675 other - error code
8676 */
8677
8678 int ha_rocksdb::find_icp_matching_index_rec(const bool move_forward,
8679 uchar *const buf) {
8680 if (pushed_idx_cond && pushed_idx_cond_keyno == active_index) {
8681 const Rdb_key_def &kd = *m_key_descr_arr[active_index];
8682 THD *thd = ha_thd();
8683
8684 while (1) {
8685 int rc = rocksdb_skip_expired_records(kd, m_scan_it, !move_forward);
8686 if (rc != HA_EXIT_SUCCESS) {
8687 return rc;
8688 }
8689
8690 if (thd && thd->killed) {
8691 return HA_ERR_QUERY_INTERRUPTED;
8692 }
8693
8694 if (!is_valid(m_scan_it)) {
8695 table->status = STATUS_NOT_FOUND;
8696 return HA_ERR_END_OF_FILE;
8697 }
8698 const rocksdb::Slice rkey = m_scan_it->key();
8699
8700 if (!kd.covers_key(rkey)) {
8701 table->status = STATUS_NOT_FOUND;
8702 return HA_ERR_END_OF_FILE;
8703 }
8704
8705 if (m_sk_match_prefix) {
8706 const rocksdb::Slice prefix((const char *)m_sk_match_prefix,
8707 m_sk_match_length);
8708 if (!kd.value_matches_prefix(rkey, prefix)) {
8709 table->status = STATUS_NOT_FOUND;
8710 return HA_ERR_END_OF_FILE;
8711 }
8712 }
8713
8714 const rocksdb::Slice value = m_scan_it->value();
8715 int err = kd.unpack_record(table, buf, &rkey, &value,
8716 m_converter->get_verify_row_debug_checksums());
8717 if (err != HA_EXIT_SUCCESS) {
8718 return err;
8719 }
8720
8721 const check_result_t icp_status= handler_index_cond_check(this);
8722 if (icp_status == CHECK_NEG) {
8723 rocksdb_smart_next(!move_forward, m_scan_it);
8724 continue; /* Get the next (or prev) index tuple */
8725 }
8726 else if (icp_status == CHECK_OUT_OF_RANGE ||
8727 icp_status == CHECK_ABORTED_BY_USER) {
8728 /* We have walked out of range we are scanning */
8729 table->status = STATUS_NOT_FOUND;
8730 return HA_ERR_END_OF_FILE;
8731 }
8732 else /* icp_status == CHECK_POS */
8733 {
8734 /* Index Condition is satisfied. We have rc==0, proceed to fetch the
8735 * row. */
8736 break;
8737 }
8738 }
8739 }
8740 return HA_EXIT_SUCCESS;
8741 }
8742
8743 /**
8744 @return
8745 HA_EXIT_SUCCESS OK
8746 other HA_ERR error code (can be SE-specific)
8747 */
8748
8749 /**
8750 @return
8751 HA_ADMIN_OK OK
8752 other HA_ADMIN error code
8753 */
8754 int ha_rocksdb::check(THD *const thd, HA_CHECK_OPT *const check_opt) {
8755 DBUG_ENTER_FUNC();
8756
8757 DBUG_ASSERT(thd != nullptr);
8758 DBUG_ASSERT(check_opt != nullptr);
8759
8760 const uint pk = pk_index(table, m_tbl_def);
8761 String rowkey_copy;
8762 String sec_key_copy;
8763 const char *const table_name = table->s->table_name.str;
8764
8765 // Only when debugging: don't use snapshot when reading
8766 // Rdb_transaction *tx= get_or_create_tx(table->in_use);
8767 // tx->snapshot= nullptr;
8768
8769 bool save_verify_row_debug_checksums =
8770 m_converter->get_verify_row_debug_checksums();
8771 m_converter->set_verify_row_debug_checksums(true);
8772 /* For each secondary index, check that we can get a PK value from it */
8773 // NO_LINT_DEBUG
8774 sql_print_verbose_info("CHECKTABLE %s: Checking table %s", table_name,
8775 table_name);
8776 ha_rows UNINIT_VAR(row_checksums_at_start); // set/used iff first_index==true
8777 ha_rows row_checksums = ha_rows(-1);
8778 bool first_index = true;
8779
8780 for (uint keyno = 0; keyno < table->s->keys; keyno++) {
8781 if (keyno != pk) {
8782 extra(HA_EXTRA_KEYREAD);
8783 ha_index_init(keyno, true);
8784 ha_rows rows = 0;
8785 ha_rows checksums = 0;
8786 if (first_index) {
8787 row_checksums_at_start = m_converter->get_row_checksums_checked();
8788 }
8789 int res;
8790 // NO_LINT_DEBUG
8791 sql_print_verbose_info("CHECKTABLE %s: Checking index %s", table_name,
8792 table->key_info[keyno].name.str);
8793 while (1) {
8794 if (!rows) {
8795 res = index_first(table->record[0]);
8796 } else {
8797 res = index_next(table->record[0]);
8798 }
8799
8800 if (res == HA_ERR_END_OF_FILE) break;
8801 if (res) {
8802 // error
8803 // NO_LINT_DEBUG
8804 sql_print_error("CHECKTABLE %s: .. row %lld: index scan error %d",
8805 table_name, rows, res);
8806 goto error;
8807 }
8808 rocksdb::Slice key = m_scan_it->key();
8809 sec_key_copy.copy(key.data(), key.size(), &my_charset_bin);
8810 rowkey_copy.copy(m_last_rowkey.ptr(), m_last_rowkey.length(),
8811 &my_charset_bin);
8812
8813 if (m_key_descr_arr[keyno]->unpack_info_has_checksum(
8814 m_scan_it->value())) {
8815 checksums++;
8816 }
8817
8818 if ((res = get_row_by_rowid(table->record[0], rowkey_copy.ptr(),
8819 rowkey_copy.length()))) {
8820 // NO_LINT_DEBUG
8821 sql_print_error(
8822 "CHECKTABLE %s: .. row %lld: "
8823 "failed to fetch row by rowid",
8824 table_name, rows);
8825 goto error;
8826 }
8827
8828 longlong hidden_pk_id = 0;
8829 if (has_hidden_pk(table) &&
8830 read_hidden_pk_id_from_rowkey(&hidden_pk_id)) {
8831 goto error;
8832 }
8833
8834 /* Check if we get the same PK value */
8835 uint packed_size = m_pk_descr->pack_record(
8836 table, m_pack_buffer, table->record[0], m_pk_packed_tuple, nullptr,
8837 false, hidden_pk_id);
8838 if (packed_size != rowkey_copy.length() ||
8839 memcmp(m_pk_packed_tuple, rowkey_copy.ptr(), packed_size)) {
8840 // NO_LINT_DEBUG
8841 sql_print_error("CHECKTABLE %s: .. row %lld: PK value mismatch",
8842 table_name, rows);
8843 goto print_and_error;
8844 }
8845
8846 /* Check if we get the same secondary key value */
8847 packed_size = m_key_descr_arr[keyno]->pack_record(
8848 table, m_pack_buffer, table->record[0], m_sk_packed_tuple,
8849 &m_sk_tails, false, hidden_pk_id);
8850 if (packed_size != sec_key_copy.length() ||
8851 memcmp(m_sk_packed_tuple, sec_key_copy.ptr(), packed_size)) {
8852 // NO_LINT_DEBUG
8853 sql_print_error(
8854 "CHECKTABLE %s: .. row %lld: "
8855 "secondary index value mismatch",
8856 table_name, rows);
8857 goto print_and_error;
8858 }
8859 rows++;
8860 continue;
8861
8862 print_and_error : {
8863 std::string buf;
8864 buf = rdb_hexdump(rowkey_copy.ptr(), rowkey_copy.length(),
8865 RDB_MAX_HEXDUMP_LEN);
8866 // NO_LINT_DEBUG
8867 sql_print_error("CHECKTABLE %s: rowkey: %s", table_name, buf.c_str());
8868
8869 buf = rdb_hexdump(m_retrieved_record.data(), m_retrieved_record.size(),
8870 RDB_MAX_HEXDUMP_LEN);
8871 // NO_LINT_DEBUG
8872 sql_print_error("CHECKTABLE %s: record: %s", table_name, buf.c_str());
8873
8874 buf = rdb_hexdump(sec_key_copy.ptr(), sec_key_copy.length(),
8875 RDB_MAX_HEXDUMP_LEN);
8876 // NO_LINT_DEBUG
8877 sql_print_error("CHECKTABLE %s: index: %s", table_name, buf.c_str());
8878
8879 goto error;
8880 }
8881 }
8882 // NO_LINT_DEBUG
8883 sql_print_verbose_info("CHECKTABLE %s: ... %lld index entries checked "
8884 "(%lld had checksums)",
8885 table_name, rows, checksums);
8886
8887 if (first_index) {
8888 row_checksums =
8889 m_converter->get_row_checksums_checked() - row_checksums_at_start;
8890 first_index = false;
8891 }
8892 ha_index_end();
8893 }
8894 }
8895 if (row_checksums != ha_rows(-1)) {
8896 // NO_LINT_DEBUG
8897 sql_print_verbose_info("CHECKTABLE %s: %lld table records had checksums",
8898 table_name, row_checksums);
8899 }
8900 extra(HA_EXTRA_NO_KEYREAD);
8901
8902 m_converter->set_verify_row_debug_checksums(save_verify_row_debug_checksums);
8903 /*
8904 TODO(sergiy): we should check also for PK records that are missing in
8905 the secondary indexes.
8906 For that, need to walk through the PK and check that every PK record has a
8907 proper counterpart in each secondary index.
8908 */
8909 DBUG_RETURN(HA_ADMIN_OK);
8910 error:
8911 m_converter->set_verify_row_debug_checksums(save_verify_row_debug_checksums);
8912 ha_index_or_rnd_end();
8913 extra(HA_EXTRA_NO_KEYREAD);
8914
8915 DBUG_RETURN(HA_ADMIN_CORRUPT);
8916 }
8917
8918 static void dbug_dump_str(FILE *const out, const char *const str, int len) {
8919 fprintf(out, "\"");
8920 for (int i = 0; i < len; i++) {
8921 if (str[i] > 32) {
8922 fprintf(out, "%c", str[i]);
8923 } else {
8924 fprintf(out, "\\%d", str[i]);
8925 }
8926 }
8927 fprintf(out, "\"");
8928 }
8929
8930 /*
8931 Debugging help: dump the whole database into a human-readable file.
8932 Usage:
8933 dbug_dump_database(rdb);
8934 */
8935
8936 void dbug_dump_database(rocksdb::DB *const db) {
8937 FILE *const out = fopen("/tmp/rocksdb.dump", "wt");
8938 if (!out) return;
8939
8940 rocksdb::Iterator *it = db->NewIterator(rocksdb::ReadOptions());
8941 for (it->SeekToFirst(); it->Valid(); it->Next()) {
8942 rocksdb::Slice key = it->key();
8943 rocksdb::Slice val = it->value();
8944 dbug_dump_str(out, key.data(), key.size());
8945 fprintf(out, " -> ");
8946 dbug_dump_str(out, val.data(), val.size());
8947 fprintf(out, "\n");
8948 }
8949
8950 delete it;
8951 fclose(out);
8952 }
8953
8954 rocksdb::Status ha_rocksdb::get_for_update(
8955 Rdb_transaction *const tx, rocksdb::ColumnFamilyHandle *const column_family,
8956 const rocksdb::Slice &key, rocksdb::PinnableSlice *const value) const {
8957 DBUG_ASSERT(m_lock_rows != RDB_LOCK_NONE);
8958 const bool exclusive = m_lock_rows != RDB_LOCK_READ;
8959
8960 const bool do_validate =
8961 my_core::thd_tx_isolation(ha_thd()) > ISO_READ_COMMITTED;
8962 rocksdb::Status s =
8963 tx->get_for_update(column_family, key, value, exclusive, do_validate);
8964
8965 #ifndef DBUG_OFF
8966 ++rocksdb_num_get_for_update_calls;
8967 #endif
8968 return s;
8969 }
8970
8971 bool ha_rocksdb::is_blind_delete_enabled() {
8972 THD *thd = ha_thd();
8973 /*
8974 Note: in MariaDB, thd->lex->table_count is only set for multi-table DELETE,
8975 not for single-table DELETE. So we check thd->lex->query_tables instead.
8976 */
8977 return (THDVAR(thd, blind_delete_primary_key) &&
8978 thd->lex->sql_command == SQLCOM_DELETE &&
8979 thd->lex->query_tables && !thd->lex->query_tables->next_global &&
8980 table->s->keys == 1 &&
8981 !has_hidden_pk(table) && !thd->rgi_slave);
8982 }
8983
8984 /*
8985 Given a rowid (i.e. packed PK) as a parameter, get the record.
8986
8987 @return
8988 HA_EXIT_SUCCESS OK
8989 other HA_ERR error code (can be SE-specific)
8990 */
8991
8992 int ha_rocksdb::get_row_by_rowid(uchar *const buf, const char *const rowid,
8993 const uint rowid_size, const bool skip_lookup,
8994 const bool skip_ttl_check) {
8995 DBUG_ENTER_FUNC();
8996
8997 DBUG_ASSERT(table != nullptr);
8998
8999 int rc;
9000
9001 rocksdb::Slice key_slice(rowid, rowid_size);
9002
9003 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
9004 DBUG_ASSERT(tx != nullptr);
9005
9006 DEBUG_SYNC(ha_thd(), "rocksdb.get_row_by_rowid");
9007 DBUG_EXECUTE_IF("dbug.rocksdb.get_row_by_rowid", {
9008 THD *thd = ha_thd();
9009 const char act[] =
9010 "now signal Reached "
9011 "wait_for signal.rocksdb.get_row_by_rowid_let_running";
9012 DBUG_ASSERT(opt_debug_sync_timeout > 0);
9013 DBUG_ASSERT(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
9014 };);
9015
9016 bool found;
9017 rocksdb::Status s;
9018
9019 /* Pretend row found without looking up */
9020 if (skip_lookup) {
9021 #ifdef MARIAROCKS_NOT_YET
9022 stats.rows_deleted_blind++;
9023 #endif
9024 update_row_stats(ROWS_DELETED_BLIND);
9025 m_last_rowkey.copy((const char *)rowid, rowid_size, &my_charset_bin);
9026 table->status = 0;
9027 DBUG_RETURN(0);
9028 }
9029
9030 if (m_lock_rows == RDB_LOCK_NONE) {
9031 tx->acquire_snapshot(true);
9032 s = tx->get(m_pk_descr->get_cf(), key_slice, &m_retrieved_record);
9033 } else if (m_insert_with_update && m_dup_pk_found) {
9034 DBUG_ASSERT(m_pk_descr->get_keyno() == m_dupp_errkey);
9035 DBUG_ASSERT(m_dup_pk_retrieved_record.length() ==
9036 m_retrieved_record.size());
9037 DBUG_ASSERT(memcmp(m_dup_pk_retrieved_record.ptr(),
9038 m_retrieved_record.data(),
9039 m_retrieved_record.size()) == 0);
9040
9041 // do nothing - we already have the result in m_retrieved_record and
9042 // already taken the lock
9043 s = rocksdb::Status::OK();
9044 } else {
9045 s = get_for_update(tx, m_pk_descr->get_cf(), key_slice,
9046 &m_retrieved_record);
9047 }
9048
9049 DBUG_EXECUTE_IF("rocksdb_return_status_corrupted",
9050 dbug_change_status_to_corrupted(&s););
9051
9052 if (!s.IsNotFound() && !s.ok()) {
9053 DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def,
9054 m_table_handler));
9055 }
9056 found = !s.IsNotFound();
9057
9058 table->status = STATUS_NOT_FOUND;
9059 if (found) {
9060 /* If we found the record, but it's expired, pretend we didn't find it. */
9061 if (!skip_ttl_check && m_pk_descr->has_ttl() &&
9062 should_hide_ttl_rec(*m_pk_descr, m_retrieved_record,
9063 tx->m_snapshot_timestamp)) {
9064 DBUG_RETURN(HA_ERR_KEY_NOT_FOUND);
9065 }
9066
9067 m_last_rowkey.copy((const char *)rowid, rowid_size, &my_charset_bin);
9068 rc = convert_record_from_storage_format(&key_slice, buf);
9069
9070 if (!rc) {
9071 table->status = 0;
9072 }
9073 } else {
9074 /*
9075 Note: we don't need to unlock the row. It is intentional that we keep
9076 locks on rows that don't exist.
9077 */
9078 rc = HA_ERR_KEY_NOT_FOUND;
9079 }
9080
9081 DBUG_RETURN(rc);
9082 }
9083
9084 /**
9085 @return
9086 HA_EXIT_SUCCESS OK
9087 other HA_ERR error code (can be SE-specific)
9088 */
9089 int ha_rocksdb::index_next(uchar *const buf) {
9090 DBUG_ENTER_FUNC();
9091
9092 bool moves_forward = true;
9093 if (m_key_descr_arr[active_index]->m_is_reverse_cf) {
9094 moves_forward = false;
9095 }
9096
9097 int rc = index_next_with_direction(buf, moves_forward);
9098 if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
9099
9100 DBUG_RETURN(rc);
9101 }
9102
9103 /**
9104 @return
9105 HA_EXIT_SUCCESS OK
9106 other HA_ERR error code (can be SE-specific)
9107 */
9108 int ha_rocksdb::index_prev(uchar *const buf) {
9109 DBUG_ENTER_FUNC();
9110
9111 bool moves_forward = false;
9112 if (m_key_descr_arr[active_index]->m_is_reverse_cf) {
9113 moves_forward = true;
9114 }
9115
9116 int rc = index_next_with_direction(buf, moves_forward);
9117 if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
9118
9119 DBUG_RETURN(rc);
9120 }
9121
9122 int ha_rocksdb::index_next_with_direction(uchar *const buf, bool move_forward) {
9123 DBUG_ENTER_FUNC();
9124
9125 int rc;
9126
9127 if (active_index == pk_index(table, m_tbl_def)) {
9128 rc = rnd_next_with_direction(buf, move_forward);
9129 } else {
9130 THD *thd = ha_thd();
9131 for (;;) {
9132 DEBUG_SYNC(thd, "rocksdb.check_flags_inwd");
9133 if (thd && thd->killed) {
9134 rc = HA_ERR_QUERY_INTERRUPTED;
9135 break;
9136 }
9137 if (m_skip_scan_it_next_call) {
9138 m_skip_scan_it_next_call = false;
9139 } else {
9140 if (move_forward) {
9141 m_scan_it->Next(); /* this call cannot fail */
9142 } else {
9143 m_scan_it->Prev();
9144 }
9145 }
9146 rc = rocksdb_skip_expired_records(*m_key_descr_arr[active_index],
9147 m_scan_it, !move_forward);
9148 if (rc != HA_EXIT_SUCCESS) {
9149 break;
9150 }
9151 rc = find_icp_matching_index_rec(move_forward, buf);
9152 if (!rc) rc = secondary_index_read(active_index, buf);
9153 if (!should_skip_invalidated_record(rc)) {
9154 break;
9155 }
9156 }
9157 }
9158
9159 DBUG_RETURN(rc);
9160 }
9161
9162 /**
9163 @return
9164 HA_EXIT_SUCCESS OK
9165 other HA_ERR error code (can be SE-specific)
9166 */
9167 int ha_rocksdb::index_first(uchar *const buf) {
9168 DBUG_ENTER_FUNC();
9169
9170 m_sk_match_prefix = nullptr;
9171 int rc = m_key_descr_arr[active_index]->m_is_reverse_cf
9172 ? index_last_intern(buf)
9173 : index_first_intern(buf);
9174 if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
9175
9176 DBUG_RETURN(rc);
9177 }
9178
9179 /**
9180 @return
9181 HA_EXIT_SUCCESS OK
9182 other HA_ERR error code (can be SE-specific)
9183 */
9184 int ha_rocksdb::index_last(uchar *const buf) {
9185 DBUG_ENTER_FUNC();
9186
9187 m_sk_match_prefix = nullptr;
9188 int rc = m_key_descr_arr[active_index]->m_is_reverse_cf
9189 ? index_first_intern(buf)
9190 : index_last_intern(buf);
9191 if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
9192
9193 DBUG_RETURN(rc);
9194 }
9195
9196 /*
9197 Start scanning from the "first" value.
9198
9199 The 'first' here means "the first from start of the key space".
9200 For reverse-ordered key spaces, the first value will be the biggest, 'zzzz'.
9201
9202 An picture of a forward-ordered keyspace (remember, the keys have form
9203 'indexnr-keyval'. Suppose the index we are at has number n)
9204
9205 (n-1) - ...
9206 ( n ) <--- 1. (n) doesn't exist in the db but it would be here.
9207 ( n ) - aaa <--- 2. Seek("n") will put us here on the first index
9208 ( n ) - bbb record.
9209 ( n ) - cc
9210
9211 So, need to do: Seek(n);
9212
9213 A backward-ordered keyspace:
9214
9215 (n+1) - bbb
9216 (n+1) - aaa
9217 (n+1) <--- (n+1) doesn't exist in the db but would be here.
9218 ( n ) - ccc <--- 1. We need to be here.
9219 ( n ) - bbb
9220 ( n ) - aaa
9221 ( n )
9222
9223 So, need to: Seek(n+1);
9224
9225 */
9226
9227 int ha_rocksdb::index_first_intern(uchar *const buf) {
9228 DBUG_ENTER_FUNC();
9229
9230 uchar *key;
9231 uint key_size;
9232 int rc;
9233
9234 if (is_pk(active_index, table, m_tbl_def)) {
9235 key = m_pk_packed_tuple;
9236 } else {
9237 key = m_sk_packed_tuple;
9238 }
9239
9240 DBUG_ASSERT(key != nullptr);
9241
9242 const Rdb_key_def &kd = *m_key_descr_arr[active_index];
9243 int key_start_matching_bytes = kd.get_first_key(key, &key_size);
9244
9245 rocksdb::Slice index_key((const char *)key, key_size);
9246
9247 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
9248 DBUG_ASSERT(tx != nullptr);
9249
9250 const bool is_new_snapshot = !tx->has_snapshot();
9251 // Loop as long as we get a deadlock error AND we end up creating the
9252 // snapshot here (i.e. it did not exist prior to this)
9253 for (;;) {
9254 setup_scan_iterator(kd, &index_key, false, key_start_matching_bytes);
9255 m_scan_it->Seek(index_key);
9256 m_skip_scan_it_next_call = true;
9257
9258 rc = index_next_with_direction(buf, true);
9259 if (!should_recreate_snapshot(rc, is_new_snapshot)) {
9260 break; /* exit the loop */
9261 }
9262
9263 // release the snapshot and iterator so they will be regenerated
9264 tx->release_snapshot();
9265 release_scan_iterator();
9266 }
9267
9268 if (!rc) {
9269 /*
9270 index_next is always incremented on success, so decrement if it is
9271 index_first instead
9272 */
9273 #ifdef MARIAROCKS_NOT_YET
9274 stats.rows_index_first++;
9275 stats.rows_index_next--;
9276 #endif
9277 }
9278
9279 DBUG_RETURN(rc);
9280 }
9281
9282 /**
9283 @details
9284 Start scanning from the "last" value
9285
9286 The 'last' here means "the last from start of the key space".
9287 For reverse-ordered key spaces, we will actually read the smallest value.
9288
9289 An picture of a forward-ordered keyspace (remember, the keys have form
9290 'indexnr-keyval'. Suppose the we are at a key that has number n)
9291
9292 (n-1)-something
9293 ( n )-aaa
9294 ( n )-bbb
9295 ( n )-ccc <----------- Need to seek to here.
9296 (n+1) <---- Doesn't exist, but would be here.
9297 (n+1)-smth, or no value at all
9298
9299 RocksDB's Iterator::SeekForPrev($val) seeks to "at $val or last value that's
9300 smaller". We can't seek to "(n)-ccc" directly, because we don't know what
9301 is the value of 'ccc' (the biggest record with prefix (n)). Instead, we seek
9302 to "(n+1)", which is the least possible value that's greater than any value
9303 in index #n.
9304
9305 So, need to: it->SeekForPrev(n+1)
9306
9307 A backward-ordered keyspace:
9308
9309 (n+1)-something
9310 ( n ) - ccc
9311 ( n ) - bbb
9312 ( n ) - aaa <---------------- (*) Need to seek here.
9313 ( n ) <--- Doesn't exist, but would be here.
9314 (n-1)-smth, or no value at all
9315
9316 So, need to: it->SeekForPrev(n)
9317 */
9318
9319 int ha_rocksdb::index_last_intern(uchar *const buf) {
9320 DBUG_ENTER_FUNC();
9321
9322 uchar *key;
9323 uint key_size;
9324 int rc;
9325
9326 if (is_pk(active_index, table, m_tbl_def)) {
9327 key = m_pk_packed_tuple;
9328 } else {
9329 key = m_sk_packed_tuple;
9330 }
9331
9332 DBUG_ASSERT(key != nullptr);
9333
9334 const Rdb_key_def &kd = *m_key_descr_arr[active_index];
9335 int key_end_matching_bytes = kd.get_last_key(key, &key_size);
9336
9337 rocksdb::Slice index_key((const char *)key, key_size);
9338
9339 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
9340 DBUG_ASSERT(tx != nullptr);
9341
9342 bool is_new_snapshot = !tx->has_snapshot();
9343 // Loop as long as we get a deadlock error AND we end up creating the
9344 // snapshot here (i.e. it did not exist prior to this)
9345 for (;;) {
9346 setup_scan_iterator(kd, &index_key, false, key_end_matching_bytes);
9347 m_scan_it->SeekForPrev(index_key);
9348 m_skip_scan_it_next_call = false;
9349
9350 if (is_pk(active_index, table, m_tbl_def)) {
9351 m_skip_scan_it_next_call = true;
9352 rc = rnd_next_with_direction(buf, false);
9353 } else {
9354 rc = find_icp_matching_index_rec(false /*move_forward*/, buf);
9355 if (!rc) rc = secondary_index_read(active_index, buf);
9356 }
9357
9358 if (!should_recreate_snapshot(rc, is_new_snapshot)) {
9359 break; /* exit the loop */
9360 }
9361
9362 // release the snapshot and iterator so they will be regenerated
9363 tx->release_snapshot();
9364 release_scan_iterator();
9365 }
9366
9367 if (!rc) {
9368 /*
9369 index_next is always incremented on success, so decrement if it is
9370 index_first instead
9371 */
9372 #ifdef MARIAROCKS_NOT_YET
9373 stats.rows_index_first++;
9374 stats.rows_index_next--;
9375 #endif
9376 }
9377
9378 DBUG_RETURN(rc);
9379 }
9380
9381 void ha_rocksdb::unlock_row() {
9382 DBUG_ENTER_FUNC();
9383
9384 if (m_lock_rows != RDB_LOCK_NONE) {
9385 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
9386 tx->release_lock(m_pk_descr->get_cf(),
9387 std::string(m_last_rowkey.ptr(), m_last_rowkey.length()));
9388 }
9389
9390 DBUG_VOID_RETURN;
9391 }
9392
9393 /*
9394 Returning true if SingleDelete can be used.
9395 - Secondary Indexes can always use SingleDelete.
9396 - If the index is PRIMARY KEY, and if all of the columns of the table
9397 are covered by the PRIMARY KEY, SingleDelete can be used.
9398 */
9399 bool ha_rocksdb::can_use_single_delete(const uint index) const {
9400 return (index != pk_index(table, m_tbl_def) ||
9401 (!has_hidden_pk(table) &&
9402 table->key_info[index].ext_key_parts == table->s->fields));
9403 }
9404
9405 bool ha_rocksdb::skip_unique_check() const {
9406 /*
9407 We want to skip unique checks if:
9408 1) bulk_load is on
9409 2) this table is in the whitelist of tables to skip and the replication
9410 lag has reached a large enough value (see unique_check_lag_threshold
9411 and unique_check_lage_reset_threshold)
9412 3) the user set unique_checks option to 0, and the table does not have
9413 any indexes. If the table has secondary keys, then those might becomes
9414 inconsisted/corrupted
9415 4) We're using read-free replication
9416 */
9417 return THDVAR(table->in_use, bulk_load) ||
9418 (m_force_skip_unique_check && m_skip_unique_check) ||
9419 (my_core::thd_test_options(table->in_use,
9420 OPTION_RELAXED_UNIQUE_CHECKS) &&
9421 m_tbl_def->m_key_count == 1) ||
9422 #ifdef MARIAROCKS_NOT_YET
9423 use_read_free_rpl();
9424 #else
9425 FALSE;
9426 #endif
9427 }
9428
9429 #ifdef MARIAROCKS_NOT_YET // MDEV-10975
9430 void ha_rocksdb::set_force_skip_unique_check(bool skip) {
9431 DBUG_ENTER_FUNC();
9432
9433 m_force_skip_unique_check = skip;
9434
9435 DBUG_VOID_RETURN;
9436 }
9437 #endif
9438
9439 bool ha_rocksdb::commit_in_the_middle() {
9440 return THDVAR(table->in_use, bulk_load) ||
9441 THDVAR(table->in_use, commit_in_the_middle);
9442 }
9443
9444 /*
9445 Executing bulk commit if it should.
9446 @retval true if bulk commit failed
9447 @retval false if bulk commit was skipped or succeeded
9448 */
9449 bool ha_rocksdb::do_bulk_commit(Rdb_transaction *const tx) {
9450 return commit_in_the_middle() &&
9451 tx->get_write_count() >= THDVAR(table->in_use, bulk_load_size) &&
9452 tx->flush_batch();
9453 }
9454
9455 /*
9456 If table was created without primary key, SQL layer represents the primary
9457 key number as MAX_INDEXES. Hence, this function returns true if the table
9458 does not contain a primary key. (In which case we generate a hidden
9459 'auto-incremented' pk.)
9460 */
9461 bool ha_rocksdb::has_hidden_pk(const TABLE *const table) const {
9462 return Rdb_key_def::table_has_hidden_pk(table);
9463 }
9464
9465 /*
9466 Returns true if given index number is a hidden_pk.
9467 - This is used when a table is created with no primary key.
9468 */
9469 bool ha_rocksdb::is_hidden_pk(const uint index, const TABLE *const table_arg,
9470 const Rdb_tbl_def *const tbl_def_arg) {
9471 DBUG_ASSERT(table_arg->s != nullptr);
9472
9473 return (table_arg->s->primary_key == MAX_INDEXES &&
9474 index == tbl_def_arg->m_key_count - 1);
9475 }
9476
9477 /* Returns index of primary key */
9478 uint ha_rocksdb::pk_index(const TABLE *const table_arg,
9479 const Rdb_tbl_def *const tbl_def_arg) {
9480 DBUG_ASSERT(table_arg->s != nullptr);
9481
9482 return table_arg->s->primary_key == MAX_INDEXES ? tbl_def_arg->m_key_count - 1
9483 : table_arg->s->primary_key;
9484 }
9485
9486 /* Returns true if given index number is a primary key */
9487 bool ha_rocksdb::is_pk(const uint index, const TABLE *const table_arg,
9488 const Rdb_tbl_def *const tbl_def_arg) {
9489 DBUG_ASSERT(table_arg->s != nullptr);
9490
9491 return index == table_arg->s->primary_key ||
9492 is_hidden_pk(index, table_arg, tbl_def_arg);
9493 }
9494
9495 uint ha_rocksdb::max_supported_key_part_length() const {
9496 DBUG_ENTER_FUNC();
9497 DBUG_RETURN(rocksdb_large_prefix ? MAX_INDEX_COL_LEN_LARGE
9498 : MAX_INDEX_COL_LEN_SMALL);
9499 }
9500
9501 const char *ha_rocksdb::get_key_name(const uint index,
9502 const TABLE *const table_arg,
9503 const Rdb_tbl_def *const tbl_def_arg) {
9504 if (is_hidden_pk(index, table_arg, tbl_def_arg)) {
9505 return HIDDEN_PK_NAME;
9506 }
9507
9508 DBUG_ASSERT(table_arg->key_info != nullptr);
9509 DBUG_ASSERT(table_arg->key_info[index].name.str != nullptr);
9510
9511 return table_arg->key_info[index].name.str;
9512 }
9513
9514 const char *ha_rocksdb::get_key_comment(const uint index,
9515 const TABLE *const table_arg,
9516 const Rdb_tbl_def *const tbl_def_arg) {
9517 if (is_hidden_pk(index, table_arg, tbl_def_arg)) {
9518 return nullptr;
9519 }
9520
9521 DBUG_ASSERT(table_arg->key_info != nullptr);
9522
9523 return table_arg->key_info[index].comment.str;
9524 }
9525
9526 const std::string ha_rocksdb::generate_cf_name(
9527 const uint index, const TABLE *const table_arg,
9528 const Rdb_tbl_def *const tbl_def_arg, bool *per_part_match_found) {
9529 DBUG_ASSERT(table_arg != nullptr);
9530 DBUG_ASSERT(tbl_def_arg != nullptr);
9531 DBUG_ASSERT(per_part_match_found != nullptr);
9532
9533 // When creating CF-s the caller needs to know if there was a custom CF name
9534 // specified for a given paritition.
9535 *per_part_match_found = false;
9536
9537 // Index comment is used to define the column family name specification(s).
9538 // If there was no comment, we get an emptry string, and it means "use the
9539 // default column family".
9540 const char *const comment = get_key_comment(index, table_arg, tbl_def_arg);
9541
9542 // `get_key_comment` can return `nullptr`, that's why this.
9543 std::string key_comment = comment ? comment : "";
9544
9545 std::string cf_name = Rdb_key_def::parse_comment_for_qualifier(
9546 key_comment, table_arg, tbl_def_arg, per_part_match_found,
9547 RDB_CF_NAME_QUALIFIER);
9548
9549 if (IF_PARTITIONING(table_arg->part_info,nullptr) != nullptr && !*per_part_match_found) {
9550 // At this point we tried to search for a custom CF name for a partition,
9551 // but none was specified. Therefore default one will be used.
9552 return "";
9553 }
9554
9555 // If we didn't find any partitioned/non-partitioned qualifiers, return the
9556 // comment itself. NOTE: this currently handles returning the cf name
9557 // specified in the index comment in the case of no partitions, which doesn't
9558 // use any qualifiers at the moment. (aka its a special case)
9559 if (cf_name.empty() && !key_comment.empty()) {
9560 return key_comment;
9561 }
9562
9563 return cf_name;
9564 }
9565
9566 const std::string ha_rocksdb::get_table_comment(const TABLE *const table_arg) {
9567 DBUG_ASSERT(table_arg->s != nullptr);
9568
9569 return table_arg->s->comment.str;
9570 }
9571
9572 /**
9573 Write a new row
9574
9575 @param[in] buf new row data to write
9576 @return
9577 HA_EXIT_SUCCESS OK
9578 other HA_ERR error code (can be SE-specific)
9579 */
9580 int ha_rocksdb::write_row(const uchar *const buf) {
9581 DBUG_ENTER_FUNC();
9582
9583 DBUG_ASSERT(buf != nullptr);
9584 DBUG_ASSERT(buf == table->record[0]);
9585 DBUG_ASSERT(m_lock_rows == RDB_LOCK_WRITE);
9586
9587 /*
9588 Note: "buf == table->record[0]" is copied from innodb. I am not aware of
9589 any use cases where this condition is not true.
9590 */
9591 if (table->next_number_field && buf == table->record[0]) {
9592 int err;
9593 if ((err = update_auto_increment())) {
9594 DBUG_RETURN(err);
9595 }
9596 }
9597
9598 // clear cache at beginning of write for INSERT ON DUPLICATE
9599 // we may get multiple write->fail->read->update if there are multiple
9600 // values from INSERT
9601 m_dup_pk_found = false;
9602
9603 const int rv = update_write_row(nullptr, buf, skip_unique_check());
9604
9605 if (rv == 0) {
9606 #ifdef MARIAROCKS_NOT_YET
9607 stats.rows_inserted++;
9608 #endif
9609 update_row_stats(ROWS_INSERTED);
9610 }
9611
9612 DBUG_RETURN(rv);
9613 }
9614
9615 /**
9616 Constructing m_last_rowkey (MyRocks key expression) from
9617 before_update|delete image (MySQL row expression).
9618 m_last_rowkey is normally set during lookup phase, such as
9619 rnd_next_with_direction() and rnd_pos(). With Read Free Replication,
9620 these read functions are skipped and update_rows(), delete_rows() are
9621 called without setting m_last_rowkey. This function sets m_last_rowkey
9622 for Read Free Replication.
9623 */
9624 void ha_rocksdb::set_last_rowkey(const uchar *const old_data) {
9625 #ifdef MARIAROCKS_NOT_YET
9626 if (old_data && use_read_free_rpl()) {
9627 const int old_pk_size = m_pk_descr->pack_record(
9628 table, m_pack_buffer, old_data, m_pk_packed_tuple, nullptr, false);
9629 m_last_rowkey.copy((const char *)m_pk_packed_tuple, old_pk_size,
9630 &my_charset_bin);
9631 }
9632 #endif
9633 }
9634
9635 /**
9636 Collect update data for primary key
9637
9638 @param[in, out] row_info hold all data for update row, such as
9639 new row data/old row data
9640 @return
9641 HA_EXIT_SUCCESS OK
9642 other HA_ERR error code (can be SE-specific)
9643 */
9644 int ha_rocksdb::get_pk_for_update(struct update_row_info *const row_info) {
9645 int size;
9646
9647 /*
9648 Get new row key for any insert, and any update where the pk is not hidden.
9649 Row key for updates with hidden pk is handled below.
9650 */
9651 if (!has_hidden_pk(table)) {
9652 row_info->hidden_pk_id = 0;
9653
9654 row_info->new_pk_unpack_info = &m_pk_unpack_info;
9655
9656 size = m_pk_descr->pack_record(
9657 table, m_pack_buffer, row_info->new_data, m_pk_packed_tuple,
9658 row_info->new_pk_unpack_info, false, 0, 0, nullptr);
9659 } else if (row_info->old_data == nullptr) {
9660 row_info->hidden_pk_id = update_hidden_pk_val();
9661 size =
9662 m_pk_descr->pack_hidden_pk(row_info->hidden_pk_id, m_pk_packed_tuple);
9663 } else {
9664 /*
9665 If hidden primary key, rowkey for new record will always be the same as
9666 before
9667 */
9668 size = row_info->old_pk_slice.size();
9669 memcpy(m_pk_packed_tuple, row_info->old_pk_slice.data(), size);
9670 int err = read_hidden_pk_id_from_rowkey(&row_info->hidden_pk_id);
9671 if (err) {
9672 return err;
9673 }
9674 }
9675
9676 row_info->new_pk_slice =
9677 rocksdb::Slice((const char *)m_pk_packed_tuple, size);
9678
9679 return HA_EXIT_SUCCESS;
9680 }
9681
9682 /**
9683 Check the specified primary key value is unique and also lock the row
9684
9685 @param[in] key_id key index
9686 @param[in] row_info hold all data for update row, such as old row
9687 data and new row data
9688 @param[out] found whether the primary key exists before.
9689 @param[out] pk_changed whether primary key is changed
9690 @return
9691 HA_EXIT_SUCCESS OK
9692 other HA_ERR error code (can be SE-specific)
9693 */
9694 int ha_rocksdb::check_and_lock_unique_pk(const uint key_id,
9695 const struct update_row_info &row_info,
9696 bool *const found) {
9697 DBUG_ASSERT(found != nullptr);
9698
9699 DBUG_ASSERT(row_info.old_pk_slice.size() == 0 ||
9700 row_info.new_pk_slice.compare(row_info.old_pk_slice) != 0);
9701
9702 /* Ignore PK violations if this is a optimized 'replace into' */
9703 #ifdef MARIAROCKS_NOT_YET
9704 const bool ignore_pk_unique_check = ha_thd()->lex->blind_replace_into;
9705 #else
9706 const bool ignore_pk_unique_check= false;
9707 #endif
9708
9709 /*
9710 Perform a read to determine if a duplicate entry exists. For primary
9711 keys, a point lookup will be sufficient.
9712
9713 note: we intentionally don't set options.snapshot here. We want to read
9714 the latest committed data.
9715 */
9716
9717 /*
9718 To prevent race conditions like below, it is necessary to
9719 take a lock for a target row. get_for_update() holds a gap lock if
9720 target key does not exist, so below conditions should never
9721 happen.
9722
9723 1) T1 Get(empty) -> T2 Get(empty) -> T1 Put(insert) -> T1 commit
9724 -> T2 Put(overwrite) -> T2 commit
9725 2) T1 Get(empty) -> T1 Put(insert, not committed yet) -> T2 Get(empty)
9726 -> T2 Put(insert, blocked) -> T1 commit -> T2 commit(overwrite)
9727 */
9728 const rocksdb::Status s =
9729 get_for_update(row_info.tx, m_pk_descr->get_cf(), row_info.new_pk_slice,
9730 ignore_pk_unique_check ? nullptr : &m_retrieved_record);
9731 if (!s.ok() && !s.IsNotFound()) {
9732 return row_info.tx->set_status_error(
9733 table->in_use, s, *m_key_descr_arr[key_id], m_tbl_def, m_table_handler);
9734 }
9735
9736 bool key_found = ignore_pk_unique_check ? false : !s.IsNotFound();
9737
9738 /*
9739 If the pk key has ttl, we may need to pretend the row wasn't
9740 found if it is already expired.
9741 */
9742 if (key_found && m_pk_descr->has_ttl() &&
9743 should_hide_ttl_rec(*m_pk_descr, m_retrieved_record,
9744 (row_info.tx->m_snapshot_timestamp
9745 ? row_info.tx->m_snapshot_timestamp
9746 : static_cast<int64_t>(std::time(nullptr))))) {
9747 key_found = false;
9748 }
9749
9750 if (key_found && row_info.old_data == nullptr && m_insert_with_update) {
9751 // In INSERT ON DUPLICATE KEY UPDATE ... case, if the insert failed
9752 // due to a duplicate key, remember the last key and skip the check
9753 // next time
9754 m_dup_pk_found = true;
9755
9756 #ifndef DBUG_OFF
9757 // save it for sanity checking later
9758 m_dup_pk_retrieved_record.copy(m_retrieved_record.data(),
9759 m_retrieved_record.size(), &my_charset_bin);
9760 #endif
9761 }
9762
9763 *found = key_found;
9764
9765 return HA_EXIT_SUCCESS;
9766 }
9767
9768 /**
9769 Check the specified secondary key value is unique and also lock the row
9770
9771 @param[in] key_id key index
9772 @param[in] row_info hold all data for update row, such as old row
9773 data and new row data
9774 @param[out] found whether specified key value exists before.
9775 @return
9776 HA_EXIT_SUCCESS OK
9777 other HA_ERR error code (can be SE-specific)
9778 */
9779 int ha_rocksdb::check_and_lock_sk(const uint key_id,
9780 const struct update_row_info &row_info,
9781 bool *const found) {
9782 DBUG_ASSERT(found != nullptr);
9783 *found = false;
9784
9785 /*
9786 Can skip checking this key if none of the key fields have changed.
9787 */
9788 if (row_info.old_data != nullptr && !m_update_scope.is_set(key_id)) {
9789 return HA_EXIT_SUCCESS;
9790 }
9791
9792 KEY *key_info = nullptr;
9793 uint n_null_fields = 0;
9794 uint user_defined_key_parts = 1;
9795
9796 key_info = &table->key_info[key_id];
9797 user_defined_key_parts = key_info->user_defined_key_parts;
9798 /*
9799 If there are no uniqueness requirements, there's no need to obtain a
9800 lock for this key.
9801 */
9802 if (!(key_info->flags & HA_NOSAME)) {
9803 return HA_EXIT_SUCCESS;
9804 }
9805
9806 const Rdb_key_def &kd = *m_key_descr_arr[key_id];
9807
9808 /*
9809 Calculate the new key for obtaining the lock
9810
9811 For unique secondary indexes, the key used for locking does not
9812 include the extended fields.
9813 */
9814 int size =
9815 kd.pack_record(table, m_pack_buffer, row_info.new_data, m_sk_packed_tuple,
9816 nullptr, false, 0, user_defined_key_parts, &n_null_fields);
9817 if (n_null_fields > 0) {
9818 /*
9819 If any fields are marked as NULL this will never match another row as
9820 to NULL never matches anything else including another NULL.
9821 */
9822 return HA_EXIT_SUCCESS;
9823 }
9824
9825 const rocksdb::Slice new_slice =
9826 rocksdb::Slice((const char *)m_sk_packed_tuple, size);
9827
9828 /*
9829 Acquire lock on the old key in case of UPDATE
9830 */
9831 if (row_info.old_data != nullptr) {
9832 size = kd.pack_record(table, m_pack_buffer, row_info.old_data,
9833 m_sk_packed_tuple_old, nullptr, false, 0,
9834 user_defined_key_parts);
9835 const rocksdb::Slice old_slice =
9836 rocksdb::Slice((const char *)m_sk_packed_tuple_old, size);
9837
9838 const rocksdb::Status s =
9839 get_for_update(row_info.tx, kd.get_cf(), old_slice, nullptr);
9840 if (!s.ok()) {
9841 return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def,
9842 m_table_handler);
9843 }
9844
9845 /*
9846 If the old and new keys are the same we're done since we've already taken
9847 the lock on the old key
9848 */
9849 if (!new_slice.compare(old_slice)) {
9850 return HA_EXIT_SUCCESS;
9851 }
9852 }
9853
9854 /*
9855 Perform a read to determine if a duplicate entry exists - since this is
9856 a secondary indexes a range scan is needed.
9857
9858 note: we intentionally don't set options.snapshot here. We want to read
9859 the latest committed data.
9860 */
9861
9862 const bool all_parts_used = (user_defined_key_parts == kd.get_key_parts());
9863
9864 /*
9865 This iterator seems expensive since we need to allocate and free
9866 memory for each unique index.
9867
9868 If this needs to be optimized, for keys without NULL fields, the
9869 extended primary key fields can be migrated to the value portion of the
9870 key. This enables using Get() instead of Seek() as in the primary key
9871 case.
9872
9873 The bloom filter may need to be disabled for this lookup.
9874 */
9875 uchar lower_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE];
9876 uchar upper_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE];
9877 rocksdb::Slice lower_bound_slice;
9878 rocksdb::Slice upper_bound_slice;
9879
9880 const bool total_order_seek = !check_bloom_and_set_bounds(
9881 ha_thd(), kd, new_slice, all_parts_used, Rdb_key_def::INDEX_NUMBER_SIZE,
9882 lower_bound_buf, upper_bound_buf, &lower_bound_slice, &upper_bound_slice);
9883 const bool fill_cache = !THDVAR(ha_thd(), skip_fill_cache);
9884
9885 const rocksdb::Status s =
9886 get_for_update(row_info.tx, kd.get_cf(), new_slice, nullptr);
9887 if (!s.ok() && !s.IsNotFound()) {
9888 return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def,
9889 m_table_handler);
9890 }
9891
9892 rocksdb::Iterator *const iter = row_info.tx->get_iterator(
9893 kd.get_cf(), total_order_seek, fill_cache, lower_bound_slice,
9894 upper_bound_slice, true /* read current data */,
9895 false /* acquire snapshot */);
9896 /*
9897 Need to scan the transaction to see if there is a duplicate key.
9898 Also need to scan RocksDB and verify the key has not been deleted
9899 in the transaction.
9900 */
9901 iter->Seek(new_slice);
9902 *found = !read_key_exact(kd, iter, all_parts_used, new_slice,
9903 row_info.tx->m_snapshot_timestamp);
9904 delete iter;
9905
9906 return HA_EXIT_SUCCESS;
9907 }
9908
9909 /**
9910 Enumerate all keys to check their uniquess and also lock it
9911
9912 @param[in] row_info hold all data for update row, such as old row
9913 data and new row data
9914 @param[out] pk_changed whether primary key is changed
9915 @return
9916 HA_EXIT_SUCCESS OK
9917 other HA_ERR error code (can be SE-specific)
9918 */
9919 int ha_rocksdb::check_uniqueness_and_lock(
9920 const struct update_row_info &row_info, bool pk_changed) {
9921 /*
9922 Go through each index and determine if the index has uniqueness
9923 requirements. If it does, then try to obtain a row lock on the new values.
9924 Once all locks have been obtained, then perform the changes needed to
9925 update/insert the row.
9926 */
9927 for (uint key_id = 0; key_id < m_tbl_def->m_key_count; key_id++) {
9928 bool found;
9929 int rc;
9930
9931 if (is_pk(key_id, table, m_tbl_def)) {
9932 if (row_info.old_pk_slice.size() > 0 && !pk_changed) {
9933 found = false;
9934 rc = HA_EXIT_SUCCESS;
9935 } else {
9936 rc = check_and_lock_unique_pk(key_id, row_info, &found);
9937 }
9938 } else {
9939 rc = check_and_lock_sk(key_id, row_info, &found);
9940 }
9941
9942 if (rc != HA_EXIT_SUCCESS) {
9943 return rc;
9944 }
9945
9946 if (found) {
9947 /* There is a row with this key already, so error out. */
9948 errkey = key_id;
9949 m_dupp_errkey = errkey;
9950
9951 return HA_ERR_FOUND_DUPP_KEY;
9952 }
9953 }
9954
9955 return HA_EXIT_SUCCESS;
9956 }
9957
9958 /**
9959 Check whether secondary key value is duplicate or not
9960
9961 @param[in] table_arg the table currently working on
9962 @param[in key_def the key_def is being checked
9963 @param[in] key secondary key storage data
9964 @param[out] sk_info hold secondary key memcmp datas(new/old)
9965 @return
9966 HA_EXIT_SUCCESS OK
9967 other HA_ERR error code (can be SE-specific)
9968 */
9969
9970 int ha_rocksdb::check_duplicate_sk(const TABLE *table_arg,
9971 const Rdb_key_def &key_def,
9972 const rocksdb::Slice *key,
9973 struct unique_sk_buf_info *sk_info) {
9974 uint n_null_fields = 0;
9975 const rocksdb::Comparator *index_comp = key_def.get_cf()->GetComparator();
9976
9977 /* Get proper SK buffer. */
9978 uchar *sk_buf = sk_info->swap_and_get_sk_buf();
9979
9980 /* Get memcmp form of sk without extended pk tail */
9981 uint sk_memcmp_size =
9982 key_def.get_memcmp_sk_parts(table_arg, *key, sk_buf, &n_null_fields);
9983
9984 sk_info->sk_memcmp_key =
9985 rocksdb::Slice(reinterpret_cast<char *>(sk_buf), sk_memcmp_size);
9986
9987 if (sk_info->sk_memcmp_key_old.size() > 0 && n_null_fields == 0 &&
9988 index_comp->Compare(sk_info->sk_memcmp_key, sk_info->sk_memcmp_key_old) ==
9989 0) {
9990 return 1;
9991 }
9992
9993 sk_info->sk_memcmp_key_old = sk_info->sk_memcmp_key;
9994 return 0;
9995 }
9996
9997 int ha_rocksdb::bulk_load_key(Rdb_transaction *const tx, const Rdb_key_def &kd,
9998 const rocksdb::Slice &key,
9999 const rocksdb::Slice &value, bool sort) {
10000 DBUG_ENTER_FUNC();
10001 int res;
10002 THD *thd = ha_thd();
10003 if (thd && thd->killed) {
10004 DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED);
10005 }
10006
10007 rocksdb::ColumnFamilyHandle *cf = kd.get_cf();
10008
10009 // In the case of unsorted inserts, m_sst_info allocated here is not
10010 // used to store the keys. It is still used to indicate when tables
10011 // are switched.
10012 if (m_sst_info == nullptr || m_sst_info->is_done()) {
10013 m_sst_info.reset(new Rdb_sst_info(rdb, m_table_handler->m_table_name,
10014 kd.get_name(), cf, *rocksdb_db_options,
10015 THDVAR(ha_thd(), trace_sst_api)));
10016 res = tx->start_bulk_load(this, m_sst_info);
10017 if (res != HA_EXIT_SUCCESS) {
10018 DBUG_RETURN(res);
10019 }
10020 }
10021 DBUG_ASSERT(m_sst_info);
10022
10023 if (sort) {
10024 Rdb_index_merge *key_merge;
10025 DBUG_ASSERT(cf != nullptr);
10026
10027 res = tx->get_key_merge(kd.get_gl_index_id(), cf, &key_merge);
10028 if (res == HA_EXIT_SUCCESS) {
10029 res = key_merge->add(key, value);
10030 }
10031 } else {
10032 res = m_sst_info->put(key, value);
10033 }
10034
10035 DBUG_RETURN(res);
10036 }
10037
10038 int ha_rocksdb::finalize_bulk_load(bool print_client_error) {
10039 DBUG_ENTER_FUNC();
10040
10041 int res = HA_EXIT_SUCCESS;
10042
10043 /* Skip if there are no possible ongoing bulk loads */
10044 if (m_sst_info) {
10045 if (m_sst_info->is_done()) {
10046 m_sst_info.reset();
10047 DBUG_RETURN(res);
10048 }
10049
10050 Rdb_sst_info::Rdb_sst_commit_info commit_info;
10051
10052 // Wrap up the current work in m_sst_info and get ready to commit
10053 // This transfer the responsibility of commit over to commit_info
10054 res = m_sst_info->finish(&commit_info, print_client_error);
10055 if (res == 0) {
10056 // Make sure we have work to do - under race condition we could lose
10057 // to another thread and end up with no work
10058 if (commit_info.has_work()) {
10059 rocksdb::IngestExternalFileOptions opts;
10060 opts.move_files = true;
10061 opts.snapshot_consistency = false;
10062 opts.allow_global_seqno = false;
10063 opts.allow_blocking_flush = false;
10064
10065 const rocksdb::Status s = rdb->IngestExternalFile(
10066 commit_info.get_cf(), commit_info.get_committed_files(), opts);
10067 if (!s.ok()) {
10068 if (print_client_error) {
10069 Rdb_sst_info::report_error_msg(s, nullptr);
10070 }
10071 res = HA_ERR_ROCKSDB_BULK_LOAD;
10072 } else {
10073 // Mark the list of SST files as committed, otherwise they'll get
10074 // cleaned up when commit_info destructs
10075 commit_info.commit();
10076 }
10077 }
10078 }
10079 m_sst_info.reset();
10080 }
10081 DBUG_RETURN(res);
10082 }
10083
10084 /**
10085 Update an existing primary key record or write a new primary key record
10086
10087 @param[in] kd the primary key is being update/write
10088 @param[in] update_row_info hold all row data, such as old row data and
10089 new row data
10090 @param[in] pk_changed whether primary key is changed
10091 @return
10092 HA_EXIT_SUCCESS OK
10093 Other HA_ERR error code (can be SE-specific)
10094 */
10095 int ha_rocksdb::update_write_pk(const Rdb_key_def &kd,
10096 const struct update_row_info &row_info,
10097 bool pk_changed) {
10098 uint key_id = kd.get_keyno();
10099 bool hidden_pk = is_hidden_pk(key_id, table, m_tbl_def);
10100 ulonglong bytes_written = 0;
10101
10102 /*
10103 If the PK has changed, or if this PK uses single deletes and this is an
10104 update, the old key needs to be deleted. In the single delete case, it
10105 might be possible to have this sequence of keys: PUT(X), PUT(X), SD(X),
10106 resulting in the first PUT(X) showing up.
10107 */
10108 if (!hidden_pk && (pk_changed || ((row_info.old_pk_slice.size() > 0) &&
10109 can_use_single_delete(key_id)))) {
10110 const rocksdb::Status s = delete_or_singledelete(
10111 key_id, row_info.tx, kd.get_cf(), row_info.old_pk_slice);
10112 if (!s.ok()) {
10113 return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def,
10114 m_table_handler);
10115 } else {
10116 bytes_written = row_info.old_pk_slice.size();
10117 }
10118 }
10119
10120 if (table->found_next_number_field) {
10121 update_auto_incr_val_from_field();
10122 }
10123
10124 int rc = HA_EXIT_SUCCESS;
10125 rocksdb::Slice value_slice;
10126 /* Prepare the new record to be written into RocksDB */
10127 if ((rc = m_converter->encode_value_slice(
10128 m_pk_descr, row_info.new_pk_slice, row_info.new_pk_unpack_info,
10129 !row_info.old_pk_slice.empty(), should_store_row_debug_checksums(),
10130 m_ttl_bytes, &m_ttl_bytes_updated, &value_slice))) {
10131 return rc;
10132 }
10133
10134 const auto cf = m_pk_descr->get_cf();
10135 if (rocksdb_enable_bulk_load_api && THDVAR(table->in_use, bulk_load) &&
10136 !hidden_pk) {
10137 /*
10138 Write the primary key directly to an SST file using an SstFileWriter
10139 */
10140 rc = bulk_load_key(row_info.tx, kd, row_info.new_pk_slice, value_slice,
10141 THDVAR(table->in_use, bulk_load_allow_unsorted));
10142 } else if (row_info.skip_unique_check || row_info.tx->m_ddl_transaction) {
10143 /*
10144 It is responsibility of the user to make sure that the data being
10145 inserted doesn't violate any unique keys.
10146 */
10147 row_info.tx->get_indexed_write_batch()->Put(cf, row_info.new_pk_slice,
10148 value_slice);
10149 } else {
10150 const bool assume_tracked = can_assume_tracked(ha_thd());
10151 const auto s = row_info.tx->put(cf, row_info.new_pk_slice, value_slice,
10152 assume_tracked);
10153 if (!s.ok()) {
10154 if (s.IsBusy()) {
10155 errkey = table->s->primary_key;
10156 m_dupp_errkey = errkey;
10157 rc = HA_ERR_FOUND_DUPP_KEY;
10158 } else {
10159 rc = row_info.tx->set_status_error(table->in_use, s, *m_pk_descr,
10160 m_tbl_def, m_table_handler);
10161 }
10162 }
10163 }
10164
10165 if (rc == HA_EXIT_SUCCESS) {
10166 row_info.tx->update_bytes_written(
10167 bytes_written + row_info.new_pk_slice.size() + value_slice.size());
10168 }
10169 return rc;
10170 }
10171
10172 /**
10173 update an existing secondary key record or write a new secondary key record
10174
10175 @param[in] table_arg Table we're working on
10176 @param[in] kd The secondary key being update/write
10177 @param[in] row_info data structure contains old row data and new row data
10178 @param[in] bulk_load_sk whether support bulk load. Currently it is only
10179 support for write
10180 @return
10181 HA_EXIT_SUCCESS OK
10182 Other HA_ERR error code (can be SE-specific)
10183 */
10184 int ha_rocksdb::update_write_sk(const TABLE *const table_arg,
10185 const Rdb_key_def &kd,
10186 const struct update_row_info &row_info,
10187 const bool bulk_load_sk) {
10188 int new_packed_size;
10189 int old_packed_size;
10190 int rc = HA_EXIT_SUCCESS;
10191
10192 rocksdb::Slice new_key_slice;
10193 rocksdb::Slice new_value_slice;
10194 rocksdb::Slice old_key_slice;
10195
10196 const uint key_id = kd.get_keyno();
10197
10198 ulonglong bytes_written = 0;
10199
10200 /*
10201 Can skip updating this key if none of the key fields have changed and, if
10202 this table has TTL, the TTL timestamp has not changed.
10203 */
10204 if (row_info.old_data != nullptr && !m_update_scope.is_set(key_id) &&
10205 (!kd.has_ttl() || !m_ttl_bytes_updated)) {
10206 return HA_EXIT_SUCCESS;
10207 }
10208
10209 bool store_row_debug_checksums = should_store_row_debug_checksums();
10210 new_packed_size =
10211 kd.pack_record(table_arg, m_pack_buffer, row_info.new_data,
10212 m_sk_packed_tuple, &m_sk_tails, store_row_debug_checksums,
10213 row_info.hidden_pk_id, 0, nullptr, m_ttl_bytes);
10214
10215 if (row_info.old_data != nullptr) {
10216 // The old value
10217 old_packed_size = kd.pack_record(
10218 table_arg, m_pack_buffer, row_info.old_data, m_sk_packed_tuple_old,
10219 &m_sk_tails_old, store_row_debug_checksums, row_info.hidden_pk_id, 0,
10220 nullptr, m_ttl_bytes);
10221
10222 /*
10223 Check if we are going to write the same value. This can happen when
10224 one does
10225 UPDATE tbl SET col='foo'
10226 and we are looking at the row that already has col='foo'.
10227
10228 We also need to compare the unpack info. Suppose, the collation is
10229 case-insensitive, and unpack info contains information about whether
10230 the letters were uppercase and lowercase. Then, both 'foo' and 'FOO'
10231 will have the same key value, but different data in unpack_info.
10232
10233 (note: anyone changing bytewise_compare should take this code into
10234 account)
10235 */
10236 if (old_packed_size == new_packed_size &&
10237 m_sk_tails_old.get_current_pos() == m_sk_tails.get_current_pos() &&
10238 !(kd.has_ttl() && m_ttl_bytes_updated) &&
10239 memcmp(m_sk_packed_tuple_old, m_sk_packed_tuple, old_packed_size) ==
10240 0 &&
10241 memcmp(m_sk_tails_old.ptr(), m_sk_tails.ptr(),
10242 m_sk_tails.get_current_pos()) == 0) {
10243 return HA_EXIT_SUCCESS;
10244 }
10245
10246 /*
10247 Deleting entries from secondary index should skip locking, but
10248 be visible to the transaction.
10249 (also note that DDL statements do not delete rows, so this is not a DDL
10250 statement)
10251 */
10252 old_key_slice = rocksdb::Slice(
10253 reinterpret_cast<const char *>(m_sk_packed_tuple_old), old_packed_size);
10254
10255 row_info.tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(),
10256 old_key_slice);
10257
10258 bytes_written = old_key_slice.size();
10259 }
10260
10261 new_key_slice = rocksdb::Slice(
10262 reinterpret_cast<const char *>(m_sk_packed_tuple), new_packed_size);
10263 new_value_slice =
10264 rocksdb::Slice(reinterpret_cast<const char *>(m_sk_tails.ptr()),
10265 m_sk_tails.get_current_pos());
10266
10267 if (bulk_load_sk && row_info.old_data == nullptr) {
10268 rc = bulk_load_key(row_info.tx, kd, new_key_slice, new_value_slice, true);
10269 } else {
10270 row_info.tx->get_indexed_write_batch()->Put(kd.get_cf(), new_key_slice,
10271 new_value_slice);
10272 }
10273
10274 row_info.tx->update_bytes_written(bytes_written + new_key_slice.size() +
10275 new_value_slice.size());
10276
10277 return rc;
10278 }
10279
10280 /**
10281 Update existing indexes(PK/SKs) or write new indexes(PK/SKs)
10282
10283 @param[in] row_info hold all row data, such as old key/new key
10284 @param[in] pk_changed whether primary key is changed
10285 @return
10286 HA_EXIT_SUCCESS OK
10287 Other HA_ERR error code (can be SE-specific)
10288 */
10289 int ha_rocksdb::update_write_indexes(const struct update_row_info &row_info,
10290 const bool pk_changed) {
10291 int rc;
10292 bool bulk_load_sk;
10293
10294 // The PK must be updated first to pull out the TTL value.
10295 rc = update_write_pk(*m_pk_descr, row_info, pk_changed);
10296 if (rc != HA_EXIT_SUCCESS) {
10297 return rc;
10298 }
10299
10300 // Update the remaining indexes. Allow bulk loading only if
10301 // allow_sk is enabled
10302 bulk_load_sk = rocksdb_enable_bulk_load_api &&
10303 THDVAR(table->in_use, bulk_load) &&
10304 THDVAR(table->in_use, bulk_load_allow_sk);
10305 for (uint key_id = 0; key_id < m_tbl_def->m_key_count; key_id++) {
10306 if (is_pk(key_id, table, m_tbl_def)) {
10307 continue;
10308 }
10309
10310 rc = update_write_sk(table, *m_key_descr_arr[key_id], row_info,
10311 bulk_load_sk);
10312 if (rc != HA_EXIT_SUCCESS) {
10313 return rc;
10314 }
10315 }
10316
10317 return HA_EXIT_SUCCESS;
10318 }
10319
10320 /**
10321 Update an existing row or write a new row
10322
10323 @param[in] old_data nullptr for write, non-null for update
10324 @param[in] new_data non-null for write/update
10325 @param[in] skip_unique_check whether to check uniqueness
10326 @return
10327 HA_EXIT_SUCCESS OK
10328 Other HA_ERR error code (can be SE-specific)
10329 */
10330 int ha_rocksdb::update_write_row(const uchar *const old_data,
10331 const uchar *const new_data,
10332 const bool skip_unique_check) {
10333 DBUG_ENTER_FUNC();
10334
10335 THD *thd = ha_thd();
10336 if (thd && thd->killed) {
10337 DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED);
10338 }
10339
10340 bool pk_changed = false;
10341 struct update_row_info row_info;
10342
10343 row_info.old_data = old_data;
10344 row_info.new_data = new_data;
10345 row_info.skip_unique_check = skip_unique_check;
10346 row_info.new_pk_unpack_info = nullptr;
10347 set_last_rowkey(old_data);
10348
10349 row_info.tx = get_or_create_tx(table->in_use);
10350
10351 if (old_data != nullptr) {
10352 row_info.old_pk_slice =
10353 rocksdb::Slice(m_last_rowkey.ptr(), m_last_rowkey.length());
10354
10355 /* Determine which indexes need updating. */
10356 calc_updated_indexes();
10357 }
10358
10359 /*
10360 Get the new row key into row_info.new_pk_slice
10361 */
10362 int rc = get_pk_for_update(&row_info);
10363 if (rc != HA_EXIT_SUCCESS) {
10364 DBUG_RETURN(rc);
10365 }
10366
10367 /*
10368 For UPDATEs, if the key has changed, we need to obtain a lock. INSERTs
10369 always require locking.
10370 */
10371 if (row_info.old_pk_slice.size() > 0) {
10372 pk_changed = row_info.new_pk_slice.compare(row_info.old_pk_slice) != 0;
10373 }
10374
10375 if (!skip_unique_check) {
10376 /*
10377 Check to see if we are going to have failures because of unique
10378 keys. Also lock the appropriate key values.
10379 */
10380 rc = check_uniqueness_and_lock(row_info, pk_changed);
10381 if (rc != HA_EXIT_SUCCESS) {
10382 DBUG_RETURN(rc);
10383 }
10384 }
10385
10386 DEBUG_SYNC(ha_thd(), "rocksdb.update_write_row_after_unique_check");
10387
10388 /*
10389 At this point, all locks have been obtained, and all checks for duplicate
10390 keys have been performed. No further errors can be allowed to occur from
10391 here because updates to the transaction will be made and those updates
10392 cannot be easily removed without rolling back the entire transaction.
10393 */
10394 rc = update_write_indexes(row_info, pk_changed);
10395 if (rc != HA_EXIT_SUCCESS) {
10396 DBUG_RETURN(rc);
10397 }
10398
10399 if (old_data != nullptr) {
10400 row_info.tx->incr_update_count();
10401 } else {
10402 row_info.tx->incr_insert_count();
10403 }
10404
10405 row_info.tx->log_table_write_op(m_tbl_def);
10406
10407 if (do_bulk_commit(row_info.tx)) {
10408 DBUG_RETURN(HA_ERR_ROCKSDB_BULK_LOAD);
10409 }
10410
10411 DBUG_RETURN(HA_EXIT_SUCCESS);
10412 }
10413
10414 /*
10415 Setting iterator upper/lower bounds for Seek/SeekForPrev.
10416 This makes RocksDB to avoid scanning tombstones outside of
10417 the given key ranges, when prefix_same_as_start=true was not passed
10418 (when prefix bloom filter can not be used).
10419 Inversing upper/lower bound is necessary on reverse order CF.
10420 This covers HA_READ_PREFIX_LAST* case as well. For example,
10421 if given query eq condition was 12 bytes and condition was
10422 0x0000b3eb003f65c5e78858b8, and if doing HA_READ_PREFIX_LAST,
10423 eq_cond_len was 11 (see calc_eq_cond_len() for details).
10424 If the index was reverse order, upper bound would be
10425 0x0000b3eb003f65c5e78857, and lower bound would be
10426 0x0000b3eb003f65c5e78859. These cover given eq condition range.
10427
10428 @param lower_bound_buf IN Buffer for lower bound
10429 @param upper_bound_buf IN Buffer for upper bound
10430
10431 @param outer_u
10432 */
10433 void ha_rocksdb::setup_iterator_bounds(
10434 const Rdb_key_def &kd, const rocksdb::Slice &eq_cond, size_t bound_len,
10435 uchar *const lower_bound, uchar *const upper_bound,
10436 rocksdb::Slice *lower_bound_slice, rocksdb::Slice *upper_bound_slice) {
10437 // If eq_cond is shorter than Rdb_key_def::INDEX_NUMBER_SIZE, we should be
10438 // able to get better bounds just by using index id directly.
10439 if (eq_cond.size() <= Rdb_key_def::INDEX_NUMBER_SIZE) {
10440 DBUG_ASSERT(bound_len == Rdb_key_def::INDEX_NUMBER_SIZE);
10441 uint size;
10442 kd.get_infimum_key(lower_bound, &size);
10443 DBUG_ASSERT(size == Rdb_key_def::INDEX_NUMBER_SIZE);
10444 kd.get_supremum_key(upper_bound, &size);
10445 DBUG_ASSERT(size == Rdb_key_def::INDEX_NUMBER_SIZE);
10446 } else {
10447 DBUG_ASSERT(bound_len <= eq_cond.size());
10448 memcpy(upper_bound, eq_cond.data(), bound_len);
10449 kd.successor(upper_bound, bound_len);
10450 memcpy(lower_bound, eq_cond.data(), bound_len);
10451 kd.predecessor(lower_bound, bound_len);
10452 }
10453
10454 if (kd.m_is_reverse_cf) {
10455 *upper_bound_slice = rocksdb::Slice((const char *)lower_bound, bound_len);
10456 *lower_bound_slice = rocksdb::Slice((const char *)upper_bound, bound_len);
10457 } else {
10458 *upper_bound_slice = rocksdb::Slice((const char *)upper_bound, bound_len);
10459 *lower_bound_slice = rocksdb::Slice((const char *)lower_bound, bound_len);
10460 }
10461 }
10462
10463 /*
10464 Open a cursor
10465 */
10466
10467 void ha_rocksdb::setup_scan_iterator(const Rdb_key_def &kd,
10468 rocksdb::Slice *const slice,
10469 const bool use_all_keys,
10470 const uint eq_cond_len) {
10471 DBUG_ASSERT(slice->size() >= eq_cond_len);
10472
10473 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10474
10475 bool skip_bloom = true;
10476
10477 const rocksdb::Slice eq_cond(slice->data(), eq_cond_len);
10478 // The size of m_scan_it_lower_bound (and upper) is technically
10479 // max_packed_sk_len as calculated in ha_rocksdb::alloc_key_buffers. Rather
10480 // than recalculating that number, we pass in the max of eq_cond_len and
10481 // Rdb_key_def::INDEX_NUMBER_SIZE which is guaranteed to be smaller than
10482 // max_packed_sk_len, hence ensuring no buffer overrun.
10483 //
10484 // See ha_rocksdb::setup_iterator_bounds on how the bound_len parameter is
10485 // used.
10486 if (check_bloom_and_set_bounds(
10487 ha_thd(), kd, eq_cond, use_all_keys,
10488 std::max(eq_cond_len, (uint)Rdb_key_def::INDEX_NUMBER_SIZE),
10489 m_scan_it_lower_bound, m_scan_it_upper_bound,
10490 &m_scan_it_lower_bound_slice, &m_scan_it_upper_bound_slice)) {
10491 skip_bloom = false;
10492 }
10493
10494 /*
10495 In some cases, setup_scan_iterator() is called multiple times from
10496 the same query but bloom filter can not always be used.
10497 Suppose the following query example. id2 is VARCHAR(30) and PRIMARY KEY
10498 (id1, id2).
10499 select count(*) from t2 WHERE id1=100 and id2 IN ('00000000000000000000',
10500 '100');
10501 In this case, setup_scan_iterator() is called twice, the first time is for
10502 (id1, id2)=(100, '00000000000000000000') and the second time is for (100,
10503 '100').
10504 If prefix bloom filter length is 24 bytes, prefix bloom filter can be used
10505 for the
10506 first condition but not for the second condition.
10507 If bloom filter condition is changed, currently it is necessary to destroy
10508 and
10509 re-create Iterator.
10510 */
10511 if (m_scan_it_skips_bloom != skip_bloom) {
10512 release_scan_iterator();
10513 }
10514
10515 /*
10516 SQL layer can call rnd_init() multiple times in a row.
10517 In that case, re-use the iterator, but re-position it at the table start.
10518 */
10519 if (!m_scan_it) {
10520 const bool fill_cache = !THDVAR(ha_thd(), skip_fill_cache);
10521 if (commit_in_the_middle()) {
10522 DBUG_ASSERT(m_scan_it_snapshot == nullptr);
10523 m_scan_it_snapshot = rdb->GetSnapshot();
10524
10525 auto read_opts = rocksdb::ReadOptions();
10526 // TODO(mung): set based on WHERE conditions
10527 read_opts.total_order_seek = true;
10528 read_opts.snapshot = m_scan_it_snapshot;
10529 m_scan_it = rdb->NewIterator(read_opts, kd.get_cf());
10530 } else {
10531 m_scan_it = tx->get_iterator(kd.get_cf(), skip_bloom, fill_cache,
10532 m_scan_it_lower_bound_slice,
10533 m_scan_it_upper_bound_slice);
10534 }
10535 m_scan_it_skips_bloom = skip_bloom;
10536 }
10537 }
10538
10539 void ha_rocksdb::release_scan_iterator() {
10540 delete m_scan_it;
10541 m_scan_it = nullptr;
10542
10543 if (m_scan_it_snapshot) {
10544 rdb->ReleaseSnapshot(m_scan_it_snapshot);
10545 m_scan_it_snapshot = nullptr;
10546 }
10547 }
10548
10549 void ha_rocksdb::setup_iterator_for_rnd_scan() {
10550 uint key_size;
10551
10552 int key_start_matching_bytes =
10553 m_pk_descr->get_first_key(m_pk_packed_tuple, &key_size);
10554
10555 rocksdb::Slice table_key((const char *)m_pk_packed_tuple, key_size);
10556
10557 setup_scan_iterator(*m_pk_descr, &table_key, false, key_start_matching_bytes);
10558 m_scan_it->Seek(table_key);
10559 m_skip_scan_it_next_call = true;
10560 }
10561
10562 /**
10563 @return
10564 HA_EXIT_SUCCESS OK
10565 other HA_ERR error code (can be SE-specific)
10566 */
10567 int ha_rocksdb::rnd_init(bool scan) {
10568 DBUG_ENTER_FUNC();
10569
10570 THD *thd = ha_thd();
10571 if (thd && thd->killed) {
10572 DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED);
10573 }
10574
10575 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10576
10577 // when this table is being updated, decode all fields
10578 m_converter->setup_field_decoders(table->read_set,
10579 m_lock_rows == RDB_LOCK_WRITE);
10580
10581 if (scan) {
10582 m_rnd_scan_is_new_snapshot = !tx->has_snapshot();
10583 setup_iterator_for_rnd_scan();
10584 } else {
10585 /* We don't need any preparations for rnd_pos() calls. */
10586 }
10587
10588 // If m_lock_rows is on then we will be doing a get_for_update when accessing
10589 // the index, so don't acquire the snapshot right away. Otherwise acquire
10590 // the snapshot immediately.
10591 tx->acquire_snapshot(m_lock_rows == RDB_LOCK_NONE);
10592
10593 DBUG_RETURN(HA_EXIT_SUCCESS);
10594 }
10595
10596 /**
10597 @return
10598 HA_EXIT_SUCCESS OK
10599 other HA_ERR error code (can be SE-specific)
10600 */
10601 int ha_rocksdb::rnd_next(uchar *const buf) {
10602 DBUG_ENTER_FUNC();
10603
10604 int rc;
10605 for (;;) {
10606 rc = rnd_next_with_direction(buf, true);
10607 if (!should_recreate_snapshot(rc, m_rnd_scan_is_new_snapshot)) {
10608 break; /* exit the loop */
10609 }
10610 // release the snapshot and iterator and then regenerate them
10611 Rdb_transaction *tx = get_or_create_tx(table->in_use);
10612 tx->release_snapshot();
10613 release_scan_iterator();
10614 setup_iterator_for_rnd_scan();
10615 }
10616
10617 m_rnd_scan_is_new_snapshot = false;
10618
10619 if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
10620
10621 DBUG_RETURN(rc);
10622 }
10623
10624 /*
10625 See also secondary_index_read().
10626 */
10627 int ha_rocksdb::rnd_next_with_direction(uchar *const buf, bool move_forward) {
10628 DBUG_ENTER_FUNC();
10629
10630 int rc;
10631 THD *thd = ha_thd();
10632
10633 table->status = STATUS_NOT_FOUND;
10634 #ifdef MARIAROCKS_NOT_YET
10635 stats.rows_requested++;
10636 #endif
10637 if (!m_scan_it || !is_valid(m_scan_it)) {
10638 /*
10639 We can get here when SQL layer has called
10640
10641 h->index_init(PRIMARY);
10642 h->index_read_map(full index tuple, HA_READ_KEY_EXACT);
10643
10644 In this case, we should return EOF.
10645 */
10646 DBUG_RETURN(HA_ERR_END_OF_FILE);
10647 }
10648
10649 for (;;) {
10650 DEBUG_SYNC(thd, "rocksdb.check_flags_rnwd");
10651 if (thd && thd->killed) {
10652 rc = HA_ERR_QUERY_INTERRUPTED;
10653 break;
10654 }
10655
10656 if (m_skip_scan_it_next_call) {
10657 m_skip_scan_it_next_call = false;
10658 } else {
10659 if (move_forward) {
10660 m_scan_it->Next(); /* this call cannot fail */
10661 } else {
10662 m_scan_it->Prev(); /* this call cannot fail */
10663 }
10664 }
10665
10666 if (!is_valid(m_scan_it)) {
10667 rc = HA_ERR_END_OF_FILE;
10668 break;
10669 }
10670
10671 /* check if we're out of this table */
10672 const rocksdb::Slice key = m_scan_it->key();
10673 if (!m_pk_descr->covers_key(key)) {
10674 rc = HA_ERR_END_OF_FILE;
10675 break;
10676 }
10677
10678 if (m_lock_rows != RDB_LOCK_NONE) {
10679 /*
10680 Lock the row we've just read.
10681
10682 Now we call get_for_update which will 1) Take a lock and 2) Will fail
10683 if the row was deleted since the snapshot was taken.
10684 */
10685 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10686 DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete");
10687
10688 if (m_pk_descr->has_ttl() &&
10689 should_hide_ttl_rec(*m_pk_descr, m_scan_it->value(),
10690 tx->m_snapshot_timestamp)) {
10691 continue;
10692 }
10693
10694 const rocksdb::Status s =
10695 get_for_update(tx, m_pk_descr->get_cf(), key, &m_retrieved_record);
10696 if (s.IsNotFound() &&
10697 should_skip_invalidated_record(HA_ERR_KEY_NOT_FOUND)) {
10698 continue;
10699 }
10700
10701 if (!s.ok()) {
10702 DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr,
10703 m_tbl_def, m_table_handler));
10704 }
10705
10706 // If we called get_for_update() use the value from that call not from
10707 // the iterator as it may be stale since we don't have a snapshot
10708 // when m_lock_rows is not RDB_LOCK_NONE.
10709 m_last_rowkey.copy(key.data(), key.size(), &my_charset_bin);
10710 rc = convert_record_from_storage_format(&key, buf);
10711 } else {
10712 // Use the value from the iterator
10713 rocksdb::Slice value = m_scan_it->value();
10714
10715 if (m_pk_descr->has_ttl() &&
10716 should_hide_ttl_rec(
10717 *m_pk_descr, value,
10718 get_or_create_tx(table->in_use)->m_snapshot_timestamp)) {
10719 continue;
10720 }
10721
10722 m_last_rowkey.copy(key.data(), key.size(), &my_charset_bin);
10723 rc = convert_record_from_storage_format(&key, &value, buf);
10724 }
10725
10726 table->status = 0;
10727 break;
10728 }
10729
10730 if (!rc) {
10731 #ifdef MARIAROCKS_NOT_YET
10732 stats.rows_read++;
10733 stats.rows_index_next++;
10734 #endif
10735 update_row_stats(ROWS_READ);
10736 }
10737
10738 DBUG_RETURN(rc);
10739 }
10740
10741 int ha_rocksdb::rnd_end() {
10742 DBUG_ENTER_FUNC();
10743
10744 release_scan_iterator();
10745
10746 DBUG_RETURN(HA_EXIT_SUCCESS);
10747 }
10748
10749 /**
10750 @return
10751 HA_EXIT_SUCCESS OK
10752 other HA_ERR error code (can be SE-specific)
10753 */
10754 int ha_rocksdb::index_init(uint idx, bool sorted) {
10755 DBUG_ENTER_FUNC();
10756
10757 THD *thd = ha_thd();
10758 if (thd && thd->killed) {
10759 DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED);
10760 }
10761
10762 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10763 DBUG_ASSERT(tx != nullptr);
10764
10765 m_start_range= NULL;
10766 // when this table is being updated, decode all fields
10767 m_converter->setup_field_decoders(table->read_set,
10768 m_lock_rows == RDB_LOCK_WRITE);
10769
10770 if (!m_keyread_only) {
10771 m_key_descr_arr[idx]->get_lookup_bitmap(table, &m_lookup_bitmap);
10772 }
10773
10774 // If m_lock_rows is not RDB_LOCK_NONE then we will be doing a get_for_update
10775 // when accessing the index, so don't acquire the snapshot right away.
10776 // Otherwise acquire the snapshot immediately.
10777 tx->acquire_snapshot(m_lock_rows == RDB_LOCK_NONE);
10778
10779 active_index = idx;
10780
10781 DBUG_RETURN(HA_EXIT_SUCCESS);
10782 }
10783
10784 /**
10785 @return
10786 HA_EXIT_SUCCESS OK
10787 */
10788 int ha_rocksdb::index_end() {
10789 DBUG_ENTER_FUNC();
10790
10791 release_scan_iterator();
10792
10793 bitmap_free(&m_lookup_bitmap);
10794
10795 active_index = MAX_KEY;
10796 in_range_check_pushed_down = FALSE;
10797 m_start_range= NULL;
10798
10799 DBUG_RETURN(HA_EXIT_SUCCESS);
10800 }
10801
10802 /**
10803 @return
10804 HA_EXIT_SUCCESS OK
10805 other HA_ERR error code (can be SE-specific)
10806 */
10807 int ha_rocksdb::truncate() {
10808 DBUG_ENTER_FUNC();
10809
10810 DBUG_ASSERT(m_tbl_def != nullptr);
10811
10812 // Save table name to use later
10813 std::string table_name = m_tbl_def->full_tablename();
10814
10815 // Delete current table
10816 int err = delete_table(m_tbl_def);
10817 if (err != HA_EXIT_SUCCESS) {
10818 DBUG_RETURN(err);
10819 }
10820
10821 // Reset auto_increment_value to 1 if auto-increment feature is enabled
10822 // By default, the starting valid value for auto_increment_value is 1
10823 DBUG_RETURN(create_table(
10824 table_name, table,
10825 table->found_next_number_field ? 1 : 0 /* auto_increment_value */));
10826 }
10827
10828 /*
10829 Delete the row we've last read. The row is also passed as parameter.
10830
10831 @detail
10832 The caller guarantees table buf points to the row that was just read.
10833 The row is either table->record[0] or table->record[1].
10834 (Check out InnoDB: row_update_for_mysql() has "UT_NOT_USED(mysql_rec)"
10835
10836 @return
10837 HA_EXIT_SUCCESS OK
10838 other HA_ERR error code (can be SE-specific)
10839 */
10840 int ha_rocksdb::delete_row(const uchar *const buf) {
10841 DBUG_ENTER_FUNC();
10842
10843 DBUG_ASSERT(buf != nullptr);
10844
10845 set_last_rowkey(buf);
10846
10847 rocksdb::Slice key_slice(m_last_rowkey.ptr(), m_last_rowkey.length());
10848 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10849 ulonglong bytes_written = 0;
10850
10851 const uint index = pk_index(table, m_tbl_def);
10852 rocksdb::Status s =
10853 delete_or_singledelete(index, tx, m_pk_descr->get_cf(), key_slice);
10854 if (!s.ok()) {
10855 DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def,
10856 m_table_handler));
10857 } else {
10858 bytes_written = key_slice.size();
10859 }
10860
10861 longlong hidden_pk_id = 0;
10862 if (m_tbl_def->m_key_count > 1 && has_hidden_pk(table)) {
10863 int err = read_hidden_pk_id_from_rowkey(&hidden_pk_id);
10864 if (err) {
10865 DBUG_RETURN(err);
10866 }
10867 }
10868
10869 // Delete the record for every secondary index
10870 for (uint i = 0; i < m_tbl_def->m_key_count; i++) {
10871 if (!is_pk(i, table, m_tbl_def)) {
10872 int packed_size;
10873 const Rdb_key_def &kd = *m_key_descr_arr[i];
10874 packed_size = kd.pack_record(table, m_pack_buffer, buf, m_sk_packed_tuple,
10875 nullptr, false, hidden_pk_id);
10876 rocksdb::Slice secondary_key_slice(
10877 reinterpret_cast<const char *>(m_sk_packed_tuple), packed_size);
10878 /* Deleting on secondary key doesn't need any locks: */
10879 tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(),
10880 secondary_key_slice);
10881 bytes_written += secondary_key_slice.size();
10882 }
10883 }
10884
10885 tx->incr_delete_count();
10886 tx->log_table_write_op(m_tbl_def);
10887
10888 if (do_bulk_commit(tx)) {
10889 DBUG_RETURN(HA_ERR_ROCKSDB_BULK_LOAD);
10890 }
10891 #ifdef MARIAROCKS_NOT_YET
10892 stats.rows_deleted++;
10893 #endif
10894 update_row_stats(ROWS_DELETED);
10895 tx->update_bytes_written(bytes_written);
10896
10897 DBUG_RETURN(HA_EXIT_SUCCESS);
10898 }
10899
10900 rocksdb::Status ha_rocksdb::delete_or_singledelete(
10901 uint index, Rdb_transaction *const tx,
10902 rocksdb::ColumnFamilyHandle *const column_family,
10903 const rocksdb::Slice &key) {
10904 const bool assume_tracked = can_assume_tracked(ha_thd());
10905 if (can_use_single_delete(index)) {
10906 return tx->single_delete(column_family, key, assume_tracked);
10907 }
10908 return tx->delete_key(column_family, key, assume_tracked);
10909 }
10910
10911 void ha_rocksdb::update_stats(void) {
10912 DBUG_ENTER_FUNC();
10913
10914 stats.records = 0;
10915 stats.index_file_length = 0ul;
10916 stats.data_file_length = 0ul;
10917 stats.mean_rec_length = 0;
10918
10919 for (uint i = 0; i < m_tbl_def->m_key_count; i++) {
10920 if (is_pk(i, table, m_tbl_def)) {
10921 stats.data_file_length = m_pk_descr->m_stats.m_actual_disk_size;
10922 stats.records = m_pk_descr->m_stats.m_rows;
10923 } else {
10924 stats.index_file_length += m_key_descr_arr[i]->m_stats.m_actual_disk_size;
10925 }
10926 }
10927
10928 DBUG_VOID_RETURN;
10929 }
10930
10931 /**
10932 @return
10933 HA_EXIT_SUCCESS OK
10934 HA_EXIT_FAILURE Error
10935 */
10936 int ha_rocksdb::info(uint flag) {
10937 DBUG_ENTER_FUNC();
10938
10939 if (!table) {
10940 DBUG_RETURN(HA_EXIT_FAILURE);
10941 }
10942
10943 if (flag & HA_STATUS_VARIABLE) {
10944 /*
10945 Test only to simulate corrupted stats
10946 */
10947 DBUG_EXECUTE_IF("myrocks_simulate_negative_stats",
10948 m_pk_descr->m_stats.m_actual_disk_size =
10949 -m_pk_descr->m_stats.m_actual_disk_size;);
10950
10951 update_stats();
10952
10953 /*
10954 If any stats are negative due to bad cached stats, re-run analyze table
10955 and re-retrieve the stats.
10956 */
10957 if (static_cast<longlong>(stats.data_file_length) < 0 ||
10958 static_cast<longlong>(stats.index_file_length) < 0 ||
10959 static_cast<longlong>(stats.records) < 0) {
10960 if (calculate_stats_for_table()) {
10961 DBUG_RETURN(HA_EXIT_FAILURE);
10962 }
10963
10964 update_stats();
10965 }
10966
10967 // if number of records is hardcoded, we do not want to force computation
10968 // of memtable cardinalities
10969 if (stats.records == 0 || (rocksdb_force_compute_memtable_stats &&
10970 rocksdb_debug_optimizer_n_rows == 0)) {
10971 // First, compute SST files stats
10972 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
10973 auto r = get_range(pk_index(table, m_tbl_def), buf);
10974 uint64_t sz = 0;
10975 uint8_t include_flags = rocksdb::DB::INCLUDE_FILES;
10976 // recompute SST files stats only if records count is 0
10977 if (stats.records == 0) {
10978 rdb->GetApproximateSizes(m_pk_descr->get_cf(), &r, 1, &sz,
10979 include_flags);
10980 stats.records += sz / ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE;
10981 stats.data_file_length += sz;
10982 }
10983 // Second, compute memtable stats. This call is expensive, so cache
10984 // values computed for some time.
10985 uint64_t cachetime = rocksdb_force_compute_memtable_stats_cachetime;
10986 uint64_t time = (cachetime == 0) ? 0 : my_interval_timer() / 1000;
10987 if (cachetime == 0 ||
10988 time > m_table_handler->m_mtcache_last_update + cachetime) {
10989 uint64_t memtableCount;
10990 uint64_t memtableSize;
10991
10992 // the stats below are calculated from skiplist wich is a probablistic
10993 // data structure, so the results vary between test runs
10994 // it also can return 0 for quite a large tables which means that
10995 // cardinality for memtable only indxes will be reported as 0
10996 rdb->GetApproximateMemTableStats(m_pk_descr->get_cf(), r,
10997 &memtableCount, &memtableSize);
10998
10999 // Atomically update all of these fields at the same time
11000 if (cachetime > 0) {
11001 if (m_table_handler->m_mtcache_lock.fetch_add(
11002 1, std::memory_order_acquire) == 0) {
11003 m_table_handler->m_mtcache_count = memtableCount;
11004 m_table_handler->m_mtcache_size = memtableSize;
11005 m_table_handler->m_mtcache_last_update = time;
11006 }
11007 m_table_handler->m_mtcache_lock.fetch_sub(1,
11008 std::memory_order_release);
11009 }
11010
11011 stats.records += memtableCount;
11012 stats.data_file_length += memtableSize;
11013 } else {
11014 // Cached data is still valid, so use it instead
11015 stats.records += m_table_handler->m_mtcache_count;
11016 stats.data_file_length += m_table_handler->m_mtcache_size;
11017 }
11018
11019 // Do like InnoDB does. stats.records=0 confuses the optimizer
11020 if (stats.records == 0 && !(flag & (HA_STATUS_TIME | HA_STATUS_OPEN))) {
11021 stats.records++;
11022 }
11023 }
11024
11025 if (rocksdb_debug_optimizer_n_rows > 0)
11026 stats.records = rocksdb_debug_optimizer_n_rows;
11027
11028 if (stats.records != 0) {
11029 stats.mean_rec_length = stats.data_file_length / stats.records;
11030 }
11031 }
11032
11033 if (flag & HA_STATUS_CONST) {
11034 ref_length = m_pk_descr->max_storage_fmt_length();
11035
11036 for (uint i = 0; i < m_tbl_def->m_key_count; i++) {
11037 if (is_hidden_pk(i, table, m_tbl_def)) {
11038 continue;
11039 }
11040 KEY *const k = &table->key_info[i];
11041 for (uint j = 0; j < k->ext_key_parts; j++) {
11042 const Rdb_index_stats &k_stats = m_key_descr_arr[i]->m_stats;
11043 uint x;
11044
11045 if (k_stats.m_distinct_keys_per_prefix.size() > j &&
11046 k_stats.m_distinct_keys_per_prefix[j] > 0) {
11047 x = k_stats.m_rows / k_stats.m_distinct_keys_per_prefix[j];
11048 /*
11049 If the number of rows is less than the number of prefixes (due to
11050 sampling), the average number of rows with the same prefix is 1.
11051 */
11052 if (x == 0) {
11053 x = 1;
11054 }
11055 } else {
11056 x = 0;
11057 }
11058 if (x > stats.records) x = stats.records;
11059 if ((x == 0 && rocksdb_debug_optimizer_no_zero_cardinality) ||
11060 rocksdb_debug_optimizer_n_rows > 0) {
11061 // Fake cardinality implementation. For example, (idx1, idx2, idx3)
11062 // index
11063 /*
11064 Make MariaRocks behave the same way as MyRocks does:
11065 1. SQL layer thinks that unique secondary indexes are not extended
11066 with PK columns (both in MySQL and MariaDB)
11067 2. MariaDB also thinks that indexes with partially-covered columns
11068 are not extended with PK columns. Use the same number of
11069 keyparts that MyRocks would use.
11070 */
11071 uint ext_key_parts2;
11072 if (k->flags & HA_NOSAME)
11073 ext_key_parts2= k->ext_key_parts; // This is #1
11074 else
11075 ext_key_parts2= m_key_descr_arr[i]->get_key_parts(); // This is #2.
11076
11077 // will have rec_per_key for (idx1)=4, (idx1,2)=2, and (idx1,2,3)=1.
11078 // rec_per_key for the whole index is 1, and multiplied by 2^n if
11079 // n suffix columns of the index are not used.
11080 x = 1 << (ext_key_parts2 - j - 1);
11081 }
11082 k->rec_per_key[j] = x;
11083 }
11084 }
11085
11086 stats.create_time = m_tbl_def->get_create_time();
11087 }
11088
11089 if (flag & HA_STATUS_TIME) {
11090 stats.update_time = m_tbl_def->m_update_time;
11091 }
11092
11093 if (flag & HA_STATUS_ERRKEY) {
11094 /*
11095 Currently we support only primary keys so we know which key had a
11096 uniqueness violation.
11097 */
11098 errkey = m_dupp_errkey;
11099 dup_ref = m_pk_tuple; // TODO(?): this should store packed PK.
11100 }
11101
11102 if (flag & HA_STATUS_AUTO) {
11103 stats.auto_increment_value = m_tbl_def->m_auto_incr_val;
11104 }
11105
11106 DBUG_RETURN(HA_EXIT_SUCCESS);
11107 }
11108
11109 void ha_rocksdb::position(const uchar *const record) {
11110 DBUG_ENTER_FUNC();
11111
11112 longlong hidden_pk_id = 0;
11113 if (has_hidden_pk(table) && read_hidden_pk_id_from_rowkey(&hidden_pk_id)) {
11114 DBUG_ASSERT(false); // should never reach here
11115 }
11116
11117 /*
11118 Get packed primary key value from the record.
11119
11120 (Note: m_last_rowkey has the packed PK of last-read-row, which allows to
11121 handle most cases, but there is an exception to this: when slave applies
11122 RBR events, it fills the record and then calls position(); rnd_pos())
11123
11124 Also note that we *can* rely on PK column values being available. This is
11125 because table_flags() includes HA_PRIMARY_KEY_REQUIRED_FOR_POSITION bit.
11126 When that is true, table->prepare_for_position() adds PK columns into the
11127 read set (this may potentially disable index-only access when PK column
11128 cannot be restored from its mem-comparable form in the secondary indexes).
11129 */
11130 const uint packed_size = m_pk_descr->pack_record(
11131 table, m_pack_buffer, record, ref, nullptr, false, hidden_pk_id);
11132
11133 /*
11134 It could be that mem-comparable form of PK occupies less than ref_length
11135 bytes. Fill the remainder with zeros.
11136 */
11137 if (ref_length > packed_size) {
11138 memset(ref + packed_size, 0, ref_length - packed_size);
11139 }
11140
11141 DBUG_VOID_RETURN;
11142 }
11143
11144 /**
11145 @return
11146 HA_EXIT_SUCCESS OK
11147 other HA_ERR error code (can be SE-specific)
11148 */
11149 int ha_rocksdb::rnd_pos(uchar *const buf, uchar *const pos) {
11150 DBUG_ENTER_FUNC();
11151
11152 int rc;
11153 size_t len;
11154
11155 #ifdef MARIAROCKS_NOT_YET
11156 stats.rows_requested++;
11157 #endif
11158 len = m_pk_descr->key_length(table,
11159 rocksdb::Slice((const char *)pos, ref_length));
11160 if (len == size_t(-1)) {
11161 DBUG_RETURN(HA_ERR_ROCKSDB_CORRUPT_DATA); /* Data corruption? */
11162 }
11163
11164 rc = get_row_by_rowid(buf, pos, len);
11165
11166 if (!rc) {
11167 #ifdef MARIAROCKS_NOT_YET
11168 stats.rows_read++;
11169 #endif
11170 update_row_stats(ROWS_READ);
11171 }
11172
11173 DBUG_RETURN(rc);
11174 }
11175
11176 /*
11177 @brief
11178 Calculate (if needed) the bitmap of indexes that are modified by the
11179 current query
11180
11181 @detail
11182 The calculation done by checking index definitions against the
11183 table->write_set
11184 column bitmap.
11185 */
11186
11187 void ha_rocksdb::calc_updated_indexes() {
11188 if (!m_update_scope_is_valid) {
11189 m_update_scope_is_valid = true;
11190 m_update_scope.clear_all();
11191
11192 for (uint keynr = 0; keynr < table->s->keys; keynr++) {
11193 const Rdb_key_def &kd = *m_key_descr_arr[keynr];
11194 /* Walk over all key parts, including the "extended key" suffix */
11195 const uint key_parts = kd.get_key_parts();
11196 for (uint kp = 0; kp < key_parts; kp++) {
11197 if (has_hidden_pk(table) && kp + 1 == key_parts) break;
11198
11199 Field *const field = kd.get_table_field_for_part_no(table, kp);
11200 if (bitmap_is_set(table->write_set, field->field_index)) {
11201 m_update_scope.set_bit(keynr);
11202 break;
11203 }
11204 }
11205 }
11206 }
11207 }
11208
11209 /**
11210 Update an existing row
11211 @param[in] old_data nullptr for write, non-null for update
11212 @param[in] new_data non-null for write/update
11213 @return
11214 HA_EXIT_SUCCESS OK
11215 other HA_ERR error code (can be SE-specific)
11216 */
11217 int ha_rocksdb::update_row(const uchar *const old_data,
11218 const uchar *const new_data) {
11219 DBUG_ENTER_FUNC();
11220
11221 DBUG_ASSERT(old_data != nullptr);
11222 DBUG_ASSERT(new_data != nullptr);
11223 DBUG_ASSERT(m_lock_rows == RDB_LOCK_WRITE);
11224 /*
11225 old_data points to record we're updating. It is the same as the record
11226 we've just read (for multi-table UPDATE, too, because SQL layer will make
11227 an rnd_pos() call to re-read the record before calling update_row())
11228 */
11229 DBUG_ASSERT(new_data == table->record[0]);
11230
11231 const int rv = update_write_row(old_data, new_data, skip_unique_check());
11232
11233 if (rv == 0) {
11234 #ifdef MARIAROCKS_NOT_YET
11235 stats.rows_updated++;
11236 #endif
11237 update_row_stats(ROWS_UPDATED);
11238 }
11239
11240 DBUG_RETURN(rv);
11241 }
11242
11243 /*
11244 MariaDB's temporary: MyRocks has this function in sql/handler.cc:
11245 */
11246
11247 bool can_hold_read_locks_on_select(THD *thd, thr_lock_type lock_type)
11248 {
11249 return (lock_type == TL_READ_WITH_SHARED_LOCKS
11250 || lock_type == TL_READ_NO_INSERT
11251 || (lock_type != TL_IGNORE
11252 && thd->lex->sql_command != SQLCOM_SELECT));
11253 }
11254
11255
11256 /* The following function was copied from ha_blackhole::store_lock: */
11257 THR_LOCK_DATA **ha_rocksdb::store_lock(THD *const thd, THR_LOCK_DATA **to,
11258 enum thr_lock_type lock_type) {
11259 DBUG_ENTER_FUNC();
11260
11261 DBUG_ASSERT(thd != nullptr);
11262 DBUG_ASSERT(to != nullptr);
11263
11264 bool in_lock_tables = my_core::thd_in_lock_tables(thd);
11265
11266 /* First, make a decision about MyRocks's internal locking */
11267 if (lock_type >= TL_WRITE_ALLOW_WRITE) {
11268 m_lock_rows = RDB_LOCK_WRITE;
11269 } else if (lock_type == TL_READ_WITH_SHARED_LOCKS) {
11270 m_lock_rows = RDB_LOCK_READ;
11271 } else if (lock_type != TL_IGNORE) {
11272 m_lock_rows = RDB_LOCK_NONE;
11273 if (THDVAR(thd, lock_scanned_rows)) {
11274 /*
11275 The following logic was copied directly from
11276 ha_innobase::store_lock_with_x_type() in
11277 storage/innobase/handler/ha_innodb.cc and causes MyRocks to leave
11278 locks in place on rows that are in a table that is not being updated.
11279 */
11280 const uint sql_command = my_core::thd_sql_command(thd);
11281 if ((lock_type == TL_READ && in_lock_tables) ||
11282 (lock_type == TL_READ_HIGH_PRIORITY && in_lock_tables) ||
11283 can_hold_read_locks_on_select(thd, lock_type)) {
11284 ulong tx_isolation = my_core::thd_tx_isolation(thd);
11285 if (sql_command != SQLCOM_CHECKSUM &&
11286 ((my_core::thd_test_options(thd, OPTION_BIN_LOG) &&
11287 tx_isolation > ISO_READ_COMMITTED) ||
11288 tx_isolation == ISO_SERIALIZABLE ||
11289 (lock_type != TL_READ && lock_type != TL_READ_NO_INSERT) ||
11290 (sql_command != SQLCOM_INSERT_SELECT &&
11291 sql_command != SQLCOM_REPLACE_SELECT &&
11292 sql_command != SQLCOM_UPDATE && sql_command != SQLCOM_DELETE &&
11293 sql_command != SQLCOM_CREATE_TABLE))) {
11294 m_lock_rows = RDB_LOCK_READ;
11295 }
11296 }
11297 }
11298 }
11299
11300 /* Then, tell the SQL layer what kind of locking it should use: */
11301 if (lock_type != TL_IGNORE && m_db_lock.type == TL_UNLOCK) {
11302 /*
11303 Here is where we get into the guts of a row level lock.
11304 If TL_UNLOCK is set
11305 If we are not doing a LOCK TABLE or DISCARD/IMPORT
11306 TABLESPACE, then allow multiple writers
11307 */
11308
11309 if ((lock_type >= TL_WRITE_CONCURRENT_INSERT && lock_type <= TL_WRITE) &&
11310 !in_lock_tables && !my_core::thd_tablespace_op(thd)) {
11311 lock_type = TL_WRITE_ALLOW_WRITE;
11312 }
11313
11314 /*
11315 In queries of type INSERT INTO t1 SELECT ... FROM t2 ...
11316 MySQL would use the lock TL_READ_NO_INSERT on t2, and that
11317 would conflict with TL_WRITE_ALLOW_WRITE, blocking all inserts
11318 to t2. Convert the lock to a normal read lock to allow
11319 concurrent inserts to t2.
11320 */
11321
11322 if (lock_type == TL_READ_NO_INSERT && !in_lock_tables) {
11323 lock_type = TL_READ;
11324 }
11325
11326 m_db_lock.type = lock_type;
11327 }
11328
11329 *to++ = &m_db_lock;
11330
11331 DBUG_RETURN(to);
11332 }
11333
11334 void ha_rocksdb::read_thd_vars(THD *const thd) {
11335 m_store_row_debug_checksums = THDVAR(thd, store_row_debug_checksums);
11336 m_converter->set_verify_row_debug_checksums(
11337 THDVAR(thd, verify_row_debug_checksums));
11338 m_checksums_pct = THDVAR(thd, checksums_pct);
11339 }
11340
11341 ulonglong ha_rocksdb::table_flags() const
11342 {
11343 DBUG_ENTER_FUNC();
11344
11345 /*
11346 HA_BINLOG_STMT_CAPABLE
11347 Upstream: MyRocks advertises itself as it supports SBR, but has additional
11348 checks in ha_rocksdb::external_lock()/ start_stmt() which will return an
11349 error if one tries to run the statement.
11350 Exceptions: @@rocksdb_unsafe_for_binlog or we are an SQL slave thread.
11351
11352 MariaDB: Inform the upper layer we don't support SBR, so it switches to RBR
11353 if possible. The exceptions are the same as with the upstream.
11354
11355 HA_REC_NOT_IN_SEQ
11356 If we don't set it, filesort crashes, because it assumes rowids are
11357 1..8 byte numbers
11358 HA_PRIMARY_KEY_IN_READ_INDEX
11359 This flag is always set, even for tables that:
11360 - have no PK
11361 - have some (or all) of PK that can't be decoded from the secondary
11362 index.
11363 */
11364 THD *thd= ha_thd();
11365 DBUG_RETURN(HA_BINLOG_ROW_CAPABLE |
11366 ((thd && (THDVAR(thd, unsafe_for_binlog) ||thd->rgi_slave))?
11367 HA_BINLOG_STMT_CAPABLE : 0) |
11368 HA_REC_NOT_IN_SEQ | HA_CAN_INDEX_BLOBS |
11369 HA_PRIMARY_KEY_IN_READ_INDEX |
11370 HA_PRIMARY_KEY_REQUIRED_FOR_POSITION | HA_NULL_IN_KEY |
11371 HA_PARTIAL_COLUMN_READ |
11372 HA_TABLE_SCAN_ON_INDEX);
11373 }
11374
11375
11376
11377 /**
11378 @return
11379 HA_EXIT_SUCCESS OK
11380 other HA_ERR error code (cannot be SE-specific)
11381 */
11382 int ha_rocksdb::external_lock(THD *const thd, int lock_type) {
11383 DBUG_ENTER_FUNC();
11384
11385 DBUG_ASSERT(thd != nullptr);
11386
11387 int res = HA_EXIT_SUCCESS;
11388 #if 0
11389 // MariaDB uses a different way to implement this, see ha_rocksdb::table_flags
11390
11391 int binlog_format = my_core::thd_binlog_format(thd);
11392 bool unsafe_for_binlog = THDVAR(ha_thd(), unsafe_for_binlog);
11393
11394 /*
11395 A note about (*) below: In fb/mysql, LOCK TABLE myrocks_table WRITE
11396 - produces an error when @@binlog_format='ROW'.
11397 - does not produce an error with @@binlog_format='STATEMENT'
11398
11399 We cannot easily provide this behavior in MariaDB, because LOCK ROWS
11400 changes @@binlog_format for the duration of the statement.
11401
11402 Instead we will make the check in ha_rocksdb::start_stmt and will produce
11403 an error in the first DML statement done under LOCK TABLE
11404
11405 (**) - The same happens for other "DDL-type" commands, so allow them here
11406 as well.
11407 */
11408 if (lock_type == F_WRLCK && !thd->rgi_slave && !unsafe_for_binlog &&
11409 binlog_format != BINLOG_FORMAT_ROW &&
11410 binlog_format != BINLOG_FORMAT_UNSPEC &&
11411 thd->lex->sql_command != SQLCOM_LOCK_TABLES && // (*)
11412 thd->lex->sql_command != SQLCOM_ANALYZE && // (**)
11413 thd->lex->sql_command != SQLCOM_OPTIMIZE && // (**)
11414 thd->lex->sql_command != SQLCOM_FLUSH && // (**)
11415 my_core::thd_binlog_filter_ok(thd)) {
11416 my_error(ER_REQUIRE_ROW_BINLOG_FORMAT, MYF(0));
11417 DBUG_RETURN(HA_ERR_UNSUPPORTED);
11418 }
11419 #endif
11420
11421 if (lock_type == F_UNLCK) {
11422 Rdb_transaction *const tx = get_tx_from_thd(thd);
11423
11424 if (tx) {
11425 tx->io_perf_end_and_record(&m_io_perf);
11426 tx->m_n_mysql_tables_in_use--;
11427 if (tx->m_n_mysql_tables_in_use == 0 &&
11428 !my_core::thd_test_options(thd,
11429 OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
11430 /*
11431 Do like InnoDB: when we get here, it's time to commit a
11432 single-statement transaction.
11433
11434 If the statement involved multiple tables, this code will be executed
11435 for each of them, but that's ok because non-first tx->commit() calls
11436 will be no-ops.
11437 */
11438 if (tx->commit_or_rollback()) {
11439 res = HA_ERR_INTERNAL_ERROR;
11440 }
11441 }
11442 }
11443 } else {
11444 if (my_core::thd_tx_isolation(thd) < ISO_READ_COMMITTED ||
11445 my_core::thd_tx_isolation(thd) > ISO_REPEATABLE_READ) {
11446 my_error(ER_ISOLATION_MODE_NOT_SUPPORTED, MYF(0),
11447 tx_isolation_names[my_core::thd_tx_isolation(thd)]);
11448 DBUG_RETURN(HA_ERR_UNSUPPORTED);
11449 }
11450 /*
11451 It's nice to do the following on start of every statement. The problem
11452 is, handler->start_stmt() is not called for INSERTs.
11453 So, we put this code here.
11454 */
11455 Rdb_transaction *const tx = get_or_create_tx(thd);
11456 read_thd_vars(thd);
11457
11458 m_update_scope_is_valid = false;
11459
11460 if (skip_unique_check()) {
11461 if ((thd->lex->sql_command == SQLCOM_INSERT ||
11462 thd->lex->sql_command == SQLCOM_LOAD ||
11463 thd->lex->sql_command == SQLCOM_REPLACE) &&
11464 (thd->lex->duplicates == DUP_REPLACE ||
11465 thd->lex->duplicates == DUP_UPDATE)) {
11466 my_error(ER_ON_DUPLICATE_DISABLED, MYF(0), thd->query());
11467 DBUG_RETURN(HA_ERR_UNSUPPORTED);
11468 }
11469 }
11470
11471 if (lock_type == F_WRLCK) {
11472 if (tx->is_tx_read_only()) {
11473 my_error(ER_UPDATES_WITH_CONSISTENT_SNAPSHOT, MYF(0));
11474 DBUG_RETURN(HA_ERR_UNSUPPORTED);
11475 }
11476
11477 #ifdef MARIADB_NOT_YET
11478 if (thd->get_explicit_snapshot()) {
11479 my_error(ER_UPDATES_WITH_EXPLICIT_SNAPSHOT, MYF(0));
11480 DBUG_RETURN(HA_ERR_UNSUPPORTED);
11481 }
11482 #endif
11483
11484 /*
11485 SQL layer signals us to take a write lock. It does so when starting DML
11486 statement. We should put locks on the rows we're reading.
11487
11488 Note: sometimes, external_lock() can be called without a prior
11489 ::store_lock call. That's why we need to set lock_* members here, too.
11490 */
11491 m_lock_rows = RDB_LOCK_WRITE;
11492
11493 if (thd->lex->sql_command == SQLCOM_CREATE_INDEX ||
11494 thd->lex->sql_command == SQLCOM_DROP_INDEX ||
11495 thd->lex->sql_command == SQLCOM_ALTER_TABLE) {
11496 tx->m_ddl_transaction = true;
11497 }
11498 }
11499 tx->m_n_mysql_tables_in_use++;
11500 rocksdb_register_tx(rocksdb_hton, thd, tx);
11501 tx->io_perf_start(&m_io_perf);
11502 }
11503
11504 DBUG_RETURN(res);
11505 }
11506
11507 /**
11508 @note
11509 A quote from ha_innobase::start_stmt():
11510 <quote>
11511 MySQL calls this function at the start of each SQL statement inside LOCK
11512 TABLES. Inside LOCK TABLES the ::external_lock method does not work to
11513 mark SQL statement borders.
11514 </quote>
11515
11516 @return
11517 HA_EXIT_SUCCESS OK
11518 */
11519
11520 int ha_rocksdb::start_stmt(THD *const thd, thr_lock_type lock_type) {
11521 DBUG_ENTER_FUNC();
11522
11523 DBUG_ASSERT(thd != nullptr);
11524
11525 Rdb_transaction *const tx = get_or_create_tx(thd);
11526 read_thd_vars(thd);
11527 rocksdb_register_tx(ht, thd, tx);
11528 tx->io_perf_start(&m_io_perf);
11529
11530 DBUG_RETURN(HA_EXIT_SUCCESS);
11531 }
11532
11533 rocksdb::Range get_range(uint32_t i,
11534 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2],
11535 int offset1, int offset2) {
11536 uchar *buf_begin = buf;
11537 uchar *buf_end = buf + Rdb_key_def::INDEX_NUMBER_SIZE;
11538 rdb_netbuf_store_index(buf_begin, i + offset1);
11539 rdb_netbuf_store_index(buf_end, i + offset2);
11540
11541 return rocksdb::Range(
11542 rocksdb::Slice((const char *)buf_begin, Rdb_key_def::INDEX_NUMBER_SIZE),
11543 rocksdb::Slice((const char *)buf_end, Rdb_key_def::INDEX_NUMBER_SIZE));
11544 }
11545
11546 static rocksdb::Range get_range(const Rdb_key_def &kd,
11547 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2],
11548 int offset1, int offset2) {
11549 return get_range(kd.get_index_number(), buf, offset1, offset2);
11550 }
11551
11552 rocksdb::Range get_range(const Rdb_key_def &kd,
11553 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]) {
11554 if (kd.m_is_reverse_cf) {
11555 return myrocks::get_range(kd, buf, 1, 0);
11556 } else {
11557 return myrocks::get_range(kd, buf, 0, 1);
11558 }
11559 }
11560
11561 rocksdb::Range ha_rocksdb::get_range(
11562 const int i, uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]) const {
11563 return myrocks::get_range(*m_key_descr_arr[i], buf);
11564 }
11565
11566 /*
11567 This function is called with total_order_seek=true, but
11568 upper/lower bound setting is not necessary.
11569 Boundary set is useful when there is no matching key,
11570 but in drop_index_thread's case, it means index is marked as removed,
11571 so no further seek will happen for the index id.
11572 */
11573 static bool is_myrocks_index_empty(rocksdb::ColumnFamilyHandle *cfh,
11574 const bool is_reverse_cf,
11575 const rocksdb::ReadOptions &read_opts,
11576 const uint index_id) {
11577 bool index_removed = false;
11578 uchar key_buf[Rdb_key_def::INDEX_NUMBER_SIZE] = {0};
11579 rdb_netbuf_store_uint32(key_buf, index_id);
11580 const rocksdb::Slice key =
11581 rocksdb::Slice(reinterpret_cast<char *>(key_buf), sizeof(key_buf));
11582 std::unique_ptr<rocksdb::Iterator> it(rdb->NewIterator(read_opts, cfh));
11583 rocksdb_smart_seek(is_reverse_cf, it.get(), key);
11584 if (!it->Valid()) {
11585 index_removed = true;
11586 } else {
11587 if (memcmp(it->key().data(), key_buf, Rdb_key_def::INDEX_NUMBER_SIZE)) {
11588 // Key does not have same prefix
11589 index_removed = true;
11590 }
11591 }
11592 return index_removed;
11593 }
11594
11595 /*
11596 Drop index thread's main logic
11597 */
11598
11599 void Rdb_drop_index_thread::run() {
11600 RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
11601
11602 for (;;) {
11603 // The stop flag might be set by shutdown command
11604 // after drop_index_thread releases signal_mutex
11605 // (i.e. while executing expensive Seek()). To prevent drop_index_thread
11606 // from entering long cond_timedwait, checking if stop flag
11607 // is true or not is needed, with drop_index_interrupt_mutex held.
11608 if (m_stop) {
11609 break;
11610 }
11611
11612 timespec ts;
11613 int sec= dict_manager.is_drop_index_empty()
11614 ? 24 * 60 * 60 // no filtering
11615 : 60; // filtering
11616 set_timespec(ts,sec);
11617
11618 const auto ret MY_ATTRIBUTE((__unused__)) =
11619 mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts);
11620 if (m_stop) {
11621 break;
11622 }
11623 // make sure, no program error is returned
11624 DBUG_ASSERT(ret == 0 || ret == ETIMEDOUT);
11625 RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
11626
11627 std::unordered_set<GL_INDEX_ID> indices;
11628 dict_manager.get_ongoing_drop_indexes(&indices);
11629 if (!indices.empty()) {
11630 std::unordered_set<GL_INDEX_ID> finished;
11631 rocksdb::ReadOptions read_opts;
11632 read_opts.total_order_seek = true; // disable bloom filter
11633
11634 for (const auto d : indices) {
11635 uint32 cf_flags = 0;
11636 if (!dict_manager.get_cf_flags(d.cf_id, &cf_flags)) {
11637 // NO_LINT_DEBUG
11638 sql_print_error(
11639 "RocksDB: Failed to get column family flags "
11640 "from cf id %u. MyRocks data dictionary may "
11641 "get corrupted.",
11642 d.cf_id);
11643 if (rocksdb_ignore_datadic_errors)
11644 {
11645 sql_print_error("RocksDB: rocksdb_ignore_datadic_errors=1, "
11646 "trying to continue");
11647 continue;
11648 }
11649 abort();
11650 }
11651 rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(d.cf_id);
11652 DBUG_ASSERT(cfh);
11653 const bool is_reverse_cf = cf_flags & Rdb_key_def::REVERSE_CF_FLAG;
11654
11655 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
11656 rocksdb::Range range = get_range(d.index_id, buf, is_reverse_cf ? 1 : 0,
11657 is_reverse_cf ? 0 : 1);
11658 rocksdb::Status status = DeleteFilesInRange(rdb->GetBaseDB(), cfh,
11659 &range.start, &range.limit);
11660 if (!status.ok()) {
11661 if (status.IsShutdownInProgress()) {
11662 break;
11663 }
11664 rdb_handle_io_error(status, RDB_IO_ERROR_BG_THREAD);
11665 }
11666 status = rdb->CompactRange(getCompactRangeOptions(), cfh, &range.start,
11667 &range.limit);
11668 if (!status.ok()) {
11669 if (status.IsShutdownInProgress()) {
11670 break;
11671 }
11672 rdb_handle_io_error(status, RDB_IO_ERROR_BG_THREAD);
11673 }
11674 if (is_myrocks_index_empty(cfh, is_reverse_cf, read_opts, d.index_id)) {
11675 finished.insert(d);
11676 }
11677 }
11678
11679 if (!finished.empty()) {
11680 dict_manager.finish_drop_indexes(finished);
11681 }
11682 }
11683 RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
11684 }
11685
11686 RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
11687 }
11688
11689 Rdb_tbl_def *ha_rocksdb::get_table_if_exists(const char *const tablename) {
11690 std::string str;
11691 if (rdb_normalize_tablename(tablename, &str) != HA_EXIT_SUCCESS) {
11692 // We were not passed table name?
11693 DBUG_ASSERT(0);
11694 return nullptr;
11695 }
11696
11697 return ddl_manager.find(str);
11698 }
11699
11700 /*
11701 Overload func for delete table ---it deletes table meta data in data
11702 dictionary immediately and delete real data in background thread(async)
11703
11704 @param tbl IN MyRocks table definition
11705
11706 @return
11707 HA_EXIT_SUCCESS OK
11708 other HA_ERR error code (can be SE-specific)
11709 */
11710 int ha_rocksdb::delete_table(Rdb_tbl_def *const tbl) {
11711 DBUG_ENTER_FUNC();
11712
11713 DBUG_ASSERT(tbl != nullptr);
11714 DBUG_ASSERT(m_tbl_def == nullptr || m_tbl_def == tbl);
11715
11716 const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
11717 rocksdb::WriteBatch *const batch = wb.get();
11718
11719 dict_manager.add_drop_table(tbl->m_key_descr_arr, tbl->m_key_count, batch);
11720
11721 /*
11722 Remove the table entry in data dictionary (this will also remove it from
11723 the persistent data dictionary).
11724 */
11725 ddl_manager.remove(tbl, batch, true);
11726
11727 int err = dict_manager.commit(batch);
11728 if (err) {
11729 DBUG_RETURN(err);
11730 }
11731
11732 rdb_drop_idx_thread.signal();
11733 // avoid dangling pointer
11734 m_tbl_def = nullptr;
11735 DBUG_RETURN(HA_EXIT_SUCCESS);
11736 }
11737
11738 /*
11739 Note: the following function is called when the table is not open. That is,
11740 this->table==nullptr, pk_key_descr==nullptr, etc.
11741
11742 tablename points to line in form "./dbname/tablename".
11743
11744 @return
11745 HA_EXIT_SUCCESS OK
11746 other HA_ERR error code (can be SE-specific)
11747 */
11748
11749 int ha_rocksdb::delete_table(const char *const tablename) {
11750 DBUG_ENTER_FUNC();
11751
11752 DBUG_ASSERT(tablename != nullptr);
11753
11754 /* Find the table in the hash */
11755 Rdb_tbl_def *const tbl = get_table_if_exists(tablename);
11756 if (!tbl) {
11757 DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
11758 }
11759
11760 DBUG_RETURN(delete_table(tbl));
11761 }
11762
11763 int ha_rocksdb::remove_rows(Rdb_tbl_def *const tbl) {
11764 const rocksdb::WriteOptions wo =
11765 rdb_get_rocksdb_write_options(handler::ha_thd());
11766
11767 rocksdb::ReadOptions opts;
11768 opts.total_order_seek = true;
11769 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
11770
11771 char key_buf[MAX_KEY_LENGTH];
11772 uint key_len;
11773 ulonglong bytes_written = 0;
11774
11775 uchar lower_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE];
11776 uchar upper_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE];
11777 rocksdb::Slice lower_bound_slice;
11778 rocksdb::Slice upper_bound_slice;
11779
11780 /*
11781 Remove all records in each index.
11782 (This is is not crash-safe, but it doesn't matter, because bulk row
11783 deletion will be handled on rocksdb side)
11784 */
11785 for (uint i = 0; i < tbl->m_key_count; i++) {
11786 const Rdb_key_def &kd = *tbl->m_key_descr_arr[i];
11787 kd.get_infimum_key(reinterpret_cast<uchar *>(key_buf), &key_len);
11788 rocksdb::ColumnFamilyHandle *cf = kd.get_cf();
11789 const rocksdb::Slice table_key(key_buf, key_len);
11790 setup_iterator_bounds(kd, table_key, Rdb_key_def::INDEX_NUMBER_SIZE,
11791 lower_bound_buf, upper_bound_buf, &lower_bound_slice,
11792 &upper_bound_slice);
11793 DBUG_ASSERT(key_len == Rdb_key_def::INDEX_NUMBER_SIZE);
11794 opts.iterate_lower_bound = &lower_bound_slice;
11795 opts.iterate_upper_bound = &upper_bound_slice;
11796 std::unique_ptr<rocksdb::Iterator> it(rdb->NewIterator(opts, cf));
11797
11798 it->Seek(table_key);
11799 while (it->Valid()) {
11800 const rocksdb::Slice key = it->key();
11801 if (!kd.covers_key(key)) {
11802 break;
11803 }
11804
11805 rocksdb::Status s;
11806 if (can_use_single_delete(i)) {
11807 s = rdb->SingleDelete(wo, cf, key);
11808 } else {
11809 s = rdb->Delete(wo, cf, key);
11810 }
11811
11812 if (!s.ok()) {
11813 return tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def,
11814 m_table_handler);
11815 }
11816 bytes_written += key.size();
11817 it->Next();
11818 }
11819 }
11820
11821 tx->update_bytes_written(bytes_written);
11822
11823 return HA_EXIT_SUCCESS;
11824 }
11825
11826 /**
11827 @return
11828 HA_EXIT_SUCCESS OK
11829 other HA_ERR error code (cannot be SE-specific)
11830 */
11831 int ha_rocksdb::rename_table(const char *const from, const char *const to) {
11832 DBUG_ENTER_FUNC();
11833
11834 std::string from_str;
11835 std::string to_str;
11836 std::string from_db;
11837 std::string to_db;
11838
11839 int rc = rdb_normalize_tablename(from, &from_str);
11840 if (rc != HA_EXIT_SUCCESS) {
11841 DBUG_RETURN(rc);
11842 }
11843
11844 rc = rdb_split_normalized_tablename(from_str, &from_db);
11845 if (rc != HA_EXIT_SUCCESS) {
11846 DBUG_RETURN(rc);
11847 }
11848
11849 rc = rdb_normalize_tablename(to, &to_str);
11850 if (rc != HA_EXIT_SUCCESS) {
11851 DBUG_RETURN(rc);
11852 }
11853
11854 rc = rdb_split_normalized_tablename(to_str, &to_db);
11855 if (rc != HA_EXIT_SUCCESS) {
11856 DBUG_RETURN(rc);
11857 }
11858
11859 // If the user changed the database part of the name then validate that the
11860 // 'to' database exists.
11861 if (from_db != to_db && !rdb_database_exists(to_db)) {
11862 // If we return a RocksDB specific error code here we get
11863 // "error: 206 - Unknown error 206". InnoDB gets
11864 // "error -1 - Unknown error -1" so let's match them.
11865 DBUG_RETURN(-1);
11866 }
11867
11868 DBUG_EXECUTE_IF("gen_sql_table_name", to_str = to_str + "#sql-test";);
11869
11870 const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
11871 rocksdb::WriteBatch *const batch = wb.get();
11872 dict_manager.lock();
11873
11874 if (ddl_manager.rename(from_str, to_str, batch)) {
11875 rc = HA_ERR_NO_SUCH_TABLE;
11876 } else {
11877 rc = dict_manager.commit(batch);
11878 }
11879 dict_manager.unlock();
11880
11881 DBUG_RETURN(rc);
11882 }
11883
11884 /**
11885 check_if_incompatible_data() called if ALTER TABLE can't detect otherwise
11886 if new and old definition are compatible
11887
11888 @details If there are no other explicit signs like changed number of
11889 fields this function will be called by compare_tables()
11890 (sql/sql_tables.cc) to decide should we rewrite whole table or only .frm
11891 file.
11892
11893 */
11894
11895 bool ha_rocksdb::check_if_incompatible_data(HA_CREATE_INFO *const info,
11896 uint table_changes) {
11897 DBUG_ENTER_FUNC();
11898
11899 DBUG_ASSERT(info != nullptr);
11900
11901 // this function is needed only for online alter-table
11902 DBUG_RETURN(COMPATIBLE_DATA_NO);
11903 }
11904
11905 /**
11906 @return
11907 HA_EXIT_SUCCESS OK
11908 */
11909 int ha_rocksdb::extra(enum ha_extra_function operation) {
11910 DBUG_ENTER_FUNC();
11911
11912 switch (operation) {
11913 case HA_EXTRA_KEYREAD:
11914 m_keyread_only = true;
11915 break;
11916 case HA_EXTRA_NO_KEYREAD:
11917 m_keyread_only = false;
11918 break;
11919 case HA_EXTRA_FLUSH:
11920 /*
11921 If the table has blobs, then they are part of m_retrieved_record.
11922 This call invalidates them.
11923 */
11924 m_retrieved_record.Reset();
11925 break;
11926 case HA_EXTRA_INSERT_WITH_UPDATE:
11927 // INSERT ON DUPLICATE KEY UPDATE
11928 if (rocksdb_enable_insert_with_update_caching) {
11929 m_insert_with_update = true;
11930 }
11931 break;
11932 case HA_EXTRA_NO_IGNORE_DUP_KEY:
11933 // PAIRED with HA_EXTRA_INSERT_WITH_UPDATE or HA_EXTRA_WRITE_CAN_REPLACE
11934 // that indicates the end of REPLACE / INSERT ON DUPLICATE KEY
11935 m_insert_with_update = false;
11936 break;
11937
11938 default:
11939 break;
11940 }
11941
11942 DBUG_RETURN(HA_EXIT_SUCCESS);
11943 }
11944
11945 /*
11946 Given a starting key and an ending key, estimate the number of rows that
11947 will exist between the two keys.
11948 */
11949 ha_rows ha_rocksdb::records_in_range(uint inx, key_range *const min_key,
11950 key_range *const max_key) {
11951 DBUG_ENTER_FUNC();
11952
11953 ha_rows ret = THDVAR(ha_thd(), records_in_range);
11954 if (ret) {
11955 DBUG_RETURN(ret);
11956 }
11957 if (table->force_index) {
11958 const ha_rows force_rows = THDVAR(ha_thd(), force_index_records_in_range);
11959 if (force_rows) {
11960 DBUG_RETURN(force_rows);
11961 }
11962 }
11963
11964 const Rdb_key_def &kd = *m_key_descr_arr[inx];
11965
11966 uint size1 = 0;
11967 if (min_key) {
11968 size1 = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
11969 min_key->key, min_key->keypart_map);
11970 if (min_key->flag == HA_READ_PREFIX_LAST_OR_PREV ||
11971 min_key->flag == HA_READ_PREFIX_LAST ||
11972 min_key->flag == HA_READ_AFTER_KEY) {
11973 kd.successor(m_sk_packed_tuple, size1);
11974 }
11975 } else {
11976 kd.get_infimum_key(m_sk_packed_tuple, &size1);
11977 }
11978
11979 uint size2 = 0;
11980 if (max_key) {
11981 size2 = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple_old,
11982 max_key->key, max_key->keypart_map);
11983 if (max_key->flag == HA_READ_PREFIX_LAST_OR_PREV ||
11984 max_key->flag == HA_READ_PREFIX_LAST ||
11985 max_key->flag == HA_READ_AFTER_KEY) {
11986 kd.successor(m_sk_packed_tuple_old, size2);
11987 }
11988 // pad the upper key with FFFFs to make sure it is more than the lower
11989 if (size1 > size2) {
11990 memset(m_sk_packed_tuple_old + size2, 0xff, size1 - size2);
11991 size2 = size1;
11992 }
11993 } else {
11994 kd.get_supremum_key(m_sk_packed_tuple_old, &size2);
11995 }
11996
11997 const rocksdb::Slice slice1((const char *)m_sk_packed_tuple, size1);
11998 const rocksdb::Slice slice2((const char *)m_sk_packed_tuple_old, size2);
11999
12000 // slice1 >= slice2 means no row will match
12001 if (slice1.compare(slice2) >= 0) {
12002 DBUG_RETURN(HA_EXIT_SUCCESS);
12003 }
12004
12005 rocksdb::Range r(kd.m_is_reverse_cf ? slice2 : slice1,
12006 kd.m_is_reverse_cf ? slice1 : slice2);
12007
12008 uint64_t sz = 0;
12009 auto disk_size = kd.m_stats.m_actual_disk_size;
12010 if (disk_size == 0) disk_size = kd.m_stats.m_data_size;
12011 auto rows = kd.m_stats.m_rows;
12012 if (rows == 0 || disk_size == 0) {
12013 rows = 1;
12014 disk_size = ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE;
12015 }
12016
12017 // Getting statistics, including from Memtables
12018 uint8_t include_flags = rocksdb::DB::INCLUDE_FILES;
12019 rdb->GetApproximateSizes(kd.get_cf(), &r, 1, &sz, include_flags);
12020 ret = rows * sz / disk_size;
12021 uint64_t memTableCount;
12022 rdb->GetApproximateMemTableStats(kd.get_cf(), r, &memTableCount, &sz);
12023 ret += memTableCount;
12024
12025 /*
12026 GetApproximateSizes() gives estimates so ret might exceed stats.records.
12027 MySQL then decides to use full index scan rather than range scan, which
12028 is not efficient for most cases.
12029 To prevent this, changing estimated records slightly smaller than
12030 stats.records.
12031 */
12032 if (ret >= stats.records) {
12033 ret = stats.records * 0.99;
12034 }
12035
12036 if (rocksdb_debug_optimizer_n_rows > 0) {
12037 ret = rocksdb_debug_optimizer_n_rows;
12038 } else if (ret == 0) {
12039 ret = 1;
12040 }
12041
12042 DBUG_RETURN(ret);
12043 }
12044
12045 void ha_rocksdb::update_create_info(HA_CREATE_INFO *const create_info) {
12046 DBUG_ENTER_FUNC();
12047
12048 DBUG_ASSERT(create_info != nullptr);
12049
12050 if (!(create_info->used_fields & HA_CREATE_USED_AUTO)) {
12051 create_info->auto_increment_value = m_tbl_def->m_auto_incr_val;
12052 }
12053
12054 DBUG_VOID_RETURN;
12055 }
12056
12057 /**
12058 @brief
12059 Doing manual compaction on OPTIMIZE TABLE in RocksDB.
12060 Compaction itself is executed by background thread in RocksDB, but
12061 CompactRange() waits until compaction completes so this function
12062 may take a long time.
12063 Since RocksDB dataset is allocated per index id, OPTIMIZE TABLE
12064 triggers manual compaction for all indexes of the table.
12065 @details
12066 Compaction range is from the beginning of the index id to
12067 the first row of the next index id. When using reverse order
12068 column family, the first row of the next index id should be
12069 the last row of the previous index id.
12070
12071 @return
12072 HA_ADMIN_OK OK
12073 other HA_ADMIN error code
12074 */
12075 int ha_rocksdb::optimize(THD *const thd, HA_CHECK_OPT *const check_opt) {
12076 DBUG_ENTER_FUNC();
12077
12078 DBUG_ASSERT(thd != nullptr);
12079 DBUG_ASSERT(check_opt != nullptr);
12080
12081 for (uint i = 0; i < table->s->keys; i++) {
12082 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
12083 auto range = get_range(i, buf);
12084 const rocksdb::Status s = rdb->CompactRange(getCompactRangeOptions(),
12085 m_key_descr_arr[i]->get_cf(),
12086 &range.start, &range.limit);
12087 if (!s.ok()) {
12088 DBUG_RETURN(rdb_error_to_mysql(s));
12089 }
12090 }
12091
12092 DBUG_RETURN(HA_EXIT_SUCCESS);
12093 }
12094
12095 static int calculate_stats(
12096 const std::unordered_map<GL_INDEX_ID, std::shared_ptr<const Rdb_key_def>>
12097 &to_recalc,
12098 bool include_memtables) {
12099 DBUG_ENTER_FUNC();
12100
12101 // find per column family key ranges which need to be queried
12102 std::unordered_map<rocksdb::ColumnFamilyHandle *, std::vector<rocksdb::Range>>
12103 ranges;
12104 std::unordered_map<GL_INDEX_ID, Rdb_index_stats> stats;
12105 std::vector<uchar> buf(to_recalc.size() * 2 * Rdb_key_def::INDEX_NUMBER_SIZE);
12106
12107 uchar *bufp = buf.data();
12108 for (const auto &it : to_recalc) {
12109 const GL_INDEX_ID index_id = it.first;
12110 auto &kd = it.second;
12111 ranges[kd->get_cf()].push_back(myrocks::get_range(*kd, bufp));
12112 bufp += 2 * Rdb_key_def::INDEX_NUMBER_SIZE;
12113
12114 stats[index_id] = Rdb_index_stats(index_id);
12115 DBUG_ASSERT(kd->get_key_parts() > 0);
12116 stats[index_id].m_distinct_keys_per_prefix.resize(kd->get_key_parts());
12117 }
12118
12119 // get RocksDB table properties for these ranges
12120 rocksdb::TablePropertiesCollection props;
12121 for (const auto &it : ranges) {
12122 const auto old_size MY_ATTRIBUTE((__unused__)) = props.size();
12123 const auto status = rdb->GetPropertiesOfTablesInRange(
12124 it.first, &it.second[0], it.second.size(), &props);
12125 DBUG_ASSERT(props.size() >= old_size);
12126 if (!status.ok()) {
12127 DBUG_RETURN(ha_rocksdb::rdb_error_to_mysql(
12128 status, "Could not access RocksDB properties"));
12129 }
12130 }
12131
12132 int num_sst = 0;
12133 for (const auto &it : props) {
12134 std::vector<Rdb_index_stats> sst_stats;
12135 Rdb_tbl_prop_coll::read_stats_from_tbl_props(it.second, &sst_stats);
12136 /*
12137 sst_stats is a list of index statistics for indexes that have entries
12138 in the current SST file.
12139 */
12140 for (const auto &it1 : sst_stats) {
12141 /*
12142 Only update statistics for indexes that belong to this SQL table.
12143
12144 The reason is: We are walking through all SST files that have
12145 entries from this table (and so can compute good statistics). For
12146 other SQL tables, it can be that we're only seeing a small fraction
12147 of table's entries (and so we can't update statistics based on that).
12148 */
12149 if (stats.find(it1.m_gl_index_id) == stats.end()) {
12150 continue;
12151 }
12152
12153 auto it_index = to_recalc.find(it1.m_gl_index_id);
12154 DBUG_ASSERT(it_index != to_recalc.end());
12155 if (it_index == to_recalc.end()) {
12156 continue;
12157 }
12158 stats[it1.m_gl_index_id].merge(
12159 it1, true, it_index->second->max_storage_fmt_length());
12160 }
12161 num_sst++;
12162 }
12163
12164 if (include_memtables) {
12165 // calculate memtable cardinality
12166 Rdb_tbl_card_coll cardinality_collector(rocksdb_table_stats_sampling_pct);
12167 auto read_opts = rocksdb::ReadOptions();
12168 read_opts.read_tier = rocksdb::ReadTier::kMemtableTier;
12169 for (const auto &it_kd : to_recalc) {
12170 const std::shared_ptr<const Rdb_key_def> &kd = it_kd.second;
12171 Rdb_index_stats &stat = stats[kd->get_gl_index_id()];
12172
12173 uchar r_buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
12174 auto r = myrocks::get_range(*kd, r_buf);
12175 uint64_t memtableCount;
12176 uint64_t memtableSize;
12177 rdb->GetApproximateMemTableStats(kd->get_cf(), r, &memtableCount,
12178 &memtableSize);
12179 if (memtableCount < (uint64_t)stat.m_rows / 10) {
12180 // skip tables that already have enough stats from SST files to reduce
12181 // overhead and avoid degradation of big tables stats by sampling from
12182 // relatively tiny (less than 10% of full data set) memtable dataset
12183 continue;
12184 }
12185
12186 std::unique_ptr<rocksdb::Iterator> it =
12187 std::unique_ptr<rocksdb::Iterator>(
12188 rdb->NewIterator(read_opts, kd->get_cf()));
12189
12190 rocksdb::Slice first_index_key((const char *)r_buf,
12191 Rdb_key_def::INDEX_NUMBER_SIZE);
12192
12193 cardinality_collector.Reset();
12194 for (it->Seek(first_index_key); is_valid(it.get()); it->Next()) {
12195 const rocksdb::Slice key = it->key();
12196 if (!kd->covers_key(key)) {
12197 break; // end of this index
12198 }
12199 stat.m_rows++;
12200
12201 cardinality_collector.ProcessKey(key, kd.get(), &stat);
12202 }
12203 cardinality_collector.AdjustStats(&stat);
12204 }
12205 }
12206
12207 // set and persist new stats
12208 ddl_manager.set_stats(stats);
12209 ddl_manager.persist_stats(true);
12210
12211 DBUG_RETURN(HA_EXIT_SUCCESS);
12212 }
12213
12214 int ha_rocksdb::calculate_stats_for_table() {
12215 DBUG_ENTER_FUNC();
12216
12217 std::unordered_map<GL_INDEX_ID, std::shared_ptr<const Rdb_key_def>>
12218 ids_to_check;
12219 for (uint i = 0; i < table->s->keys; i++) {
12220 ids_to_check.insert(std::make_pair(m_key_descr_arr[i]->get_gl_index_id(),
12221 m_key_descr_arr[i]));
12222 }
12223
12224 DBUG_RETURN(calculate_stats(ids_to_check, true));
12225 }
12226
12227 /*
12228 @return
12229 HA_ADMIN_OK OK
12230 other HA_ADMIN error code
12231 */
12232 int ha_rocksdb::analyze(THD *const thd, HA_CHECK_OPT *const check_opt) {
12233 DBUG_ENTER_FUNC();
12234
12235 if (table) {
12236 if (calculate_stats_for_table() != HA_EXIT_SUCCESS) {
12237 DBUG_RETURN(HA_ADMIN_FAILED);
12238 }
12239 }
12240
12241 // A call to ::info is needed to repopulate some SQL level structs. This is
12242 // necessary for online analyze because we cannot rely on another ::open
12243 // call to call info for us.
12244 if (info(HA_STATUS_CONST | HA_STATUS_VARIABLE) != HA_EXIT_SUCCESS) {
12245 DBUG_RETURN(HA_ADMIN_FAILED);
12246 }
12247
12248 DBUG_RETURN(HA_ADMIN_OK);
12249 }
12250
12251 void ha_rocksdb::get_auto_increment(ulonglong off, ulonglong inc,
12252 ulonglong nb_desired_values,
12253 ulonglong *const first_value,
12254 ulonglong *const nb_reserved_values) {
12255 /*
12256 MySQL has a somewhat complicated way of handling the auto-increment value.
12257 The first time get_auto_increment is called for a statement,
12258 nb_desired_values is the estimate for how many values will be needed. The
12259 engine can then reserve some values, and those will be automatically used
12260 by MySQL, until a hard-coded value shows up in the insert statement, after
12261 which MySQL again calls this function to reset its starting value.
12262 *
12263 For simplicity we will just ignore nb_desired_values - we aren't going to
12264 reserve any extra values for a multi-insert statement. Each row will
12265 simply acquire the next value as needed and we will always tell MySQL that
12266 we only reserved 1 value. Since we are using an atomic value for
12267 m_auto_incr_val this should be safe - if we had to grab a mutex, doing
12268 an actual reserve of some values might be a better solution.
12269 */
12270 DEBUG_SYNC(ha_thd(), "rocksdb.autoinc_vars");
12271 DEBUG_SYNC(ha_thd(), "rocksdb.autoinc_vars2");
12272
12273 if (off > inc) {
12274 off = 1;
12275 }
12276
12277 Field *field;
12278 ulonglong new_val, max_val;
12279 field = table->key_info[table->s->next_number_index].key_part[0].field;
12280 max_val = rdb_get_int_col_max_value(field);
12281
12282 // Local variable reference to simplify code below
12283 auto &auto_incr = m_tbl_def->m_auto_incr_val;
12284
12285 if (inc == 1) {
12286 DBUG_ASSERT(off == 1);
12287 // Optimization for the standard case where we are always simply
12288 // incrementing from the last position
12289
12290 // Use CAS operation in a loop to make sure automically get the next auto
12291 // increment value while ensuring that we don't wrap around to a negative
12292 // number.
12293 //
12294 // We set auto_incr to the min of max_val and new_val + 1. This means that
12295 // if we're at the maximum, we should be returning the same value for
12296 // multiple rows, resulting in duplicate key errors (as expected).
12297 //
12298 // If we return values greater than the max, the SQL layer will "truncate"
12299 // the value anyway, but it means that we store invalid values into
12300 // auto_incr that will be visible in SHOW CREATE TABLE.
12301 new_val = auto_incr;
12302 while (new_val != std::numeric_limits<ulonglong>::max()) {
12303 if (auto_incr.compare_exchange_weak(new_val,
12304 std::min(new_val + 1, max_val))) {
12305 break;
12306 }
12307 }
12308 } else {
12309 // The next value can be more complicated if either 'inc' or 'off' is not 1
12310 ulonglong last_val = auto_incr;
12311
12312 if (last_val > max_val) {
12313 new_val = std::numeric_limits<ulonglong>::max();
12314 } else {
12315 // Loop until we can correctly update the atomic value
12316 do {
12317 DBUG_ASSERT(last_val > 0);
12318 // Calculate the next value in the auto increment series: offset
12319 // + N * increment where N is 0, 1, 2, ...
12320 //
12321 // For further information please visit:
12322 // http://dev.mysql.com/doc/refman/5.7/en/replication-options-master.html
12323 //
12324 // The following is confusing so here is an explanation:
12325 // To get the next number in the sequence above you subtract out the
12326 // offset, calculate the next sequence (N * increment) and then add the
12327 // offset back in.
12328 //
12329 // The additions are rearranged to avoid overflow. The following is
12330 // equivalent to (last_val - 1 + inc - off) / inc. This uses the fact
12331 // that (a+b)/c = a/c + b/c + (a%c + b%c)/c. To show why:
12332 //
12333 // (a+b)/c
12334 // = (a - a%c + a%c + b - b%c + b%c) / c
12335 // = (a - a%c) / c + (b - b%c) / c + (a%c + b%c) / c
12336 // = a/c + b/c + (a%c + b%c) / c
12337 //
12338 // Now, substitute a = last_val - 1, b = inc - off, c = inc to get the
12339 // following statement.
12340 ulonglong n =
12341 (last_val - 1) / inc + ((last_val - 1) % inc + inc - off) / inc;
12342
12343 // Check if n * inc + off will overflow. This can only happen if we have
12344 // an UNSIGNED BIGINT field.
12345 if (n > (std::numeric_limits<ulonglong>::max() - off) / inc) {
12346 DBUG_ASSERT(max_val == std::numeric_limits<ulonglong>::max());
12347 // The 'last_val' value is already equal to or larger than the largest
12348 // value in the sequence. Continuing would wrap around (technically
12349 // the behavior would be undefined). What should we do?
12350 // We could:
12351 // 1) set the new value to the last possible number in our sequence
12352 // as described above. The problem with this is that this
12353 // number could be smaller than a value in an existing row.
12354 // 2) set the new value to the largest possible number. This number
12355 // may not be in our sequence, but it is guaranteed to be equal
12356 // to or larger than any other value already inserted.
12357 //
12358 // For now I'm going to take option 2.
12359 //
12360 // Returning ULLONG_MAX from get_auto_increment will cause the SQL
12361 // layer to fail with ER_AUTOINC_READ_FAILED. This means that due to
12362 // the SE API for get_auto_increment, inserts will fail with
12363 // ER_AUTOINC_READ_FAILED if the column is UNSIGNED BIGINT, but
12364 // inserts will fail with ER_DUP_ENTRY for other types (or no failure
12365 // if the column is in a non-unique SK).
12366 new_val = std::numeric_limits<ulonglong>::max();
12367 auto_incr = new_val; // Store the largest value into auto_incr
12368 break;
12369 }
12370
12371 new_val = n * inc + off;
12372
12373 // Attempt to store the new value (plus 1 since m_auto_incr_val contains
12374 // the next available value) into the atomic value. If the current
12375 // value no longer matches what we have in 'last_val' this will fail and
12376 // we will repeat the loop (`last_val` will automatically get updated
12377 // with the current value).
12378 //
12379 // See above explanation for inc == 1 for why we use std::min.
12380 } while (!auto_incr.compare_exchange_weak(
12381 last_val, std::min(new_val + 1, max_val)));
12382 }
12383 }
12384
12385 *first_value = new_val;
12386 *nb_reserved_values = 1;
12387 }
12388
12389 #ifndef DBUG_OFF
12390
12391 /* Debugger help function */
12392 static char dbug_item_print_buf[512];
12393
12394 const char *dbug_print_item(Item *const item) {
12395 char *const buf = dbug_item_print_buf;
12396 String str(buf, sizeof(dbug_item_print_buf), &my_charset_bin);
12397 str.length(0);
12398 if (!item) return "(Item*)nullptr";
12399 item->print(&str, QT_ORDINARY);
12400 if (str.c_ptr() == buf) {
12401 return buf;
12402 } else {
12403 return "Couldn't fit into buffer";
12404 }
12405 }
12406
12407 #endif /*DBUG_OFF*/
12408
12409 /**
12410 SQL layer calls this function to push an index condition.
12411
12412 @details
12413 The condition is for index keyno (only one condition can be pushed at a
12414 time).
12415 The caller guarantees that condition refers only to index fields; besides
12416 that, fields must have
12417
12418 $field->part_of_key.set_bit(keyno)
12419
12420 which means that
12421
12422 (handler->index_flags(keyno, $keypart, 0) & HA_KEYREAD_ONLY) == 1
12423
12424 which means that field value can be restored from the index tuple.
12425
12426 @return
12427 Part of condition we couldn't check (always nullptr).
12428 */
12429
12430 class Item *ha_rocksdb::idx_cond_push(uint keyno, class Item *const idx_cond) {
12431 DBUG_ENTER_FUNC();
12432
12433 DBUG_ASSERT(keyno != MAX_KEY);
12434 DBUG_ASSERT(idx_cond != nullptr);
12435
12436 pushed_idx_cond = idx_cond;
12437 pushed_idx_cond_keyno = keyno;
12438 in_range_check_pushed_down = TRUE;
12439
12440 /* We will check the whole condition */
12441 DBUG_RETURN(nullptr);
12442 }
12443
12444 /*
12445 Checks if inplace alter is supported for a given operation.
12446 */
12447
12448 my_core::enum_alter_inplace_result ha_rocksdb::check_if_supported_inplace_alter(
12449 TABLE *altered_table, my_core::Alter_inplace_info *const ha_alter_info) {
12450 DBUG_ENTER_FUNC();
12451
12452 DBUG_ASSERT(ha_alter_info != nullptr);
12453
12454 if (ha_alter_info->handler_flags &
12455 ~(ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX |
12456 ALTER_DROP_UNIQUE_INDEX |
12457 ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX |
12458 ALTER_PARTITIONED |
12459 ALTER_ADD_UNIQUE_INDEX |
12460 ALTER_CHANGE_CREATE_OPTION)) {
12461 DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED);
12462 }
12463
12464 /* We don't support unique keys on table w/ no primary keys */
12465 if ((ha_alter_info->handler_flags &
12466 ALTER_ADD_UNIQUE_INDEX) &&
12467 has_hidden_pk(altered_table)) {
12468 DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED);
12469 }
12470
12471 /* We only support changing auto_increment for table options. */
12472 if ((ha_alter_info->handler_flags & ALTER_CHANGE_CREATE_OPTION) &&
12473 !(ha_alter_info->create_info->used_fields & HA_CREATE_USED_AUTO)) {
12474 DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED);
12475 }
12476
12477 /* FIXME: MDEV-16099 Use alter algorithm=nocopy or algorithm=instant
12478 for non-InnoDB engine */
12479 DBUG_RETURN(my_core::HA_ALTER_INPLACE_COPY_LOCK);
12480 }
12481
12482 /**
12483 Allows the storage engine to update internal structures with concurrent
12484 writes blocked. If check_if_supported_inplace_alter() returns
12485 HA_ALTER_INPLACE_COPY_NO_LOCK or HA_ALTER_INPLACE_COPY_LOCK.
12486 this function is called with exclusive lock otherwise the same level of
12487 locking as for inplace_alter_table() will be used.
12488
12489 @note Storage engines are responsible for reporting any errors by
12490 calling my_error()/print_error()
12491
12492 @note If this function reports error, commit_inplace_alter_table()
12493 will be called with commit= false.
12494
12495 @note For partitioning, failing to prepare one partition, means that
12496 commit_inplace_alter_table() will be called to roll back changes for
12497 all partitions. This means that commit_inplace_alter_table() might be
12498 called without prepare_inplace_alter_table() having been called first
12499 for a given partition.
12500
12501 @param altered_table TABLE object for new version of table.
12502 @param ha_alter_info Structure describing changes to be done
12503 by ALTER TABLE and holding data used
12504 during in-place alter.
12505
12506 @retval true Error
12507 @retval false Success
12508 */
12509 bool ha_rocksdb::prepare_inplace_alter_table(
12510 TABLE *const altered_table,
12511 my_core::Alter_inplace_info *const ha_alter_info) {
12512 DBUG_ENTER_FUNC();
12513
12514 DBUG_ASSERT(altered_table != nullptr);
12515 DBUG_ASSERT(ha_alter_info != nullptr);
12516
12517 Rdb_tbl_def *new_tdef = nullptr;
12518 std::shared_ptr<Rdb_key_def> *old_key_descr = nullptr;
12519 std::shared_ptr<Rdb_key_def> *new_key_descr = nullptr;
12520 uint old_n_keys = m_tbl_def->m_key_count;
12521 uint new_n_keys = altered_table->s->keys;
12522 std::unordered_set<std::shared_ptr<Rdb_key_def>> added_indexes;
12523 std::unordered_set<GL_INDEX_ID> dropped_index_ids;
12524 uint n_dropped_keys = 0;
12525 uint n_added_keys = 0;
12526 ulonglong max_auto_incr = 0;
12527
12528 if (ha_alter_info->handler_flags &
12529 (ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX |
12530 ALTER_DROP_UNIQUE_INDEX |
12531 ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX |
12532 ALTER_ADD_UNIQUE_INDEX)) {
12533 if (has_hidden_pk(altered_table)) {
12534 new_n_keys += 1;
12535 }
12536
12537 const TABLE *const old_table = table;
12538 old_key_descr = m_tbl_def->m_key_descr_arr;
12539 new_key_descr = new std::shared_ptr<Rdb_key_def>[new_n_keys];
12540
12541 new_tdef = new Rdb_tbl_def(m_tbl_def->full_tablename());
12542 new_tdef->m_key_descr_arr = new_key_descr;
12543 new_tdef->m_key_count = new_n_keys;
12544 new_tdef->m_auto_incr_val =
12545 m_tbl_def->m_auto_incr_val.load(std::memory_order_relaxed);
12546 new_tdef->m_hidden_pk_val =
12547 m_tbl_def->m_hidden_pk_val.load(std::memory_order_relaxed);
12548
12549 if (create_key_defs(altered_table, new_tdef, table, m_tbl_def)) {
12550 /* Delete the new key descriptors */
12551 delete[] new_key_descr;
12552
12553 /*
12554 Explicitly mark as nullptr so we don't accidentally remove entries
12555 from data dictionary on cleanup (or cause double delete[]).
12556 */
12557 new_tdef->m_key_descr_arr = nullptr;
12558 delete new_tdef;
12559
12560 my_error(ER_KEY_CREATE_DURING_ALTER, MYF(0));
12561 DBUG_RETURN(HA_EXIT_FAILURE);
12562 }
12563
12564 uint i;
12565 uint j;
12566
12567 /* Determine which(if any) key definition(s) need to be dropped */
12568 for (i = 0; i < ha_alter_info->index_drop_count; i++) {
12569 const KEY *const dropped_key = ha_alter_info->index_drop_buffer[i];
12570 for (j = 0; j < old_n_keys; j++) {
12571 const KEY *const old_key =
12572 &old_table->key_info[old_key_descr[j]->get_keyno()];
12573
12574 if (!compare_keys(old_key, dropped_key)) {
12575 dropped_index_ids.insert(old_key_descr[j]->get_gl_index_id());
12576 break;
12577 }
12578 }
12579 }
12580
12581 /* Determine which(if any) key definitions(s) need to be added */
12582 int identical_indexes_found = 0;
12583 for (i = 0; i < ha_alter_info->index_add_count; i++) {
12584 const KEY *const added_key =
12585 &ha_alter_info->key_info_buffer[ha_alter_info->index_add_buffer[i]];
12586 for (j = 0; j < new_n_keys; j++) {
12587 const KEY *const new_key =
12588 &altered_table->key_info[new_key_descr[j]->get_keyno()];
12589 if (!compare_keys(new_key, added_key)) {
12590 /*
12591 Check for cases where an 'identical' index is being dropped and
12592 re-added in a single ALTER statement. Turn this into a no-op as the
12593 index has not changed.
12594
12595 E.G. Unique index -> non-unique index requires no change
12596
12597 Note that cases where the index name remains the same but the
12598 key-parts are changed is already handled in create_inplace_key_defs.
12599 In these cases the index needs to be rebuilt.
12600 */
12601 if (dropped_index_ids.count(new_key_descr[j]->get_gl_index_id())) {
12602 dropped_index_ids.erase(new_key_descr[j]->get_gl_index_id());
12603 identical_indexes_found++;
12604 } else {
12605 added_indexes.insert(new_key_descr[j]);
12606 }
12607
12608 break;
12609 }
12610 }
12611 }
12612
12613 n_dropped_keys = ha_alter_info->index_drop_count - identical_indexes_found;
12614 n_added_keys = ha_alter_info->index_add_count - identical_indexes_found;
12615 DBUG_ASSERT(dropped_index_ids.size() == n_dropped_keys);
12616 DBUG_ASSERT(added_indexes.size() == n_added_keys);
12617 DBUG_ASSERT(new_n_keys == (old_n_keys - n_dropped_keys + n_added_keys));
12618 }
12619 if (ha_alter_info->handler_flags & ALTER_CHANGE_CREATE_OPTION) {
12620 if (!new_tdef) {
12621 new_tdef = m_tbl_def;
12622 }
12623 if (table->found_next_number_field) {
12624 max_auto_incr = load_auto_incr_value_from_index();
12625 }
12626 }
12627
12628 ha_alter_info->handler_ctx = new Rdb_inplace_alter_ctx(
12629 new_tdef, old_key_descr, new_key_descr, old_n_keys, new_n_keys,
12630 added_indexes, dropped_index_ids, n_added_keys, n_dropped_keys,
12631 max_auto_incr);
12632 DBUG_RETURN(HA_EXIT_SUCCESS);
12633 }
12634
12635 /**
12636 Alter the table structure in-place with operations specified using
12637 HA_ALTER_FLAGS and Alter_inplace_info. The level of concurrency allowed
12638 during this operation depends on the return value from
12639 check_if_supported_inplace_alter().
12640
12641 @note Storage engines are responsible for reporting any errors by
12642 calling my_error()/print_error()
12643
12644 @note If this function reports error, commit_inplace_alter_table()
12645 will be called with commit= false.
12646
12647 @param altered_table TABLE object for new version of table.
12648 @param ha_alter_info Structure describing changes to be done
12649 by ALTER TABLE and holding data used
12650 during in-place alter.
12651
12652 @retval true Error
12653 @retval false Success
12654 */
12655 bool ha_rocksdb::inplace_alter_table(
12656 TABLE *const altered_table,
12657 my_core::Alter_inplace_info *const ha_alter_info) {
12658 DBUG_ENTER_FUNC();
12659
12660 DBUG_ASSERT(altered_table != nullptr);
12661 DBUG_ASSERT(ha_alter_info != nullptr);
12662 DBUG_ASSERT(ha_alter_info->handler_ctx != nullptr);
12663
12664 Rdb_inplace_alter_ctx *const ctx =
12665 static_cast<Rdb_inplace_alter_ctx *>(ha_alter_info->handler_ctx);
12666
12667 if (ha_alter_info->handler_flags &
12668 (ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX |
12669 ALTER_ADD_UNIQUE_INDEX)) {
12670 /*
12671 Buffers need to be set up again to account for new, possibly longer
12672 secondary keys.
12673 */
12674 free_key_buffers();
12675
12676 DBUG_ASSERT(ctx != nullptr);
12677
12678 /*
12679 If adding unique index, allocate special buffers for duplicate checking.
12680 */
12681 int err;
12682 if ((err = alloc_key_buffers(
12683 altered_table, ctx->m_new_tdef,
12684 ha_alter_info->handler_flags &
12685 ALTER_ADD_UNIQUE_INDEX))) {
12686 my_error(ER_OUT_OF_RESOURCES, MYF(0));
12687 DBUG_RETURN(err);
12688 }
12689
12690 /* Populate all new secondary keys by scanning the primary key. */
12691 if ((err = inplace_populate_sk(altered_table, ctx->m_added_indexes))) {
12692 my_error(ER_SK_POPULATE_DURING_ALTER, MYF(0));
12693 DBUG_RETURN(HA_EXIT_FAILURE);
12694 }
12695 }
12696
12697 DBUG_EXECUTE_IF("myrocks_simulate_index_create_rollback", {
12698 dbug_create_err_inplace_alter();
12699 DBUG_RETURN(HA_EXIT_FAILURE);
12700 };);
12701
12702 DBUG_RETURN(HA_EXIT_SUCCESS);
12703 }
12704
12705 /**
12706 Scan the Primary Key index entries and populate the new secondary keys.
12707 */
12708 int ha_rocksdb::inplace_populate_sk(
12709 TABLE *const new_table_arg,
12710 const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes) {
12711 DBUG_ENTER_FUNC();
12712 int res = HA_EXIT_SUCCESS;
12713 const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
12714 rocksdb::WriteBatch *const batch = wb.get();
12715
12716 /* Update the data dictionary */
12717 std::unordered_set<GL_INDEX_ID> create_index_ids;
12718 for (const auto &index : indexes) {
12719 create_index_ids.insert(index->get_gl_index_id());
12720 }
12721 dict_manager.add_create_index(create_index_ids, batch);
12722 res = dict_manager.commit(batch);
12723 if (res != HA_EXIT_SUCCESS) {
12724 return res;
12725 }
12726
12727 /*
12728 Add uncommitted key definitons to ddl_manager. We need to do this
12729 so that the property collector can find this keydef when it needs to
12730 update stats. The property collector looks for the keydef in the
12731 data dictionary, but it won't be there yet since this key definition
12732 is still in the creation process.
12733 */
12734 ddl_manager.add_uncommitted_keydefs(indexes);
12735
12736 const bool hidden_pk_exists = has_hidden_pk(table);
12737
12738 Rdb_transaction *tx = get_or_create_tx(table->in_use);
12739
12740 /*
12741 There is one specific scenario where m_sst_info may not be nullptr. This
12742 happens if the handler we're using happens to be the handler where the PK
12743 bulk load was done on. The sequence of events that lead to this is as
12744 follows (T1 is PK bulk load, T2 is SK alter table):
12745
12746 T1: Execute last INSERT statement
12747 T1: Return TABLE and handler object back to Table_cache_manager
12748 T1: Close connection
12749 T2: Execute ALTER statement
12750 T2: Take same TABLE/handler from Table_cache_manager
12751 T2: Call closefrm which will call finalize_bulk_load on every other open
12752 table/handler *except* the one it's on.
12753 T2: Acquire stale snapshot of PK
12754 T1: Call finalize_bulk_load
12755
12756 This is rare because usually, closefrm will call the destructor (and thus
12757 finalize_bulk_load) on the handler where PK bulk load is done. However, if
12758 the thread ids of the bulk load thread and the alter thread differ by a
12759 multiple of table_cache_instances (8 by default), then they hash to the
12760 same bucket in Table_cache_manager and the alter thread will not not call
12761 the destructor on the handler it is holding. Thus, its m_sst_info will not
12762 be nullptr.
12763
12764 At this point, it is safe to refresh the snapshot because we know all other
12765 open handlers have been closed at this point, and the one we're on is the
12766 only one left.
12767 */
12768 if (m_sst_info) {
12769 if ((res = finalize_bulk_load())) {
12770 DBUG_RETURN(res);
12771 }
12772 tx->commit();
12773 }
12774
12775 const ulonglong rdb_merge_buf_size = THDVAR(ha_thd(), merge_buf_size);
12776 const ulonglong rdb_merge_combine_read_size =
12777 THDVAR(ha_thd(), merge_combine_read_size);
12778 const ulonglong rdb_merge_tmp_file_removal_delay =
12779 THDVAR(ha_thd(), merge_tmp_file_removal_delay_ms);
12780
12781 for (const auto &index : indexes) {
12782 bool is_unique_index =
12783 new_table_arg->key_info[index->get_keyno()].flags & HA_NOSAME;
12784
12785 Rdb_index_merge rdb_merge(tx->get_rocksdb_tmpdir(), rdb_merge_buf_size,
12786 rdb_merge_combine_read_size,
12787 rdb_merge_tmp_file_removal_delay,
12788 index->get_cf());
12789
12790 if ((res = rdb_merge.init())) {
12791 DBUG_RETURN(res);
12792 }
12793
12794 /*
12795 Note: We pass in the currently existing table + tbl_def object here,
12796 as the pk index position may have changed in the case of hidden primary
12797 keys.
12798 */
12799 const uint pk = pk_index(table, m_tbl_def);
12800 ha_index_init(pk, true);
12801
12802 /* Scan each record in the primary key in order */
12803 for (res = index_first(table->record[0]); res == 0;
12804 res = index_next(table->record[0])) {
12805 longlong hidden_pk_id = 0;
12806 if (hidden_pk_exists &&
12807 (res = read_hidden_pk_id_from_rowkey(&hidden_pk_id))) {
12808 // NO_LINT_DEBUG
12809 sql_print_error("Error retrieving hidden pk id.");
12810 ha_index_end();
12811 DBUG_RETURN(res);
12812 }
12813
12814 /* Create new secondary index entry */
12815 const int new_packed_size = index->pack_record(
12816 new_table_arg, m_pack_buffer, table->record[0], m_sk_packed_tuple,
12817 &m_sk_tails, should_store_row_debug_checksums(), hidden_pk_id, 0,
12818 nullptr, m_ttl_bytes);
12819
12820 const rocksdb::Slice key = rocksdb::Slice(
12821 reinterpret_cast<const char *>(m_sk_packed_tuple), new_packed_size);
12822 const rocksdb::Slice val =
12823 rocksdb::Slice(reinterpret_cast<const char *>(m_sk_tails.ptr()),
12824 m_sk_tails.get_current_pos());
12825
12826 /*
12827 Add record to offset tree in preparation for writing out to
12828 disk in sorted chunks.
12829 */
12830 if ((res = rdb_merge.add(key, val))) {
12831 ha_index_end();
12832 DBUG_RETURN(res);
12833 }
12834 }
12835
12836 if (res != HA_ERR_END_OF_FILE) {
12837 // NO_LINT_DEBUG
12838 sql_print_error("Error retrieving index entry from primary key.");
12839 ha_index_end();
12840 DBUG_RETURN(res);
12841 }
12842
12843 ha_index_end();
12844
12845 /*
12846 Perform an n-way merge of n sorted buffers on disk, then writes all
12847 results to RocksDB via SSTFileWriter API.
12848 */
12849 rocksdb::Slice merge_key;
12850 rocksdb::Slice merge_val;
12851
12852 struct unique_sk_buf_info sk_info;
12853 sk_info.dup_sk_buf = m_dup_sk_packed_tuple;
12854 sk_info.dup_sk_buf_old = m_dup_sk_packed_tuple_old;
12855
12856 while ((res = rdb_merge.next(&merge_key, &merge_val)) == 0) {
12857 /* Perform uniqueness check if needed */
12858 if (is_unique_index) {
12859 if (check_duplicate_sk(new_table_arg, *index, &merge_key, &sk_info)) {
12860 /*
12861 Duplicate entry found when trying to create unique secondary key.
12862 We need to unpack the record into new_table_arg->record[0] as it
12863 is used inside print_keydup_error so that the error message shows
12864 the duplicate record.
12865 */
12866 if (index->unpack_record(
12867 new_table_arg, new_table_arg->record[0], &merge_key,
12868 &merge_val, m_converter->get_verify_row_debug_checksums())) {
12869 /* Should never reach here */
12870 DBUG_ASSERT(0);
12871 }
12872
12873 print_keydup_error(new_table_arg,
12874 &new_table_arg->key_info[index->get_keyno()],
12875 MYF(0));
12876 DBUG_RETURN(ER_DUP_ENTRY);
12877 }
12878 }
12879
12880 /*
12881 Insert key and slice to SST via SSTFileWriter API.
12882 */
12883 if ((res = bulk_load_key(tx, *index, merge_key, merge_val, false))) {
12884 break;
12885 }
12886 }
12887
12888 /*
12889 Here, res == -1 means that we are finished, while > 0 means an error
12890 occurred.
12891 */
12892 if (res > 0) {
12893 // NO_LINT_DEBUG
12894 sql_print_error("Error while bulk loading keys in external merge sort.");
12895 DBUG_RETURN(res);
12896 }
12897
12898 bool is_critical_error;
12899 res = tx->finish_bulk_load(&is_critical_error);
12900 if (res && is_critical_error) {
12901 // NO_LINT_DEBUG
12902 sql_print_error("Error finishing bulk load.");
12903 DBUG_RETURN(res);
12904 }
12905 }
12906
12907 /*
12908 Explicitly tell jemalloc to clean up any unused dirty pages at this point.
12909 See https://reviews.facebook.net/D63723 for more details.
12910 */
12911 purge_all_jemalloc_arenas();
12912
12913 DBUG_EXECUTE_IF("crash_during_online_index_creation", DBUG_SUICIDE(););
12914 DBUG_RETURN(res);
12915 }
12916
12917 /**
12918 Commit or rollback the changes made during prepare_inplace_alter_table()
12919 and inplace_alter_table() inside the storage engine.
12920 Note that in case of rollback the allowed level of concurrency during
12921 this operation will be the same as for inplace_alter_table() and thus
12922 might be higher than during prepare_inplace_alter_table(). (For example,
12923 concurrent writes were blocked during prepare, but might not be during
12924 rollback).
12925
12926 @note Storage engines are responsible for reporting any errors by
12927 calling my_error()/print_error()
12928
12929 @note If this function with commit= true reports error, it will be called
12930 again with commit= false.
12931
12932 @note In case of partitioning, this function might be called for rollback
12933 without prepare_inplace_alter_table() having been called first.
12934 Also partitioned tables sets ha_alter_info->group_commit_ctx to a NULL
12935 terminated array of the partitions handlers and if all of them are
12936 committed as one, then group_commit_ctx should be set to NULL to indicate
12937 to the partitioning handler that all partitions handlers are committed.
12938 @see prepare_inplace_alter_table().
12939
12940 @param altered_table TABLE object for new version of table.
12941 @param ha_alter_info Structure describing changes to be done
12942 by ALTER TABLE and holding data used
12943 during in-place alter.
12944 @param commit True => Commit, False => Rollback.
12945
12946 @retval true Error
12947 @retval false Success
12948 */
12949 bool ha_rocksdb::commit_inplace_alter_table(
12950 my_core::TABLE *const altered_table,
12951 my_core::Alter_inplace_info *const ha_alter_info, bool commit) {
12952 DBUG_ENTER_FUNC();
12953
12954 DBUG_ASSERT(altered_table != nullptr);
12955 DBUG_ASSERT(ha_alter_info != nullptr);
12956
12957 Rdb_inplace_alter_ctx *const ctx0 =
12958 static_cast<Rdb_inplace_alter_ctx *>(ha_alter_info->handler_ctx);
12959
12960 DEBUG_SYNC(ha_thd(), "rocksdb.commit_in_place_alter_table");
12961
12962 /*
12963 IMPORTANT: When rollback is requested, mysql will abort with
12964 an assertion failure. That means every failed commit during inplace alter
12965 table will result in a fatal error on the server. Indexes ongoing creation
12966 will be detected when the server restarts, and dropped.
12967
12968 For partitioned tables, a rollback call to this function (commit == false)
12969 is done for each partition. A successful commit call only executes once
12970 for all partitions.
12971 */
12972 if (!commit) {
12973 /* If ctx has not been created yet, nothing to do here */
12974 if (!ctx0) {
12975 DBUG_RETURN(HA_EXIT_SUCCESS);
12976 }
12977
12978 /*
12979 Cannot call destructor for Rdb_tbl_def directly because we don't want to
12980 erase the mappings inside the ddl_manager, as the old_key_descr is still
12981 using them.
12982 */
12983 if (ctx0->m_new_key_descr) {
12984 /* Delete the new key descriptors */
12985 for (uint i = 0; i < ctx0->m_new_tdef->m_key_count; i++) {
12986 ctx0->m_new_key_descr[i] = nullptr;
12987 }
12988
12989 delete[] ctx0->m_new_key_descr;
12990 ctx0->m_new_key_descr = nullptr;
12991 ctx0->m_new_tdef->m_key_descr_arr = nullptr;
12992
12993 delete ctx0->m_new_tdef;
12994 }
12995
12996 /* Remove uncommitted key definitons from ddl_manager */
12997 ddl_manager.remove_uncommitted_keydefs(ctx0->m_added_indexes);
12998
12999 /* Rollback any partially created indexes */
13000 dict_manager.rollback_ongoing_index_creation();
13001
13002 DBUG_RETURN(HA_EXIT_SUCCESS);
13003 }
13004
13005 DBUG_ASSERT(ctx0);
13006
13007 /*
13008 For partitioned tables, we need to commit all changes to all tables at
13009 once, unlike in the other inplace alter API methods.
13010 */
13011 inplace_alter_handler_ctx **ctx_array;
13012 inplace_alter_handler_ctx *ctx_single[2];
13013
13014 if (ha_alter_info->group_commit_ctx) {
13015 DBUG_EXECUTE_IF("crash_during_index_creation_partition", DBUG_SUICIDE(););
13016 ctx_array = ha_alter_info->group_commit_ctx;
13017 } else {
13018 ctx_single[0] = ctx0;
13019 ctx_single[1] = nullptr;
13020 ctx_array = ctx_single;
13021 }
13022
13023 DBUG_ASSERT(ctx0 == ctx_array[0]);
13024 ha_alter_info->group_commit_ctx = nullptr;
13025
13026 if (ha_alter_info->handler_flags &
13027 (ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX |
13028 ALTER_DROP_UNIQUE_INDEX |
13029 ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX |
13030 ALTER_ADD_UNIQUE_INDEX)) {
13031 const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
13032 rocksdb::WriteBatch *const batch = wb.get();
13033 std::unordered_set<GL_INDEX_ID> create_index_ids;
13034
13035 m_tbl_def = ctx0->m_new_tdef;
13036 m_key_descr_arr = m_tbl_def->m_key_descr_arr;
13037 m_pk_descr = m_key_descr_arr[pk_index(altered_table, m_tbl_def)];
13038
13039 dict_manager.lock();
13040 for (inplace_alter_handler_ctx **pctx = ctx_array; *pctx; pctx++) {
13041 Rdb_inplace_alter_ctx *const ctx =
13042 static_cast<Rdb_inplace_alter_ctx *>(*pctx);
13043
13044 /* Mark indexes to be dropped */
13045 dict_manager.add_drop_index(ctx->m_dropped_index_ids, batch);
13046
13047 for (const auto &index : ctx->m_added_indexes) {
13048 create_index_ids.insert(index->get_gl_index_id());
13049 }
13050
13051 if (ddl_manager.put_and_write(ctx->m_new_tdef, batch)) {
13052 /*
13053 Failed to write new entry into data dictionary, this should never
13054 happen.
13055 */
13056 DBUG_ASSERT(0);
13057 }
13058
13059 /*
13060 Remove uncommitted key definitons from ddl_manager, as they are now
13061 committed into the data dictionary.
13062 */
13063 ddl_manager.remove_uncommitted_keydefs(ctx->m_added_indexes);
13064 }
13065
13066 if (dict_manager.commit(batch)) {
13067 /*
13068 Should never reach here. We assume MyRocks will abort if commit fails.
13069 */
13070 DBUG_ASSERT(0);
13071 }
13072
13073 dict_manager.unlock();
13074
13075 /* Mark ongoing create indexes as finished/remove from data dictionary */
13076 dict_manager.finish_indexes_operation(
13077 create_index_ids, Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
13078
13079 rdb_drop_idx_thread.signal();
13080 }
13081
13082 if (ha_alter_info->handler_flags & ALTER_CHANGE_CREATE_OPTION) {
13083 const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
13084 rocksdb::WriteBatch *const batch = wb.get();
13085 std::unordered_set<GL_INDEX_ID> create_index_ids;
13086
13087 ulonglong auto_incr_val = ha_alter_info->create_info->auto_increment_value;
13088
13089 for (inplace_alter_handler_ctx **pctx = ctx_array; *pctx; pctx++) {
13090 Rdb_inplace_alter_ctx *const ctx =
13091 static_cast<Rdb_inplace_alter_ctx *>(*pctx);
13092 auto_incr_val = std::max(auto_incr_val, ctx->m_max_auto_incr);
13093 dict_manager.put_auto_incr_val(
13094 batch, ctx->m_new_tdef->get_autoincr_gl_index_id(), auto_incr_val,
13095 true /* overwrite */);
13096 ctx->m_new_tdef->m_auto_incr_val = auto_incr_val;
13097 }
13098
13099 if (dict_manager.commit(batch)) {
13100 DBUG_ASSERT(0);
13101 }
13102 }
13103
13104 DBUG_RETURN(HA_EXIT_SUCCESS);
13105 }
13106
13107 #define SHOW_FNAME(name) rocksdb_show_##name
13108
13109 #define DEF_SHOW_FUNC(name, key) \
13110 static int SHOW_FNAME(name)(MYSQL_THD thd, SHOW_VAR * var, char *buff) { \
13111 rocksdb_status_counters.name = \
13112 rocksdb_stats->getTickerCount(rocksdb::key); \
13113 var->type = SHOW_LONGLONG; \
13114 var->value = reinterpret_cast<char *>(&rocksdb_status_counters.name); \
13115 return HA_EXIT_SUCCESS; \
13116 }
13117
13118 #define DEF_STATUS_VAR(name) \
13119 { "rocksdb_" #name, (char *)&SHOW_FNAME(name), SHOW_FUNC }
13120
13121 #define DEF_STATUS_VAR_PTR(name, ptr, option) \
13122 { "rocksdb_" name, (char *)ptr, option }
13123
13124 #define DEF_STATUS_VAR_FUNC(name, ptr, option) \
13125 { name, reinterpret_cast<char *>(ptr), option }
13126
13127 struct rocksdb_status_counters_t {
13128 uint64_t block_cache_miss;
13129 uint64_t block_cache_hit;
13130 uint64_t block_cache_add;
13131 uint64_t block_cache_add_failures;
13132 uint64_t block_cache_index_miss;
13133 uint64_t block_cache_index_hit;
13134 uint64_t block_cache_index_add;
13135 uint64_t block_cache_index_bytes_insert;
13136 uint64_t block_cache_index_bytes_evict;
13137 uint64_t block_cache_filter_miss;
13138 uint64_t block_cache_filter_hit;
13139 uint64_t block_cache_filter_add;
13140 uint64_t block_cache_filter_bytes_insert;
13141 uint64_t block_cache_filter_bytes_evict;
13142 uint64_t block_cache_bytes_read;
13143 uint64_t block_cache_bytes_write;
13144 uint64_t block_cache_data_bytes_insert;
13145 uint64_t block_cache_data_miss;
13146 uint64_t block_cache_data_hit;
13147 uint64_t block_cache_data_add;
13148 uint64_t bloom_filter_useful;
13149 uint64_t bloom_filter_full_positive;
13150 uint64_t bloom_filter_full_true_positive;
13151 uint64_t memtable_hit;
13152 uint64_t memtable_miss;
13153 uint64_t get_hit_l0;
13154 uint64_t get_hit_l1;
13155 uint64_t get_hit_l2_and_up;
13156 uint64_t compaction_key_drop_new;
13157 uint64_t compaction_key_drop_obsolete;
13158 uint64_t compaction_key_drop_user;
13159 uint64_t number_keys_written;
13160 uint64_t number_keys_read;
13161 uint64_t number_keys_updated;
13162 uint64_t bytes_written;
13163 uint64_t bytes_read;
13164 uint64_t number_db_seek;
13165 uint64_t number_db_seek_found;
13166 uint64_t number_db_next;
13167 uint64_t number_db_next_found;
13168 uint64_t number_db_prev;
13169 uint64_t number_db_prev_found;
13170 uint64_t iter_bytes_read;
13171 uint64_t no_file_closes;
13172 uint64_t no_file_opens;
13173 uint64_t no_file_errors;
13174 uint64_t stall_micros;
13175 uint64_t num_iterators;
13176 uint64_t number_multiget_get;
13177 uint64_t number_multiget_keys_read;
13178 uint64_t number_multiget_bytes_read;
13179 uint64_t number_deletes_filtered;
13180 uint64_t number_merge_failures;
13181 uint64_t bloom_filter_prefix_checked;
13182 uint64_t bloom_filter_prefix_useful;
13183 uint64_t number_reseeks_iteration;
13184 uint64_t getupdatessince_calls;
13185 uint64_t block_cachecompressed_miss;
13186 uint64_t block_cachecompressed_hit;
13187 uint64_t wal_synced;
13188 uint64_t wal_bytes;
13189 uint64_t write_self;
13190 uint64_t write_other;
13191 uint64_t write_timedout;
13192 uint64_t write_wal;
13193 uint64_t flush_write_bytes;
13194 uint64_t compact_read_bytes;
13195 uint64_t compact_write_bytes;
13196 uint64_t number_superversion_acquires;
13197 uint64_t number_superversion_releases;
13198 uint64_t number_superversion_cleanups;
13199 uint64_t number_block_not_compressed;
13200 };
13201
13202 static rocksdb_status_counters_t rocksdb_status_counters;
13203
13204 DEF_SHOW_FUNC(block_cache_miss, BLOCK_CACHE_MISS)
13205 DEF_SHOW_FUNC(block_cache_hit, BLOCK_CACHE_HIT)
13206 DEF_SHOW_FUNC(block_cache_add, BLOCK_CACHE_ADD)
13207 DEF_SHOW_FUNC(block_cache_add_failures, BLOCK_CACHE_ADD_FAILURES)
13208 DEF_SHOW_FUNC(block_cache_index_miss, BLOCK_CACHE_INDEX_MISS)
13209 DEF_SHOW_FUNC(block_cache_index_hit, BLOCK_CACHE_INDEX_HIT)
13210 DEF_SHOW_FUNC(block_cache_index_add, BLOCK_CACHE_INDEX_ADD)
13211 DEF_SHOW_FUNC(block_cache_index_bytes_insert, BLOCK_CACHE_INDEX_BYTES_INSERT)
13212 DEF_SHOW_FUNC(block_cache_index_bytes_evict, BLOCK_CACHE_INDEX_BYTES_EVICT)
13213 DEF_SHOW_FUNC(block_cache_filter_miss, BLOCK_CACHE_FILTER_MISS)
13214 DEF_SHOW_FUNC(block_cache_filter_hit, BLOCK_CACHE_FILTER_HIT)
13215 DEF_SHOW_FUNC(block_cache_filter_add, BLOCK_CACHE_FILTER_ADD)
13216 DEF_SHOW_FUNC(block_cache_filter_bytes_insert, BLOCK_CACHE_FILTER_BYTES_INSERT)
13217 DEF_SHOW_FUNC(block_cache_filter_bytes_evict, BLOCK_CACHE_FILTER_BYTES_EVICT)
13218 DEF_SHOW_FUNC(block_cache_bytes_read, BLOCK_CACHE_BYTES_READ)
13219 DEF_SHOW_FUNC(block_cache_bytes_write, BLOCK_CACHE_BYTES_WRITE)
13220 DEF_SHOW_FUNC(block_cache_data_bytes_insert, BLOCK_CACHE_DATA_BYTES_INSERT)
13221 DEF_SHOW_FUNC(block_cache_data_miss, BLOCK_CACHE_DATA_MISS)
13222 DEF_SHOW_FUNC(block_cache_data_hit, BLOCK_CACHE_DATA_HIT)
13223 DEF_SHOW_FUNC(block_cache_data_add, BLOCK_CACHE_DATA_ADD)
13224 DEF_SHOW_FUNC(bloom_filter_useful, BLOOM_FILTER_USEFUL)
13225 DEF_SHOW_FUNC(bloom_filter_full_positive, BLOOM_FILTER_FULL_POSITIVE)
13226 DEF_SHOW_FUNC(bloom_filter_full_true_positive, BLOOM_FILTER_FULL_TRUE_POSITIVE)
13227 DEF_SHOW_FUNC(memtable_hit, MEMTABLE_HIT)
13228 DEF_SHOW_FUNC(memtable_miss, MEMTABLE_MISS)
13229 DEF_SHOW_FUNC(get_hit_l0, GET_HIT_L0)
13230 DEF_SHOW_FUNC(get_hit_l1, GET_HIT_L1)
13231 DEF_SHOW_FUNC(get_hit_l2_and_up, GET_HIT_L2_AND_UP)
13232 DEF_SHOW_FUNC(compaction_key_drop_new, COMPACTION_KEY_DROP_NEWER_ENTRY)
13233 DEF_SHOW_FUNC(compaction_key_drop_obsolete, COMPACTION_KEY_DROP_OBSOLETE)
13234 DEF_SHOW_FUNC(compaction_key_drop_user, COMPACTION_KEY_DROP_USER)
13235 DEF_SHOW_FUNC(number_keys_written, NUMBER_KEYS_WRITTEN)
13236 DEF_SHOW_FUNC(number_keys_read, NUMBER_KEYS_READ)
13237 DEF_SHOW_FUNC(number_keys_updated, NUMBER_KEYS_UPDATED)
13238 DEF_SHOW_FUNC(bytes_written, BYTES_WRITTEN)
13239 DEF_SHOW_FUNC(bytes_read, BYTES_READ)
13240 DEF_SHOW_FUNC(number_db_seek, NUMBER_DB_SEEK)
13241 DEF_SHOW_FUNC(number_db_seek_found, NUMBER_DB_SEEK_FOUND)
13242 DEF_SHOW_FUNC(number_db_next, NUMBER_DB_NEXT)
13243 DEF_SHOW_FUNC(number_db_next_found, NUMBER_DB_NEXT_FOUND)
13244 DEF_SHOW_FUNC(number_db_prev, NUMBER_DB_PREV)
13245 DEF_SHOW_FUNC(number_db_prev_found, NUMBER_DB_PREV_FOUND)
13246 DEF_SHOW_FUNC(iter_bytes_read, ITER_BYTES_READ)
13247 DEF_SHOW_FUNC(no_file_closes, NO_FILE_CLOSES)
13248 DEF_SHOW_FUNC(no_file_opens, NO_FILE_OPENS)
13249 DEF_SHOW_FUNC(no_file_errors, NO_FILE_ERRORS)
13250 DEF_SHOW_FUNC(stall_micros, STALL_MICROS)
13251 DEF_SHOW_FUNC(num_iterators, NO_ITERATORS)
13252 DEF_SHOW_FUNC(number_multiget_get, NUMBER_MULTIGET_CALLS)
13253 DEF_SHOW_FUNC(number_multiget_keys_read, NUMBER_MULTIGET_KEYS_READ)
13254 DEF_SHOW_FUNC(number_multiget_bytes_read, NUMBER_MULTIGET_BYTES_READ)
13255 DEF_SHOW_FUNC(number_deletes_filtered, NUMBER_FILTERED_DELETES)
13256 DEF_SHOW_FUNC(number_merge_failures, NUMBER_MERGE_FAILURES)
13257 DEF_SHOW_FUNC(bloom_filter_prefix_checked, BLOOM_FILTER_PREFIX_CHECKED)
13258 DEF_SHOW_FUNC(bloom_filter_prefix_useful, BLOOM_FILTER_PREFIX_USEFUL)
13259 DEF_SHOW_FUNC(number_reseeks_iteration, NUMBER_OF_RESEEKS_IN_ITERATION)
13260 DEF_SHOW_FUNC(getupdatessince_calls, GET_UPDATES_SINCE_CALLS)
13261 DEF_SHOW_FUNC(block_cachecompressed_miss, BLOCK_CACHE_COMPRESSED_MISS)
13262 DEF_SHOW_FUNC(block_cachecompressed_hit, BLOCK_CACHE_COMPRESSED_HIT)
13263 DEF_SHOW_FUNC(wal_synced, WAL_FILE_SYNCED)
13264 DEF_SHOW_FUNC(wal_bytes, WAL_FILE_BYTES)
13265 DEF_SHOW_FUNC(write_self, WRITE_DONE_BY_SELF)
13266 DEF_SHOW_FUNC(write_other, WRITE_DONE_BY_OTHER)
13267 DEF_SHOW_FUNC(write_timedout, WRITE_TIMEDOUT)
13268 DEF_SHOW_FUNC(write_wal, WRITE_WITH_WAL)
13269 DEF_SHOW_FUNC(flush_write_bytes, FLUSH_WRITE_BYTES)
13270 DEF_SHOW_FUNC(compact_read_bytes, COMPACT_READ_BYTES)
13271 DEF_SHOW_FUNC(compact_write_bytes, COMPACT_WRITE_BYTES)
13272 DEF_SHOW_FUNC(number_superversion_acquires, NUMBER_SUPERVERSION_ACQUIRES)
13273 DEF_SHOW_FUNC(number_superversion_releases, NUMBER_SUPERVERSION_RELEASES)
13274 DEF_SHOW_FUNC(number_superversion_cleanups, NUMBER_SUPERVERSION_CLEANUPS)
13275 DEF_SHOW_FUNC(number_block_not_compressed, NUMBER_BLOCK_NOT_COMPRESSED)
13276
13277 static void myrocks_update_status() {
13278 export_stats.rows_deleted = global_stats.rows[ROWS_DELETED];
13279 export_stats.rows_inserted = global_stats.rows[ROWS_INSERTED];
13280 export_stats.rows_read = global_stats.rows[ROWS_READ];
13281 export_stats.rows_updated = global_stats.rows[ROWS_UPDATED];
13282 export_stats.rows_deleted_blind = global_stats.rows[ROWS_DELETED_BLIND];
13283 export_stats.rows_expired = global_stats.rows[ROWS_EXPIRED];
13284 export_stats.rows_filtered = global_stats.rows[ROWS_FILTERED];
13285
13286 export_stats.system_rows_deleted = global_stats.system_rows[ROWS_DELETED];
13287 export_stats.system_rows_inserted = global_stats.system_rows[ROWS_INSERTED];
13288 export_stats.system_rows_read = global_stats.system_rows[ROWS_READ];
13289 export_stats.system_rows_updated = global_stats.system_rows[ROWS_UPDATED];
13290
13291 export_stats.queries_point = global_stats.queries[QUERIES_POINT];
13292 export_stats.queries_range = global_stats.queries[QUERIES_RANGE];
13293
13294 export_stats.covered_secondary_key_lookups =
13295 global_stats.covered_secondary_key_lookups;
13296 }
13297
13298 static void myrocks_update_memory_status() {
13299 std::vector<rocksdb::DB *> dbs;
13300 std::unordered_set<const rocksdb::Cache *> cache_set;
13301 dbs.push_back(rdb);
13302 std::map<rocksdb::MemoryUtil::UsageType, uint64_t> temp_usage_by_type;
13303 rocksdb::MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set,
13304 &temp_usage_by_type);
13305 memory_stats.memtable_total =
13306 temp_usage_by_type[rocksdb::MemoryUtil::kMemTableTotal];
13307 memory_stats.memtable_unflushed =
13308 temp_usage_by_type[rocksdb::MemoryUtil::kMemTableUnFlushed];
13309 }
13310
13311 static SHOW_VAR myrocks_status_variables[] = {
13312 DEF_STATUS_VAR_FUNC("rows_deleted", &export_stats.rows_deleted,
13313 SHOW_LONGLONG),
13314 DEF_STATUS_VAR_FUNC("rows_inserted", &export_stats.rows_inserted,
13315 SHOW_LONGLONG),
13316 DEF_STATUS_VAR_FUNC("rows_read", &export_stats.rows_read, SHOW_LONGLONG),
13317 DEF_STATUS_VAR_FUNC("rows_updated", &export_stats.rows_updated,
13318 SHOW_LONGLONG),
13319 DEF_STATUS_VAR_FUNC("rows_deleted_blind", &export_stats.rows_deleted_blind,
13320 SHOW_LONGLONG),
13321 DEF_STATUS_VAR_FUNC("rows_expired", &export_stats.rows_expired,
13322 SHOW_LONGLONG),
13323 DEF_STATUS_VAR_FUNC("rows_filtered", &export_stats.rows_filtered,
13324 SHOW_LONGLONG),
13325 DEF_STATUS_VAR_FUNC("system_rows_deleted",
13326 &export_stats.system_rows_deleted, SHOW_LONGLONG),
13327 DEF_STATUS_VAR_FUNC("system_rows_inserted",
13328 &export_stats.system_rows_inserted, SHOW_LONGLONG),
13329 DEF_STATUS_VAR_FUNC("system_rows_read", &export_stats.system_rows_read,
13330 SHOW_LONGLONG),
13331 DEF_STATUS_VAR_FUNC("system_rows_updated",
13332 &export_stats.system_rows_updated, SHOW_LONGLONG),
13333 DEF_STATUS_VAR_FUNC("memtable_total", &memory_stats.memtable_total,
13334 SHOW_LONGLONG),
13335 DEF_STATUS_VAR_FUNC("memtable_unflushed", &memory_stats.memtable_unflushed,
13336 SHOW_LONGLONG),
13337 DEF_STATUS_VAR_FUNC("queries_point", &export_stats.queries_point,
13338 SHOW_LONGLONG),
13339 DEF_STATUS_VAR_FUNC("queries_range", &export_stats.queries_range,
13340 SHOW_LONGLONG),
13341 DEF_STATUS_VAR_FUNC("covered_secondary_key_lookups",
13342 &export_stats.covered_secondary_key_lookups,
13343 SHOW_LONGLONG),
13344
13345 {NullS, NullS, SHOW_LONG}};
13346
13347 static void show_myrocks_vars(THD *thd, SHOW_VAR *var, char *buff) {
13348 myrocks_update_status();
13349 myrocks_update_memory_status();
13350 var->type = SHOW_ARRAY;
13351 var->value = reinterpret_cast<char *>(&myrocks_status_variables);
13352 }
13353
13354 static ulonglong io_stall_prop_value(
13355 const std::map<std::string, std::string> &props, const std::string &key) {
13356 std::map<std::string, std::string>::const_iterator iter =
13357 props.find("io_stalls." + key);
13358 if (iter != props.end()) {
13359 return std::stoull(iter->second);
13360 } else {
13361 DBUG_PRINT("warning",
13362 ("RocksDB GetMapPropery hasn't returned key=%s", key.c_str()));
13363 DBUG_ASSERT(0);
13364 return 0;
13365 }
13366 }
13367
13368 static void update_rocksdb_stall_status() {
13369 st_io_stall_stats local_io_stall_stats;
13370 for (const auto &cf_name : cf_manager.get_cf_names()) {
13371 rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(cf_name);
13372 if (cfh == nullptr) {
13373 continue;
13374 }
13375
13376 std::map<std::string, std::string> props;
13377 if (!rdb->GetMapProperty(cfh, "rocksdb.cfstats", &props)) {
13378 continue;
13379 }
13380
13381 local_io_stall_stats.level0_slowdown +=
13382 io_stall_prop_value(props, "level0_slowdown");
13383 local_io_stall_stats.level0_slowdown_with_compaction +=
13384 io_stall_prop_value(props, "level0_slowdown_with_compaction");
13385 local_io_stall_stats.level0_numfiles +=
13386 io_stall_prop_value(props, "level0_numfiles");
13387 local_io_stall_stats.level0_numfiles_with_compaction +=
13388 io_stall_prop_value(props, "level0_numfiles_with_compaction");
13389 local_io_stall_stats.stop_for_pending_compaction_bytes +=
13390 io_stall_prop_value(props, "stop_for_pending_compaction_bytes");
13391 local_io_stall_stats.slowdown_for_pending_compaction_bytes +=
13392 io_stall_prop_value(props, "slowdown_for_pending_compaction_bytes");
13393 local_io_stall_stats.memtable_compaction +=
13394 io_stall_prop_value(props, "memtable_compaction");
13395 local_io_stall_stats.memtable_slowdown +=
13396 io_stall_prop_value(props, "memtable_slowdown");
13397 local_io_stall_stats.total_stop += io_stall_prop_value(props, "total_stop");
13398 local_io_stall_stats.total_slowdown +=
13399 io_stall_prop_value(props, "total_slowdown");
13400 }
13401 io_stall_stats = local_io_stall_stats;
13402 }
13403
13404 static SHOW_VAR rocksdb_stall_status_variables[] = {
13405 DEF_STATUS_VAR_FUNC("l0_file_count_limit_slowdowns",
13406 &io_stall_stats.level0_slowdown, SHOW_LONGLONG),
13407 DEF_STATUS_VAR_FUNC("locked_l0_file_count_limit_slowdowns",
13408 &io_stall_stats.level0_slowdown_with_compaction,
13409 SHOW_LONGLONG),
13410 DEF_STATUS_VAR_FUNC("l0_file_count_limit_stops",
13411 &io_stall_stats.level0_numfiles, SHOW_LONGLONG),
13412 DEF_STATUS_VAR_FUNC("locked_l0_file_count_limit_stops",
13413 &io_stall_stats.level0_numfiles_with_compaction,
13414 SHOW_LONGLONG),
13415 DEF_STATUS_VAR_FUNC("pending_compaction_limit_stops",
13416 &io_stall_stats.stop_for_pending_compaction_bytes,
13417 SHOW_LONGLONG),
13418 DEF_STATUS_VAR_FUNC("pending_compaction_limit_slowdowns",
13419 &io_stall_stats.slowdown_for_pending_compaction_bytes,
13420 SHOW_LONGLONG),
13421 DEF_STATUS_VAR_FUNC("memtable_limit_stops",
13422 &io_stall_stats.memtable_compaction, SHOW_LONGLONG),
13423 DEF_STATUS_VAR_FUNC("memtable_limit_slowdowns",
13424 &io_stall_stats.memtable_slowdown, SHOW_LONGLONG),
13425 DEF_STATUS_VAR_FUNC("total_stops", &io_stall_stats.total_stop,
13426 SHOW_LONGLONG),
13427 DEF_STATUS_VAR_FUNC("total_slowdowns", &io_stall_stats.total_slowdown,
13428 SHOW_LONGLONG),
13429 // end of the array marker
13430 {NullS, NullS, SHOW_LONG}};
13431
13432 static void show_rocksdb_stall_vars(THD *thd, SHOW_VAR *var, char *buff) {
13433 update_rocksdb_stall_status();
13434 var->type = SHOW_ARRAY;
13435 var->value = reinterpret_cast<char *>(&rocksdb_stall_status_variables);
13436 }
13437
13438 static SHOW_VAR rocksdb_status_vars[] = {
13439 DEF_STATUS_VAR(block_cache_miss),
13440 DEF_STATUS_VAR(block_cache_hit),
13441 DEF_STATUS_VAR(block_cache_add),
13442 DEF_STATUS_VAR(block_cache_add_failures),
13443 DEF_STATUS_VAR(block_cache_index_miss),
13444 DEF_STATUS_VAR(block_cache_index_hit),
13445 DEF_STATUS_VAR(block_cache_index_add),
13446 DEF_STATUS_VAR(block_cache_index_bytes_insert),
13447 DEF_STATUS_VAR(block_cache_index_bytes_evict),
13448 DEF_STATUS_VAR(block_cache_filter_miss),
13449 DEF_STATUS_VAR(block_cache_filter_hit),
13450 DEF_STATUS_VAR(block_cache_filter_add),
13451 DEF_STATUS_VAR(block_cache_filter_bytes_insert),
13452 DEF_STATUS_VAR(block_cache_filter_bytes_evict),
13453 DEF_STATUS_VAR(block_cache_bytes_read),
13454 DEF_STATUS_VAR(block_cache_bytes_write),
13455 DEF_STATUS_VAR(block_cache_data_bytes_insert),
13456 DEF_STATUS_VAR(block_cache_data_miss),
13457 DEF_STATUS_VAR(block_cache_data_hit),
13458 DEF_STATUS_VAR(block_cache_data_add),
13459 DEF_STATUS_VAR(bloom_filter_useful),
13460 DEF_STATUS_VAR(bloom_filter_full_positive),
13461 DEF_STATUS_VAR(bloom_filter_full_true_positive),
13462 DEF_STATUS_VAR(memtable_hit),
13463 DEF_STATUS_VAR(memtable_miss),
13464 DEF_STATUS_VAR(get_hit_l0),
13465 DEF_STATUS_VAR(get_hit_l1),
13466 DEF_STATUS_VAR(get_hit_l2_and_up),
13467 DEF_STATUS_VAR(compaction_key_drop_new),
13468 DEF_STATUS_VAR(compaction_key_drop_obsolete),
13469 DEF_STATUS_VAR(compaction_key_drop_user),
13470 DEF_STATUS_VAR(number_keys_written),
13471 DEF_STATUS_VAR(number_keys_read),
13472 DEF_STATUS_VAR(number_keys_updated),
13473 DEF_STATUS_VAR(bytes_written),
13474 DEF_STATUS_VAR(bytes_read),
13475 DEF_STATUS_VAR(number_db_seek),
13476 DEF_STATUS_VAR(number_db_seek_found),
13477 DEF_STATUS_VAR(number_db_next),
13478 DEF_STATUS_VAR(number_db_next_found),
13479 DEF_STATUS_VAR(number_db_prev),
13480 DEF_STATUS_VAR(number_db_prev_found),
13481 DEF_STATUS_VAR(iter_bytes_read),
13482 DEF_STATUS_VAR(no_file_closes),
13483 DEF_STATUS_VAR(no_file_opens),
13484 DEF_STATUS_VAR(no_file_errors),
13485 DEF_STATUS_VAR(stall_micros),
13486 DEF_STATUS_VAR(num_iterators),
13487 DEF_STATUS_VAR(number_multiget_get),
13488 DEF_STATUS_VAR(number_multiget_keys_read),
13489 DEF_STATUS_VAR(number_multiget_bytes_read),
13490 DEF_STATUS_VAR(number_deletes_filtered),
13491 DEF_STATUS_VAR(number_merge_failures),
13492 DEF_STATUS_VAR(bloom_filter_prefix_checked),
13493 DEF_STATUS_VAR(bloom_filter_prefix_useful),
13494 DEF_STATUS_VAR(number_reseeks_iteration),
13495 DEF_STATUS_VAR(getupdatessince_calls),
13496 DEF_STATUS_VAR(block_cachecompressed_miss),
13497 DEF_STATUS_VAR(block_cachecompressed_hit),
13498 DEF_STATUS_VAR(wal_synced),
13499 DEF_STATUS_VAR(wal_bytes),
13500 DEF_STATUS_VAR(write_self),
13501 DEF_STATUS_VAR(write_other),
13502 DEF_STATUS_VAR(write_timedout),
13503 DEF_STATUS_VAR(write_wal),
13504 DEF_STATUS_VAR(flush_write_bytes),
13505 DEF_STATUS_VAR(compact_read_bytes),
13506 DEF_STATUS_VAR(compact_write_bytes),
13507 DEF_STATUS_VAR(number_superversion_acquires),
13508 DEF_STATUS_VAR(number_superversion_releases),
13509 DEF_STATUS_VAR(number_superversion_cleanups),
13510 DEF_STATUS_VAR(number_block_not_compressed),
13511 DEF_STATUS_VAR_PTR("row_lock_deadlocks", &rocksdb_row_lock_deadlocks,
13512 SHOW_LONGLONG),
13513 DEF_STATUS_VAR_PTR("row_lock_wait_timeouts",
13514 &rocksdb_row_lock_wait_timeouts, SHOW_LONGLONG),
13515 DEF_STATUS_VAR_PTR("snapshot_conflict_errors",
13516 &rocksdb_snapshot_conflict_errors, SHOW_LONGLONG),
13517 DEF_STATUS_VAR_PTR("wal_group_syncs", &rocksdb_wal_group_syncs,
13518 SHOW_LONGLONG),
13519 DEF_STATUS_VAR_PTR("manual_compactions_processed",
13520 &rocksdb_manual_compactions_processed, SHOW_LONGLONG),
13521 DEF_STATUS_VAR_PTR("manual_compactions_running",
13522 &rocksdb_manual_compactions_running, SHOW_LONGLONG),
13523 DEF_STATUS_VAR_PTR("number_sst_entry_put", &rocksdb_num_sst_entry_put,
13524 SHOW_LONGLONG),
13525 DEF_STATUS_VAR_PTR("number_sst_entry_delete", &rocksdb_num_sst_entry_delete,
13526 SHOW_LONGLONG),
13527 DEF_STATUS_VAR_PTR("number_sst_entry_singledelete",
13528 &rocksdb_num_sst_entry_singledelete, SHOW_LONGLONG),
13529 DEF_STATUS_VAR_PTR("number_sst_entry_merge", &rocksdb_num_sst_entry_merge,
13530 SHOW_LONGLONG),
13531 DEF_STATUS_VAR_PTR("number_sst_entry_other", &rocksdb_num_sst_entry_other,
13532 SHOW_LONGLONG),
13533 #ifndef DBUG_OFF
13534 DEF_STATUS_VAR_PTR("num_get_for_update_calls",
13535 &rocksdb_num_get_for_update_calls, SHOW_LONGLONG),
13536 #endif
13537 // the variables generated by SHOW_FUNC are sorted only by prefix (first
13538 // arg in the tuple below), so make sure it is unique to make sorting
13539 // deterministic as quick sort is not stable
13540 {"rocksdb", reinterpret_cast<char *>(&show_myrocks_vars), SHOW_FUNC},
13541 {"rocksdb_stall", reinterpret_cast<char *>(&show_rocksdb_stall_vars),
13542 SHOW_FUNC},
13543 {NullS, NullS, SHOW_LONG}};
13544
13545 /*
13546 Background thread's main logic
13547 */
13548
13549 void Rdb_background_thread::run() {
13550 // How many seconds to wait till flushing the WAL next time.
13551 const int WAKE_UP_INTERVAL = 1;
13552
13553 timespec ts_next_sync;
13554 set_timespec(ts_next_sync, WAKE_UP_INTERVAL);
13555
13556 for (;;) {
13557 // Wait until the next timeout or until we receive a signal to stop the
13558 // thread. Request to stop the thread should only be triggered when the
13559 // storage engine is being unloaded.
13560 RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
13561 const auto ret MY_ATTRIBUTE((__unused__)) =
13562 mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts_next_sync);
13563
13564 // Check that we receive only the expected error codes.
13565 DBUG_ASSERT(ret == 0 || ret == ETIMEDOUT);
13566 const bool local_stop = m_stop;
13567 const bool local_save_stats = m_save_stats;
13568 reset();
13569 RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
13570
13571 if (local_stop) {
13572 // If we're here then that's because condition variable was signaled by
13573 // another thread and we're shutting down. Break out the loop to make
13574 // sure that shutdown thread can proceed.
13575 break;
13576 }
13577
13578 // This path should be taken only when the timer expired.
13579 DBUG_ASSERT(ret == ETIMEDOUT);
13580
13581 if (local_save_stats) {
13582 ddl_manager.persist_stats();
13583 }
13584
13585 // Set the next timestamp for mysql_cond_timedwait() (which ends up calling
13586 // pthread_cond_timedwait()) to wait on.
13587 set_timespec(ts_next_sync, WAKE_UP_INTERVAL);
13588
13589 // Flush the WAL. Sync it for both background and never modes to copy
13590 // InnoDB's behavior. For mode never, the wal file isn't even written,
13591 // whereas background writes to the wal file, but issues the syncs in a
13592 // background thread.
13593 if (rdb && (rocksdb_flush_log_at_trx_commit != FLUSH_LOG_SYNC) &&
13594 !rocksdb_db_options->allow_mmap_writes) {
13595 const rocksdb::Status s = rdb->FlushWAL(true);
13596 if (!s.ok()) {
13597 rdb_handle_io_error(s, RDB_IO_ERROR_BG_THREAD);
13598 }
13599 }
13600 // Recalculate statistics for indexes.
13601 if (rocksdb_stats_recalc_rate) {
13602 std::unordered_map<GL_INDEX_ID, std::shared_ptr<const Rdb_key_def>>
13603 to_recalc;
13604
13605 if (rdb_indexes_to_recalc.empty()) {
13606 struct Rdb_index_collector : public Rdb_tables_scanner {
13607 int add_table(Rdb_tbl_def *tdef) override {
13608 for (uint i = 0; i < tdef->m_key_count; i++) {
13609 rdb_indexes_to_recalc.push_back(
13610 tdef->m_key_descr_arr[i]->get_gl_index_id());
13611 }
13612 return HA_EXIT_SUCCESS;
13613 }
13614 } collector;
13615 ddl_manager.scan_for_tables(&collector);
13616 }
13617
13618 while (to_recalc.size() < rocksdb_stats_recalc_rate &&
13619 !rdb_indexes_to_recalc.empty()) {
13620 const auto index_id = rdb_indexes_to_recalc.back();
13621 rdb_indexes_to_recalc.pop_back();
13622
13623 std::shared_ptr<const Rdb_key_def> keydef =
13624 ddl_manager.safe_find(index_id);
13625
13626 if (keydef) {
13627 to_recalc.insert(std::make_pair(keydef->get_gl_index_id(), keydef));
13628 }
13629 }
13630
13631 if (!to_recalc.empty()) {
13632 calculate_stats(to_recalc, false);
13633 }
13634 }
13635
13636 }
13637
13638 // save remaining stats which might've left unsaved
13639 ddl_manager.persist_stats();
13640 }
13641
13642 /*
13643 A background thread to handle manual compactions,
13644 except for dropping indexes/tables. Every second, it checks
13645 pending manual compactions, and it calls CompactRange if there is.
13646 */
13647 void Rdb_manual_compaction_thread::run() {
13648 mysql_mutex_init(0, &m_mc_mutex, MY_MUTEX_INIT_FAST);
13649 RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
13650 for (;;) {
13651 if (m_stop) {
13652 break;
13653 }
13654 timespec ts;
13655 set_timespec(ts, 1);
13656
13657 const auto ret MY_ATTRIBUTE((__unused__)) =
13658 mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts);
13659 if (m_stop) {
13660 break;
13661 }
13662 // make sure, no program error is returned
13663 DBUG_ASSERT(ret == 0 || ret == ETIMEDOUT);
13664 RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
13665
13666 RDB_MUTEX_LOCK_CHECK(m_mc_mutex);
13667 // Grab the first item and proceed, if not empty.
13668 if (m_requests.empty()) {
13669 RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
13670 RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
13671 continue;
13672 }
13673 Manual_compaction_request &mcr = m_requests.begin()->second;
13674 DBUG_ASSERT(mcr.cf != nullptr);
13675 DBUG_ASSERT(mcr.state == Manual_compaction_request::INITED);
13676 mcr.state = Manual_compaction_request::RUNNING;
13677 RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
13678
13679 DBUG_ASSERT(mcr.state == Manual_compaction_request::RUNNING);
13680 // NO_LINT_DEBUG
13681 sql_print_information("Manual Compaction id %d cf %s started.", mcr.mc_id,
13682 mcr.cf->GetName().c_str());
13683 rocksdb_manual_compactions_running++;
13684 if (rocksdb_debug_manual_compaction_delay > 0) {
13685 my_sleep(rocksdb_debug_manual_compaction_delay * 1000000);
13686 }
13687 // CompactRange may take a very long time. On clean shutdown,
13688 // it is cancelled by CancelAllBackgroundWork, then status is
13689 // set to shutdownInProgress.
13690 const rocksdb::Status s = rdb->CompactRange(
13691 getCompactRangeOptions(mcr.concurrency), mcr.cf, mcr.start, mcr.limit);
13692 rocksdb_manual_compactions_running--;
13693 if (s.ok()) {
13694 // NO_LINT_DEBUG
13695 sql_print_information("Manual Compaction id %d cf %s ended.", mcr.mc_id,
13696 mcr.cf->GetName().c_str());
13697 } else {
13698 // NO_LINT_DEBUG
13699 sql_print_information("Manual Compaction id %d cf %s aborted. %s",
13700 mcr.mc_id, mcr.cf->GetName().c_str(), s.getState());
13701 if (!s.IsShutdownInProgress()) {
13702 rdb_handle_io_error(s, RDB_IO_ERROR_BG_THREAD);
13703 } else {
13704 DBUG_ASSERT(m_requests.size() == 1);
13705 }
13706 }
13707 rocksdb_manual_compactions_processed++;
13708 clear_manual_compaction_request(mcr.mc_id, false);
13709 RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
13710 }
13711 clear_all_manual_compaction_requests();
13712 DBUG_ASSERT(m_requests.empty());
13713 RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
13714 mysql_mutex_destroy(&m_mc_mutex);
13715 }
13716
13717 void Rdb_manual_compaction_thread::clear_all_manual_compaction_requests() {
13718 RDB_MUTEX_LOCK_CHECK(m_mc_mutex);
13719 m_requests.clear();
13720 RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
13721 }
13722
13723 void Rdb_manual_compaction_thread::clear_manual_compaction_request(
13724 int mc_id, bool init_only) {
13725 bool erase = true;
13726 RDB_MUTEX_LOCK_CHECK(m_mc_mutex);
13727 auto it = m_requests.find(mc_id);
13728 if (it != m_requests.end()) {
13729 if (init_only) {
13730 Manual_compaction_request mcr = it->second;
13731 if (mcr.state != Manual_compaction_request::INITED) {
13732 erase = false;
13733 }
13734 }
13735 if (erase) {
13736 m_requests.erase(it);
13737 }
13738 } else {
13739 // Current code path guarantees that erasing by the same mc_id happens
13740 // at most once. INITED state may be erased by a thread that requested
13741 // the compaction. RUNNING state is erased by mc thread only.
13742 DBUG_ASSERT(0);
13743 }
13744 RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
13745 }
13746
13747 int Rdb_manual_compaction_thread::request_manual_compaction(
13748 rocksdb::ColumnFamilyHandle *cf, rocksdb::Slice *start,
13749 rocksdb::Slice *limit, int concurrency) {
13750 int mc_id = -1;
13751 RDB_MUTEX_LOCK_CHECK(m_mc_mutex);
13752 if (m_requests.size() >= rocksdb_max_manual_compactions) {
13753 RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
13754 return mc_id;
13755 }
13756 Manual_compaction_request mcr;
13757 mc_id = mcr.mc_id = ++m_latest_mc_id;
13758 mcr.state = Manual_compaction_request::INITED;
13759 mcr.cf = cf;
13760 mcr.start = start;
13761 mcr.limit = limit;
13762 mcr.concurrency = concurrency;
13763 m_requests.insert(std::make_pair(mcr.mc_id, mcr));
13764 RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
13765 return mc_id;
13766 }
13767
13768 bool Rdb_manual_compaction_thread::is_manual_compaction_finished(int mc_id) {
13769 bool finished = false;
13770 RDB_MUTEX_LOCK_CHECK(m_mc_mutex);
13771 if (m_requests.count(mc_id) == 0) {
13772 finished = true;
13773 }
13774 RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
13775 return finished;
13776 }
13777
13778 /**
13779 * Locking read + Not Found + Read Committed occurs if we accessed
13780 * a row by Seek, tried to lock it, failed, released and reacquired the
13781 * snapshot (because of READ COMMITTED mode) and the row was deleted by
13782 * someone else in the meantime.
13783 * If so, we either just skipping the row, or re-creating a snapshot
13784 * and seek again. In both cases, Read Committed constraint is not broken.
13785 */
13786 bool ha_rocksdb::should_skip_invalidated_record(const int rc) {
13787 if ((m_lock_rows != RDB_LOCK_NONE && rc == HA_ERR_KEY_NOT_FOUND &&
13788 my_core::thd_tx_isolation(ha_thd()) == ISO_READ_COMMITTED)) {
13789 return true;
13790 }
13791 return false;
13792 }
13793 /**
13794 * Indicating snapshot needs to be re-created and retrying seek again,
13795 * instead of returning errors or empty set. This is normally applicable
13796 * when hitting kBusy when locking the first row of the transaction,
13797 * with Repeatable Read isolation level.
13798 */
13799 bool ha_rocksdb::should_recreate_snapshot(const int rc,
13800 const bool is_new_snapshot) {
13801 if (should_skip_invalidated_record(rc) ||
13802 (rc == HA_ERR_ROCKSDB_STATUS_BUSY && is_new_snapshot)) {
13803 return true;
13804 }
13805 return false;
13806 }
13807
13808 /**
13809 * If calling put/delete/singledelete without locking the row,
13810 * it is necessary to pass assume_tracked=false to RocksDB TX API.
13811 * Read Free Replication and Blind Deletes are the cases when
13812 * using TX API and skipping row locking.
13813 */
13814 bool ha_rocksdb::can_assume_tracked(THD *thd) {
13815 if (/* MARIAROCKS_NOT_YET use_read_free_rpl() ||*/ (THDVAR(thd, blind_delete_primary_key))) {
13816 return false;
13817 }
13818 return true;
13819 }
13820
13821 bool ha_rocksdb::check_bloom_and_set_bounds(
13822 THD *thd, const Rdb_key_def &kd, const rocksdb::Slice &eq_cond,
13823 const bool use_all_keys, size_t bound_len, uchar *const lower_bound,
13824 uchar *const upper_bound, rocksdb::Slice *lower_bound_slice,
13825 rocksdb::Slice *upper_bound_slice) {
13826 bool can_use_bloom = can_use_bloom_filter(thd, kd, eq_cond, use_all_keys);
13827 if (!can_use_bloom) {
13828 setup_iterator_bounds(kd, eq_cond, bound_len, lower_bound, upper_bound,
13829 lower_bound_slice, upper_bound_slice);
13830 }
13831 return can_use_bloom;
13832 }
13833
13834 /**
13835 Deciding if it is possible to use bloom filter or not.
13836
13837 @detail
13838 Even if bloom filter exists, it is not always possible
13839 to use bloom filter. If using bloom filter when you shouldn't,
13840 false negative may happen -- fewer rows than expected may be returned.
13841 It is users' responsibility to use bloom filter correctly.
13842
13843 If bloom filter does not exist, return value does not matter because
13844 RocksDB does not use bloom filter internally.
13845
13846 @param kd
13847 @param eq_cond Equal condition part of the key. This always includes
13848 system index id (4 bytes).
13849 @param use_all_keys True if all key parts are set with equal conditions.
13850 This is aware of extended keys.
13851 */
13852 bool ha_rocksdb::can_use_bloom_filter(THD *thd, const Rdb_key_def &kd,
13853 const rocksdb::Slice &eq_cond,
13854 const bool use_all_keys) {
13855 bool can_use = false;
13856
13857 if (THDVAR(thd, skip_bloom_filter_on_read)) {
13858 return can_use;
13859 }
13860
13861 const rocksdb::SliceTransform *prefix_extractor = kd.get_extractor();
13862 if (prefix_extractor) {
13863 /*
13864 This is an optimized use case for CappedPrefixTransform.
13865 If eq_cond length >= prefix extractor length and if
13866 all keys are used for equal lookup, it is
13867 always possible to use bloom filter.
13868
13869 Prefix bloom filter can't be used on descending scan with
13870 prefix lookup (i.e. WHERE id1=1 ORDER BY id2 DESC), because of
13871 RocksDB's limitation. On ascending (or not sorting) scan,
13872 keys longer than the capped prefix length will be truncated down
13873 to the capped length and the resulting key is added to the bloom filter.
13874
13875 Keys shorter than the capped prefix length will be added to
13876 the bloom filter. When keys are looked up, key conditionals
13877 longer than the capped length can be used; key conditionals
13878 shorter require all parts of the key to be available
13879 for the short key match.
13880 */
13881 if ((use_all_keys && prefix_extractor->InRange(eq_cond)) ||
13882 prefix_extractor->SameResultWhenAppended(eq_cond)) {
13883 can_use = true;
13884 } else {
13885 can_use = false;
13886 }
13887 } else {
13888 /*
13889 if prefix extractor is not defined, all key parts have to be
13890 used by eq_cond.
13891 */
13892 if (use_all_keys) {
13893 can_use = true;
13894 } else {
13895 can_use = false;
13896 }
13897 }
13898
13899 return can_use;
13900 }
13901
13902 /* For modules that need access to the global data structures */
13903 rocksdb::TransactionDB *rdb_get_rocksdb_db() { return rdb; }
13904
13905 Rdb_cf_manager &rdb_get_cf_manager() { return cf_manager; }
13906
13907 const rocksdb::BlockBasedTableOptions &rdb_get_table_options() {
13908 return *rocksdb_tbl_options;
13909 }
13910
13911 bool rdb_is_ttl_enabled() { return rocksdb_enable_ttl; }
13912 bool rdb_is_ttl_read_filtering_enabled() {
13913 return rocksdb_enable_ttl_read_filtering;
13914 }
13915 #ifndef DBUG_OFF
13916 int rdb_dbug_set_ttl_rec_ts() { return rocksdb_debug_ttl_rec_ts; }
13917 int rdb_dbug_set_ttl_snapshot_ts() { return rocksdb_debug_ttl_snapshot_ts; }
13918 int rdb_dbug_set_ttl_read_filter_ts() {
13919 return rocksdb_debug_ttl_read_filter_ts;
13920 }
13921 bool rdb_dbug_set_ttl_ignore_pk() { return rocksdb_debug_ttl_ignore_pk; }
13922 #endif
13923
13924 void rdb_update_global_stats(const operation_type &type, uint count,
13925 bool is_system_table) {
13926 DBUG_ASSERT(type < ROWS_MAX);
13927
13928 if (count == 0) {
13929 return;
13930 }
13931
13932 if (is_system_table) {
13933 global_stats.system_rows[type].add(count);
13934 } else {
13935 global_stats.rows[type].add(count);
13936 }
13937 }
13938
13939 int rdb_get_table_perf_counters(const char *const tablename,
13940 Rdb_perf_counters *const counters) {
13941 DBUG_ASSERT(tablename != nullptr);
13942
13943 Rdb_table_handler *table_handler;
13944 table_handler = rdb_open_tables.get_table_handler(tablename);
13945 if (table_handler == nullptr) {
13946 return HA_ERR_ROCKSDB_INVALID_TABLE;
13947 }
13948
13949 counters->load(table_handler->m_table_perf_context);
13950
13951 rdb_open_tables.release_table_handler(table_handler);
13952 return HA_EXIT_SUCCESS;
13953 }
13954
13955 const char *get_rdb_io_error_string(const RDB_IO_ERROR_TYPE err_type) {
13956 // If this assertion fails then this means that a member has been either added
13957 // to or removed from RDB_IO_ERROR_TYPE enum and this function needs to be
13958 // changed to return the appropriate value.
13959 static_assert(RDB_IO_ERROR_LAST == 4, "Please handle all the error types.");
13960
13961 switch (err_type) {
13962 case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_TX_COMMIT:
13963 return "RDB_IO_ERROR_TX_COMMIT";
13964 case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_DICT_COMMIT:
13965 return "RDB_IO_ERROR_DICT_COMMIT";
13966 case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_BG_THREAD:
13967 return "RDB_IO_ERROR_BG_THREAD";
13968 case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_GENERAL:
13969 return "RDB_IO_ERROR_GENERAL";
13970 default:
13971 DBUG_ASSERT(false);
13972 return "(unknown)";
13973 }
13974 }
13975
13976 // In case of core dump generation we want this function NOT to be optimized
13977 // so that we can capture as much data as possible to debug the root cause
13978 // more efficiently.
13979 #ifdef __GNUC__
13980 #endif
13981 void rdb_handle_io_error(const rocksdb::Status status,
13982 const RDB_IO_ERROR_TYPE err_type) {
13983 if (status.IsIOError()) {
13984 /* skip dumping core if write failed and we are allowed to do so */
13985 #ifdef MARIAROCKS_NOT_YET
13986 if (skip_core_dump_on_error) {
13987 opt_core_file = false;
13988 }
13989 #endif
13990 switch (err_type) {
13991 case RDB_IO_ERROR_TX_COMMIT:
13992 case RDB_IO_ERROR_DICT_COMMIT: {
13993 rdb_log_status_error(status, "failed to write to WAL");
13994 /* NO_LINT_DEBUG */
13995 sql_print_error("MyRocks: aborting on WAL write error.");
13996 abort();
13997 break;
13998 }
13999 case RDB_IO_ERROR_BG_THREAD: {
14000 rdb_log_status_error(status, "BG thread failed to write to RocksDB");
14001 /* NO_LINT_DEBUG */
14002 sql_print_error("MyRocks: aborting on BG write error.");
14003 abort();
14004 break;
14005 }
14006 case RDB_IO_ERROR_GENERAL: {
14007 rdb_log_status_error(status, "failed on I/O");
14008 /* NO_LINT_DEBUG */
14009 sql_print_error("MyRocks: aborting on I/O error.");
14010 abort();
14011 break;
14012 }
14013 default:
14014 DBUG_ASSERT(0);
14015 break;
14016 }
14017 } else if (status.IsCorruption()) {
14018 rdb_log_status_error(status, "data corruption detected!");
14019 rdb_persist_corruption_marker();
14020 /* NO_LINT_DEBUG */
14021 sql_print_error("MyRocks: aborting because of data corruption.");
14022 abort();
14023 } else if (!status.ok()) {
14024 switch (err_type) {
14025 case RDB_IO_ERROR_DICT_COMMIT: {
14026 rdb_log_status_error(status, "Failed to write to WAL (dictionary)");
14027 /* NO_LINT_DEBUG */
14028 sql_print_error("MyRocks: aborting on WAL write error.");
14029 abort();
14030 break;
14031 }
14032 default:
14033 rdb_log_status_error(status, "Failed to read/write in RocksDB");
14034 break;
14035 }
14036 }
14037 }
14038 #ifdef __GNUC__
14039 #endif
14040 Rdb_dict_manager *rdb_get_dict_manager(void) { return &dict_manager; }
14041
14042 Rdb_ddl_manager *rdb_get_ddl_manager(void) { return &ddl_manager; }
14043
14044 Rdb_binlog_manager *rdb_get_binlog_manager(void) { return &binlog_manager; }
14045
14046 void rocksdb_set_compaction_options(
14047 my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
14048 my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14049 void *const var_ptr, const void *const save) {
14050 if (var_ptr && save) {
14051 *(uint64_t *)var_ptr = *(const uint64_t *)save;
14052 }
14053 const Rdb_compact_params params = {
14054 (uint64_t)rocksdb_compaction_sequential_deletes,
14055 (uint64_t)rocksdb_compaction_sequential_deletes_window,
14056 (uint64_t)rocksdb_compaction_sequential_deletes_file_size};
14057 if (properties_collector_factory) {
14058 properties_collector_factory->SetCompactionParams(params);
14059 }
14060 }
14061
14062 void rocksdb_set_table_stats_sampling_pct(
14063 my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
14064 my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14065 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
14066 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14067
14068 const uint32_t new_val = *static_cast<const uint32_t *>(save);
14069
14070 if (new_val != rocksdb_table_stats_sampling_pct) {
14071 rocksdb_table_stats_sampling_pct = new_val;
14072
14073 if (properties_collector_factory) {
14074 properties_collector_factory->SetTableStatsSamplingPct(
14075 rocksdb_table_stats_sampling_pct);
14076 }
14077 }
14078
14079 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14080 }
14081
14082 /*
14083 This function allows setting the rate limiter's bytes per second value
14084 but only if the rate limiter is turned on which has to be done at startup.
14085 If the rate is already 0 (turned off) or we are changing it to 0 (trying
14086 to turn it off) this function will push a warning to the client and do
14087 nothing.
14088 This is similar to the code in innodb_doublewrite_update (found in
14089 storage/innobase/handler/ha_innodb.cc).
14090 */
14091 void rocksdb_set_rate_limiter_bytes_per_sec(
14092 my_core::THD *const thd,
14093 my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14094 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
14095 const uint64_t new_val = *static_cast<const uint64_t *>(save);
14096 if (new_val == 0 || rocksdb_rate_limiter_bytes_per_sec == 0) {
14097 /*
14098 If a rate_limiter was not enabled at startup we can't change it nor
14099 can we disable it if one was created at startup
14100 */
14101 push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_WRONG_ARGUMENTS,
14102 "RocksDB: rocksdb_rate_limiter_bytes_per_sec cannot "
14103 "be dynamically changed to or from 0. Do a clean "
14104 "shutdown if you want to change it from or to 0.");
14105 } else if (new_val != rocksdb_rate_limiter_bytes_per_sec) {
14106 /* Apply the new value to the rate limiter and store it locally */
14107 DBUG_ASSERT(rocksdb_rate_limiter != nullptr);
14108 rocksdb_rate_limiter_bytes_per_sec = new_val;
14109 rocksdb_rate_limiter->SetBytesPerSecond(new_val);
14110 }
14111 }
14112
14113 void rocksdb_set_sst_mgr_rate_bytes_per_sec(
14114 my_core::THD *const thd,
14115 my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14116 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
14117 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14118
14119 const uint64_t new_val = *static_cast<const uint64_t *>(save);
14120
14121 if (new_val != rocksdb_sst_mgr_rate_bytes_per_sec) {
14122 rocksdb_sst_mgr_rate_bytes_per_sec = new_val;
14123
14124 rocksdb_db_options->sst_file_manager->SetDeleteRateBytesPerSecond(
14125 rocksdb_sst_mgr_rate_bytes_per_sec);
14126 }
14127
14128 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14129 }
14130
14131 void rocksdb_set_delayed_write_rate(THD *thd, struct st_mysql_sys_var *var,
14132 void *var_ptr, const void *save) {
14133 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14134 const uint64_t new_val = *static_cast<const uint64_t *>(save);
14135 if (rocksdb_delayed_write_rate != new_val) {
14136 rocksdb_delayed_write_rate = new_val;
14137 rocksdb::Status s =
14138 rdb->SetDBOptions({{"delayed_write_rate", std::to_string(new_val)}});
14139
14140 if (!s.ok()) {
14141 /* NO_LINT_DEBUG */
14142 sql_print_warning(
14143 "MyRocks: failed to update delayed_write_rate. "
14144 "status code = %d, status = %s",
14145 s.code(), s.ToString().c_str());
14146 }
14147 }
14148 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14149 }
14150
14151 void rocksdb_set_max_latest_deadlocks(THD *thd, struct st_mysql_sys_var *var,
14152 void *var_ptr, const void *save) {
14153 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14154 const uint32_t new_val = *static_cast<const uint32_t *>(save);
14155 if (rocksdb_max_latest_deadlocks != new_val) {
14156 rocksdb_max_latest_deadlocks = new_val;
14157 rdb->SetDeadlockInfoBufferSize(rocksdb_max_latest_deadlocks);
14158 }
14159 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14160 }
14161
14162 void rdb_set_collation_exception_list(const char *const exception_list) {
14163 DBUG_ASSERT(rdb_collation_exceptions != nullptr);
14164
14165 if (!rdb_collation_exceptions->set_patterns(exception_list)) {
14166 my_core::warn_about_bad_patterns(rdb_collation_exceptions,
14167 "strict_collation_exceptions");
14168 }
14169 }
14170
14171 void rocksdb_set_collation_exception_list(THD *const thd,
14172 struct st_mysql_sys_var *const var,
14173 void *const var_ptr,
14174 const void *const save) {
14175 const char *const val = *static_cast<const char *const *>(save);
14176
14177 rdb_set_collation_exception_list(val == nullptr ? "" : val);
14178
14179 //psergey-todo: what is the purpose of the below??
14180 const char *val_copy= val? my_strdup(val, MYF(0)): nullptr;
14181 my_free(*static_cast<char**>(var_ptr));
14182 *static_cast<const char**>(var_ptr) = val_copy;
14183 }
14184
14185 int mysql_value_to_bool(struct st_mysql_value *value, my_bool *return_value) {
14186 int new_value_type = value->value_type(value);
14187 if (new_value_type == MYSQL_VALUE_TYPE_STRING) {
14188 char buf[16];
14189 int len = sizeof(buf);
14190 const char *str = value->val_str(value, buf, &len);
14191 if (str && (my_strcasecmp(system_charset_info, "true", str) == 0 ||
14192 my_strcasecmp(system_charset_info, "on", str) == 0)) {
14193 *return_value = TRUE;
14194 } else if (str && (my_strcasecmp(system_charset_info, "false", str) == 0 ||
14195 my_strcasecmp(system_charset_info, "off", str) == 0)) {
14196 *return_value = FALSE;
14197 } else {
14198 return 1;
14199 }
14200 } else if (new_value_type == MYSQL_VALUE_TYPE_INT) {
14201 long long intbuf;
14202 value->val_int(value, &intbuf);
14203 if (intbuf > 1) return 1;
14204 *return_value = intbuf > 0 ? TRUE : FALSE;
14205 } else {
14206 return 1;
14207 }
14208
14209 return 0;
14210 }
14211
14212 int rocksdb_check_bulk_load(
14213 THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)),
14214 void *save, struct st_mysql_value *value) {
14215 my_bool new_value;
14216 if (mysql_value_to_bool(value, &new_value) != 0) {
14217 return 1;
14218 }
14219
14220 Rdb_transaction *tx = get_tx_from_thd(thd);
14221 if (tx != nullptr) {
14222 bool is_critical_error;
14223 const int rc = tx->finish_bulk_load(&is_critical_error);
14224 if (rc != 0 && is_critical_error) {
14225 // NO_LINT_DEBUG
14226 sql_print_error(
14227 "RocksDB: Error %d finalizing last SST file while "
14228 "setting bulk loading variable",
14229 rc);
14230 THDVAR(thd, bulk_load) = 0;
14231 return 1;
14232 }
14233 }
14234
14235 *static_cast<bool *>(save) = new_value;
14236 return 0;
14237 }
14238
14239 int rocksdb_check_bulk_load_allow_unsorted(
14240 THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)),
14241 void *save, struct st_mysql_value *value) {
14242 my_bool new_value;
14243 if (mysql_value_to_bool(value, &new_value) != 0) {
14244 return 1;
14245 }
14246
14247 if (THDVAR(thd, bulk_load)) {
14248 my_error(ER_ERROR_WHEN_EXECUTING_COMMAND, MYF(0), "SET",
14249 "Cannot change this setting while bulk load is enabled");
14250
14251 return 1;
14252 }
14253
14254 *static_cast<bool *>(save) = new_value;
14255 return 0;
14256 }
14257
14258 static void rocksdb_set_max_background_jobs(THD *thd,
14259 struct st_mysql_sys_var *const var,
14260 void *const var_ptr,
14261 const void *const save) {
14262 DBUG_ASSERT(save != nullptr);
14263 DBUG_ASSERT(rocksdb_db_options != nullptr);
14264 DBUG_ASSERT(rocksdb_db_options->env != nullptr);
14265
14266 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14267
14268 const int new_val = *static_cast<const int *>(save);
14269
14270 if (rocksdb_db_options->max_background_jobs != new_val) {
14271 rocksdb_db_options->max_background_jobs = new_val;
14272 rocksdb::Status s =
14273 rdb->SetDBOptions({{"max_background_jobs", std::to_string(new_val)}});
14274
14275 if (!s.ok()) {
14276 /* NO_LINT_DEBUG */
14277 sql_print_warning(
14278 "MyRocks: failed to update max_background_jobs. "
14279 "Status code = %d, status = %s.",
14280 s.code(), s.ToString().c_str());
14281 }
14282 }
14283
14284 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14285 }
14286
14287 static void rocksdb_set_bytes_per_sync(
14288 THD *thd MY_ATTRIBUTE((__unused__)),
14289 struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14290 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
14291 DBUG_ASSERT(save != nullptr);
14292 DBUG_ASSERT(rocksdb_db_options != nullptr);
14293 DBUG_ASSERT(rocksdb_db_options->env != nullptr);
14294
14295 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14296
14297 const ulonglong new_val = *static_cast<const ulonglong *>(save);
14298
14299 if (rocksdb_db_options->bytes_per_sync != new_val) {
14300 rocksdb_db_options->bytes_per_sync = new_val;
14301 rocksdb::Status s =
14302 rdb->SetDBOptions({{"bytes_per_sync", std::to_string(new_val)}});
14303
14304 if (!s.ok()) {
14305 /* NO_LINT_DEBUG */
14306 sql_print_warning(
14307 "MyRocks: failed to update max_background_jobs. "
14308 "Status code = %d, status = %s.",
14309 s.code(), s.ToString().c_str());
14310 }
14311 }
14312
14313 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14314 }
14315
14316 static void rocksdb_set_wal_bytes_per_sync(
14317 THD *thd MY_ATTRIBUTE((__unused__)),
14318 struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14319 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
14320 DBUG_ASSERT(save != nullptr);
14321 DBUG_ASSERT(rocksdb_db_options != nullptr);
14322 DBUG_ASSERT(rocksdb_db_options->env != nullptr);
14323
14324 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14325
14326 const ulonglong new_val = *static_cast<const ulonglong *>(save);
14327
14328 if (rocksdb_db_options->wal_bytes_per_sync != new_val) {
14329 rocksdb_db_options->wal_bytes_per_sync = new_val;
14330 rocksdb::Status s =
14331 rdb->SetDBOptions({{"wal_bytes_per_sync", std::to_string(new_val)}});
14332
14333 if (!s.ok()) {
14334 /* NO_LINT_DEBUG */
14335 sql_print_warning(
14336 "MyRocks: failed to update max_background_jobs. "
14337 "Status code = %d, status = %s.",
14338 s.code(), s.ToString().c_str());
14339 }
14340 }
14341
14342 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14343 }
14344
14345 /*
14346 Validating and updating block cache size via sys_var::check path.
14347 SetCapacity may take seconds when reducing block cache, and
14348 sys_var::update holds LOCK_global_system_variables mutex, so
14349 updating block cache size is done at check path instead.
14350 */
14351 static int rocksdb_validate_set_block_cache_size(
14352 THD *thd MY_ATTRIBUTE((__unused__)),
14353 struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14354 void *var_ptr, struct st_mysql_value *value) {
14355 DBUG_ASSERT(value != nullptr);
14356
14357 long long new_value;
14358
14359 /* value is NULL */
14360 if (value->val_int(value, &new_value)) {
14361 return HA_EXIT_FAILURE;
14362 }
14363
14364 if (new_value < RDB_MIN_BLOCK_CACHE_SIZE ||
14365 (uint64_t)new_value > (uint64_t)LLONG_MAX) {
14366 return HA_EXIT_FAILURE;
14367 }
14368
14369 RDB_MUTEX_LOCK_CHECK(rdb_block_cache_resize_mutex);
14370 const rocksdb::BlockBasedTableOptions &table_options =
14371 rdb_get_table_options();
14372
14373 if (rocksdb_block_cache_size != new_value && table_options.block_cache) {
14374 table_options.block_cache->SetCapacity(new_value);
14375 }
14376 *static_cast<int64_t *>(var_ptr) = static_cast<int64_t>(new_value);
14377 RDB_MUTEX_UNLOCK_CHECK(rdb_block_cache_resize_mutex);
14378 return HA_EXIT_SUCCESS;
14379 }
14380
14381 static int rocksdb_validate_update_cf_options(
14382 THD * /* unused */, struct st_mysql_sys_var * /*unused*/, void *save,
14383 struct st_mysql_value *value) {
14384 char buff[STRING_BUFFER_USUAL_SIZE];
14385 const char *str;
14386 int length;
14387 length = sizeof(buff);
14388 str = value->val_str(value, buff, &length);
14389 // In some cases, str can point to buff in the stack.
14390 // This can cause invalid memory access after validation is finished.
14391 // To avoid this kind case, let's alway duplicate the str if str is not
14392 // nullptr
14393 *(const char **)save = (str == nullptr) ? nullptr : my_strdup(str, MYF(0));
14394
14395 if (str == nullptr) {
14396 return HA_EXIT_SUCCESS;
14397 }
14398
14399 Rdb_cf_options::Name_to_config_t option_map;
14400
14401 // Basic sanity checking and parsing the options into a map. If this fails
14402 // then there's no point to proceed.
14403 if (!Rdb_cf_options::parse_cf_options(str, &option_map)) {
14404 my_error(ER_WRONG_VALUE_FOR_VAR, MYF(0), "rocksdb_update_cf_options", str);
14405 // Free what we've copied with my_strdup above.
14406 my_free((void*)(*(const char **)save));
14407 return HA_EXIT_FAILURE;
14408 }
14409 // Loop through option_map and create missing column families
14410 for (Rdb_cf_options::Name_to_config_t::iterator it = option_map.begin();
14411 it != option_map.end(); ++it) {
14412 cf_manager.get_or_create_cf(rdb, it->first);
14413 }
14414 return HA_EXIT_SUCCESS;
14415 }
14416
14417 static void rocksdb_set_update_cf_options(
14418 THD *const /* unused */, struct st_mysql_sys_var *const /* unused */,
14419 void *const var_ptr, const void *const save) {
14420 const char *const val = *static_cast<const char *const *>(save);
14421
14422 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14423
14424 my_free(*reinterpret_cast<char **>(var_ptr));
14425
14426 if (!val) {
14427 *reinterpret_cast<char **>(var_ptr) = nullptr;
14428 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14429 return;
14430 }
14431
14432 DBUG_ASSERT(val != nullptr);
14433
14434 // Reset the pointers regardless of how much success we had with updating
14435 // the CF options. This will results in consistent behavior and avoids
14436 // dealing with cases when only a subset of CF-s was successfully updated.
14437 *reinterpret_cast<const char **>(var_ptr) = val;
14438
14439 // Do the real work of applying the changes.
14440 Rdb_cf_options::Name_to_config_t option_map;
14441
14442 // This should never fail, because of rocksdb_validate_update_cf_options
14443 if (!Rdb_cf_options::parse_cf_options(val, &option_map)) {
14444 my_free(*reinterpret_cast<char**>(var_ptr));
14445 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14446 return;
14447 }
14448
14449 // For each CF we have, see if we need to update any settings.
14450 for (const auto &cf_name : cf_manager.get_cf_names()) {
14451 DBUG_ASSERT(!cf_name.empty());
14452
14453 rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(cf_name);
14454 DBUG_ASSERT(cfh != nullptr);
14455
14456 const auto it = option_map.find(cf_name);
14457 std::string per_cf_options = (it != option_map.end()) ? it->second : "";
14458
14459 if (!per_cf_options.empty()) {
14460 Rdb_cf_options::Name_to_config_t opt_map;
14461 rocksdb::Status s = rocksdb::StringToMap(per_cf_options, &opt_map);
14462
14463 if (s != rocksdb::Status::OK()) {
14464 // NO_LINT_DEBUG
14465 sql_print_warning(
14466 "MyRocks: failed to convert the options for column "
14467 "family '%s' to a map. %s",
14468 cf_name.c_str(), s.ToString().c_str());
14469 } else {
14470 DBUG_ASSERT(rdb != nullptr);
14471
14472 // Finally we can apply the options.
14473 s = rdb->SetOptions(cfh, opt_map);
14474
14475 if (s != rocksdb::Status::OK()) {
14476 // NO_LINT_DEBUG
14477 sql_print_warning(
14478 "MyRocks: failed to apply the options for column "
14479 "family '%s'. %s",
14480 cf_name.c_str(), s.ToString().c_str());
14481 } else {
14482 // NO_LINT_DEBUG
14483 sql_print_information(
14484 "MyRocks: options for column family '%s' "
14485 "have been successfully updated.",
14486 cf_name.c_str());
14487
14488 // Make sure that data is internally consistent as well and update
14489 // the CF options. This is necessary also to make sure that the CF
14490 // options will be correctly reflected in the relevant table:
14491 // ROCKSDB_CF_OPTIONS in INFORMATION_SCHEMA.
14492 rocksdb::ColumnFamilyOptions cf_options = rdb->GetOptions(cfh);
14493 std::string updated_options;
14494
14495 s = rocksdb::GetStringFromColumnFamilyOptions(&updated_options,
14496 cf_options);
14497
14498 DBUG_ASSERT(s == rocksdb::Status::OK());
14499 DBUG_ASSERT(!updated_options.empty());
14500
14501 cf_manager.update_options_map(cf_name, updated_options);
14502 }
14503 }
14504 }
14505 }
14506
14507 // Our caller (`plugin_var_memalloc_global_update`) will call `my_free` to
14508 // free up resources used before.
14509
14510 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14511 }
14512
14513 void rdb_queue_save_stats_request() { rdb_bg_thread.request_save_stats(); }
14514
14515 #ifdef MARIAROCKS_NOT_YET // MDEV-10976
14516
14517 void ha_rocksdb::rpl_before_delete_rows() {
14518 DBUG_ENTER_FUNC();
14519
14520 m_in_rpl_delete_rows = true;
14521
14522 DBUG_VOID_RETURN;
14523 }
14524
14525 void ha_rocksdb::rpl_after_delete_rows() {
14526 DBUG_ENTER_FUNC();
14527
14528 m_in_rpl_delete_rows = false;
14529
14530 DBUG_VOID_RETURN;
14531 }
14532
14533 void ha_rocksdb::rpl_before_update_rows() {
14534 DBUG_ENTER_FUNC();
14535
14536 m_in_rpl_update_rows = true;
14537
14538 DBUG_VOID_RETURN;
14539 }
14540
14541 void ha_rocksdb::rpl_after_update_rows() {
14542 DBUG_ENTER_FUNC();
14543
14544 m_in_rpl_update_rows = false;
14545
14546 DBUG_VOID_RETURN;
14547 }
14548
14549 #if 0
14550 bool ha_rocksdb::is_read_free_rpl_table() const {
14551 return table->s && m_tbl_def->m_is_read_free_rpl_table;
14552 }
14553 #endif
14554
14555 /**
14556 @brief
14557 Read Free Replication can be used or not. Returning true means
14558 Read Free Replication can be used.
14559 */
14560 bool ha_rocksdb::use_read_free_rpl() const {
14561 DBUG_ENTER_FUNC();
14562
14563 if (!ha_thd()->rli_slave || table->triggers || /* !is_read_free_rpl_table()*/ ) {
14564 DBUG_RETURN(false);
14565 }
14566
14567 #if 0 // MARIAROCKS_NOT_YET : read-free replication is not supported
14568 switch (rocksdb_read_free_rpl) {
14569 case read_free_rpl_type::OFF:
14570 DBUG_RETURN(false);
14571 case read_free_rpl_type::PK_ONLY:
14572 DBUG_RETURN(!has_hidden_pk(table) && table->s->keys == 1);
14573 case read_free_rpl_type::PK_SK:
14574 DBUG_RETURN(!has_hidden_pk(table));
14575 }
14576 #else
14577 DBUG_RETURN(false);
14578 #endif
14579
14580 DBUG_ASSERT(false);
14581 DBUG_RETURN(false);
14582 }
14583 #endif // MARIAROCKS_NOT_YET
14584
14585 double ha_rocksdb::read_time(uint index, uint ranges, ha_rows rows) {
14586 DBUG_ENTER_FUNC();
14587
14588 if (index != table->s->primary_key) {
14589 /* Non covering index range scan */
14590 DBUG_RETURN(handler::read_time(index, ranges, rows));
14591 }
14592
14593 DBUG_RETURN((rows / 20.0) + 1);
14594 }
14595
14596 void ha_rocksdb::print_error(int error, myf errflag) {
14597 if (error == HA_ERR_ROCKSDB_STATUS_BUSY) {
14598 error = HA_ERR_LOCK_DEADLOCK;
14599 }
14600 handler::print_error(error, errflag);
14601 }
14602
14603 std::string rdb_corruption_marker_file_name() {
14604 std::string ret(rocksdb_datadir);
14605 ret.append("/ROCKSDB_CORRUPTED");
14606 return ret;
14607 }
14608
14609 void sql_print_verbose_info(const char *format, ...)
14610 {
14611 va_list args;
14612
14613 if (global_system_variables.log_warnings > 2) {
14614 va_start(args, format);
14615 sql_print_information_v(format, args);
14616 va_end(args);
14617 }
14618 }
14619
14620 } // namespace myrocks
14621
14622
14623 /**
14624 Construct and emit duplicate key error message using information
14625 from table's record buffer.
14626
14627 @sa print_keydup_error(table, key, msg, errflag, thd, org_table_name).
14628 */
14629
14630 void print_keydup_error(TABLE *table, KEY *key, myf errflag,
14631 const THD *thd, const char *org_table_name)
14632 {
14633 print_keydup_error(table, key, ER(ER_DUP_ENTRY_WITH_KEY_NAME), errflag);
14634 }
14635
14636 /*
14637 Register the storage engine plugin outside of myrocks namespace
14638 so that mysql_declare_plugin does not get confused when it does
14639 its name generation.
14640 */
14641
14642
14643 struct st_mysql_storage_engine rocksdb_storage_engine = {
14644 MYSQL_HANDLERTON_INTERFACE_VERSION};
14645
14646 maria_declare_plugin(rocksdb_se){
14647 MYSQL_STORAGE_ENGINE_PLUGIN, /* Plugin Type */
14648 &rocksdb_storage_engine, /* Plugin Descriptor */
14649 "ROCKSDB", /* Plugin Name */
14650 "Monty Program Ab", /* Plugin Author */
14651 "RocksDB storage engine", /* Plugin Description */
14652 PLUGIN_LICENSE_GPL, /* Plugin Licence */
14653 myrocks::rocksdb_init_func, /* Plugin Entry Point */
14654 myrocks::rocksdb_done_func, /* Plugin Deinitializer */
14655 0x0001, /* version number (0.1) */
14656 myrocks::rocksdb_status_vars, /* status variables */
14657 myrocks::rocksdb_system_variables, /* system variables */
14658 "1.0", /* string version */
14659 myrocks::MYROCKS_MARIADB_PLUGIN_MATURITY_LEVEL
14660 },
14661 myrocks::rdb_i_s_cfstats, myrocks::rdb_i_s_dbstats,
14662 myrocks::rdb_i_s_perf_context, myrocks::rdb_i_s_perf_context_global,
14663 myrocks::rdb_i_s_cfoptions, myrocks::rdb_i_s_compact_stats,
14664 myrocks::rdb_i_s_global_info, myrocks::rdb_i_s_ddl,
14665 myrocks::rdb_i_s_sst_props, myrocks::rdb_i_s_index_file_map,
14666 myrocks::rdb_i_s_lock_info, myrocks::rdb_i_s_trx_info,
14667 myrocks::rdb_i_s_deadlock_info
14668 maria_declare_plugin_end;
14669