1 /*
2 Copyright (c) 2012, Monty Program Ab
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; version 2 of the License.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA */
16
17 #ifdef USE_PRAGMA_IMPLEMENTATION
18 #pragma implementation // gcc: Class implementation
19 #endif
20
21 #define MYSQL_SERVER 1
22
23 /* For use of 'PRIu64': */
24 #define __STDC_FORMAT_MACROS
25
26 #include <my_global.h>
27
28 #include <inttypes.h>
29
30 /* The C++ file's header */
31 #include "./ha_rocksdb.h"
32
33 /* C++ standard header files */
34 #include <inttypes.h>
35 #include <algorithm>
36 #include <limits>
37 #include <map>
38 #include <queue>
39 #include <set>
40 #include <string>
41 #include <vector>
42
43 /* MySQL includes */
44 #include <mysql/psi/mysql_table.h>
45 // #include <thread_pool_priv.h>
46 #include <mysys_err.h>
47 #include "./debug_sync.h"
48 #include "./my_bit.h"
49 #include "./my_stacktrace.h"
50 #include "./my_sys.h"
51 #include "./sql_audit.h"
52 #include "./sql_table.h"
53 #include "./sql_hset.h"
54 #ifdef MARIAROCKS_NOT_YET
55 #endif
56
57 /* RocksDB includes */
58 #include "monitoring/histogram.h"
59 #include "rocksdb/compaction_filter.h"
60 #include "rocksdb/env.h"
61 #include "rocksdb/memory_allocator.h"
62 #include "rocksdb/persistent_cache.h"
63 #include "rocksdb/rate_limiter.h"
64 #include "rocksdb/slice_transform.h"
65 #include "rocksdb/thread_status.h"
66 #include "rocksdb/utilities/checkpoint.h"
67 #include "rocksdb/utilities/convenience.h"
68 #include "rocksdb/utilities/memory_util.h"
69 #include "rocksdb/utilities/sim_cache.h"
70 #include "rocksdb/utilities/write_batch_with_index.h"
71 #include "util/stop_watch.h"
72 #include "./rdb_source_revision.h"
73
74 // MariaRocks: this is needed to access RocksDB debug syncpoints:
75 #include "test_util/sync_point.h"
76
77 /* MyRocks includes */
78 #include "./event_listener.h"
79 #include "./ha_rocksdb_proto.h"
80 #include "./logger.h"
81 #include "./nosql_access.h"
82 #include "./rdb_cf_manager.h"
83 #include "./rdb_cf_options.h"
84 #include "./rdb_converter.h"
85 #include "./rdb_datadic.h"
86 #include "./rdb_i_s.h"
87 #include "./rdb_index_merge.h"
88 #include "./rdb_mutex_wrapper.h"
89 #include "./rdb_psi.h"
90 #include "./rdb_threads.h"
91 #include "./rdb_mariadb_server_port.h"
92
93 // Internal MySQL APIs not exposed in any header.
94 extern "C" {
95 /**
96 Mark transaction to rollback and mark error as fatal to a sub-statement.
97 @param thd Thread handle
98 @param all TRUE <=> rollback main transaction.
99 */
100 void thd_mark_transaction_to_rollback(MYSQL_THD thd, bool all);
101
102 /**
103 * Get the user thread's binary logging format
104 * @param thd user thread
105 * @return Value to be used as index into the binlog_format_names array
106 */
107 int thd_binlog_format(const MYSQL_THD thd);
108
109 /**
110 * Check if binary logging is filtered for thread's current db.
111 * @param thd Thread handle
112 * @retval 1 the query is not filtered, 0 otherwise.
113 */
114 bool thd_binlog_filter_ok(const MYSQL_THD thd);
115 }
116
117 extern my_bool opt_core_file;
118
119 // Needed in rocksdb_init_func
120 void ignore_db_dirs_append(const char *dirname_arg);
121
122
123 namespace myrocks {
124
125 static st_global_stats global_stats;
126 static st_export_stats export_stats;
127 static st_memory_stats memory_stats;
128 static st_io_stall_stats io_stall_stats;
129
130 const std::string DEFAULT_CF_NAME("default");
131 const std::string DEFAULT_SYSTEM_CF_NAME("__system__");
132 const std::string PER_INDEX_CF_NAME("$per_index_cf");
133
134 static std::vector<GL_INDEX_ID> rdb_indexes_to_recalc;
135
136 #ifdef MARIADB_NOT_YET
137 class Rdb_explicit_snapshot : public explicit_snapshot {
138 public:
create(snapshot_info_st * ss_info,rocksdb::DB * db,const rocksdb::Snapshot * snapshot)139 static std::shared_ptr<Rdb_explicit_snapshot> create(
140 snapshot_info_st *ss_info, rocksdb::DB *db,
141 const rocksdb::Snapshot *snapshot) {
142 std::lock_guard<std::mutex> lock(explicit_snapshot_mutex);
143 auto s = std::unique_ptr<rocksdb::ManagedSnapshot>(
144 new rocksdb::ManagedSnapshot(db, snapshot));
145 if (!s) {
146 return nullptr;
147 }
148 ss_info->snapshot_id = ++explicit_snapshot_counter;
149 auto ret = std::make_shared<Rdb_explicit_snapshot>(*ss_info, std::move(s));
150 if (!ret) {
151 return nullptr;
152 }
153 explicit_snapshots[ss_info->snapshot_id] = ret;
154 return ret;
155 }
156
dump_snapshots()157 static std::string dump_snapshots() {
158 std::string str;
159 std::lock_guard<std::mutex> lock(explicit_snapshot_mutex);
160 for (const auto &elem : explicit_snapshots) {
161 const auto &ss = elem.second.lock();
162 DBUG_ASSERT(ss != nullptr);
163 const auto &info = ss->ss_info;
164 str += "\nSnapshot ID: " + std::to_string(info.snapshot_id) +
165 "\nBinlog File: " + info.binlog_file +
166 "\nBinlog Pos: " + std::to_string(info.binlog_pos) +
167 "\nGtid Executed: " + info.gtid_executed + "\n";
168 }
169
170 return str;
171 }
172
get(const ulonglong snapshot_id)173 static std::shared_ptr<Rdb_explicit_snapshot> get(
174 const ulonglong snapshot_id) {
175 std::lock_guard<std::mutex> lock(explicit_snapshot_mutex);
176 auto elem = explicit_snapshots.find(snapshot_id);
177 if (elem == explicit_snapshots.end()) {
178 return nullptr;
179 }
180 return elem->second.lock();
181 }
182
get_snapshot()183 rocksdb::ManagedSnapshot *get_snapshot() { return snapshot.get(); }
184
Rdb_explicit_snapshot(snapshot_info_st ss_info,std::unique_ptr<rocksdb::ManagedSnapshot> && snapshot)185 Rdb_explicit_snapshot(snapshot_info_st ss_info,
186 std::unique_ptr<rocksdb::ManagedSnapshot> &&snapshot)
187 : explicit_snapshot(ss_info), snapshot(std::move(snapshot)) {}
188
~Rdb_explicit_snapshot()189 virtual ~Rdb_explicit_snapshot() {
190 std::lock_guard<std::mutex> lock(explicit_snapshot_mutex);
191 explicit_snapshots.erase(ss_info.snapshot_id);
192 }
193
194 private:
195 std::unique_ptr<rocksdb::ManagedSnapshot> snapshot;
196
197 static std::mutex explicit_snapshot_mutex;
198 static ulonglong explicit_snapshot_counter;
199 static std::unordered_map<ulonglong, std::weak_ptr<Rdb_explicit_snapshot>>
200 explicit_snapshots;
201 };
202
203 std::mutex Rdb_explicit_snapshot::explicit_snapshot_mutex;
204 ulonglong Rdb_explicit_snapshot::explicit_snapshot_counter = 0;
205 std::unordered_map<ulonglong, std::weak_ptr<Rdb_explicit_snapshot>>
206 Rdb_explicit_snapshot::explicit_snapshots;
207 #endif
208
209 /**
210 Updates row counters based on the table type and operation type.
211 */
update_row_stats(const operation_type & type)212 void ha_rocksdb::update_row_stats(const operation_type &type) {
213 DBUG_ASSERT(type < ROWS_MAX);
214 // Find if we are modifying system databases.
215 if (table->s && m_tbl_def->m_is_mysql_system_table) {
216 global_stats.system_rows[type].inc();
217 } else {
218 global_stats.rows[type].inc();
219 }
220 }
221
222 void dbug_dump_database(rocksdb::DB *db);
223 static handler *rocksdb_create_handler(my_core::handlerton *hton,
224 my_core::TABLE_SHARE *table_arg,
225 my_core::MEM_ROOT *mem_root);
226
getCompactRangeOptions(int concurrency=0)227 static rocksdb::CompactRangeOptions getCompactRangeOptions(
228 int concurrency = 0) {
229 rocksdb::CompactRangeOptions compact_range_options;
230 compact_range_options.bottommost_level_compaction =
231 rocksdb::BottommostLevelCompaction::kForce;
232 compact_range_options.exclusive_manual_compaction = false;
233 if (concurrency > 0) {
234 compact_range_options.max_subcompactions = concurrency;
235 }
236 return compact_range_options;
237 }
238
239 ///////////////////////////////////////////////////////////
240 // Parameters and settings
241 ///////////////////////////////////////////////////////////
242 static char *rocksdb_default_cf_options = nullptr;
243 static char *rocksdb_override_cf_options = nullptr;
244 static char *rocksdb_update_cf_options = nullptr;
245
246 ///////////////////////////////////////////////////////////
247 // Globals
248 ///////////////////////////////////////////////////////////
249 handlerton *rocksdb_hton;
250
251 rocksdb::TransactionDB *rdb = nullptr;
252 rocksdb::HistogramImpl *commit_latency_stats = nullptr;
253
254 static std::shared_ptr<rocksdb::Statistics> rocksdb_stats;
255 static std::unique_ptr<rocksdb::Env> flashcache_aware_env;
256 static std::shared_ptr<Rdb_tbl_prop_coll_factory> properties_collector_factory;
257
258 Rdb_dict_manager dict_manager;
259 Rdb_cf_manager cf_manager;
260 Rdb_ddl_manager ddl_manager;
261 Rdb_binlog_manager binlog_manager;
262
263 #if !defined(_WIN32) && !defined(__APPLE__)
264 Rdb_io_watchdog *io_watchdog = nullptr;
265 #endif
266 /**
267 MyRocks background thread control
268 N.B. This is besides RocksDB's own background threads
269 (@see rocksdb::CancelAllBackgroundWork())
270 */
271
272 static Rdb_background_thread rdb_bg_thread;
273
274 static Rdb_manual_compaction_thread rdb_mc_thread;
275
276 // List of table names (using regex) that are exceptions to the strict
277 // collation check requirement.
278 Regex_list_handler *rdb_collation_exceptions;
279
280 static const char **rdb_get_error_messages(int nr);
281
rocksdb_flush_all_memtables()282 static void rocksdb_flush_all_memtables() {
283 const Rdb_cf_manager &cf_manager = rdb_get_cf_manager();
284 for (const auto &cf_handle : cf_manager.get_all_cf()) {
285 rdb->Flush(rocksdb::FlushOptions(), cf_handle);
286 }
287 }
288
rocksdb_delete_column_family_stub(THD * const,struct st_mysql_sys_var * const,void * const,const void * const)289 static void rocksdb_delete_column_family_stub(
290 THD *const /* thd */, struct st_mysql_sys_var *const /* var */,
291 void *const /* var_ptr */, const void *const /* save */) {}
292
rocksdb_delete_column_family(THD * const,struct st_mysql_sys_var * const,void * const,struct st_mysql_value * const value)293 static int rocksdb_delete_column_family(
294 THD *const /* thd */, struct st_mysql_sys_var *const /* var */,
295 void *const /* var_ptr */, struct st_mysql_value *const value) {
296 // Return failure for now until the race condition between creating
297 // CF and deleting CF is resolved
298 return HA_EXIT_FAILURE;
299
300 char buff[STRING_BUFFER_USUAL_SIZE];
301 int len = sizeof(buff);
302
303 DBUG_ASSERT(value != nullptr);
304
305 if (const char *const cf = value->val_str(value, buff, &len)) {
306 auto &cf_manager = rdb_get_cf_manager();
307 auto ret = cf_manager.drop_cf(cf);
308 if (ret == HA_EXIT_SUCCESS) {
309 // NO_LINT_DEBUG
310 sql_print_information("RocksDB: Dropped column family: %s\n", cf);
311 } else {
312 // NO_LINT_DEBUG
313 sql_print_error("RocksDB: Failed to drop column family: %s, error: %d\n",
314 cf, ret);
315 }
316
317 return ret;
318 }
319
320 return HA_EXIT_SUCCESS;
321 }
322
323 ///////////////////////////////////////////////////////////
324 // Hash map: table name => open table handler
325 ///////////////////////////////////////////////////////////
326
327 namespace // anonymous namespace = not visible outside this source file
328 {
329
330 typedef Hash_set<Rdb_table_handler> Rdb_table_set;
331
332 class Rdb_open_tables_map {
333 private:
334 /* Hash table used to track the handlers of open tables */
335 std::unordered_map<std::string, Rdb_table_handler *> m_table_map;
336
337 /* The mutex used to protect the hash table */
338 mutable mysql_mutex_t m_mutex;
339
340 public:
init()341 void init() {
342 m_table_map.clear();
343 mysql_mutex_init(rdb_psi_open_tbls_mutex_key, &m_mutex, MY_MUTEX_INIT_FAST);
344 }
345
free()346 void free() {
347 m_table_map.clear();
348 mysql_mutex_destroy(&m_mutex);
349 }
count()350 size_t count() { return m_table_map.size(); }
351
352 Rdb_table_handler *get_table_handler(const char *const table_name);
353 void release_table_handler(Rdb_table_handler *const table_handler);
354
355 std::vector<std::string> get_table_names(void) const;
356 };
357
358 } // anonymous namespace
359
360 static Rdb_open_tables_map rdb_open_tables;
361
rdb_normalize_dir(std::string dir)362 static std::string rdb_normalize_dir(std::string dir) {
363 while (dir.size() > 0 && dir.back() == '/') {
364 dir.resize(dir.size() - 1);
365 }
366 return dir;
367 }
368
rocksdb_create_checkpoint(THD * const thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * const var MY_ATTRIBUTE ((__unused__)),void * const save MY_ATTRIBUTE ((__unused__)),struct st_mysql_value * const value)369 static int rocksdb_create_checkpoint(
370 THD *const thd MY_ATTRIBUTE((__unused__)),
371 struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
372 void *const save MY_ATTRIBUTE((__unused__)),
373 struct st_mysql_value *const value) {
374 char buf[FN_REFLEN];
375 int len = sizeof(buf);
376 const char *const checkpoint_dir_raw = value->val_str(value, buf, &len);
377 if (checkpoint_dir_raw) {
378 if (rdb != nullptr) {
379 std::string checkpoint_dir = rdb_normalize_dir(checkpoint_dir_raw);
380 // NO_LINT_DEBUG
381 sql_print_information("RocksDB: creating checkpoint in directory : %s\n",
382 checkpoint_dir.c_str());
383 rocksdb::Checkpoint *checkpoint;
384 auto status = rocksdb::Checkpoint::Create(rdb, &checkpoint);
385 // We can only return HA_EXIT_FAILURE/HA_EXIT_SUCCESS here which is why
386 // the return code is ignored, but by calling into rdb_error_to_mysql,
387 // it will call my_error for us, which will propogate up to the client.
388 int rc __attribute__((__unused__));
389 if (status.ok()) {
390 status = checkpoint->CreateCheckpoint(checkpoint_dir.c_str());
391 delete checkpoint;
392 if (status.ok()) {
393 // NO_LINT_DEBUG
394 sql_print_information(
395 "RocksDB: created checkpoint in directory : %s\n",
396 checkpoint_dir.c_str());
397 return HA_EXIT_SUCCESS;
398 } else {
399 rc = ha_rocksdb::rdb_error_to_mysql(status);
400 }
401 } else {
402 rc = ha_rocksdb::rdb_error_to_mysql(status);
403 }
404 }
405 }
406 return HA_EXIT_FAILURE;
407 }
408
409 /* This method is needed to indicate that the
410 ROCKSDB_CREATE_CHECKPOINT command is not read-only */
rocksdb_create_checkpoint_stub(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)411 static void rocksdb_create_checkpoint_stub(THD *const thd,
412 struct st_mysql_sys_var *const var,
413 void *const var_ptr,
414 const void *const save) {}
415
rocksdb_force_flush_memtable_now_stub(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)416 static void rocksdb_force_flush_memtable_now_stub(
417 THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
418 const void *const save) {}
419
rocksdb_force_flush_memtable_now(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,struct st_mysql_value * const value)420 static int rocksdb_force_flush_memtable_now(
421 THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
422 struct st_mysql_value *const value) {
423 // NO_LINT_DEBUG
424 sql_print_information("RocksDB: Manual memtable flush.");
425 rocksdb_flush_all_memtables();
426 return HA_EXIT_SUCCESS;
427 }
428
rocksdb_force_flush_memtable_and_lzero_now_stub(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)429 static void rocksdb_force_flush_memtable_and_lzero_now_stub(
430 THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
431 const void *const save) {}
432
rocksdb_force_flush_memtable_and_lzero_now(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,struct st_mysql_value * const value)433 static int rocksdb_force_flush_memtable_and_lzero_now(
434 THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
435 struct st_mysql_value *const value) {
436 // NO_LINT_DEBUG
437 sql_print_information("RocksDB: Manual memtable and L0 flush.");
438 rocksdb_flush_all_memtables();
439
440 const Rdb_cf_manager &cf_manager = rdb_get_cf_manager();
441 rocksdb::CompactionOptions c_options = rocksdb::CompactionOptions();
442 rocksdb::ColumnFamilyMetaData metadata;
443 rocksdb::ColumnFamilyDescriptor cf_descr;
444
445 int i, max_attempts = 3, num_errors = 0;
446
447 for (const auto &cf_handle : cf_manager.get_all_cf()) {
448 for (i = 0; i < max_attempts; i++) {
449 rdb->GetColumnFamilyMetaData(cf_handle, &metadata);
450 cf_handle->GetDescriptor(&cf_descr);
451 c_options.output_file_size_limit = cf_descr.options.target_file_size_base;
452
453 DBUG_ASSERT(metadata.levels[0].level == 0);
454 std::vector<std::string> file_names;
455 for (auto &file : metadata.levels[0].files) {
456 file_names.emplace_back(file.db_path + file.name);
457 }
458
459 if (file_names.empty()) {
460 break;
461 }
462
463 rocksdb::Status s;
464 s = rdb->CompactFiles(c_options, cf_handle, file_names, 1);
465
466 // Due to a race, it's possible for CompactFiles to collide
467 // with auto compaction, causing an error to return
468 // regarding file not found. In that case, retry.
469 if (s.IsInvalidArgument()) {
470 continue;
471 }
472
473 if (!s.ok() && !s.IsAborted()) {
474 rdb_handle_io_error(s, RDB_IO_ERROR_GENERAL);
475 return HA_EXIT_FAILURE;
476 }
477 break;
478 }
479 if (i == max_attempts) {
480 num_errors++;
481 }
482 }
483
484 return num_errors == 0 ? HA_EXIT_SUCCESS : HA_EXIT_FAILURE;
485 }
486
487 static void rocksdb_drop_index_wakeup_thread(
488 my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
489 struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
490 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save);
491
492 static my_bool rocksdb_pause_background_work = 0;
493 static mysql_mutex_t rdb_sysvars_mutex;
494 static mysql_mutex_t rdb_block_cache_resize_mutex;
495
rocksdb_set_pause_background_work(my_core::THD * const,struct st_mysql_sys_var * const,void * const,const void * const save)496 static void rocksdb_set_pause_background_work(
497 my_core::THD *const,
498 struct st_mysql_sys_var *const,
499 void *const, const void *const save) {
500 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
501 const my_bool pause_requested = *static_cast<const my_bool *>(save);
502 if (rocksdb_pause_background_work != pause_requested) {
503 if (pause_requested) {
504 rdb->PauseBackgroundWork();
505 } else {
506 rdb->ContinueBackgroundWork();
507 }
508 rocksdb_pause_background_work = pause_requested;
509 }
510 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
511 }
512
513 static void rocksdb_set_compaction_options(THD *thd,
514 struct st_mysql_sys_var *var,
515 void *var_ptr, const void *save);
516
517 static void rocksdb_set_table_stats_sampling_pct(THD *thd,
518 struct st_mysql_sys_var *var,
519 void *var_ptr,
520 const void *save);
521
522 static void rocksdb_set_rate_limiter_bytes_per_sec(THD *thd,
523 struct st_mysql_sys_var *var,
524 void *var_ptr,
525 const void *save);
526
527 static void rocksdb_set_sst_mgr_rate_bytes_per_sec(THD *thd,
528 struct st_mysql_sys_var *var,
529 void *var_ptr,
530 const void *save);
531
532 static void rocksdb_set_delayed_write_rate(THD *thd,
533 struct st_mysql_sys_var *var,
534 void *var_ptr, const void *save);
535
536 static void rocksdb_set_max_latest_deadlocks(THD *thd,
537 struct st_mysql_sys_var *var,
538 void *var_ptr, const void *save);
539
540 static void rdb_set_collation_exception_list(const char *exception_list);
541 static void rocksdb_set_collation_exception_list(THD *thd,
542 struct st_mysql_sys_var *var,
543 void *var_ptr,
544 const void *save);
545
546 static int rocksdb_validate_update_cf_options(THD *thd,
547 struct st_mysql_sys_var *var,
548 void *save,
549 st_mysql_value *value);
550
551 static void rocksdb_set_update_cf_options(THD *thd,
552 struct st_mysql_sys_var *var,
553 void *var_ptr, const void *save);
554
555 static int rocksdb_check_bulk_load(
556 THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)),
557 void *save, struct st_mysql_value *value);
558
559 static int rocksdb_check_bulk_load_allow_unsorted(
560 THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)),
561 void *save, struct st_mysql_value *value);
562
563 static void rocksdb_set_max_background_jobs(THD *thd,
564 struct st_mysql_sys_var *const var,
565 void *const var_ptr,
566 const void *const save);
567 static void rocksdb_set_bytes_per_sync(THD *thd,
568 struct st_mysql_sys_var *const var,
569 void *const var_ptr,
570 const void *const save);
571 static void rocksdb_set_wal_bytes_per_sync(THD *thd,
572 struct st_mysql_sys_var *const var,
573 void *const var_ptr,
574 const void *const save);
575 static int rocksdb_validate_set_block_cache_size(
576 THD *thd, struct st_mysql_sys_var *const var, void *var_ptr,
577 struct st_mysql_value *value);
578 //////////////////////////////////////////////////////////////////////////////
579 // Options definitions
580 //////////////////////////////////////////////////////////////////////////////
581 static long long rocksdb_block_cache_size;
582 static long long rocksdb_sim_cache_size;
583 static my_bool rocksdb_use_clock_cache;
584 static double rocksdb_cache_high_pri_pool_ratio;
585 static my_bool rocksdb_cache_dump;
586 /* Use unsigned long long instead of uint64_t because of MySQL compatibility */
587 static unsigned long long // NOLINT(runtime/int)
588 rocksdb_rate_limiter_bytes_per_sec;
589 static unsigned long long // NOLINT(runtime/int)
590 rocksdb_sst_mgr_rate_bytes_per_sec;
591 static unsigned long long rocksdb_delayed_write_rate;
592 static uint32_t rocksdb_max_latest_deadlocks;
593 static unsigned long // NOLINT(runtime/int)
594 rocksdb_persistent_cache_size_mb;
595 static ulong rocksdb_info_log_level;
596 static char *rocksdb_wal_dir;
597 static char *rocksdb_persistent_cache_path;
598 static ulong rocksdb_index_type;
599 static uint32_t rocksdb_flush_log_at_trx_commit;
600 static uint32_t rocksdb_debug_optimizer_n_rows;
601 static my_bool rocksdb_force_compute_memtable_stats;
602 static uint32_t rocksdb_force_compute_memtable_stats_cachetime;
603 static my_bool rocksdb_debug_optimizer_no_zero_cardinality;
604 static uint32_t rocksdb_wal_recovery_mode;
605 static uint32_t rocksdb_stats_level;
606 static uint32_t rocksdb_access_hint_on_compaction_start;
607 static char *rocksdb_compact_cf_name;
608 static char *rocksdb_delete_cf_name;
609 static char *rocksdb_checkpoint_name;
610 static my_bool rocksdb_signal_drop_index_thread;
611 static my_bool rocksdb_signal_remove_mariabackup_checkpoint;
612 static my_bool rocksdb_strict_collation_check = 1;
613 static my_bool rocksdb_ignore_unknown_options = 1;
614 static my_bool rocksdb_enable_2pc = 0;
615 static char *rocksdb_strict_collation_exceptions;
616 static my_bool rocksdb_collect_sst_properties = 1;
617 static my_bool rocksdb_force_flush_memtable_now_var = 0;
618 static my_bool rocksdb_force_flush_memtable_and_lzero_now_var = 0;
619 static my_bool rocksdb_enable_ttl = 1;
620 static my_bool rocksdb_enable_ttl_read_filtering = 1;
621 static int rocksdb_debug_ttl_rec_ts = 0;
622 static int rocksdb_debug_ttl_snapshot_ts = 0;
623 static int rocksdb_debug_ttl_read_filter_ts = 0;
624 static my_bool rocksdb_debug_ttl_ignore_pk = 0;
625 static my_bool rocksdb_reset_stats = 0;
626 static uint32_t rocksdb_io_write_timeout_secs = 0;
627 static uint32_t rocksdb_seconds_between_stat_computes = 3600;
628 static long long rocksdb_compaction_sequential_deletes = 0l;
629 static long long rocksdb_compaction_sequential_deletes_window = 0l;
630 static long long rocksdb_compaction_sequential_deletes_file_size = 0l;
631 static uint32_t rocksdb_validate_tables = 1;
632 static char *rocksdb_datadir;
633 static uint32_t rocksdb_table_stats_sampling_pct;
634 static my_bool rocksdb_enable_bulk_load_api = 1;
635 static my_bool rocksdb_print_snapshot_conflict_queries = 0;
636 static my_bool rocksdb_large_prefix = 0;
637 static my_bool rocksdb_allow_to_start_after_corruption = 0;
638 static char* rocksdb_git_hash;
639
640 uint32_t rocksdb_ignore_datadic_errors = 0;
641
642 char *compression_types_val=
643 const_cast<char*>(get_rocksdb_supported_compression_types());
644 static unsigned long rocksdb_write_policy =
645 rocksdb::TxnDBWritePolicy::WRITE_COMMITTED;
646
647 #if 0 // MARIAROCKS_NOT_YET : read-free replication is not supported
648 char *rocksdb_read_free_rpl_tables;
649 std::mutex rocksdb_read_free_rpl_tables_mutex;
650 #if defined(HAVE_PSI_INTERFACE)
651 Regex_list_handler rdb_read_free_regex_handler(key_rwlock_read_free_rpl_tables);
652 #else
653 Regex_list_handler rdb_read_free_regex_handler;
654 #endif
655 enum read_free_rpl_type { OFF = 0, PK_ONLY, PK_SK };
656 static unsigned long rocksdb_read_free_rpl = read_free_rpl_type::OFF;
657 #endif
658
659 static my_bool rocksdb_error_on_suboptimal_collation = 1;
660 static uint32_t rocksdb_stats_recalc_rate = 0;
661 static uint32_t rocksdb_debug_manual_compaction_delay = 0;
662 static uint32_t rocksdb_max_manual_compactions = 0;
663 static my_bool rocksdb_rollback_on_timeout = FALSE;
664 static my_bool rocksdb_enable_insert_with_update_caching = TRUE;
665
666 std::atomic<uint64_t> rocksdb_row_lock_deadlocks(0);
667 std::atomic<uint64_t> rocksdb_row_lock_wait_timeouts(0);
668 std::atomic<uint64_t> rocksdb_snapshot_conflict_errors(0);
669 std::atomic<uint64_t> rocksdb_wal_group_syncs(0);
670 std::atomic<uint64_t> rocksdb_manual_compactions_processed(0);
671 std::atomic<uint64_t> rocksdb_manual_compactions_running(0);
672 #ifndef DBUG_OFF
673 std::atomic<uint64_t> rocksdb_num_get_for_update_calls(0);
674 #endif
675
676
677
678 /*
679 Remove directory with files in it.
680 Used to remove checkpoint created by mariabackup.
681 */
682 #ifdef _WIN32
683 #include <direct.h> /* unlink*/
684 #ifndef F_OK
685 #define F_OK 0
686 #endif
687 #endif
688
rmdir_force(const char * dir)689 static int rmdir_force(const char *dir) {
690 if (access(dir, F_OK))
691 return true;
692
693 char path[FN_REFLEN];
694 char sep[] = {FN_LIBCHAR, 0};
695 int err = 0;
696
697 MY_DIR *dir_info = my_dir(dir, MYF(MY_DONT_SORT | MY_WANT_STAT));
698 if (!dir_info)
699 return 1;
700
701 for (uint i = 0; i < dir_info->number_of_files; i++) {
702 FILEINFO *file = dir_info->dir_entry + i;
703
704 strxnmov(path, sizeof(path), dir, sep, file->name, NULL);
705
706 err = my_delete(path, 0);
707
708 if (err) {
709 break;
710 }
711 }
712
713 my_dirend(dir_info);
714
715 if (!err)
716 err = rmdir(dir);
717
718 return (err == 0) ? HA_EXIT_SUCCESS : HA_EXIT_FAILURE;
719 }
720
721
rocksdb_remove_mariabackup_checkpoint(my_core::THD * const,struct st_mysql_sys_var * const,void * const var_ptr,const void * const)722 static void rocksdb_remove_mariabackup_checkpoint(
723 my_core::THD *const,
724 struct st_mysql_sys_var *const ,
725 void *const var_ptr, const void *const) {
726 std::string mariabackup_checkpoint_dir(rocksdb_datadir);
727
728 mariabackup_checkpoint_dir.append("/mariabackup-checkpoint");
729
730 if (unlink(mariabackup_checkpoint_dir.c_str()) == 0)
731 return;
732
733 rmdir_force(mariabackup_checkpoint_dir.c_str());
734 }
735
736
rdb_init_rocksdb_db_options(void)737 static std::unique_ptr<rocksdb::DBOptions> rdb_init_rocksdb_db_options(void) {
738 auto o = std::unique_ptr<rocksdb::DBOptions>(new rocksdb::DBOptions());
739
740 o->create_if_missing = true;
741 o->listeners.push_back(std::make_shared<Rdb_event_listener>(&ddl_manager));
742 o->info_log_level = rocksdb::InfoLogLevel::INFO_LEVEL;
743 o->max_subcompactions = DEFAULT_SUBCOMPACTIONS;
744 o->max_open_files = -2; // auto-tune to 50% open_files_limit
745
746 o->two_write_queues = true;
747 o->manual_wal_flush = true;
748 return o;
749 }
750
751 /* DBOptions contains Statistics and needs to be destructed last */
752 static std::unique_ptr<rocksdb::BlockBasedTableOptions> rocksdb_tbl_options =
753 std::unique_ptr<rocksdb::BlockBasedTableOptions>(
754 new rocksdb::BlockBasedTableOptions());
755 static std::unique_ptr<rocksdb::DBOptions> rocksdb_db_options =
756 rdb_init_rocksdb_db_options();
757
758 static std::shared_ptr<rocksdb::RateLimiter> rocksdb_rate_limiter;
759
760 /* This enum needs to be kept up to date with rocksdb::TxnDBWritePolicy */
761 static const char *write_policy_names[] = {"write_committed", "write_prepared",
762 "write_unprepared", NullS};
763
764 static TYPELIB write_policy_typelib = {array_elements(write_policy_names) - 1,
765 "write_policy_typelib",
766 write_policy_names, nullptr};
767
768 #if 0 // MARIAROCKS_NOT_YET : read-free replication is not supported
769 /* This array needs to be kept up to date with myrocks::read_free_rpl_type */
770 static const char *read_free_rpl_names[] = {"OFF", "PK_ONLY", "PK_SK", NullS};
771
772 static TYPELIB read_free_rpl_typelib = {array_elements(read_free_rpl_names) - 1,
773 "read_free_rpl_typelib",
774 read_free_rpl_names, nullptr};
775 #endif
776
777 /* This enum needs to be kept up to date with rocksdb::InfoLogLevel */
778 static const char *info_log_level_names[] = {"debug_level", "info_level",
779 "warn_level", "error_level",
780 "fatal_level", NullS};
781
782 static TYPELIB info_log_level_typelib = {
783 array_elements(info_log_level_names) - 1, "info_log_level_typelib",
784 info_log_level_names, nullptr};
785
rocksdb_set_rocksdb_info_log_level(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)786 static void rocksdb_set_rocksdb_info_log_level(
787 THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
788 const void *const save) {
789 DBUG_ASSERT(save != nullptr);
790
791 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
792 rocksdb_info_log_level = *static_cast<const uint64_t *>(save);
793 rocksdb_db_options->info_log->SetInfoLogLevel(
794 static_cast<rocksdb::InfoLogLevel>(rocksdb_info_log_level));
795 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
796 }
797
rocksdb_set_rocksdb_stats_level(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)798 static void rocksdb_set_rocksdb_stats_level(THD *const thd,
799 struct st_mysql_sys_var *const var,
800 void *const var_ptr,
801 const void *const save) {
802 DBUG_ASSERT(save != nullptr);
803
804 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
805 rocksdb_db_options->statistics->set_stats_level(
806 static_cast<rocksdb::StatsLevel>(
807 *static_cast<const uint64_t *>(save)));
808 // Actual stats level is defined at rocksdb dbopt::statistics::stats_level_
809 // so adjusting rocksdb_stats_level here to make sure it points to
810 // the correct stats level.
811 rocksdb_stats_level = rocksdb_db_options->statistics->get_stats_level();
812 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
813 }
814
rocksdb_set_reset_stats(my_core::THD * const,my_core::st_mysql_sys_var * const var MY_ATTRIBUTE ((__unused__)),void * const var_ptr,const void * const save)815 static void rocksdb_set_reset_stats(
816 my_core::THD *const /* unused */,
817 my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
818 void *const var_ptr, const void *const save) {
819 DBUG_ASSERT(save != nullptr);
820 DBUG_ASSERT(rdb != nullptr);
821 DBUG_ASSERT(rocksdb_stats != nullptr);
822
823 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
824
825 *static_cast<bool *>(var_ptr) = *static_cast<const bool *>(save);
826
827 if (rocksdb_reset_stats) {
828 rocksdb::Status s = rdb->ResetStats();
829
830 // RocksDB will always return success. Let's document this assumption here
831 // as well so that we'll get immediately notified when contract changes.
832 DBUG_ASSERT(s == rocksdb::Status::OK());
833
834 s = rocksdb_stats->Reset();
835 DBUG_ASSERT(s == rocksdb::Status::OK());
836 }
837
838 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
839 }
840
rocksdb_set_io_write_timeout(my_core::THD * const thd MY_ATTRIBUTE ((__unused__)),my_core::st_mysql_sys_var * const var MY_ATTRIBUTE ((__unused__)),void * const var_ptr MY_ATTRIBUTE ((__unused__)),const void * const save)841 static void rocksdb_set_io_write_timeout(
842 my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
843 my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
844 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
845 DBUG_ASSERT(save != nullptr);
846 DBUG_ASSERT(rdb != nullptr);
847 #if !defined(_WIN32) && !defined(__APPLE__)
848 DBUG_ASSERT(io_watchdog != nullptr);
849 #endif
850
851 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
852
853 const uint32_t new_val = *static_cast<const uint32_t *>(save);
854
855 rocksdb_io_write_timeout_secs = new_val;
856 #if !defined(_WIN32) && !defined(__APPLE__)
857 io_watchdog->reset_timeout(rocksdb_io_write_timeout_secs);
858 #endif
859 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
860 }
861
862 enum rocksdb_flush_log_at_trx_commit_type : unsigned int {
863 FLUSH_LOG_NEVER = 0,
864 FLUSH_LOG_SYNC,
865 FLUSH_LOG_BACKGROUND,
866 FLUSH_LOG_MAX /* must be last */
867 };
868
rocksdb_validate_flush_log_at_trx_commit(THD * const thd,struct st_mysql_sys_var * const var,void * var_ptr,struct st_mysql_value * const value)869 static int rocksdb_validate_flush_log_at_trx_commit(
870 THD *const thd,
871 struct st_mysql_sys_var *const var, /* in: pointer to system variable */
872 void *var_ptr, /* out: immediate result for update function */
873 struct st_mysql_value *const value /* in: incoming value */) {
874 long long new_value;
875
876 /* value is NULL */
877 if (value->val_int(value, &new_value)) {
878 return HA_EXIT_FAILURE;
879 }
880
881 if (rocksdb_db_options->allow_mmap_writes && new_value != FLUSH_LOG_NEVER) {
882 return HA_EXIT_FAILURE;
883 }
884
885 *static_cast<uint32_t *>(var_ptr) = static_cast<uint32_t>(new_value);
886 return HA_EXIT_SUCCESS;
887 }
rocksdb_compact_column_family_stub(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,const void * const save)888 static void rocksdb_compact_column_family_stub(
889 THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
890 const void *const save) {}
891
892 static int rocksdb_compact_column_family(THD *const thd,
893 struct st_mysql_sys_var *const var,
894 void *const var_ptr,
895 struct st_mysql_value *const value);
896
897 static const char *index_type_names[] = {"kBinarySearch", "kHashSearch", NullS};
898
899 static TYPELIB index_type_typelib = {array_elements(index_type_names) - 1,
900 "index_type_typelib", index_type_names,
901 nullptr};
902
903 const ulong RDB_MAX_LOCK_WAIT_SECONDS = 1024 * 1024 * 1024;
904 const ulong RDB_DEFAULT_MAX_ROW_LOCKS = 1024 * 1024;
905 const ulong RDB_MAX_ROW_LOCKS = 1024 * 1024 * 1024;
906 const ulong RDB_DEFAULT_BULK_LOAD_SIZE = 1000;
907 const ulong RDB_MAX_BULK_LOAD_SIZE = 1024 * 1024 * 1024;
908 const size_t RDB_DEFAULT_MERGE_BUF_SIZE = 64 * 1024 * 1024;
909 const size_t RDB_MIN_MERGE_BUF_SIZE = 100;
910 const size_t RDB_DEFAULT_MERGE_COMBINE_READ_SIZE = 1024 * 1024 * 1024;
911 const size_t RDB_MIN_MERGE_COMBINE_READ_SIZE = 100;
912 const size_t RDB_DEFAULT_MERGE_TMP_FILE_REMOVAL_DELAY = 0;
913 const size_t RDB_MIN_MERGE_TMP_FILE_REMOVAL_DELAY = 0;
914 const int64 RDB_DEFAULT_BLOCK_CACHE_SIZE = 512 * 1024 * 1024;
915 const int64 RDB_MIN_BLOCK_CACHE_SIZE = 1024;
916 const int RDB_MAX_CHECKSUMS_PCT = 100;
917 const ulong RDB_DEADLOCK_DETECT_DEPTH = 50;
918
919 // TODO: 0 means don't wait at all, and we don't support it yet?
920 static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG,
921 "Number of seconds to wait for lock", nullptr,
922 nullptr, /*default*/ 1, /*min*/ 1,
923 /*max*/ RDB_MAX_LOCK_WAIT_SECONDS, 0);
924
925 static MYSQL_THDVAR_BOOL(deadlock_detect, PLUGIN_VAR_RQCMDARG,
926 "Enables deadlock detection", nullptr, nullptr, FALSE);
927
928 static MYSQL_THDVAR_ULONG(deadlock_detect_depth, PLUGIN_VAR_RQCMDARG,
929 "Number of transactions deadlock detection will "
930 "traverse through before assuming deadlock",
931 nullptr, nullptr,
932 /*default*/ RDB_DEADLOCK_DETECT_DEPTH,
933 /*min*/ 2,
934 /*max*/ ULONG_MAX, 0);
935
936 static MYSQL_THDVAR_BOOL(
937 commit_time_batch_for_recovery, PLUGIN_VAR_RQCMDARG,
938 "TransactionOptions::commit_time_batch_for_recovery for RocksDB", nullptr,
939 nullptr, TRUE);
940
941 static MYSQL_THDVAR_BOOL(
942 trace_sst_api, PLUGIN_VAR_RQCMDARG,
943 "Generate trace output in the log for each call to the SstFileWriter",
944 nullptr, nullptr, FALSE);
945
946 static MYSQL_THDVAR_BOOL(
947 bulk_load, PLUGIN_VAR_RQCMDARG,
948 "Use bulk-load mode for inserts. This disables "
949 "unique_checks and enables rocksdb_commit_in_the_middle.",
950 rocksdb_check_bulk_load, nullptr, FALSE);
951
952 static MYSQL_THDVAR_BOOL(bulk_load_allow_sk, PLUGIN_VAR_RQCMDARG,
953 "Allow bulk loading of sk keys during bulk-load. "
954 "Can be changed only when bulk load is disabled.",
955 /* Intentionally reuse unsorted's check function */
956 rocksdb_check_bulk_load_allow_unsorted, nullptr,
957 FALSE);
958
959 static MYSQL_THDVAR_BOOL(bulk_load_allow_unsorted, PLUGIN_VAR_RQCMDARG,
960 "Allow unsorted input during bulk-load. "
961 "Can be changed only when bulk load is disabled.",
962 rocksdb_check_bulk_load_allow_unsorted, nullptr,
963 FALSE);
964
965 static MYSQL_SYSVAR_BOOL(enable_bulk_load_api, rocksdb_enable_bulk_load_api,
966 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
967 "Enables using SstFileWriter for bulk loading",
968 nullptr, nullptr, rocksdb_enable_bulk_load_api);
969
970 static MYSQL_SYSVAR_STR(git_hash, rocksdb_git_hash,
971 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
972 "Git revision of the RocksDB library used by MyRocks",
973 nullptr, nullptr, ROCKSDB_GIT_HASH);
974
975 static MYSQL_THDVAR_STR(tmpdir, PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_MEMALLOC,
976 "Directory for temporary files during DDL operations.",
977 nullptr, nullptr, "");
978
979 #define DEFAULT_SKIP_UNIQUE_CHECK_TABLES ".*"
980 static MYSQL_THDVAR_STR(
981 skip_unique_check_tables, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
982 "Skip unique constraint checking for the specified tables", nullptr,
983 nullptr, DEFAULT_SKIP_UNIQUE_CHECK_TABLES);
984
985 static MYSQL_THDVAR_BOOL(
986 commit_in_the_middle, PLUGIN_VAR_RQCMDARG,
987 "Commit rows implicitly every rocksdb_bulk_load_size, on bulk load/insert, "
988 "update and delete",
989 nullptr, nullptr, FALSE);
990
991 static MYSQL_THDVAR_BOOL(
992 blind_delete_primary_key, PLUGIN_VAR_RQCMDARG,
993 "Deleting rows by primary key lookup, without reading rows (Blind Deletes)."
994 " Blind delete is disabled if the table has secondary key",
995 nullptr, nullptr, FALSE);
996
997 #if 0 // MARIAROCKS_NOT_YET : read-free replication is not supported
998
999 static const char *DEFAULT_READ_FREE_RPL_TABLES = ".*";
1000
rocksdb_validate_read_free_rpl_tables(THD * thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * var MY_ATTRIBUTE ((__unused__)),void * save,struct st_mysql_value * value)1001 static int rocksdb_validate_read_free_rpl_tables(
1002 THD *thd MY_ATTRIBUTE((__unused__)),
1003 struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)), void *save,
1004 struct st_mysql_value *value) {
1005 char buff[STRING_BUFFER_USUAL_SIZE];
1006 int length = sizeof(buff);
1007 const char *wlist_buf = value->val_str(value, buff, &length);
1008 const auto wlist = wlist_buf ? wlist_buf : DEFAULT_READ_FREE_RPL_TABLES;
1009
1010 #if defined(HAVE_PSI_INTERFACE)
1011 Regex_list_handler regex_handler(key_rwlock_read_free_rpl_tables);
1012 #else
1013 Regex_list_handler regex_handler;
1014 #endif
1015
1016 if (!regex_handler.set_patterns(wlist)) {
1017 warn_about_bad_patterns(®ex_handler, "rocksdb_read_free_rpl_tables");
1018 return HA_EXIT_FAILURE;
1019 }
1020
1021 *static_cast<const char **>(save) = my_strdup(wlist, MYF(MY_WME));
1022 return HA_EXIT_SUCCESS;
1023 }
1024
rocksdb_update_read_free_rpl_tables(THD * thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * var MY_ATTRIBUTE ((__unused__)),void * var_ptr,const void * save)1025 static void rocksdb_update_read_free_rpl_tables(
1026 THD *thd MY_ATTRIBUTE((__unused__)),
1027 struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)), void *var_ptr,
1028 const void *save) {
1029 const auto wlist = *static_cast<const char *const *>(save);
1030 DBUG_ASSERT(wlist != nullptr);
1031
1032 // This is bound to succeed since we've already checked for bad patterns in
1033 // rocksdb_validate_read_free_rpl_tables
1034 rdb_read_free_regex_handler.set_patterns(wlist);
1035
1036 // update all table defs
1037 struct Rdb_read_free_rpl_updater : public Rdb_tables_scanner {
1038 int add_table(Rdb_tbl_def *tdef) override {
1039 tdef->check_and_set_read_free_rpl_table();
1040 return HA_EXIT_SUCCESS;
1041 }
1042 } updater;
1043 ddl_manager.scan_for_tables(&updater);
1044
1045 if (wlist == DEFAULT_READ_FREE_RPL_TABLES) {
1046 // If running SET var = DEFAULT, then rocksdb_validate_read_free_rpl_tables
1047 // isn't called, and memory is never allocated for the value. Allocate it
1048 // here.
1049 *static_cast<const char **>(var_ptr) = my_strdup(wlist, MYF(MY_WME));
1050 } else {
1051 // Otherwise, we just reuse the value allocated from
1052 // rocksdb_validate_read_free_rpl_tables.
1053 *static_cast<const char **>(var_ptr) = wlist;
1054 }
1055 }
1056
1057 static MYSQL_SYSVAR_STR(
1058 read_free_rpl_tables, rocksdb_read_free_rpl_tables,
1059 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC /*| PLUGIN_VAR_ALLOCATED*/,
1060 "List of tables that will use read-free replication on the slave "
1061 "(i.e. not lookup a row during replication)",
1062 rocksdb_validate_read_free_rpl_tables, rocksdb_update_read_free_rpl_tables,
1063 DEFAULT_READ_FREE_RPL_TABLES);
1064
1065 static MYSQL_SYSVAR_ENUM(
1066 read_free_rpl, rocksdb_read_free_rpl,
1067 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
1068 "Use read-free replication on the slave (i.e. no row lookup during "
1069 "replication). Default is OFF, PK_SK will enable it on all tables with "
1070 "primary key. PK_ONLY will enable it on tables where the only key is the "
1071 "primary key (i.e. no secondary keys).",
1072 nullptr, nullptr, read_free_rpl_type::OFF, &read_free_rpl_typelib);
1073 #endif
1074
1075 static MYSQL_THDVAR_BOOL(skip_bloom_filter_on_read, PLUGIN_VAR_RQCMDARG,
1076 "Skip using bloom filter for reads", nullptr, nullptr,
1077 FALSE);
1078
1079 static MYSQL_THDVAR_ULONG(max_row_locks, PLUGIN_VAR_RQCMDARG,
1080 "Maximum number of locks a transaction can have",
1081 nullptr, nullptr,
1082 /*default*/ RDB_DEFAULT_MAX_ROW_LOCKS,
1083 /*min*/ 1,
1084 /*max*/ RDB_MAX_ROW_LOCKS, 0);
1085
1086 static MYSQL_THDVAR_ULONGLONG(
1087 write_batch_max_bytes, PLUGIN_VAR_RQCMDARG,
1088 "Maximum size of write batch in bytes. 0 means no limit.", nullptr, nullptr,
1089 /* default */ 0, /* min */ 0, /* max */ SIZE_T_MAX, 1);
1090
1091 static MYSQL_THDVAR_BOOL(
1092 lock_scanned_rows, PLUGIN_VAR_RQCMDARG,
1093 "Take and hold locks on rows that are scanned but not updated", nullptr,
1094 nullptr, FALSE);
1095
1096 static MYSQL_THDVAR_ULONG(bulk_load_size, PLUGIN_VAR_RQCMDARG,
1097 "Max #records in a batch for bulk-load mode", nullptr,
1098 nullptr,
1099 /*default*/ RDB_DEFAULT_BULK_LOAD_SIZE,
1100 /*min*/ 1,
1101 /*max*/ RDB_MAX_BULK_LOAD_SIZE, 0);
1102
1103 static MYSQL_THDVAR_ULONGLONG(
1104 merge_buf_size, PLUGIN_VAR_RQCMDARG,
1105 "Size to allocate for merge sort buffers written out to disk "
1106 "during inplace index creation.",
1107 nullptr, nullptr,
1108 /* default (64MB) */ RDB_DEFAULT_MERGE_BUF_SIZE,
1109 /* min (100B) */ RDB_MIN_MERGE_BUF_SIZE,
1110 /* max */ SIZE_T_MAX, 1);
1111
1112 static MYSQL_THDVAR_ULONGLONG(
1113 merge_combine_read_size, PLUGIN_VAR_RQCMDARG,
1114 "Size that we have to work with during combine (reading from disk) phase "
1115 "of "
1116 "external sort during fast index creation.",
1117 nullptr, nullptr,
1118 /* default (1GB) */ RDB_DEFAULT_MERGE_COMBINE_READ_SIZE,
1119 /* min (100B) */ RDB_MIN_MERGE_COMBINE_READ_SIZE,
1120 /* max */ SIZE_T_MAX, 1);
1121
1122 static MYSQL_THDVAR_ULONGLONG(
1123 merge_tmp_file_removal_delay_ms, PLUGIN_VAR_RQCMDARG,
1124 "Fast index creation creates a large tmp file on disk during index "
1125 "creation. Removing this large file all at once when index creation is "
1126 "complete can cause trim stalls on Flash. This variable specifies a "
1127 "duration to sleep (in milliseconds) between calling chsize() to truncate "
1128 "the file in chunks. The chunk size is the same as merge_buf_size.",
1129 nullptr, nullptr,
1130 /* default (0ms) */ RDB_DEFAULT_MERGE_TMP_FILE_REMOVAL_DELAY,
1131 /* min (0ms) */ RDB_MIN_MERGE_TMP_FILE_REMOVAL_DELAY,
1132 /* max */ SIZE_T_MAX, 1);
1133
1134 static MYSQL_THDVAR_INT(
1135 manual_compaction_threads, PLUGIN_VAR_RQCMDARG,
1136 "How many rocksdb threads to run for manual compactions", nullptr, nullptr,
1137 /* default rocksdb.dboption max_subcompactions */ 0,
1138 /* min */ 0, /* max */ 128, 0);
1139
1140 static MYSQL_SYSVAR_BOOL(
1141 create_if_missing,
1142 *reinterpret_cast<my_bool *>(&rocksdb_db_options->create_if_missing),
1143 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1144 "DBOptions::create_if_missing for RocksDB", nullptr, nullptr,
1145 rocksdb_db_options->create_if_missing);
1146
1147 static MYSQL_SYSVAR_BOOL(
1148 two_write_queues,
1149 *reinterpret_cast<my_bool *>(&rocksdb_db_options->two_write_queues),
1150 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1151 "DBOptions::two_write_queues for RocksDB", nullptr, nullptr,
1152 rocksdb_db_options->two_write_queues);
1153
1154 static MYSQL_SYSVAR_BOOL(
1155 manual_wal_flush,
1156 *reinterpret_cast<my_bool *>(&rocksdb_db_options->manual_wal_flush),
1157 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1158 "DBOptions::manual_wal_flush for RocksDB", nullptr, nullptr,
1159 rocksdb_db_options->manual_wal_flush);
1160
1161 static MYSQL_SYSVAR_ENUM(write_policy, rocksdb_write_policy,
1162 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1163 "DBOptions::write_policy for RocksDB", nullptr,
1164 nullptr, rocksdb::TxnDBWritePolicy::WRITE_COMMITTED,
1165 &write_policy_typelib);
1166
1167 static MYSQL_SYSVAR_BOOL(
1168 create_missing_column_families,
1169 *reinterpret_cast<my_bool *>(
1170 &rocksdb_db_options->create_missing_column_families),
1171 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1172 "DBOptions::create_missing_column_families for RocksDB", nullptr, nullptr,
1173 rocksdb_db_options->create_missing_column_families);
1174
1175 static MYSQL_SYSVAR_BOOL(
1176 error_if_exists,
1177 *reinterpret_cast<my_bool *>(&rocksdb_db_options->error_if_exists),
1178 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1179 "DBOptions::error_if_exists for RocksDB", nullptr, nullptr,
1180 rocksdb_db_options->error_if_exists);
1181
1182 static MYSQL_SYSVAR_BOOL(
1183 paranoid_checks,
1184 *reinterpret_cast<my_bool *>(&rocksdb_db_options->paranoid_checks),
1185 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1186 "DBOptions::paranoid_checks for RocksDB", nullptr, nullptr,
1187 rocksdb_db_options->paranoid_checks);
1188
1189 static MYSQL_SYSVAR_ULONGLONG(
1190 rate_limiter_bytes_per_sec, rocksdb_rate_limiter_bytes_per_sec,
1191 PLUGIN_VAR_RQCMDARG, "DBOptions::rate_limiter bytes_per_sec for RocksDB",
1192 nullptr, rocksdb_set_rate_limiter_bytes_per_sec, /* default */ 0L,
1193 /* min */ 0L, /* max */ MAX_RATE_LIMITER_BYTES_PER_SEC, 0);
1194
1195 static MYSQL_SYSVAR_ULONGLONG(
1196 sst_mgr_rate_bytes_per_sec, rocksdb_sst_mgr_rate_bytes_per_sec,
1197 PLUGIN_VAR_RQCMDARG,
1198 "DBOptions::sst_file_manager rate_bytes_per_sec for RocksDB", nullptr,
1199 rocksdb_set_sst_mgr_rate_bytes_per_sec,
1200 /* default */ DEFAULT_SST_MGR_RATE_BYTES_PER_SEC,
1201 /* min */ 0L, /* max */ UINT64_MAX, 0);
1202
1203 static MYSQL_SYSVAR_ULONGLONG(delayed_write_rate, rocksdb_delayed_write_rate,
1204 PLUGIN_VAR_RQCMDARG,
1205 "DBOptions::delayed_write_rate", nullptr,
1206 rocksdb_set_delayed_write_rate,
1207 rocksdb_db_options->delayed_write_rate, 0,
1208 UINT64_MAX, 0);
1209
1210 static MYSQL_SYSVAR_UINT(max_latest_deadlocks, rocksdb_max_latest_deadlocks,
1211 PLUGIN_VAR_RQCMDARG,
1212 "Maximum number of recent "
1213 "deadlocks to store",
1214 nullptr, rocksdb_set_max_latest_deadlocks,
1215 rocksdb::kInitialMaxDeadlocks, 0, UINT32_MAX, 0);
1216
1217 static MYSQL_SYSVAR_ENUM(
1218 info_log_level, rocksdb_info_log_level, PLUGIN_VAR_RQCMDARG,
1219 "Filter level for info logs to be written mysqld error log. "
1220 "Valid values include 'debug_level', 'info_level', 'warn_level'"
1221 "'error_level' and 'fatal_level'.",
1222 nullptr, rocksdb_set_rocksdb_info_log_level,
1223 rocksdb::InfoLogLevel::ERROR_LEVEL, &info_log_level_typelib);
1224
1225 static MYSQL_THDVAR_INT(
1226 perf_context_level, PLUGIN_VAR_RQCMDARG,
1227 "Perf Context Level for rocksdb internal timer stat collection", nullptr,
1228 nullptr,
1229 /* default */ rocksdb::PerfLevel::kUninitialized,
1230 /* min */ rocksdb::PerfLevel::kUninitialized,
1231 /* max */ rocksdb::PerfLevel::kOutOfBounds - 1, 0);
1232
1233 static MYSQL_SYSVAR_UINT(
1234 wal_recovery_mode, rocksdb_wal_recovery_mode, PLUGIN_VAR_RQCMDARG,
1235 "DBOptions::wal_recovery_mode for RocksDB. Default is kAbsoluteConsistency",
1236 nullptr, nullptr,
1237 /* default */ (uint)rocksdb::WALRecoveryMode::kAbsoluteConsistency,
1238 /* min */ (uint)rocksdb::WALRecoveryMode::kTolerateCorruptedTailRecords,
1239 /* max */ (uint)rocksdb::WALRecoveryMode::kSkipAnyCorruptedRecords, 0);
1240
1241 static MYSQL_SYSVAR_UINT(
1242 stats_level, rocksdb_stats_level, PLUGIN_VAR_RQCMDARG,
1243 "Statistics Level for RocksDB. Default is 0 (kExceptHistogramOrTimers)",
1244 nullptr, rocksdb_set_rocksdb_stats_level,
1245 /* default */ (uint)rocksdb::StatsLevel::kExceptHistogramOrTimers,
1246 /* min */ (uint)rocksdb::StatsLevel::kExceptHistogramOrTimers,
1247 /* max */ (uint)rocksdb::StatsLevel::kAll, 0);
1248
1249 static MYSQL_SYSVAR_SIZE_T(compaction_readahead_size,
1250 rocksdb_db_options->compaction_readahead_size,
1251 PLUGIN_VAR_RQCMDARG,
1252 "DBOptions::compaction_readahead_size for RocksDB",
1253 nullptr, nullptr,
1254 rocksdb_db_options->compaction_readahead_size,
1255 /* min */ 0L, /* max */ SIZE_T_MAX, 0);
1256
1257 static MYSQL_SYSVAR_BOOL(
1258 new_table_reader_for_compaction_inputs,
1259 *reinterpret_cast<my_bool *>(
1260 &rocksdb_db_options->new_table_reader_for_compaction_inputs),
1261 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1262 "DBOptions::new_table_reader_for_compaction_inputs for RocksDB", nullptr,
1263 nullptr, rocksdb_db_options->new_table_reader_for_compaction_inputs);
1264
1265 static MYSQL_SYSVAR_UINT(
1266 access_hint_on_compaction_start, rocksdb_access_hint_on_compaction_start,
1267 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1268 "DBOptions::access_hint_on_compaction_start for RocksDB", nullptr, nullptr,
1269 /* default */ (uint)rocksdb::Options::AccessHint::NORMAL,
1270 /* min */ (uint)rocksdb::Options::AccessHint::NONE,
1271 /* max */ (uint)rocksdb::Options::AccessHint::WILLNEED, 0);
1272
1273 static MYSQL_SYSVAR_BOOL(
1274 allow_concurrent_memtable_write,
1275 *reinterpret_cast<my_bool *>(
1276 &rocksdb_db_options->allow_concurrent_memtable_write),
1277 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1278 "DBOptions::allow_concurrent_memtable_write for RocksDB", nullptr, nullptr,
1279 false);
1280
1281 static MYSQL_SYSVAR_BOOL(
1282 enable_write_thread_adaptive_yield,
1283 *reinterpret_cast<my_bool *>(
1284 &rocksdb_db_options->enable_write_thread_adaptive_yield),
1285 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1286 "DBOptions::enable_write_thread_adaptive_yield for RocksDB", nullptr,
1287 nullptr, false);
1288
1289 static MYSQL_SYSVAR_INT(max_open_files, rocksdb_db_options->max_open_files,
1290 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1291 "DBOptions::max_open_files for RocksDB", nullptr,
1292 nullptr, rocksdb_db_options->max_open_files,
1293 /* min */ -2, /* max */ INT_MAX, 0);
1294
1295 static MYSQL_SYSVAR_UINT64_T(max_total_wal_size,
1296 rocksdb_db_options->max_total_wal_size,
1297 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1298 "DBOptions::max_total_wal_size for RocksDB", nullptr,
1299 nullptr, rocksdb_db_options->max_total_wal_size,
1300 /* min */ 0, /* max */ LONGLONG_MAX, 0);
1301
1302 static MYSQL_SYSVAR_BOOL(
1303 use_fsync, *reinterpret_cast<my_bool *>(&rocksdb_db_options->use_fsync),
1304 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1305 "DBOptions::use_fsync for RocksDB", nullptr, nullptr,
1306 rocksdb_db_options->use_fsync);
1307
1308 static MYSQL_SYSVAR_STR(wal_dir, rocksdb_wal_dir,
1309 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1310 "DBOptions::wal_dir for RocksDB", nullptr, nullptr,
1311 rocksdb_db_options->wal_dir.c_str());
1312
1313 static MYSQL_SYSVAR_STR(
1314 persistent_cache_path, rocksdb_persistent_cache_path,
1315 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1316 "Path for BlockBasedTableOptions::persistent_cache for RocksDB", nullptr,
1317 nullptr, "");
1318
1319 static MYSQL_SYSVAR_ULONG(
1320 persistent_cache_size_mb, rocksdb_persistent_cache_size_mb,
1321 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1322 "Size of cache in MB for BlockBasedTableOptions::persistent_cache "
1323 "for RocksDB",
1324 nullptr, nullptr, rocksdb_persistent_cache_size_mb,
1325 /* min */ 0L, /* max */ ULONG_MAX, 0);
1326
1327 static MYSQL_SYSVAR_UINT64_T(
1328 delete_obsolete_files_period_micros,
1329 rocksdb_db_options->delete_obsolete_files_period_micros,
1330 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1331 "DBOptions::delete_obsolete_files_period_micros for RocksDB", nullptr,
1332 nullptr, rocksdb_db_options->delete_obsolete_files_period_micros,
1333 /* min */ 0, /* max */ LONGLONG_MAX, 0);
1334
1335 static MYSQL_SYSVAR_INT(max_background_jobs,
1336 rocksdb_db_options->max_background_jobs,
1337 PLUGIN_VAR_RQCMDARG,
1338 "DBOptions::max_background_jobs for RocksDB", nullptr,
1339 rocksdb_set_max_background_jobs,
1340 rocksdb_db_options->max_background_jobs,
1341 /* min */ -1, /* max */ MAX_BACKGROUND_JOBS, 0);
1342
1343 static MYSQL_SYSVAR_UINT(max_subcompactions,
1344 rocksdb_db_options->max_subcompactions,
1345 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1346 "DBOptions::max_subcompactions for RocksDB", nullptr,
1347 nullptr, rocksdb_db_options->max_subcompactions,
1348 /* min */ 1, /* max */ MAX_SUBCOMPACTIONS, 0);
1349
1350 static MYSQL_SYSVAR_SIZE_T(max_log_file_size,
1351 rocksdb_db_options->max_log_file_size,
1352 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1353 "DBOptions::max_log_file_size for RocksDB", nullptr,
1354 nullptr, rocksdb_db_options->max_log_file_size,
1355 /* min */ 0L, /* max */ SIZE_T_MAX, 0);
1356
1357 static MYSQL_SYSVAR_SIZE_T(log_file_time_to_roll,
1358 rocksdb_db_options->log_file_time_to_roll,
1359 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1360 "DBOptions::log_file_time_to_roll for RocksDB",
1361 nullptr, nullptr,
1362 rocksdb_db_options->log_file_time_to_roll,
1363 /* min */ 0L, /* max */ SIZE_T_MAX, 0);
1364
1365 static MYSQL_SYSVAR_SIZE_T(keep_log_file_num,
1366 rocksdb_db_options->keep_log_file_num,
1367 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1368 "DBOptions::keep_log_file_num for RocksDB", nullptr,
1369 nullptr, rocksdb_db_options->keep_log_file_num,
1370 /* min */ 0L, /* max */ SIZE_T_MAX, 0);
1371
1372 static MYSQL_SYSVAR_UINT64_T(max_manifest_file_size,
1373 rocksdb_db_options->max_manifest_file_size,
1374 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1375 "DBOptions::max_manifest_file_size for RocksDB",
1376 nullptr, nullptr,
1377 rocksdb_db_options->max_manifest_file_size,
1378 /* min */ 0L, /* max */ ULONGLONG_MAX, 0);
1379
1380 static MYSQL_SYSVAR_INT(table_cache_numshardbits,
1381 rocksdb_db_options->table_cache_numshardbits,
1382 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1383 "DBOptions::table_cache_numshardbits for RocksDB",
1384 nullptr, nullptr,
1385 rocksdb_db_options->table_cache_numshardbits,
1386 // LRUCache limits this to 19 bits, anything greater
1387 // fails to create a cache and returns a nullptr
1388 /* min */ 0, /* max */ 19, 0);
1389
1390 static MYSQL_SYSVAR_UINT64_T(wal_ttl_seconds, rocksdb_db_options->WAL_ttl_seconds,
1391 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1392 "DBOptions::WAL_ttl_seconds for RocksDB", nullptr,
1393 nullptr, rocksdb_db_options->WAL_ttl_seconds,
1394 /* min */ 0L, /* max */ LONGLONG_MAX, 0);
1395
1396 static MYSQL_SYSVAR_UINT64_T(wal_size_limit_mb,
1397 rocksdb_db_options->WAL_size_limit_MB,
1398 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1399 "DBOptions::WAL_size_limit_MB for RocksDB", nullptr,
1400 nullptr, rocksdb_db_options->WAL_size_limit_MB,
1401 /* min */ 0L, /* max */ LONGLONG_MAX, 0);
1402
1403 static MYSQL_SYSVAR_SIZE_T(manifest_preallocation_size,
1404 rocksdb_db_options->manifest_preallocation_size,
1405 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1406 "DBOptions::manifest_preallocation_size for RocksDB",
1407 nullptr, nullptr,
1408 rocksdb_db_options->manifest_preallocation_size,
1409 /* min */ 0L, /* max */ SIZE_T_MAX, 0);
1410
1411 static MYSQL_SYSVAR_BOOL(
1412 use_direct_reads,
1413 *reinterpret_cast<my_bool *>(&rocksdb_db_options->use_direct_reads),
1414 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1415 "DBOptions::use_direct_reads for RocksDB", nullptr, nullptr,
1416 rocksdb_db_options->use_direct_reads);
1417
1418 static MYSQL_SYSVAR_BOOL(
1419 use_direct_io_for_flush_and_compaction,
1420 *reinterpret_cast<my_bool *>(&rocksdb_db_options->use_direct_io_for_flush_and_compaction),
1421 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1422 "DBOptions::use_direct_io_for_flush_and_compaction for RocksDB", nullptr, nullptr,
1423 rocksdb_db_options->use_direct_io_for_flush_and_compaction);
1424
1425 static MYSQL_SYSVAR_BOOL(
1426 allow_mmap_reads,
1427 *reinterpret_cast<my_bool *>(&rocksdb_db_options->allow_mmap_reads),
1428 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1429 "DBOptions::allow_mmap_reads for RocksDB", nullptr, nullptr,
1430 rocksdb_db_options->allow_mmap_reads);
1431
1432 static MYSQL_SYSVAR_BOOL(
1433 allow_mmap_writes,
1434 *reinterpret_cast<my_bool *>(&rocksdb_db_options->allow_mmap_writes),
1435 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1436 "DBOptions::allow_mmap_writes for RocksDB", nullptr, nullptr,
1437 rocksdb_db_options->allow_mmap_writes);
1438
1439 static MYSQL_SYSVAR_BOOL(
1440 is_fd_close_on_exec,
1441 *reinterpret_cast<my_bool *>(&rocksdb_db_options->is_fd_close_on_exec),
1442 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1443 "DBOptions::is_fd_close_on_exec for RocksDB", nullptr, nullptr,
1444 rocksdb_db_options->is_fd_close_on_exec);
1445
1446 static MYSQL_SYSVAR_UINT(stats_dump_period_sec,
1447 rocksdb_db_options->stats_dump_period_sec,
1448 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1449 "DBOptions::stats_dump_period_sec for RocksDB",
1450 nullptr, nullptr,
1451 rocksdb_db_options->stats_dump_period_sec,
1452 /* min */ 0, /* max */ INT_MAX, 0);
1453
1454 static MYSQL_SYSVAR_BOOL(
1455 advise_random_on_open,
1456 *reinterpret_cast<my_bool *>(&rocksdb_db_options->advise_random_on_open),
1457 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1458 "DBOptions::advise_random_on_open for RocksDB", nullptr, nullptr,
1459 rocksdb_db_options->advise_random_on_open);
1460
1461 static MYSQL_SYSVAR_SIZE_T(db_write_buffer_size,
1462 rocksdb_db_options->db_write_buffer_size,
1463 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1464 "DBOptions::db_write_buffer_size for RocksDB",
1465 nullptr, nullptr,
1466 rocksdb_db_options->db_write_buffer_size,
1467 /* min */ 0L, /* max */ SIZE_T_MAX, 0);
1468
1469 static MYSQL_SYSVAR_BOOL(
1470 use_adaptive_mutex,
1471 *reinterpret_cast<my_bool *>(&rocksdb_db_options->use_adaptive_mutex),
1472 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1473 "DBOptions::use_adaptive_mutex for RocksDB", nullptr, nullptr,
1474 rocksdb_db_options->use_adaptive_mutex);
1475
1476 static MYSQL_SYSVAR_UINT64_T(bytes_per_sync, rocksdb_db_options->bytes_per_sync,
1477 PLUGIN_VAR_RQCMDARG,
1478 "DBOptions::bytes_per_sync for RocksDB", nullptr,
1479 rocksdb_set_bytes_per_sync,
1480 rocksdb_db_options->bytes_per_sync,
1481 /* min */ 0L, /* max */ ULONGLONG_MAX, 0);
1482
1483 static MYSQL_SYSVAR_UINT64_T(wal_bytes_per_sync,
1484 rocksdb_db_options->wal_bytes_per_sync,
1485 PLUGIN_VAR_RQCMDARG,
1486 "DBOptions::wal_bytes_per_sync for RocksDB", nullptr,
1487 rocksdb_set_wal_bytes_per_sync,
1488 rocksdb_db_options->wal_bytes_per_sync,
1489 /* min */ 0L, /* max */ ULONGLONG_MAX, 0);
1490
1491 static MYSQL_SYSVAR_BOOL(
1492 enable_thread_tracking,
1493 *reinterpret_cast<my_bool *>(&rocksdb_db_options->enable_thread_tracking),
1494 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1495 "DBOptions::enable_thread_tracking for RocksDB", nullptr, nullptr, true);
1496
1497 static MYSQL_SYSVAR_LONGLONG(block_cache_size, rocksdb_block_cache_size,
1498 PLUGIN_VAR_RQCMDARG,
1499 "block_cache size for RocksDB",
1500 rocksdb_validate_set_block_cache_size, nullptr,
1501 /* default */ RDB_DEFAULT_BLOCK_CACHE_SIZE,
1502 /* min */ RDB_MIN_BLOCK_CACHE_SIZE,
1503 /* max */ LLONG_MAX,
1504 /* Block size */ RDB_MIN_BLOCK_CACHE_SIZE);
1505
1506 static MYSQL_SYSVAR_LONGLONG(sim_cache_size, rocksdb_sim_cache_size,
1507 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1508 "Simulated cache size for RocksDB", nullptr,
1509 nullptr,
1510 /* default */ 0,
1511 /* min */ 0,
1512 /* max */ LLONG_MAX,
1513 /* Block size */ 0);
1514
1515 static MYSQL_SYSVAR_BOOL(
1516 use_clock_cache, rocksdb_use_clock_cache,
1517 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1518 "Use ClockCache instead of default LRUCache for RocksDB", nullptr, nullptr,
1519 false);
1520
1521 static MYSQL_SYSVAR_BOOL(cache_dump, rocksdb_cache_dump,
1522 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1523 "Include RocksDB block cache content in core dump.",
1524 nullptr, nullptr, true);
1525
1526 static MYSQL_SYSVAR_DOUBLE(cache_high_pri_pool_ratio,
1527 rocksdb_cache_high_pri_pool_ratio,
1528 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1529 "Specify the size of block cache high-pri pool",
1530 nullptr, nullptr, /* default */ 0.0, /* min */ 0.0,
1531 /* max */ 1.0, 0);
1532
1533 static MYSQL_SYSVAR_BOOL(
1534 cache_index_and_filter_blocks,
1535 *reinterpret_cast<my_bool *>(
1536 &rocksdb_tbl_options->cache_index_and_filter_blocks),
1537 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1538 "BlockBasedTableOptions::cache_index_and_filter_blocks for RocksDB",
1539 nullptr, nullptr, true);
1540
1541 static MYSQL_SYSVAR_BOOL(
1542 cache_index_and_filter_with_high_priority,
1543 *reinterpret_cast<my_bool *>(
1544 &rocksdb_tbl_options->cache_index_and_filter_blocks_with_high_priority),
1545 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1546 "cache_index_and_filter_blocks_with_high_priority for RocksDB", nullptr,
1547 nullptr, true);
1548
1549 // When pin_l0_filter_and_index_blocks_in_cache is true, RocksDB will use the
1550 // LRU cache, but will always keep the filter & idndex block's handle checked
1551 // out (=won't call ShardedLRUCache::Release), plus the parsed out objects
1552 // the LRU cache will never push flush them out, hence they're pinned.
1553 //
1554 // This fixes the mutex contention between :ShardedLRUCache::Lookup and
1555 // ShardedLRUCache::Release which reduced the QPS ratio (QPS using secondary
1556 // index / QPS using PK).
1557 static MYSQL_SYSVAR_BOOL(
1558 pin_l0_filter_and_index_blocks_in_cache,
1559 *reinterpret_cast<my_bool *>(
1560 &rocksdb_tbl_options->pin_l0_filter_and_index_blocks_in_cache),
1561 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1562 "pin_l0_filter_and_index_blocks_in_cache for RocksDB", nullptr, nullptr,
1563 true);
1564
1565 static MYSQL_SYSVAR_ENUM(index_type, rocksdb_index_type,
1566 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1567 "BlockBasedTableOptions::index_type for RocksDB",
1568 nullptr, nullptr,
1569 (ulong)rocksdb_tbl_options->index_type,
1570 &index_type_typelib);
1571
1572 static MYSQL_SYSVAR_BOOL(
1573 hash_index_allow_collision,
1574 *reinterpret_cast<my_bool *>(
1575 &rocksdb_tbl_options->hash_index_allow_collision),
1576 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1577 "BlockBasedTableOptions::hash_index_allow_collision for RocksDB", nullptr,
1578 nullptr, rocksdb_tbl_options->hash_index_allow_collision);
1579
1580 static MYSQL_SYSVAR_BOOL(
1581 no_block_cache,
1582 *reinterpret_cast<my_bool *>(&rocksdb_tbl_options->no_block_cache),
1583 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1584 "BlockBasedTableOptions::no_block_cache for RocksDB", nullptr, nullptr,
1585 rocksdb_tbl_options->no_block_cache);
1586
1587 static MYSQL_SYSVAR_SIZE_T(block_size, rocksdb_tbl_options->block_size,
1588 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1589 "BlockBasedTableOptions::block_size for RocksDB",
1590 nullptr, nullptr, rocksdb_tbl_options->block_size,
1591 /* min */ 1L, /* max */ SIZE_T_MAX, 0);
1592
1593 static MYSQL_SYSVAR_INT(
1594 block_size_deviation, rocksdb_tbl_options->block_size_deviation,
1595 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1596 "BlockBasedTableOptions::block_size_deviation for RocksDB", nullptr,
1597 nullptr, rocksdb_tbl_options->block_size_deviation,
1598 /* min */ 0, /* max */ INT_MAX, 0);
1599
1600 static MYSQL_SYSVAR_INT(
1601 block_restart_interval, rocksdb_tbl_options->block_restart_interval,
1602 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1603 "BlockBasedTableOptions::block_restart_interval for RocksDB", nullptr,
1604 nullptr, rocksdb_tbl_options->block_restart_interval,
1605 /* min */ 1, /* max */ INT_MAX, 0);
1606
1607 static MYSQL_SYSVAR_BOOL(
1608 whole_key_filtering,
1609 *reinterpret_cast<my_bool *>(&rocksdb_tbl_options->whole_key_filtering),
1610 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1611 "BlockBasedTableOptions::whole_key_filtering for RocksDB", nullptr, nullptr,
1612 rocksdb_tbl_options->whole_key_filtering);
1613
1614 static MYSQL_SYSVAR_STR(default_cf_options, rocksdb_default_cf_options,
1615 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1616 "default cf options for RocksDB", nullptr, nullptr, "");
1617
1618 static MYSQL_SYSVAR_STR(override_cf_options, rocksdb_override_cf_options,
1619 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1620 "option overrides per cf for RocksDB", nullptr, nullptr,
1621 "");
1622
1623 static MYSQL_SYSVAR_STR(update_cf_options, rocksdb_update_cf_options,
1624 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC
1625 /* psergey-merge: need this? : PLUGIN_VAR_ALLOCATED*/,
1626 "Option updates per column family for RocksDB",
1627 rocksdb_validate_update_cf_options,
1628 rocksdb_set_update_cf_options, nullptr);
1629
1630 static MYSQL_SYSVAR_UINT(flush_log_at_trx_commit,
1631 rocksdb_flush_log_at_trx_commit, PLUGIN_VAR_RQCMDARG,
1632 "Sync on transaction commit. Similar to "
1633 "innodb_flush_log_at_trx_commit. 1: sync on commit, "
1634 "0,2: not sync on commit",
1635 rocksdb_validate_flush_log_at_trx_commit, nullptr,
1636 /* default */ FLUSH_LOG_SYNC,
1637 /* min */ FLUSH_LOG_NEVER,
1638 /* max */ FLUSH_LOG_BACKGROUND, 0);
1639
1640 static MYSQL_THDVAR_BOOL(write_disable_wal, PLUGIN_VAR_RQCMDARG,
1641 "WriteOptions::disableWAL for RocksDB", nullptr,
1642 nullptr, rocksdb::WriteOptions().disableWAL);
1643
1644 static MYSQL_THDVAR_BOOL(
1645 write_ignore_missing_column_families, PLUGIN_VAR_RQCMDARG,
1646 "WriteOptions::ignore_missing_column_families for RocksDB", nullptr,
1647 nullptr, rocksdb::WriteOptions().ignore_missing_column_families);
1648
1649 static MYSQL_THDVAR_BOOL(skip_fill_cache, PLUGIN_VAR_RQCMDARG,
1650 "Skip filling block cache on read requests", nullptr,
1651 nullptr, FALSE);
1652
1653 static MYSQL_THDVAR_BOOL(
1654 unsafe_for_binlog, PLUGIN_VAR_RQCMDARG,
1655 "Allowing statement based binary logging which may break consistency",
1656 nullptr, nullptr, FALSE);
1657
1658 static MYSQL_THDVAR_UINT(records_in_range, PLUGIN_VAR_RQCMDARG,
1659 "Used to override the result of records_in_range(). "
1660 "Set to a positive number to override",
1661 nullptr, nullptr, 0,
1662 /* min */ 0, /* max */ INT_MAX, 0);
1663
1664 static MYSQL_THDVAR_UINT(force_index_records_in_range, PLUGIN_VAR_RQCMDARG,
1665 "Used to override the result of records_in_range() "
1666 "when FORCE INDEX is used.",
1667 nullptr, nullptr, 0,
1668 /* min */ 0, /* max */ INT_MAX, 0);
1669
1670 static MYSQL_SYSVAR_UINT(
1671 debug_optimizer_n_rows, rocksdb_debug_optimizer_n_rows,
1672 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_NOSYSVAR,
1673 "Test only to override rocksdb estimates of table size in a memtable",
1674 nullptr, nullptr, 0, /* min */ 0, /* max */ INT_MAX, 0);
1675
1676 static MYSQL_SYSVAR_BOOL(force_compute_memtable_stats,
1677 rocksdb_force_compute_memtable_stats,
1678 PLUGIN_VAR_RQCMDARG,
1679 "Force to always compute memtable stats", nullptr,
1680 nullptr, TRUE);
1681
1682 static MYSQL_SYSVAR_UINT(force_compute_memtable_stats_cachetime,
1683 rocksdb_force_compute_memtable_stats_cachetime,
1684 PLUGIN_VAR_RQCMDARG,
1685 "Time in usecs to cache memtable estimates", nullptr,
1686 nullptr, /* default */ 60 * 1000 * 1000,
1687 /* min */ 0, /* max */ INT_MAX, 0);
1688
1689 static MYSQL_SYSVAR_BOOL(
1690 debug_optimizer_no_zero_cardinality,
1691 rocksdb_debug_optimizer_no_zero_cardinality, PLUGIN_VAR_RQCMDARG,
1692 "In case if cardinality is zero, overrides it with some value", nullptr,
1693 nullptr, TRUE);
1694
1695 static MYSQL_SYSVAR_STR(compact_cf, rocksdb_compact_cf_name,
1696 PLUGIN_VAR_RQCMDARG, "Compact column family",
1697 rocksdb_compact_column_family,
1698 rocksdb_compact_column_family_stub, "");
1699
1700 static MYSQL_SYSVAR_STR(delete_cf, rocksdb_delete_cf_name, PLUGIN_VAR_RQCMDARG,
1701 "Delete column family", rocksdb_delete_column_family,
1702 rocksdb_delete_column_family_stub, "");
1703
1704 static MYSQL_SYSVAR_STR(create_checkpoint, rocksdb_checkpoint_name,
1705 PLUGIN_VAR_RQCMDARG, "Checkpoint directory",
1706 rocksdb_create_checkpoint,
1707 rocksdb_create_checkpoint_stub, "");
1708
1709 static MYSQL_SYSVAR_BOOL(remove_mariabackup_checkpoint,
1710 rocksdb_signal_remove_mariabackup_checkpoint,
1711 PLUGIN_VAR_RQCMDARG, "Remove mariabackup checkpoint",
1712 nullptr, rocksdb_remove_mariabackup_checkpoint, FALSE);
1713
1714 static MYSQL_SYSVAR_BOOL(signal_drop_index_thread,
1715 rocksdb_signal_drop_index_thread, PLUGIN_VAR_RQCMDARG,
1716 "Wake up drop index thread", nullptr,
1717 rocksdb_drop_index_wakeup_thread, FALSE);
1718
1719 static MYSQL_SYSVAR_BOOL(pause_background_work, rocksdb_pause_background_work,
1720 PLUGIN_VAR_RQCMDARG,
1721 "Disable all rocksdb background operations", nullptr,
1722 rocksdb_set_pause_background_work, FALSE);
1723
1724 static MYSQL_SYSVAR_BOOL(
1725 enable_ttl, rocksdb_enable_ttl, PLUGIN_VAR_RQCMDARG,
1726 "Enable expired TTL records to be dropped during compaction.", nullptr,
1727 nullptr, TRUE);
1728
1729 static MYSQL_SYSVAR_BOOL(
1730 enable_ttl_read_filtering, rocksdb_enable_ttl_read_filtering,
1731 PLUGIN_VAR_RQCMDARG,
1732 "For tables with TTL, expired records are skipped/filtered out during "
1733 "processing and in query results. Disabling this will allow these records "
1734 "to be seen, but as a result rows may disappear in the middle of "
1735 "transactions as they are dropped during compaction. Use with caution.",
1736 nullptr, nullptr, TRUE);
1737
1738 static MYSQL_SYSVAR_INT(
1739 debug_ttl_rec_ts, rocksdb_debug_ttl_rec_ts, PLUGIN_VAR_RQCMDARG,
1740 "For debugging purposes only. Overrides the TTL of records to "
1741 "now() + debug_ttl_rec_ts. The value can be +/- to simulate "
1742 "a record inserted in the past vs a record inserted in the 'future'. "
1743 "A value of 0 denotes that the variable is not set. This variable is a "
1744 "no-op in non-debug builds.",
1745 nullptr, nullptr, 0, /* min */ -3600, /* max */ 3600, 0);
1746
1747 static MYSQL_SYSVAR_INT(
1748 debug_ttl_snapshot_ts, rocksdb_debug_ttl_snapshot_ts, PLUGIN_VAR_RQCMDARG,
1749 "For debugging purposes only. Sets the snapshot during compaction to "
1750 "now() + debug_set_ttl_snapshot_ts. The value can be +/- to simulate "
1751 "a snapshot in the past vs a snapshot created in the 'future'. "
1752 "A value of 0 denotes that the variable is not set. This variable is a "
1753 "no-op in non-debug builds.",
1754 nullptr, nullptr, 0, /* min */ -3600, /* max */ 3600, 0);
1755
1756 static MYSQL_SYSVAR_INT(
1757 debug_ttl_read_filter_ts, rocksdb_debug_ttl_read_filter_ts,
1758 PLUGIN_VAR_RQCMDARG,
1759 "For debugging purposes only. Overrides the TTL read filtering time to "
1760 "time + debug_ttl_read_filter_ts. A value of 0 denotes that the variable "
1761 "is not set. This variable is a no-op in non-debug builds.",
1762 nullptr, nullptr, 0, /* min */ -3600, /* max */ 3600, 0);
1763
1764 static MYSQL_SYSVAR_BOOL(
1765 debug_ttl_ignore_pk, rocksdb_debug_ttl_ignore_pk, PLUGIN_VAR_RQCMDARG,
1766 "For debugging purposes only. If true, compaction filtering will not occur "
1767 "on PK TTL data. This variable is a no-op in non-debug builds.",
1768 nullptr, nullptr, FALSE);
1769
1770 static MYSQL_SYSVAR_UINT(
1771 max_manual_compactions, rocksdb_max_manual_compactions, PLUGIN_VAR_RQCMDARG,
1772 "Maximum number of pending + ongoing number of manual compactions.",
1773 nullptr, nullptr, /* default */ 10, /* min */ 0, /* max */ UINT_MAX, 0);
1774
1775 static MYSQL_SYSVAR_BOOL(
1776 rollback_on_timeout, rocksdb_rollback_on_timeout, PLUGIN_VAR_OPCMDARG,
1777 "Whether to roll back the complete transaction or a single statement on "
1778 "lock wait timeout (a single statement by default)",
1779 NULL, NULL, FALSE);
1780
1781 static MYSQL_SYSVAR_UINT(
1782 debug_manual_compaction_delay, rocksdb_debug_manual_compaction_delay,
1783 PLUGIN_VAR_RQCMDARG,
1784 "For debugging purposes only. Sleeping specified seconds "
1785 "for simulating long running compactions.",
1786 nullptr, nullptr, 0, /* min */ 0, /* max */ UINT_MAX, 0);
1787
1788 static MYSQL_SYSVAR_BOOL(
1789 reset_stats, rocksdb_reset_stats, PLUGIN_VAR_RQCMDARG,
1790 "Reset the RocksDB internal statistics without restarting the DB.", nullptr,
1791 rocksdb_set_reset_stats, FALSE);
1792
1793 static MYSQL_SYSVAR_UINT(io_write_timeout, rocksdb_io_write_timeout_secs,
1794 PLUGIN_VAR_RQCMDARG,
1795 "Timeout for experimental I/O watchdog.", nullptr,
1796 rocksdb_set_io_write_timeout, /* default */ 0,
1797 /* min */ 0L,
1798 /* max */ UINT_MAX, 0);
1799
1800 static MYSQL_SYSVAR_BOOL(enable_2pc, rocksdb_enable_2pc, PLUGIN_VAR_RQCMDARG,
1801 "Enable two phase commit for MyRocks", nullptr,
1802 nullptr, TRUE);
1803
1804 static MYSQL_SYSVAR_BOOL(ignore_unknown_options, rocksdb_ignore_unknown_options,
1805 PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
1806 "Enable ignoring unknown options passed to RocksDB",
1807 nullptr, nullptr, TRUE);
1808
1809 static MYSQL_SYSVAR_BOOL(strict_collation_check, rocksdb_strict_collation_check,
1810 PLUGIN_VAR_RQCMDARG,
1811 "Enforce case sensitive collation for MyRocks indexes",
1812 nullptr, nullptr, TRUE);
1813
1814 static MYSQL_SYSVAR_STR(strict_collation_exceptions,
1815 rocksdb_strict_collation_exceptions,
1816 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
1817 "List of tables (using regex) that are excluded "
1818 "from the case sensitive collation enforcement",
1819 nullptr, rocksdb_set_collation_exception_list, "");
1820
1821 static MYSQL_SYSVAR_BOOL(collect_sst_properties, rocksdb_collect_sst_properties,
1822 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1823 "Enables collecting SST file properties on each flush",
1824 nullptr, nullptr, rocksdb_collect_sst_properties);
1825
1826 static MYSQL_SYSVAR_BOOL(
1827 force_flush_memtable_now, rocksdb_force_flush_memtable_now_var,
1828 PLUGIN_VAR_RQCMDARG,
1829 "Forces memstore flush which may block all write requests so be careful",
1830 rocksdb_force_flush_memtable_now, rocksdb_force_flush_memtable_now_stub,
1831 FALSE);
1832
1833 static MYSQL_SYSVAR_BOOL(
1834 force_flush_memtable_and_lzero_now,
1835 rocksdb_force_flush_memtable_and_lzero_now_var, PLUGIN_VAR_RQCMDARG,
1836 "Acts similar to force_flush_memtable_now, but also compacts all L0 files.",
1837 rocksdb_force_flush_memtable_and_lzero_now,
1838 rocksdb_force_flush_memtable_and_lzero_now_stub, FALSE);
1839
1840 static MYSQL_SYSVAR_UINT(
1841 seconds_between_stat_computes, rocksdb_seconds_between_stat_computes,
1842 PLUGIN_VAR_RQCMDARG,
1843 "Sets a number of seconds to wait between optimizer stats recomputation. "
1844 "Only changed indexes will be refreshed.",
1845 nullptr, nullptr, rocksdb_seconds_between_stat_computes,
1846 /* min */ 0L, /* max */ UINT_MAX, 0);
1847
1848 static MYSQL_SYSVAR_LONGLONG(compaction_sequential_deletes,
1849 rocksdb_compaction_sequential_deletes,
1850 PLUGIN_VAR_RQCMDARG,
1851 "RocksDB will trigger compaction for the file if "
1852 "it has more than this number sequential deletes "
1853 "per window",
1854 nullptr, rocksdb_set_compaction_options,
1855 DEFAULT_COMPACTION_SEQUENTIAL_DELETES,
1856 /* min */ 0L,
1857 /* max */ MAX_COMPACTION_SEQUENTIAL_DELETES, 0);
1858
1859 static MYSQL_SYSVAR_LONGLONG(
1860 compaction_sequential_deletes_window,
1861 rocksdb_compaction_sequential_deletes_window, PLUGIN_VAR_RQCMDARG,
1862 "Size of the window for counting rocksdb_compaction_sequential_deletes",
1863 nullptr, rocksdb_set_compaction_options,
1864 DEFAULT_COMPACTION_SEQUENTIAL_DELETES_WINDOW,
1865 /* min */ 0L, /* max */ MAX_COMPACTION_SEQUENTIAL_DELETES_WINDOW, 0);
1866
1867 static MYSQL_SYSVAR_LONGLONG(
1868 compaction_sequential_deletes_file_size,
1869 rocksdb_compaction_sequential_deletes_file_size, PLUGIN_VAR_RQCMDARG,
1870 "Minimum file size required for compaction_sequential_deletes", nullptr,
1871 rocksdb_set_compaction_options, 0L,
1872 /* min */ -1L, /* max */ LLONG_MAX, 0);
1873
1874 static MYSQL_SYSVAR_BOOL(
1875 compaction_sequential_deletes_count_sd,
1876 rocksdb_compaction_sequential_deletes_count_sd, PLUGIN_VAR_RQCMDARG,
1877 "Counting SingleDelete as rocksdb_compaction_sequential_deletes", nullptr,
1878 nullptr, rocksdb_compaction_sequential_deletes_count_sd);
1879
1880 static MYSQL_SYSVAR_BOOL(
1881 print_snapshot_conflict_queries, rocksdb_print_snapshot_conflict_queries,
1882 PLUGIN_VAR_RQCMDARG,
1883 "Logging queries that got snapshot conflict errors into *.err log", nullptr,
1884 nullptr, rocksdb_print_snapshot_conflict_queries);
1885
1886 static MYSQL_THDVAR_INT(checksums_pct, PLUGIN_VAR_RQCMDARG,
1887 "How many percentages of rows to be checksummed",
1888 nullptr, nullptr, RDB_MAX_CHECKSUMS_PCT,
1889 /* min */ 0, /* max */ RDB_MAX_CHECKSUMS_PCT, 0);
1890
1891 static MYSQL_THDVAR_BOOL(store_row_debug_checksums, PLUGIN_VAR_RQCMDARG,
1892 "Include checksums when writing index/table records",
1893 nullptr, nullptr, false /* default value */);
1894
1895 static MYSQL_THDVAR_BOOL(verify_row_debug_checksums, PLUGIN_VAR_RQCMDARG,
1896 "Verify checksums when reading index/table records",
1897 nullptr, nullptr, false /* default value */);
1898
1899 static MYSQL_THDVAR_BOOL(master_skip_tx_api, PLUGIN_VAR_RQCMDARG,
1900 "Skipping holding any lock on row access. "
1901 "Not effective on slave.",
1902 nullptr, nullptr, false);
1903
1904 static MYSQL_SYSVAR_UINT(
1905 validate_tables, rocksdb_validate_tables,
1906 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1907 "Verify all .frm files match all RocksDB tables (0 means no verification, "
1908 "1 means verify and fail on error, and 2 means verify but continue",
1909 nullptr, nullptr, 1 /* default value */, 0 /* min value */,
1910 2 /* max value */, 0);
1911
1912 static MYSQL_SYSVAR_UINT(
1913 ignore_datadic_errors, rocksdb_ignore_datadic_errors,
1914 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1915 "Ignore MyRocks' data directory errors. "
1916 "(CAUTION: Use only to start the server and perform repairs. Do NOT use "
1917 "for regular operation)",
1918 nullptr, nullptr, 0 /* default value */, 0 /* min value */,
1919 1 /* max value */, 0);
1920
1921 static MYSQL_SYSVAR_STR(datadir, rocksdb_datadir,
1922 PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
1923 "RocksDB data directory", nullptr, nullptr,
1924 "./#rocksdb");
1925
1926 static MYSQL_SYSVAR_STR(supported_compression_types,
1927 compression_types_val,
1928 PLUGIN_VAR_NOCMDOPT | PLUGIN_VAR_READONLY,
1929 "Compression algorithms supported by RocksDB",
1930 nullptr, nullptr,
1931 compression_types_val);
1932
1933 static MYSQL_SYSVAR_UINT(
1934 table_stats_sampling_pct, rocksdb_table_stats_sampling_pct,
1935 PLUGIN_VAR_RQCMDARG,
1936 "Percentage of entries to sample when collecting statistics about table "
1937 "properties. Specify either 0 to sample everything or percentage "
1938 "[" STRINGIFY_ARG(RDB_TBL_STATS_SAMPLE_PCT_MIN) ".." STRINGIFY_ARG(
1939 RDB_TBL_STATS_SAMPLE_PCT_MAX) "]. "
1940 "By default " STRINGIFY_ARG(
1941 RDB_DEFAULT_TBL_STATS_SAMPLE_PCT) "% "
1942 "of"
1943 " e"
1944 "nt"
1945 "ri"
1946 "es"
1947 " a"
1948 "re"
1949 " "
1950 "sa"
1951 "mp"
1952 "le"
1953 "d"
1954 ".",
1955 nullptr, rocksdb_set_table_stats_sampling_pct, /* default */
1956 RDB_DEFAULT_TBL_STATS_SAMPLE_PCT, /* everything */ 0,
1957 /* max */ RDB_TBL_STATS_SAMPLE_PCT_MAX, 0);
1958
1959 static MYSQL_SYSVAR_UINT(
1960 stats_recalc_rate, rocksdb_stats_recalc_rate, PLUGIN_VAR_RQCMDARG,
1961 "The number of indexes per second to recalculate statistics for. 0 to "
1962 "disable background recalculation.",
1963 nullptr, nullptr, 0 /* default value */, 0 /* min value */,
1964 UINT_MAX /* max value */, 0);
1965
1966 static MYSQL_SYSVAR_BOOL(
1967 large_prefix, rocksdb_large_prefix, PLUGIN_VAR_RQCMDARG,
1968 "Support large index prefix length of 3072 bytes. If off, the maximum "
1969 "index prefix length is 767.",
1970 nullptr, nullptr, FALSE);
1971
1972 static MYSQL_SYSVAR_BOOL(
1973 allow_to_start_after_corruption, rocksdb_allow_to_start_after_corruption,
1974 PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
1975 "Allow server still to start successfully even if RocksDB corruption is "
1976 "detected.",
1977 nullptr, nullptr, FALSE);
1978
1979 static MYSQL_SYSVAR_BOOL(error_on_suboptimal_collation,
1980 rocksdb_error_on_suboptimal_collation,
1981 PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
1982 "Raise an error instead of warning if a sub-optimal "
1983 "collation is used",
1984 nullptr, nullptr, TRUE);
1985
1986 static MYSQL_SYSVAR_BOOL(
1987 enable_insert_with_update_caching,
1988 rocksdb_enable_insert_with_update_caching, PLUGIN_VAR_OPCMDARG,
1989 "Whether to enable optimization where we cache the read from a failed "
1990 "insertion attempt in INSERT ON DUPLICATE KEY UPDATE",
1991 nullptr, nullptr, TRUE);
1992
1993 static const int ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE = 100;
1994
1995 static struct st_mysql_sys_var *rocksdb_system_variables[] = {
1996 MYSQL_SYSVAR(lock_wait_timeout),
1997 MYSQL_SYSVAR(deadlock_detect),
1998 MYSQL_SYSVAR(deadlock_detect_depth),
1999 MYSQL_SYSVAR(commit_time_batch_for_recovery),
2000 MYSQL_SYSVAR(max_row_locks),
2001 MYSQL_SYSVAR(write_batch_max_bytes),
2002 MYSQL_SYSVAR(lock_scanned_rows),
2003 MYSQL_SYSVAR(bulk_load),
2004 MYSQL_SYSVAR(bulk_load_allow_sk),
2005 MYSQL_SYSVAR(bulk_load_allow_unsorted),
2006 MYSQL_SYSVAR(skip_unique_check_tables),
2007 MYSQL_SYSVAR(trace_sst_api),
2008 MYSQL_SYSVAR(commit_in_the_middle),
2009 MYSQL_SYSVAR(blind_delete_primary_key),
2010 #if 0 // MARIAROCKS_NOT_YET : read-free replication is not supported
2011 MYSQL_SYSVAR(read_free_rpl_tables),
2012 MYSQL_SYSVAR(read_free_rpl),
2013 #endif
2014 MYSQL_SYSVAR(bulk_load_size),
2015 MYSQL_SYSVAR(merge_buf_size),
2016 MYSQL_SYSVAR(enable_bulk_load_api),
2017 MYSQL_SYSVAR(tmpdir),
2018 MYSQL_SYSVAR(merge_combine_read_size),
2019 MYSQL_SYSVAR(merge_tmp_file_removal_delay_ms),
2020 MYSQL_SYSVAR(skip_bloom_filter_on_read),
2021
2022 MYSQL_SYSVAR(create_if_missing),
2023 MYSQL_SYSVAR(two_write_queues),
2024 MYSQL_SYSVAR(manual_wal_flush),
2025 MYSQL_SYSVAR(write_policy),
2026 MYSQL_SYSVAR(create_missing_column_families),
2027 MYSQL_SYSVAR(error_if_exists),
2028 MYSQL_SYSVAR(paranoid_checks),
2029 MYSQL_SYSVAR(rate_limiter_bytes_per_sec),
2030 MYSQL_SYSVAR(sst_mgr_rate_bytes_per_sec),
2031 MYSQL_SYSVAR(delayed_write_rate),
2032 MYSQL_SYSVAR(max_latest_deadlocks),
2033 MYSQL_SYSVAR(info_log_level),
2034 MYSQL_SYSVAR(max_open_files),
2035 MYSQL_SYSVAR(max_total_wal_size),
2036 MYSQL_SYSVAR(use_fsync),
2037 MYSQL_SYSVAR(wal_dir),
2038 MYSQL_SYSVAR(persistent_cache_path),
2039 MYSQL_SYSVAR(persistent_cache_size_mb),
2040 MYSQL_SYSVAR(delete_obsolete_files_period_micros),
2041 MYSQL_SYSVAR(max_background_jobs),
2042 MYSQL_SYSVAR(max_log_file_size),
2043 MYSQL_SYSVAR(max_subcompactions),
2044 MYSQL_SYSVAR(log_file_time_to_roll),
2045 MYSQL_SYSVAR(keep_log_file_num),
2046 MYSQL_SYSVAR(max_manifest_file_size),
2047 MYSQL_SYSVAR(table_cache_numshardbits),
2048 MYSQL_SYSVAR(wal_ttl_seconds),
2049 MYSQL_SYSVAR(wal_size_limit_mb),
2050 MYSQL_SYSVAR(manifest_preallocation_size),
2051 MYSQL_SYSVAR(use_direct_reads),
2052 MYSQL_SYSVAR(use_direct_io_for_flush_and_compaction),
2053 MYSQL_SYSVAR(allow_mmap_reads),
2054 MYSQL_SYSVAR(allow_mmap_writes),
2055 MYSQL_SYSVAR(is_fd_close_on_exec),
2056 MYSQL_SYSVAR(stats_dump_period_sec),
2057 MYSQL_SYSVAR(advise_random_on_open),
2058 MYSQL_SYSVAR(db_write_buffer_size),
2059 MYSQL_SYSVAR(use_adaptive_mutex),
2060 MYSQL_SYSVAR(bytes_per_sync),
2061 MYSQL_SYSVAR(wal_bytes_per_sync),
2062 MYSQL_SYSVAR(enable_thread_tracking),
2063 MYSQL_SYSVAR(perf_context_level),
2064 MYSQL_SYSVAR(wal_recovery_mode),
2065 MYSQL_SYSVAR(stats_level),
2066 MYSQL_SYSVAR(access_hint_on_compaction_start),
2067 MYSQL_SYSVAR(new_table_reader_for_compaction_inputs),
2068 MYSQL_SYSVAR(compaction_readahead_size),
2069 MYSQL_SYSVAR(allow_concurrent_memtable_write),
2070 MYSQL_SYSVAR(enable_write_thread_adaptive_yield),
2071
2072 MYSQL_SYSVAR(block_cache_size),
2073 MYSQL_SYSVAR(sim_cache_size),
2074 MYSQL_SYSVAR(use_clock_cache),
2075 MYSQL_SYSVAR(cache_high_pri_pool_ratio),
2076 MYSQL_SYSVAR(cache_dump),
2077 MYSQL_SYSVAR(cache_index_and_filter_blocks),
2078 MYSQL_SYSVAR(cache_index_and_filter_with_high_priority),
2079 MYSQL_SYSVAR(pin_l0_filter_and_index_blocks_in_cache),
2080 MYSQL_SYSVAR(index_type),
2081 MYSQL_SYSVAR(hash_index_allow_collision),
2082 MYSQL_SYSVAR(no_block_cache),
2083 MYSQL_SYSVAR(block_size),
2084 MYSQL_SYSVAR(block_size_deviation),
2085 MYSQL_SYSVAR(block_restart_interval),
2086 MYSQL_SYSVAR(whole_key_filtering),
2087
2088 MYSQL_SYSVAR(default_cf_options),
2089 MYSQL_SYSVAR(override_cf_options),
2090 MYSQL_SYSVAR(update_cf_options),
2091
2092 MYSQL_SYSVAR(flush_log_at_trx_commit),
2093 MYSQL_SYSVAR(write_disable_wal),
2094 MYSQL_SYSVAR(write_ignore_missing_column_families),
2095
2096 MYSQL_SYSVAR(skip_fill_cache),
2097 MYSQL_SYSVAR(unsafe_for_binlog),
2098
2099 MYSQL_SYSVAR(records_in_range),
2100 MYSQL_SYSVAR(force_index_records_in_range),
2101 MYSQL_SYSVAR(debug_optimizer_n_rows),
2102 MYSQL_SYSVAR(force_compute_memtable_stats),
2103 MYSQL_SYSVAR(force_compute_memtable_stats_cachetime),
2104 MYSQL_SYSVAR(debug_optimizer_no_zero_cardinality),
2105
2106 MYSQL_SYSVAR(compact_cf),
2107 MYSQL_SYSVAR(delete_cf),
2108 MYSQL_SYSVAR(signal_drop_index_thread),
2109 MYSQL_SYSVAR(pause_background_work),
2110 MYSQL_SYSVAR(enable_2pc),
2111 MYSQL_SYSVAR(ignore_unknown_options),
2112 MYSQL_SYSVAR(strict_collation_check),
2113 MYSQL_SYSVAR(strict_collation_exceptions),
2114 MYSQL_SYSVAR(collect_sst_properties),
2115 MYSQL_SYSVAR(force_flush_memtable_now),
2116 MYSQL_SYSVAR(force_flush_memtable_and_lzero_now),
2117 MYSQL_SYSVAR(enable_ttl),
2118 MYSQL_SYSVAR(enable_ttl_read_filtering),
2119 MYSQL_SYSVAR(debug_ttl_rec_ts),
2120 MYSQL_SYSVAR(debug_ttl_snapshot_ts),
2121 MYSQL_SYSVAR(debug_ttl_read_filter_ts),
2122 MYSQL_SYSVAR(debug_ttl_ignore_pk),
2123 MYSQL_SYSVAR(reset_stats),
2124 MYSQL_SYSVAR(io_write_timeout),
2125 MYSQL_SYSVAR(seconds_between_stat_computes),
2126
2127 MYSQL_SYSVAR(compaction_sequential_deletes),
2128 MYSQL_SYSVAR(compaction_sequential_deletes_window),
2129 MYSQL_SYSVAR(compaction_sequential_deletes_file_size),
2130 MYSQL_SYSVAR(compaction_sequential_deletes_count_sd),
2131 MYSQL_SYSVAR(print_snapshot_conflict_queries),
2132
2133 MYSQL_SYSVAR(datadir),
2134 MYSQL_SYSVAR(supported_compression_types),
2135 MYSQL_SYSVAR(create_checkpoint),
2136 MYSQL_SYSVAR(remove_mariabackup_checkpoint),
2137 MYSQL_SYSVAR(checksums_pct),
2138 MYSQL_SYSVAR(store_row_debug_checksums),
2139 MYSQL_SYSVAR(verify_row_debug_checksums),
2140 MYSQL_SYSVAR(master_skip_tx_api),
2141
2142 MYSQL_SYSVAR(validate_tables),
2143 MYSQL_SYSVAR(table_stats_sampling_pct),
2144
2145 MYSQL_SYSVAR(large_prefix),
2146 MYSQL_SYSVAR(allow_to_start_after_corruption),
2147 MYSQL_SYSVAR(git_hash),
2148 MYSQL_SYSVAR(error_on_suboptimal_collation),
2149 MYSQL_SYSVAR(stats_recalc_rate),
2150 MYSQL_SYSVAR(debug_manual_compaction_delay),
2151 MYSQL_SYSVAR(max_manual_compactions),
2152 MYSQL_SYSVAR(manual_compaction_threads),
2153 MYSQL_SYSVAR(rollback_on_timeout),
2154
2155 MYSQL_SYSVAR(enable_insert_with_update_caching),
2156
2157 MYSQL_SYSVAR(ignore_datadic_errors),
2158 nullptr};
2159
rdb_get_rocksdb_write_options(my_core::THD * const thd)2160 static rocksdb::WriteOptions rdb_get_rocksdb_write_options(
2161 my_core::THD *const thd) {
2162 rocksdb::WriteOptions opt;
2163
2164 opt.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
2165 opt.disableWAL = THDVAR(thd, write_disable_wal);
2166 opt.ignore_missing_column_families =
2167 THDVAR(thd, write_ignore_missing_column_families);
2168
2169 return opt;
2170 }
2171
rocksdb_compact_column_family(THD * const thd,struct st_mysql_sys_var * const var,void * const var_ptr,struct st_mysql_value * const value)2172 static int rocksdb_compact_column_family(THD *const thd,
2173 struct st_mysql_sys_var *const var,
2174 void *const var_ptr,
2175 struct st_mysql_value *const value) {
2176 char buff[STRING_BUFFER_USUAL_SIZE];
2177 int len = sizeof(buff);
2178
2179 DBUG_ASSERT(value != nullptr);
2180
2181 if (const char *const cf = value->val_str(value, buff, &len)) {
2182 auto cfh = cf_manager.get_cf(cf);
2183 if (cfh != nullptr && rdb != nullptr) {
2184 int mc_id = rdb_mc_thread.request_manual_compaction(
2185 cfh, nullptr, nullptr, THDVAR(thd, manual_compaction_threads));
2186 if (mc_id == -1) {
2187 my_error(ER_INTERNAL_ERROR, MYF(0),
2188 "Can't schedule more manual compactions. "
2189 "Increase rocksdb_max_manual_compactions or stop issuing "
2190 "more manual compactions.");
2191 return HA_EXIT_FAILURE;
2192 } else if (mc_id < 0) {
2193 return HA_EXIT_FAILURE;
2194 }
2195 // NO_LINT_DEBUG
2196 sql_print_information("RocksDB: Manual compaction of column family: %s\n",
2197 cf);
2198 // Checking thd state every short cycle (100ms). This is for allowing to
2199 // exiting this function without waiting for CompactRange to finish.
2200 do {
2201 my_sleep(100000);
2202 } while (!thd->killed &&
2203 !rdb_mc_thread.is_manual_compaction_finished(mc_id));
2204
2205 if (thd->killed) {
2206 // This cancels if requested compaction state is INITED.
2207 // TODO(yoshinorim): Cancel running compaction as well once
2208 // it is supported in RocksDB.
2209 rdb_mc_thread.clear_manual_compaction_request(mc_id, true);
2210 }
2211 }
2212 }
2213 return HA_EXIT_SUCCESS;
2214 }
2215
2216 ///////////////////////////////////////////////////////////////////////////////////////////
2217
2218 /*
2219 Drop index thread's control
2220 */
2221
2222 static Rdb_drop_index_thread rdb_drop_idx_thread;
2223
rocksdb_drop_index_wakeup_thread(my_core::THD * const thd MY_ATTRIBUTE ((__unused__)),struct st_mysql_sys_var * const var MY_ATTRIBUTE ((__unused__)),void * const var_ptr MY_ATTRIBUTE ((__unused__)),const void * const save)2224 static void rocksdb_drop_index_wakeup_thread(
2225 my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
2226 struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
2227 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
2228 if (*static_cast<const bool *>(save)) {
2229 rdb_drop_idx_thread.signal();
2230 }
2231 }
2232
rocksdb_perf_context_level(THD * const thd)2233 static inline uint32_t rocksdb_perf_context_level(THD *const thd) {
2234 DBUG_ASSERT(thd != nullptr);
2235
2236 const int session_perf_context_level = THDVAR(thd, perf_context_level);
2237 if (session_perf_context_level > rocksdb::PerfLevel::kUninitialized) {
2238 return session_perf_context_level;
2239 }
2240
2241 /*
2242 Fallback to global thdvar, if session specific one was not set to a valid
2243 value.
2244 */
2245
2246 const int global_perf_context_level = THDVAR(nullptr, perf_context_level);
2247 if (global_perf_context_level > rocksdb::PerfLevel::kUninitialized) {
2248 return global_perf_context_level;
2249 }
2250
2251 return rocksdb::PerfLevel::kDisable;
2252 }
2253
2254 /*
2255 Very short (functor-like) interface to be passed to
2256 Rdb_transaction::walk_tx_list()
2257 */
2258
2259 interface Rdb_tx_list_walker {
2260 virtual ~Rdb_tx_list_walker() {}
2261 virtual void process_tran(const Rdb_transaction *const) = 0;
2262 };
2263
2264 /*
2265 This is a helper class that is passed to RocksDB to get notifications when
2266 a snapshot gets created.
2267 */
2268
2269 class Rdb_snapshot_notifier : public rocksdb::TransactionNotifier {
2270 Rdb_transaction *m_owning_tx;
2271
2272 void SnapshotCreated(const rocksdb::Snapshot *snapshot) override;
2273
2274 public:
2275 Rdb_snapshot_notifier(const Rdb_snapshot_notifier &) = delete;
2276 Rdb_snapshot_notifier &operator=(const Rdb_snapshot_notifier &) = delete;
2277
Rdb_snapshot_notifier(Rdb_transaction * const owning_tx)2278 explicit Rdb_snapshot_notifier(Rdb_transaction *const owning_tx)
2279 : m_owning_tx(owning_tx) {}
2280
2281 // If the owning Rdb_transaction gets destructed we need to not reference
2282 // it anymore.
detach()2283 void detach() { m_owning_tx = nullptr; }
2284 };
2285
2286
2287 #ifdef MARIAROCKS_NOT_YET
2288 // ER_LOCK_WAIT_TIMEOUT error also has a reason in facebook/mysql-5.6
2289 #endif
timeout_message(const char * command,const char * name1,const char * name2)2290 String timeout_message(const char *command, const char *name1,
2291 const char *name2)
2292 {
2293 String msg;
2294 msg.append("Timeout on ");
2295 msg.append(command);
2296 msg.append(": ");
2297 msg.append(name1);
2298 if (name2 && name2[0])
2299 {
2300 msg.append(".");
2301 msg.append(name2);
2302 }
2303 return msg;
2304 }
2305
2306
2307 /* This is the base class for transactions when interacting with rocksdb.
2308 */
2309 class Rdb_transaction {
2310 protected:
2311 ulonglong m_write_count = 0;
2312 ulonglong m_insert_count = 0;
2313 ulonglong m_update_count = 0;
2314 ulonglong m_delete_count = 0;
2315 ulonglong m_lock_count = 0;
2316 std::unordered_map<GL_INDEX_ID, ulonglong> m_auto_incr_map;
2317
2318 bool m_is_delayed_snapshot = false;
2319 bool m_is_two_phase = false;
2320
2321 std::unordered_set<Rdb_tbl_def*> modified_tables;
2322
2323 private:
2324 /*
2325 Number of write operations this transaction had when we took the last
2326 savepoint (the idea is not to take another savepoint if we haven't made
2327 any changes)
2328 */
2329 ulonglong m_writes_at_last_savepoint;
2330
2331 protected:
2332
2333 protected:
2334 THD *m_thd = nullptr;
2335
2336 static std::multiset<Rdb_transaction *> s_tx_list;
2337 static mysql_mutex_t s_tx_list_mutex;
2338
2339 Rdb_io_perf *m_tbl_io_perf;
2340
2341 bool m_tx_read_only = false;
2342
2343 int m_timeout_sec; /* Cached value of @@rocksdb_lock_wait_timeout */
2344
2345 /* Maximum number of locks the transaction can have */
2346 ulonglong m_max_row_locks;
2347
2348 bool m_is_tx_failed = false;
2349 bool m_rollback_only = false;
2350
2351 std::shared_ptr<Rdb_snapshot_notifier> m_notifier;
2352
2353 // This should be used only when updating binlog information.
2354 virtual rocksdb::WriteBatchBase *get_write_batch() = 0;
2355 virtual bool commit_no_binlog() = 0;
2356 virtual rocksdb::Iterator *get_iterator(
2357 const rocksdb::ReadOptions &options,
2358 rocksdb::ColumnFamilyHandle *column_family) = 0;
2359
2360 protected:
2361 /*
2362 The following two are helper functions to be overloaded by child classes.
2363 They should provide RocksDB's savepoint semantics.
2364 */
2365 virtual void do_set_savepoint() = 0;
2366 virtual void do_rollback_to_savepoint() = 0;
2367
2368 /*
2369 @detail
2370 This function takes in the WriteBatch of the transaction to add
2371 all the AUTO_INCREMENT merges. It does so by iterating through
2372 m_auto_incr_map and then constructing key/value pairs to call merge upon.
2373
2374 @param wb
2375 */
merge_auto_incr_map(rocksdb::WriteBatchBase * const wb)2376 rocksdb::Status merge_auto_incr_map(rocksdb::WriteBatchBase *const wb) {
2377 DBUG_EXECUTE_IF("myrocks_autoinc_upgrade", return rocksdb::Status::OK(););
2378
2379 // Iterate through the merge map merging all keys into data dictionary.
2380 rocksdb::Status s;
2381 for (auto &it : m_auto_incr_map) {
2382 s = dict_manager.put_auto_incr_val(wb, it.first, it.second);
2383 if (!s.ok()) {
2384 return s;
2385 }
2386 }
2387 m_auto_incr_map.clear();
2388 return s;
2389 }
2390
2391 public:
2392 rocksdb::ReadOptions m_read_opts;
2393 const char *m_mysql_log_file_name;
2394 my_off_t m_mysql_log_offset;
2395 #ifdef MARIAROCKS_NOT_YET
2396 // TODO: MariaDB probably doesn't need these at all:
2397 const char *m_mysql_gtid;
2398 const char *m_mysql_max_gtid;
2399 #endif
2400 String m_detailed_error;
2401 int64_t m_snapshot_timestamp = 0;
2402 bool m_ddl_transaction;
2403 #ifdef MARIAROCKS_NOT_YET
2404 std::shared_ptr<Rdb_explicit_snapshot> m_explicit_snapshot;
2405 #endif
2406
2407 /*
2408 Tracks the number of tables in use through external_lock.
2409 This should not be reset during start_tx().
2410 */
2411 int64_t m_n_mysql_tables_in_use = 0;
2412
2413 /*
2414 MariaDB's group commit:
2415 */
2416 bool commit_ordered_done;
2417 bool commit_ordered_res;
2418
2419 /*
2420 for distinction between rdb_transaction_impl and rdb_writebatch_impl
2421 when using walk tx list
2422 */
2423 virtual bool is_writebatch_trx() const = 0;
2424
init_mutex()2425 static void init_mutex() {
2426 mysql_mutex_init(key_mutex_tx_list, &s_tx_list_mutex, MY_MUTEX_INIT_FAST);
2427 }
2428
term_mutex()2429 static void term_mutex() {
2430 DBUG_ASSERT(s_tx_list.size() == 0);
2431 mysql_mutex_destroy(&s_tx_list_mutex);
2432 }
2433
walk_tx_list(Rdb_tx_list_walker * walker)2434 static void walk_tx_list(Rdb_tx_list_walker *walker) {
2435 DBUG_ASSERT(walker != nullptr);
2436
2437 RDB_MUTEX_LOCK_CHECK(s_tx_list_mutex);
2438
2439 for (auto it : s_tx_list) {
2440 walker->process_tran(it);
2441 }
2442
2443 RDB_MUTEX_UNLOCK_CHECK(s_tx_list_mutex);
2444 }
2445
set_status_error(THD * const thd,const rocksdb::Status & s,const Rdb_key_def & kd,Rdb_tbl_def * const tbl_def,Rdb_table_handler * const table_handler)2446 int set_status_error(THD *const thd, const rocksdb::Status &s,
2447 const Rdb_key_def &kd, Rdb_tbl_def *const tbl_def,
2448 Rdb_table_handler *const table_handler) {
2449 DBUG_ASSERT(!s.ok());
2450 DBUG_ASSERT(tbl_def != nullptr);
2451
2452 if (s.IsTimedOut()) {
2453 /*
2454 SQL layer has weird expectations. If we return an error when
2455 doing a read in DELETE IGNORE, it will ignore the error ("because it's
2456 an IGNORE command!) but then will fail an assert, because "error code
2457 was returned, but no error happened". Do what InnoDB's
2458 convert_error_code_to_mysql() does: force a statement
2459 rollback before returning HA_ERR_LOCK_WAIT_TIMEOUT:
2460 */
2461 my_core::thd_mark_transaction_to_rollback(
2462 thd, static_cast<bool>(rocksdb_rollback_on_timeout));
2463 m_detailed_error.copy(timeout_message(
2464 "index", tbl_def->full_tablename().c_str(), kd.get_name().c_str()));
2465 table_handler->m_lock_wait_timeout_counter.inc();
2466 rocksdb_row_lock_wait_timeouts++;
2467
2468 return HA_ERR_LOCK_WAIT_TIMEOUT;
2469 }
2470
2471 if (s.IsDeadlock()) {
2472 my_core::thd_mark_transaction_to_rollback(thd,
2473 true /* whole transaction */);
2474 m_detailed_error = String();
2475 table_handler->m_deadlock_counter.inc();
2476 rocksdb_row_lock_deadlocks++;
2477 return HA_ERR_LOCK_DEADLOCK;
2478 } else if (s.IsBusy()) {
2479 rocksdb_snapshot_conflict_errors++;
2480 if (rocksdb_print_snapshot_conflict_queries) {
2481 char user_host_buff[MAX_USER_HOST_SIZE + 1];
2482 make_user_name(thd, user_host_buff);
2483 // NO_LINT_DEBUG
2484 sql_print_warning(
2485 "Got snapshot conflict errors: User: %s "
2486 "Query: %s",
2487 user_host_buff, thd->query());
2488 }
2489 m_detailed_error = String(" (snapshot conflict)", system_charset_info);
2490 table_handler->m_deadlock_counter.inc();
2491 return HA_ERR_ROCKSDB_STATUS_BUSY;
2492 }
2493
2494 if (s.IsIOError() || s.IsCorruption()) {
2495 rdb_handle_io_error(s, RDB_IO_ERROR_GENERAL);
2496 }
2497
2498 return ha_rocksdb::rdb_error_to_mysql(s);
2499 }
2500
get_thd() const2501 THD *get_thd() const { return m_thd; }
2502
2503 /* Used for tracking io_perf counters */
io_perf_start(Rdb_io_perf * const io_perf)2504 void io_perf_start(Rdb_io_perf *const io_perf) {
2505 /*
2506 Since perf_context is tracked per thread, it is difficult and expensive
2507 to maintain perf_context on a per table basis. Therefore, roll all
2508 perf_context data into the first table used in a query. This works well
2509 for single table queries and is probably good enough for queries that hit
2510 multiple tables.
2511
2512 perf_context stats gathering is started when the table lock is acquired
2513 or when ha_rocksdb::start_stmt is called in case of LOCK TABLES. They
2514 are recorded when the table lock is released, or when commit/rollback
2515 is called on the transaction, whichever comes first. Table lock release
2516 and commit/rollback can happen in different orders. In the case where
2517 the lock is released before commit/rollback is called, an extra step to
2518 gather stats during commit/rollback is needed.
2519 */
2520 if (m_tbl_io_perf == nullptr &&
2521 io_perf->start(rocksdb_perf_context_level(m_thd))) {
2522 m_tbl_io_perf = io_perf;
2523 }
2524 }
2525
io_perf_end_and_record(void)2526 void io_perf_end_and_record(void) {
2527 if (m_tbl_io_perf != nullptr) {
2528 m_tbl_io_perf->end_and_record(rocksdb_perf_context_level(m_thd));
2529 m_tbl_io_perf = nullptr;
2530 }
2531 }
2532
io_perf_end_and_record(Rdb_io_perf * const io_perf)2533 void io_perf_end_and_record(Rdb_io_perf *const io_perf) {
2534 if (m_tbl_io_perf == io_perf) {
2535 io_perf_end_and_record();
2536 }
2537 }
2538
update_bytes_written(ulonglong bytes_written)2539 void update_bytes_written(ulonglong bytes_written) {
2540 if (m_tbl_io_perf != nullptr) {
2541 m_tbl_io_perf->update_bytes_written(rocksdb_perf_context_level(m_thd),
2542 bytes_written);
2543 }
2544 }
2545
set_params(int timeout_sec_arg,int max_row_locks_arg)2546 void set_params(int timeout_sec_arg, int max_row_locks_arg) {
2547 m_timeout_sec = timeout_sec_arg;
2548 m_max_row_locks = max_row_locks_arg;
2549 set_lock_timeout(timeout_sec_arg);
2550 }
2551
2552 virtual void set_lock_timeout(int timeout_sec_arg) = 0;
2553
get_write_count() const2554 ulonglong get_write_count() const { return m_write_count; }
2555
get_insert_count() const2556 ulonglong get_insert_count() const { return m_insert_count; }
2557
get_update_count() const2558 ulonglong get_update_count() const { return m_update_count; }
2559
get_delete_count() const2560 ulonglong get_delete_count() const { return m_delete_count; }
2561
incr_insert_count()2562 void incr_insert_count() { ++m_insert_count; }
2563
incr_update_count()2564 void incr_update_count() { ++m_update_count; }
2565
incr_delete_count()2566 void incr_delete_count() { ++m_delete_count; }
2567
get_timeout_sec() const2568 int get_timeout_sec() const { return m_timeout_sec; }
2569
get_lock_count() const2570 ulonglong get_lock_count() const { return m_lock_count; }
2571
2572 virtual void set_sync(bool sync) = 0;
2573
2574 virtual void release_lock(rocksdb::ColumnFamilyHandle *const column_family,
2575 const std::string &rowkey) = 0;
2576
2577 virtual bool prepare(const rocksdb::TransactionName &name) = 0;
2578
commit_or_rollback()2579 bool commit_or_rollback() {
2580 bool res;
2581 if (m_is_tx_failed) {
2582 rollback();
2583 res = false;
2584 } else {
2585 res = commit();
2586 }
2587 return res;
2588 }
2589
commit()2590 bool commit() {
2591 if (get_write_count() == 0) {
2592 rollback();
2593 return false;
2594 } else if (m_rollback_only) {
2595 /*
2596 Transactions marked as rollback_only are expected to be rolled back at
2597 prepare(). But there are some exceptions like below that prepare() is
2598 never called and commit() is called instead.
2599 1. Binlog is disabled
2600 2. No modification exists in binlog cache for the transaction (#195)
2601 In both cases, rolling back transaction is safe. Nothing is written to
2602 binlog.
2603 */
2604 my_error(ER_ROLLBACK_ONLY, MYF(0));
2605 rollback();
2606 return true;
2607 } else {
2608 #ifdef MARIAROCKS_NOT_YET
2609 /*
2610 Storing binlog position inside MyRocks is needed only for restoring
2611 MyRocks from backups. This feature is not supported yet.
2612 */
2613 mysql_bin_log_commit_pos(m_thd, &m_mysql_log_offset,
2614 &m_mysql_log_file_name);
2615 binlog_manager.update(m_mysql_log_file_name, m_mysql_log_offset,
2616 get_write_batch());
2617 #endif
2618 return commit_no_binlog();
2619 }
2620 }
2621
2622 virtual void rollback() = 0;
2623
snapshot_created(const rocksdb::Snapshot * const snapshot)2624 void snapshot_created(const rocksdb::Snapshot *const snapshot) {
2625 DBUG_ASSERT(snapshot != nullptr);
2626
2627 m_read_opts.snapshot = snapshot;
2628 rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp);
2629 m_is_delayed_snapshot = false;
2630 }
2631
2632 virtual void acquire_snapshot(bool acquire_now) = 0;
2633 virtual void release_snapshot() = 0;
2634
has_snapshot() const2635 bool has_snapshot() const { return m_read_opts.snapshot != nullptr; }
2636
2637 private:
2638 // The Rdb_sst_info structures we are currently loading. In a partitioned
2639 // table this can have more than one entry
2640 std::vector<std::shared_ptr<Rdb_sst_info>> m_curr_bulk_load;
2641 std::string m_curr_bulk_load_tablename;
2642
2643 /* External merge sorts for bulk load: key ID -> merge sort instance */
2644 std::unordered_map<GL_INDEX_ID, Rdb_index_merge> m_key_merge;
2645
2646 public:
get_key_merge(GL_INDEX_ID kd_gl_id,rocksdb::ColumnFamilyHandle * cf,Rdb_index_merge ** key_merge)2647 int get_key_merge(GL_INDEX_ID kd_gl_id, rocksdb::ColumnFamilyHandle *cf,
2648 Rdb_index_merge **key_merge) {
2649 int res;
2650 auto it = m_key_merge.find(kd_gl_id);
2651 if (it == m_key_merge.end()) {
2652 m_key_merge.emplace(
2653 std::piecewise_construct, std::make_tuple(kd_gl_id),
2654 std::make_tuple(
2655 get_rocksdb_tmpdir(), THDVAR(get_thd(), merge_buf_size),
2656 THDVAR(get_thd(), merge_combine_read_size),
2657 THDVAR(get_thd(), merge_tmp_file_removal_delay_ms), cf));
2658 it = m_key_merge.find(kd_gl_id);
2659 if ((res = it->second.init()) != 0) {
2660 return res;
2661 }
2662 }
2663 *key_merge = &it->second;
2664 return HA_EXIT_SUCCESS;
2665 }
2666
2667 /* Finish bulk loading for all table handlers belongs to one connection */
finish_bulk_load(bool * is_critical_error=nullptr,int print_client_error=true)2668 int finish_bulk_load(bool *is_critical_error = nullptr,
2669 int print_client_error = true) {
2670 Ensure_cleanup cleanup([&]() {
2671 // Always clear everything regardless of success/failure
2672 m_curr_bulk_load.clear();
2673 m_curr_bulk_load_tablename.clear();
2674 m_key_merge.clear();
2675 });
2676
2677 int rc = 0;
2678 if (is_critical_error) {
2679 *is_critical_error = true;
2680 }
2681
2682 // PREPARE phase: finish all on-going bulk loading Rdb_sst_info and
2683 // collect all Rdb_sst_commit_info containing (SST files, cf)
2684 int rc2 = 0;
2685 std::vector<Rdb_sst_info::Rdb_sst_commit_info> sst_commit_list;
2686 sst_commit_list.reserve(m_curr_bulk_load.size());
2687
2688 for (auto &sst_info : m_curr_bulk_load) {
2689 Rdb_sst_info::Rdb_sst_commit_info commit_info;
2690
2691 // Commit the list of SST files and move it to the end of
2692 // sst_commit_list, effectively transfer the ownership over
2693 rc2 = sst_info->finish(&commit_info, print_client_error);
2694 if (rc2 && rc == 0) {
2695 // Don't return yet - make sure we finish all the SST infos
2696 rc = rc2;
2697 }
2698
2699 // Make sure we have work to do - we might be losing the race
2700 if (rc2 == 0 && commit_info.has_work()) {
2701 sst_commit_list.emplace_back(std::move(commit_info));
2702 DBUG_ASSERT(!commit_info.has_work());
2703 }
2704 }
2705
2706 if (rc) {
2707 return rc;
2708 }
2709
2710 // MERGING Phase: Flush the index_merge sort buffers into SST files in
2711 // Rdb_sst_info and collect all Rdb_sst_commit_info containing
2712 // (SST files, cf)
2713 if (!m_key_merge.empty()) {
2714 Ensure_cleanup malloc_cleanup([]() {
2715 /*
2716 Explicitly tell jemalloc to clean up any unused dirty pages at this
2717 point.
2718 See https://reviews.facebook.net/D63723 for more details.
2719 */
2720 purge_all_jemalloc_arenas();
2721 });
2722
2723 rocksdb::Slice merge_key;
2724 rocksdb::Slice merge_val;
2725 for (auto it = m_key_merge.begin(); it != m_key_merge.end(); it++) {
2726 GL_INDEX_ID index_id = it->first;
2727 std::shared_ptr<const Rdb_key_def> keydef =
2728 ddl_manager.safe_find(index_id);
2729 std::string table_name = ddl_manager.safe_get_table_name(index_id);
2730
2731 // Unable to find key definition or table name since the
2732 // table could have been dropped.
2733 // TODO(herman): there is a race here between dropping the table
2734 // and detecting a drop here. If the table is dropped while bulk
2735 // loading is finishing, these keys being added here may
2736 // be missed by the compaction filter and not be marked for
2737 // removal. It is unclear how to lock the sql table from the storage
2738 // engine to prevent modifications to it while bulk load is occurring.
2739 if (keydef == nullptr) {
2740 if (is_critical_error) {
2741 // We used to set the error but simply ignores it. This follows
2742 // current behavior and we should revisit this later
2743 *is_critical_error = false;
2744 }
2745 return HA_ERR_KEY_NOT_FOUND;
2746 } else if (table_name.empty()) {
2747 if (is_critical_error) {
2748 // We used to set the error but simply ignores it. This follows
2749 // current behavior and we should revisit this later
2750 *is_critical_error = false;
2751 }
2752 return HA_ERR_NO_SUCH_TABLE;
2753 }
2754 const std::string &index_name = keydef->get_name();
2755 Rdb_index_merge &rdb_merge = it->second;
2756
2757 // Rdb_sst_info expects a denormalized table name in the form of
2758 // "./database/table"
2759 std::replace(table_name.begin(), table_name.end(), '.', '/');
2760 table_name = "./" + table_name;
2761 auto sst_info = std::make_shared<Rdb_sst_info>(
2762 rdb, table_name, index_name, rdb_merge.get_cf(),
2763 *rocksdb_db_options, THDVAR(get_thd(), trace_sst_api));
2764
2765 while ((rc2 = rdb_merge.next(&merge_key, &merge_val)) == 0) {
2766 if ((rc2 = sst_info->put(merge_key, merge_val)) != 0) {
2767 rc = rc2;
2768
2769 // Don't return yet - make sure we finish the sst_info
2770 break;
2771 }
2772 }
2773
2774 // -1 => no more items
2775 if (rc2 != -1 && rc != 0) {
2776 rc = rc2;
2777 }
2778
2779 Rdb_sst_info::Rdb_sst_commit_info commit_info;
2780 rc2 = sst_info->finish(&commit_info, print_client_error);
2781 if (rc2 != 0 && rc == 0) {
2782 // Only set the error from sst_info->finish if finish failed and we
2783 // didn't fail before. In other words, we don't have finish's
2784 // success mask earlier failures
2785 rc = rc2;
2786 }
2787
2788 if (rc) {
2789 return rc;
2790 }
2791
2792 if (commit_info.has_work()) {
2793 sst_commit_list.emplace_back(std::move(commit_info));
2794 DBUG_ASSERT(!commit_info.has_work());
2795 }
2796 }
2797 }
2798
2799 // Early return in case we lost the race completely and end up with no
2800 // work at all
2801 if (sst_commit_list.size() == 0) {
2802 return rc;
2803 }
2804
2805 // INGEST phase: Group all Rdb_sst_commit_info by cf (as they might
2806 // have the same cf across different indexes) and call out to RocksDB
2807 // to ingest all SST files in one atomic operation
2808 rocksdb::IngestExternalFileOptions options;
2809 options.move_files = true;
2810 options.snapshot_consistency = false;
2811 options.allow_global_seqno = false;
2812 options.allow_blocking_flush = false;
2813
2814 std::map<rocksdb::ColumnFamilyHandle *, rocksdb::IngestExternalFileArg>
2815 arg_map;
2816
2817 // Group by column_family
2818 for (auto &commit_info : sst_commit_list) {
2819 if (arg_map.find(commit_info.get_cf()) == arg_map.end()) {
2820 rocksdb::IngestExternalFileArg arg;
2821 arg.column_family = commit_info.get_cf(),
2822 arg.external_files = commit_info.get_committed_files(),
2823 arg.options = options;
2824
2825 arg_map.emplace(commit_info.get_cf(), arg);
2826 } else {
2827 auto &files = arg_map[commit_info.get_cf()].external_files;
2828 files.insert(files.end(), commit_info.get_committed_files().begin(),
2829 commit_info.get_committed_files().end());
2830 }
2831 }
2832
2833 std::vector<rocksdb::IngestExternalFileArg> args;
2834 size_t file_count = 0;
2835 for (auto &cf_files_pair : arg_map) {
2836 args.push_back(cf_files_pair.second);
2837 file_count += cf_files_pair.second.external_files.size();
2838 }
2839
2840 const rocksdb::Status s = rdb->IngestExternalFiles(args);
2841 if (THDVAR(m_thd, trace_sst_api)) {
2842 // NO_LINT_DEBUG
2843 sql_print_information(
2844 "SST Tracing: IngestExternalFile '%zu' files returned %s", file_count,
2845 s.ok() ? "ok" : "not ok");
2846 }
2847
2848 if (!s.ok()) {
2849 if (print_client_error) {
2850 Rdb_sst_info::report_error_msg(s, nullptr);
2851 }
2852 return HA_ERR_ROCKSDB_BULK_LOAD;
2853 }
2854
2855 // COMMIT phase: mark everything as completed. This avoids SST file
2856 // deletion kicking in. Otherwise SST files would get deleted if this
2857 // entire operation is aborted
2858 for (auto &commit_info : sst_commit_list) {
2859 commit_info.commit();
2860 }
2861
2862 return rc;
2863 }
2864
start_bulk_load(ha_rocksdb * const bulk_load,std::shared_ptr<Rdb_sst_info> sst_info)2865 int start_bulk_load(ha_rocksdb *const bulk_load,
2866 std::shared_ptr<Rdb_sst_info> sst_info) {
2867 /*
2868 If we already have an open bulk load of a table and the name doesn't
2869 match the current one, close out the currently running one. This allows
2870 multiple bulk loads to occur on a partitioned table, but then closes
2871 them all out when we switch to another table.
2872 */
2873 DBUG_ASSERT(bulk_load != nullptr);
2874
2875 if (!m_curr_bulk_load.empty() &&
2876 bulk_load->get_table_basename() != m_curr_bulk_load_tablename) {
2877 const auto res = finish_bulk_load();
2878 if (res != HA_EXIT_SUCCESS) {
2879 return res;
2880 }
2881 }
2882
2883 /*
2884 This used to track ha_rocksdb handler objects, but those can be
2885 freed by the table cache while this was referencing them. Instead
2886 of tracking ha_rocksdb handler objects, this now tracks the
2887 Rdb_sst_info allocated, and both the ha_rocksdb handler and the
2888 Rdb_transaction both have shared pointers to them.
2889
2890 On transaction complete, it will commit each Rdb_sst_info structure found.
2891 If the ha_rocksdb object is freed, etc., it will also commit
2892 the Rdb_sst_info. The Rdb_sst_info commit path needs to be idempotent.
2893 */
2894 m_curr_bulk_load.push_back(sst_info);
2895 m_curr_bulk_load_tablename = bulk_load->get_table_basename();
2896 return HA_EXIT_SUCCESS;
2897 }
2898
num_ongoing_bulk_load() const2899 int num_ongoing_bulk_load() const { return m_curr_bulk_load.size(); }
2900
get_rocksdb_tmpdir() const2901 const char *get_rocksdb_tmpdir() const {
2902 const char *tmp_dir = THDVAR(get_thd(), tmpdir);
2903
2904 /*
2905 We want to treat an empty string as nullptr, in these cases DDL operations
2906 will use the default --tmpdir passed to mysql instead.
2907 */
2908 if (tmp_dir != nullptr && *tmp_dir == '\0') {
2909 tmp_dir = nullptr;
2910 }
2911 return (tmp_dir);
2912 }
2913
2914 /*
2915 Flush the data accumulated so far. This assumes we're doing a bulk insert.
2916
2917 @detail
2918 This should work like transaction commit, except that we don't
2919 synchronize with the binlog (there is no API that would allow to have
2920 binlog flush the changes accumulated so far and return its current
2921 position)
2922
2923 @todo
2924 Add test coverage for what happens when somebody attempts to do bulk
2925 inserts while inside a multi-statement transaction.
2926 */
flush_batch()2927 bool flush_batch() {
2928 if (get_write_count() == 0) return false;
2929
2930 /* Commit the current transaction */
2931 if (commit_no_binlog()) return true;
2932
2933 /* Start another one */
2934 start_tx();
2935 return false;
2936 }
2937
set_auto_incr(const GL_INDEX_ID & gl_index_id,ulonglong curr_id)2938 void set_auto_incr(const GL_INDEX_ID &gl_index_id, ulonglong curr_id) {
2939 m_auto_incr_map[gl_index_id] =
2940 std::max(m_auto_incr_map[gl_index_id], curr_id);
2941 }
2942
2943 #ifndef DBUG_OFF
get_auto_incr(const GL_INDEX_ID & gl_index_id)2944 ulonglong get_auto_incr(const GL_INDEX_ID &gl_index_id) {
2945 if (m_auto_incr_map.count(gl_index_id) > 0) {
2946 return m_auto_incr_map[gl_index_id];
2947 }
2948 return 0;
2949 }
2950 #endif
2951
2952 virtual rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family,
2953 const rocksdb::Slice &key,
2954 const rocksdb::Slice &value,
2955 const bool assume_tracked) = 0;
2956 virtual rocksdb::Status delete_key(
2957 rocksdb::ColumnFamilyHandle *const column_family,
2958 const rocksdb::Slice &key, const bool assume_tracked) = 0;
2959 virtual rocksdb::Status single_delete(
2960 rocksdb::ColumnFamilyHandle *const column_family,
2961 const rocksdb::Slice &key, const bool assume_tracked) = 0;
2962
2963 virtual bool has_modifications() const = 0;
2964
2965 virtual rocksdb::WriteBatchBase *get_indexed_write_batch() = 0;
2966 /*
2967 Return a WriteBatch that one can write to. The writes will skip any
2968 transaction locking. The writes will NOT be visible to the transaction.
2969 */
get_blind_write_batch()2970 rocksdb::WriteBatchBase *get_blind_write_batch() {
2971 return get_indexed_write_batch()->GetWriteBatch();
2972 }
2973
2974 virtual rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family,
2975 const rocksdb::Slice &key,
2976 rocksdb::PinnableSlice *const value) const = 0;
2977 virtual rocksdb::Status get_for_update(
2978 rocksdb::ColumnFamilyHandle *const column_family,
2979 const rocksdb::Slice &key, rocksdb::PinnableSlice *const value,
2980 bool exclusive, const bool do_validate) = 0;
2981
get_iterator(rocksdb::ColumnFamilyHandle * const column_family,bool skip_bloom_filter,bool fill_cache,const rocksdb::Slice & eq_cond_lower_bound,const rocksdb::Slice & eq_cond_upper_bound,bool read_current=false,bool create_snapshot=true)2982 rocksdb::Iterator *get_iterator(
2983 rocksdb::ColumnFamilyHandle *const column_family, bool skip_bloom_filter,
2984 bool fill_cache, const rocksdb::Slice &eq_cond_lower_bound,
2985 const rocksdb::Slice &eq_cond_upper_bound, bool read_current = false,
2986 bool create_snapshot = true) {
2987 // Make sure we are not doing both read_current (which implies we don't
2988 // want a snapshot) and create_snapshot which makes sure we create
2989 // a snapshot
2990 DBUG_ASSERT(column_family != nullptr);
2991 DBUG_ASSERT(!read_current || !create_snapshot);
2992
2993 if (create_snapshot) acquire_snapshot(true);
2994
2995 rocksdb::ReadOptions options = m_read_opts;
2996
2997 if (skip_bloom_filter) {
2998 options.total_order_seek = true;
2999 options.iterate_lower_bound = &eq_cond_lower_bound;
3000 options.iterate_upper_bound = &eq_cond_upper_bound;
3001 } else {
3002 // With this option, Iterator::Valid() returns false if key
3003 // is outside of the prefix bloom filter range set at Seek().
3004 // Must not be set to true if not using bloom filter.
3005 options.prefix_same_as_start = true;
3006 }
3007 options.fill_cache = fill_cache;
3008 if (read_current) {
3009 options.snapshot = nullptr;
3010 }
3011 return get_iterator(options, column_family);
3012 }
3013
3014 virtual bool is_tx_started() const = 0;
3015 virtual void start_tx() = 0;
3016 virtual void start_stmt() = 0;
3017
3018 protected:
3019 // Non-virtual functions with actions to be done on transaction start and
3020 // commit.
on_commit()3021 void on_commit() {
3022 time_t tm;
3023 tm = time(nullptr);
3024 for (auto &it : modified_tables) {
3025 it->m_update_time = tm;
3026 }
3027 modified_tables.clear();
3028 }
on_rollback()3029 void on_rollback() {
3030 modified_tables.clear();
3031 }
3032 public:
3033 // Inform the transaction that this table was modified
log_table_write_op(Rdb_tbl_def * tbl)3034 void log_table_write_op(Rdb_tbl_def *tbl) {
3035 modified_tables.insert(tbl);
3036 }
3037
set_initial_savepoint()3038 void set_initial_savepoint() {
3039 /*
3040 Set the initial savepoint. If the first statement in the transaction
3041 fails, we need something to roll back to, without rolling back the
3042 entire transaction.
3043 */
3044 do_set_savepoint();
3045 m_writes_at_last_savepoint = m_write_count;
3046 }
3047
3048 /*
3049 Called when a "top-level" statement inside a transaction completes
3050 successfully and its changes become part of the transaction's changes.
3051 */
make_stmt_savepoint_permanent()3052 int make_stmt_savepoint_permanent() {
3053 // Take another RocksDB savepoint only if we had changes since the last
3054 // one. This is very important for long transactions doing lots of
3055 // SELECTs.
3056 if (m_writes_at_last_savepoint != m_write_count) {
3057 rocksdb::WriteBatchBase *batch = get_write_batch();
3058 rocksdb::Status status = rocksdb::Status::NotFound();
3059 while ((status = batch->PopSavePoint()) == rocksdb::Status::OK()) {
3060 }
3061
3062 if (status != rocksdb::Status::NotFound()) {
3063 return HA_EXIT_FAILURE;
3064 }
3065
3066 do_set_savepoint();
3067 m_writes_at_last_savepoint = m_write_count;
3068 }
3069
3070 return HA_EXIT_SUCCESS;
3071 }
3072
3073 /*
3074 Rollback to the savepoint we've set before the last statement
3075 */
rollback_to_stmt_savepoint()3076 void rollback_to_stmt_savepoint() {
3077 if (m_writes_at_last_savepoint != m_write_count) {
3078 do_rollback_to_savepoint();
3079 /*
3080 RollbackToSavePoint "removes the most recent SetSavePoint()", so
3081 we need to set it again so that next statement can roll back to this
3082 stage.
3083 It's ok to do it here at statement end (instead of doing it at next
3084 statement start) because setting a savepoint is cheap.
3085 */
3086 do_set_savepoint();
3087 m_writes_at_last_savepoint = m_write_count;
3088 }
3089 }
3090
3091 virtual void rollback_stmt() = 0;
3092
set_tx_failed(bool failed_arg)3093 void set_tx_failed(bool failed_arg) { m_is_tx_failed = failed_arg; }
3094
can_prepare() const3095 bool can_prepare() const {
3096 if (m_rollback_only) {
3097 my_error(ER_ROLLBACK_ONLY, MYF(0));
3098 return false;
3099 }
3100 return true;
3101 }
3102
rollback_to_savepoint(void * const savepoint)3103 int rollback_to_savepoint(void *const savepoint) {
3104 if (has_modifications()) {
3105 my_error(ER_ROLLBACK_TO_SAVEPOINT, MYF(0));
3106 m_rollback_only = true;
3107 return HA_EXIT_FAILURE;
3108 }
3109 return HA_EXIT_SUCCESS;
3110 }
3111
3112 /*
3113 This is used by transactions started with "START TRANSACTION WITH "
3114 "CONSISTENT [ROCKSDB] SNAPSHOT". When tx_read_only is turned on,
3115 snapshot has to be created via DB::GetSnapshot(), not via Transaction
3116 API.
3117 */
is_tx_read_only() const3118 bool is_tx_read_only() const { return m_tx_read_only; }
3119
is_two_phase() const3120 bool is_two_phase() const { return m_is_two_phase; }
3121
set_tx_read_only(bool val)3122 void set_tx_read_only(bool val) { m_tx_read_only = val; }
3123
Rdb_transaction(THD * const thd)3124 explicit Rdb_transaction(THD *const thd)
3125 : m_thd(thd), m_tbl_io_perf(nullptr) {
3126 RDB_MUTEX_LOCK_CHECK(s_tx_list_mutex);
3127 s_tx_list.insert(this);
3128 RDB_MUTEX_UNLOCK_CHECK(s_tx_list_mutex);
3129 }
3130
~Rdb_transaction()3131 virtual ~Rdb_transaction() {
3132 RDB_MUTEX_LOCK_CHECK(s_tx_list_mutex);
3133 s_tx_list.erase(this);
3134 RDB_MUTEX_UNLOCK_CHECK(s_tx_list_mutex);
3135 }
is_prepared()3136 virtual bool is_prepared() { return false; };
detach_prepared_tx()3137 virtual void detach_prepared_tx() {};
3138 };
3139
3140 /*
3141 This is a rocksdb transaction. Its members represent the current transaction,
3142 which consists of:
3143 - the snapshot
3144 - the changes we've made but are not seeing yet.
3145
3146 The changes are made to individual tables, which store them here and then
3147 this object commits them on commit.
3148 */
3149 class Rdb_transaction_impl : public Rdb_transaction {
3150 rocksdb::Transaction *m_rocksdb_tx = nullptr;
3151 rocksdb::Transaction *m_rocksdb_reuse_tx = nullptr;
3152
3153 public:
set_lock_timeout(int timeout_sec_arg)3154 void set_lock_timeout(int timeout_sec_arg) override {
3155 if (m_rocksdb_tx) {
3156 m_rocksdb_tx->SetLockTimeout(rdb_convert_sec_to_ms(m_timeout_sec));
3157 }
3158 }
3159
set_sync(bool sync)3160 void set_sync(bool sync) override {
3161 if (m_rocksdb_tx)
3162 m_rocksdb_tx->GetWriteOptions()->sync = sync;
3163 }
3164
release_lock(rocksdb::ColumnFamilyHandle * const column_family,const std::string & rowkey)3165 void release_lock(rocksdb::ColumnFamilyHandle *const column_family,
3166 const std::string &rowkey) override {
3167 if (!THDVAR(m_thd, lock_scanned_rows)) {
3168 m_rocksdb_tx->UndoGetForUpdate(column_family, rocksdb::Slice(rowkey));
3169 }
3170 }
3171
is_writebatch_trx() const3172 virtual bool is_writebatch_trx() const override { return false; }
3173
is_prepared()3174 bool is_prepared() override {
3175 return m_rocksdb_tx && rocksdb::Transaction::PREPARED == m_rocksdb_tx->GetState();
3176 }
3177
detach_prepared_tx()3178 void detach_prepared_tx() override {
3179 DBUG_ASSERT(rocksdb::Transaction::PREPARED == m_rocksdb_tx->GetState());
3180 m_rocksdb_tx = nullptr;
3181 }
3182
3183 private:
release_tx(void)3184 void release_tx(void) {
3185 // We are done with the current active transaction object. Preserve it
3186 // for later reuse.
3187 DBUG_ASSERT(m_rocksdb_reuse_tx == nullptr);
3188 m_rocksdb_reuse_tx = m_rocksdb_tx;
3189 m_rocksdb_tx = nullptr;
3190 }
3191
prepare(const rocksdb::TransactionName & name)3192 bool prepare(const rocksdb::TransactionName &name) override {
3193 rocksdb::Status s;
3194 s = m_rocksdb_tx->SetName(name);
3195 if (!s.ok()) {
3196 rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3197 return false;
3198 }
3199
3200 s = merge_auto_incr_map(m_rocksdb_tx->GetWriteBatch()->GetWriteBatch());
3201 if (!s.ok()) {
3202 rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3203 return false;
3204 }
3205
3206 s = m_rocksdb_tx->Prepare();
3207 if (!s.ok()) {
3208 rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3209 return false;
3210 }
3211 return true;
3212 }
3213
commit_no_binlog()3214 bool commit_no_binlog() override {
3215 bool res = false;
3216 rocksdb::Status s;
3217
3218 s = merge_auto_incr_map(m_rocksdb_tx->GetWriteBatch()->GetWriteBatch());
3219 if (!s.ok()) {
3220 rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3221 res = true;
3222 goto error;
3223 }
3224
3225 release_snapshot();
3226 s = m_rocksdb_tx->Commit();
3227 if (!s.ok()) {
3228 rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3229 res = true;
3230 goto error;
3231 }
3232
3233 on_commit();
3234 error:
3235 on_rollback();
3236 /* Save the transaction object to be reused */
3237 release_tx();
3238
3239 m_write_count = 0;
3240 m_insert_count = 0;
3241 m_update_count = 0;
3242 m_delete_count = 0;
3243 m_lock_count = 0;
3244 set_tx_read_only(false);
3245 m_rollback_only = false;
3246 return res;
3247 }
3248
3249 public:
rollback()3250 void rollback() override {
3251 on_rollback();
3252 m_write_count = 0;
3253 m_insert_count = 0;
3254 m_update_count = 0;
3255 m_delete_count = 0;
3256 m_lock_count = 0;
3257 m_auto_incr_map.clear();
3258 m_ddl_transaction = false;
3259 if (m_rocksdb_tx) {
3260 release_snapshot();
3261 /* This will also release all of the locks: */
3262 m_rocksdb_tx->Rollback();
3263
3264 /* Save the transaction object to be reused */
3265 release_tx();
3266
3267 set_tx_read_only(false);
3268 m_rollback_only = false;
3269 }
3270 }
3271
acquire_snapshot(bool acquire_now)3272 void acquire_snapshot(bool acquire_now) override {
3273 if (m_read_opts.snapshot == nullptr) {
3274 #ifdef MARIAROCKS_NOT_YET
3275 const auto thd_ss = std::static_pointer_cast<Rdb_explicit_snapshot>(
3276 m_thd->get_explicit_snapshot());
3277 if (thd_ss) {
3278 m_explicit_snapshot = thd_ss;
3279 }
3280 if (m_explicit_snapshot) {
3281 auto snapshot = m_explicit_snapshot->get_snapshot()->snapshot();
3282 snapshot_created(snapshot);
3283 } else
3284 #endif
3285 if (is_tx_read_only()) {
3286 snapshot_created(rdb->GetSnapshot());
3287 } else if (acquire_now) {
3288 m_rocksdb_tx->SetSnapshot();
3289 snapshot_created(m_rocksdb_tx->GetSnapshot());
3290 } else if (!m_is_delayed_snapshot) {
3291 m_rocksdb_tx->SetSnapshotOnNextOperation(m_notifier);
3292 m_is_delayed_snapshot = true;
3293 }
3294 }
3295 }
3296
release_snapshot()3297 void release_snapshot() override {
3298 bool need_clear = m_is_delayed_snapshot;
3299
3300 if (m_read_opts.snapshot != nullptr) {
3301 m_snapshot_timestamp = 0;
3302 #ifdef MARIAROCKS_NOT_YET
3303 if (m_explicit_snapshot) {
3304 m_explicit_snapshot.reset();
3305 need_clear = false;
3306 } else
3307 #endif
3308 if (is_tx_read_only()) {
3309 rdb->ReleaseSnapshot(m_read_opts.snapshot);
3310 need_clear = false;
3311 } else {
3312 need_clear = true;
3313 }
3314 m_read_opts.snapshot = nullptr;
3315 }
3316
3317 if (need_clear && m_rocksdb_tx != nullptr) m_rocksdb_tx->ClearSnapshot();
3318 }
3319
has_snapshot()3320 bool has_snapshot() { return m_read_opts.snapshot != nullptr; }
3321
put(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const rocksdb::Slice & value,const bool assume_tracked)3322 rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family,
3323 const rocksdb::Slice &key, const rocksdb::Slice &value,
3324 const bool assume_tracked) override {
3325 ++m_write_count;
3326 ++m_lock_count;
3327 if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks) {
3328 return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
3329 }
3330 return m_rocksdb_tx->Put(column_family, key, value, assume_tracked);
3331 }
3332
delete_key(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const bool assume_tracked)3333 rocksdb::Status delete_key(rocksdb::ColumnFamilyHandle *const column_family,
3334 const rocksdb::Slice &key,
3335 const bool assume_tracked) override {
3336 ++m_write_count;
3337 ++m_lock_count;
3338 if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks) {
3339 return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
3340 }
3341 return m_rocksdb_tx->Delete(column_family, key, assume_tracked);
3342 }
3343
single_delete(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const bool assume_tracked)3344 rocksdb::Status single_delete(
3345 rocksdb::ColumnFamilyHandle *const column_family,
3346 const rocksdb::Slice &key, const bool assume_tracked) override {
3347 ++m_write_count;
3348 ++m_lock_count;
3349 if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks) {
3350 return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
3351 }
3352 return m_rocksdb_tx->SingleDelete(column_family, key, assume_tracked);
3353 }
3354
has_modifications() const3355 bool has_modifications() const override {
3356 return m_rocksdb_tx->GetWriteBatch() &&
3357 m_rocksdb_tx->GetWriteBatch()->GetWriteBatch() &&
3358 m_rocksdb_tx->GetWriteBatch()->GetWriteBatch()->Count() > 0;
3359 }
3360
get_write_batch()3361 rocksdb::WriteBatchBase *get_write_batch() override {
3362 if (is_two_phase()) {
3363 return m_rocksdb_tx->GetCommitTimeWriteBatch();
3364 }
3365 return m_rocksdb_tx->GetWriteBatch()->GetWriteBatch();
3366 }
3367
3368 /*
3369 Return a WriteBatch that one can write to. The writes will skip any
3370 transaction locking. The writes WILL be visible to the transaction.
3371 */
get_indexed_write_batch()3372 rocksdb::WriteBatchBase *get_indexed_write_batch() override {
3373 ++m_write_count;
3374 return m_rocksdb_tx->GetWriteBatch();
3375 }
3376
get(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,rocksdb::PinnableSlice * const value) const3377 rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family,
3378 const rocksdb::Slice &key,
3379 rocksdb::PinnableSlice *const value) const override {
3380 // clean PinnableSlice right begfore Get() for multiple gets per statement
3381 // the resources after the last Get in a statement are cleared in
3382 // handler::reset call
3383 value->Reset();
3384 global_stats.queries[QUERIES_POINT].inc();
3385 return m_rocksdb_tx->Get(m_read_opts, column_family, key, value);
3386 }
3387
get_for_update(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,rocksdb::PinnableSlice * const value,bool exclusive,const bool do_validate)3388 rocksdb::Status get_for_update(
3389 rocksdb::ColumnFamilyHandle *const column_family,
3390 const rocksdb::Slice &key, rocksdb::PinnableSlice *const value,
3391 bool exclusive, const bool do_validate) override {
3392 if (++m_lock_count > m_max_row_locks) {
3393 return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
3394 }
3395
3396 if (value != nullptr) {
3397 value->Reset();
3398 }
3399 rocksdb::Status s;
3400 // If snapshot is null, pass it to GetForUpdate and snapshot is
3401 // initialized there. Snapshot validation is skipped in that case.
3402 if (m_read_opts.snapshot == nullptr || do_validate) {
3403 s = m_rocksdb_tx->GetForUpdate(
3404 m_read_opts, column_family, key, value, exclusive,
3405 m_read_opts.snapshot ? do_validate : false);
3406 } else {
3407 // If snapshot is set, and if skipping validation,
3408 // call GetForUpdate without validation and set back old snapshot
3409 auto saved_snapshot = m_read_opts.snapshot;
3410 m_read_opts.snapshot = nullptr;
3411 s = m_rocksdb_tx->GetForUpdate(m_read_opts, column_family, key, value,
3412 exclusive, false);
3413 m_read_opts.snapshot = saved_snapshot;
3414 }
3415 return s;
3416 }
3417
get_iterator(const rocksdb::ReadOptions & options,rocksdb::ColumnFamilyHandle * const column_family)3418 rocksdb::Iterator *get_iterator(
3419 const rocksdb::ReadOptions &options,
3420 rocksdb::ColumnFamilyHandle *const column_family) override {
3421 global_stats.queries[QUERIES_RANGE].inc();
3422 return m_rocksdb_tx->GetIterator(options, column_family);
3423 }
3424
get_rdb_trx() const3425 const rocksdb::Transaction *get_rdb_trx() const { return m_rocksdb_tx; }
3426
is_tx_started() const3427 bool is_tx_started() const override { return (m_rocksdb_tx != nullptr); }
3428
start_tx()3429 void start_tx() override {
3430 rocksdb::TransactionOptions tx_opts;
3431 rocksdb::WriteOptions write_opts;
3432 tx_opts.set_snapshot = false;
3433 tx_opts.lock_timeout = rdb_convert_sec_to_ms(m_timeout_sec);
3434 tx_opts.deadlock_detect = THDVAR(m_thd, deadlock_detect);
3435 tx_opts.deadlock_detect_depth = THDVAR(m_thd, deadlock_detect_depth);
3436 // If this variable is set, this will write commit time write batch
3437 // information on recovery or memtable flush.
3438 tx_opts.use_only_the_last_commit_time_batch_for_recovery =
3439 THDVAR(m_thd, commit_time_batch_for_recovery);
3440 tx_opts.max_write_batch_size = THDVAR(m_thd, write_batch_max_bytes);
3441
3442 write_opts.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
3443 write_opts.disableWAL = THDVAR(m_thd, write_disable_wal);
3444 write_opts.ignore_missing_column_families =
3445 THDVAR(m_thd, write_ignore_missing_column_families);
3446 m_is_two_phase = rocksdb_enable_2pc;
3447
3448 commit_ordered_done= false;
3449
3450 /*
3451 If m_rocksdb_reuse_tx is null this will create a new transaction object.
3452 Otherwise it will reuse the existing one.
3453 */
3454 m_rocksdb_tx =
3455 rdb->BeginTransaction(write_opts, tx_opts, m_rocksdb_reuse_tx);
3456 m_rocksdb_reuse_tx = nullptr;
3457
3458 m_read_opts = rocksdb::ReadOptions();
3459
3460 set_initial_savepoint();
3461
3462 m_ddl_transaction = false;
3463 }
3464
3465 /* Implementations of do_*savepoint based on rocksdB::Transaction savepoints
3466 */
do_set_savepoint()3467 void do_set_savepoint() override { m_rocksdb_tx->SetSavePoint(); }
3468
do_rollback_to_savepoint()3469 void do_rollback_to_savepoint() override {
3470 m_rocksdb_tx->RollbackToSavePoint();
3471 }
3472
3473 /*
3474 Start a statement inside a multi-statement transaction.
3475
3476 @todo: are we sure this is called once (and not several times) per
3477 statement start?
3478
3479 For hooking to start of statement that is its own transaction, see
3480 ha_rocksdb::external_lock().
3481 */
start_stmt()3482 void start_stmt() override {
3483 // Set the snapshot to delayed acquisition (SetSnapshotOnNextOperation)
3484 acquire_snapshot(false);
3485 }
3486
3487 /*
3488 This must be called when last statement is rolled back, but the transaction
3489 continues
3490 */
rollback_stmt()3491 void rollback_stmt() override {
3492 /* TODO: here we must release the locks taken since the start_stmt() call */
3493 if (m_rocksdb_tx) {
3494 const rocksdb::Snapshot *const org_snapshot = m_rocksdb_tx->GetSnapshot();
3495 rollback_to_stmt_savepoint();
3496
3497 const rocksdb::Snapshot *const cur_snapshot = m_rocksdb_tx->GetSnapshot();
3498 if (org_snapshot != cur_snapshot) {
3499 if (org_snapshot != nullptr) m_snapshot_timestamp = 0;
3500
3501 m_read_opts.snapshot = cur_snapshot;
3502 if (cur_snapshot != nullptr) {
3503 rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp);
3504 } else {
3505 m_is_delayed_snapshot = true;
3506 }
3507 }
3508 }
3509 }
3510
Rdb_transaction_impl(THD * const thd)3511 explicit Rdb_transaction_impl(THD *const thd)
3512 : Rdb_transaction(thd), m_rocksdb_tx(nullptr) {
3513 // Create a notifier that can be called when a snapshot gets generated.
3514 m_notifier = std::make_shared<Rdb_snapshot_notifier>(this);
3515 }
3516
~Rdb_transaction_impl()3517 virtual ~Rdb_transaction_impl() override {
3518 rollback();
3519
3520 // Theoretically the notifier could outlive the Rdb_transaction_impl
3521 // (because of the shared_ptr), so let it know it can't reference
3522 // the transaction anymore.
3523 m_notifier->detach();
3524
3525 // Free any transaction memory that is still hanging around.
3526 delete m_rocksdb_reuse_tx;
3527 DBUG_ASSERT(m_rocksdb_tx == nullptr);
3528 }
3529 };
3530
3531 /* This is a rocksdb write batch. This class doesn't hold or wait on any
3532 transaction locks (skips rocksdb transaction API) thus giving better
3533 performance.
3534
3535 Currently this is only used for replication threads which are guaranteed
3536 to be non-conflicting. Any further usage of this class should completely
3537 be thought thoroughly.
3538 */
3539 class Rdb_writebatch_impl : public Rdb_transaction {
3540 rocksdb::WriteBatchWithIndex *m_batch;
3541 rocksdb::WriteOptions write_opts;
3542 // Called after commit/rollback.
reset()3543 void reset() {
3544 m_batch->Clear();
3545 m_read_opts = rocksdb::ReadOptions();
3546 m_ddl_transaction = false;
3547 }
3548
3549 private:
prepare(const rocksdb::TransactionName & name)3550 bool prepare(const rocksdb::TransactionName &name) override { return true; }
3551
commit_no_binlog()3552 bool commit_no_binlog() override {
3553 bool res = false;
3554 rocksdb::Status s;
3555 rocksdb::TransactionDBWriteOptimizations optimize;
3556 optimize.skip_concurrency_control = true;
3557
3558 s = merge_auto_incr_map(m_batch->GetWriteBatch());
3559 if (!s.ok()) {
3560 rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3561 res = true;
3562 goto error;
3563 }
3564
3565 release_snapshot();
3566
3567 s = rdb->Write(write_opts, optimize, m_batch->GetWriteBatch());
3568 if (!s.ok()) {
3569 rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
3570 res = true;
3571 goto error;
3572 }
3573 on_commit();
3574 error:
3575 on_rollback();
3576 reset();
3577
3578 m_write_count = 0;
3579 m_insert_count = 0;
3580 m_update_count = 0;
3581 m_delete_count = 0;
3582 set_tx_read_only(false);
3583 m_rollback_only = false;
3584 return res;
3585 }
3586
3587 /* Implementations of do_*savepoint based on rocksdB::WriteBatch savepoints */
do_set_savepoint()3588 void do_set_savepoint() override { m_batch->SetSavePoint(); }
3589
do_rollback_to_savepoint()3590 void do_rollback_to_savepoint() override { m_batch->RollbackToSavePoint(); }
3591
3592
3593 public:
is_writebatch_trx() const3594 bool is_writebatch_trx() const override { return true; }
3595
set_lock_timeout(int timeout_sec_arg)3596 void set_lock_timeout(int timeout_sec_arg) override {
3597 // Nothing to do here.
3598 }
3599
set_sync(bool sync)3600 void set_sync(bool sync) override { write_opts.sync = sync; }
3601
release_lock(rocksdb::ColumnFamilyHandle * const column_family,const std::string & rowkey)3602 void release_lock(rocksdb::ColumnFamilyHandle *const column_family,
3603 const std::string &rowkey) override {
3604 // Nothing to do here since we don't hold any row locks.
3605 }
3606
rollback()3607 void rollback() override {
3608 on_rollback();
3609 m_write_count = 0;
3610 m_insert_count = 0;
3611 m_update_count = 0;
3612 m_delete_count = 0;
3613 m_lock_count = 0;
3614 release_snapshot();
3615
3616 reset();
3617 set_tx_read_only(false);
3618 m_rollback_only = false;
3619 }
3620
acquire_snapshot(bool acquire_now)3621 void acquire_snapshot(bool acquire_now) override {
3622 if (m_read_opts.snapshot == nullptr) snapshot_created(rdb->GetSnapshot());
3623 }
3624
release_snapshot()3625 void release_snapshot() override {
3626 if (m_read_opts.snapshot != nullptr) {
3627 rdb->ReleaseSnapshot(m_read_opts.snapshot);
3628 m_read_opts.snapshot = nullptr;
3629 }
3630 }
3631
put(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const rocksdb::Slice & value,const bool assume_tracked)3632 rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family,
3633 const rocksdb::Slice &key, const rocksdb::Slice &value,
3634 const bool assume_tracked) override {
3635 ++m_write_count;
3636 m_batch->Put(column_family, key, value);
3637 // Note Put/Delete in write batch doesn't return any error code. We simply
3638 // return OK here.
3639 return rocksdb::Status::OK();
3640 }
3641
delete_key(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const bool assume_tracked)3642 rocksdb::Status delete_key(rocksdb::ColumnFamilyHandle *const column_family,
3643 const rocksdb::Slice &key,
3644 const bool assume_tracked) override {
3645 ++m_write_count;
3646 m_batch->Delete(column_family, key);
3647 return rocksdb::Status::OK();
3648 }
3649
single_delete(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,const bool)3650 rocksdb::Status single_delete(
3651 rocksdb::ColumnFamilyHandle *const column_family,
3652 const rocksdb::Slice &key, const bool /* assume_tracked */) override {
3653 ++m_write_count;
3654 m_batch->SingleDelete(column_family, key);
3655 return rocksdb::Status::OK();
3656 }
3657
has_modifications() const3658 bool has_modifications() const override {
3659 return m_batch->GetWriteBatch()->Count() > 0;
3660 }
3661
get_write_batch()3662 rocksdb::WriteBatchBase *get_write_batch() override { return m_batch; }
3663
get_indexed_write_batch()3664 rocksdb::WriteBatchBase *get_indexed_write_batch() override {
3665 ++m_write_count;
3666 return m_batch;
3667 }
3668
get(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,rocksdb::PinnableSlice * const value) const3669 rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family,
3670 const rocksdb::Slice &key,
3671 rocksdb::PinnableSlice *const value) const override {
3672 value->Reset();
3673 return m_batch->GetFromBatchAndDB(rdb, m_read_opts, column_family, key,
3674 value);
3675 }
3676
get_for_update(rocksdb::ColumnFamilyHandle * const column_family,const rocksdb::Slice & key,rocksdb::PinnableSlice * const value,bool,const bool)3677 rocksdb::Status get_for_update(
3678 rocksdb::ColumnFamilyHandle *const column_family,
3679 const rocksdb::Slice &key, rocksdb::PinnableSlice *const value,
3680 bool /* exclusive */, const bool /* do_validate */) override {
3681 if (value == nullptr) {
3682 rocksdb::PinnableSlice pin_val;
3683 rocksdb::Status s = get(column_family, key, &pin_val);
3684 pin_val.Reset();
3685 return s;
3686 }
3687
3688 return get(column_family, key, value);
3689 }
3690
get_iterator(const rocksdb::ReadOptions & options,rocksdb::ColumnFamilyHandle * const)3691 rocksdb::Iterator *get_iterator(
3692 const rocksdb::ReadOptions &options,
3693 rocksdb::ColumnFamilyHandle *const /* column_family */) override {
3694 const auto it = rdb->NewIterator(options);
3695 return m_batch->NewIteratorWithBase(it);
3696 }
3697
is_tx_started() const3698 bool is_tx_started() const override { return (m_batch != nullptr); }
3699
start_tx()3700 void start_tx() override {
3701 commit_ordered_done= false; // Do we need this here?
3702 reset();
3703 write_opts.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
3704 write_opts.disableWAL = THDVAR(m_thd, write_disable_wal);
3705 write_opts.ignore_missing_column_families =
3706 THDVAR(m_thd, write_ignore_missing_column_families);
3707
3708 set_initial_savepoint();
3709 }
3710
start_stmt()3711 void start_stmt() override {}
3712
rollback_stmt()3713 void rollback_stmt() override {
3714 if (m_batch) rollback_to_stmt_savepoint();
3715 }
3716
Rdb_writebatch_impl(THD * const thd)3717 explicit Rdb_writebatch_impl(THD *const thd)
3718 : Rdb_transaction(thd), m_batch(nullptr) {
3719 m_batch = new rocksdb::WriteBatchWithIndex(rocksdb::BytewiseComparator(), 0,
3720 true);
3721 }
3722
~Rdb_writebatch_impl()3723 virtual ~Rdb_writebatch_impl() override {
3724 rollback();
3725 delete m_batch;
3726 }
3727 };
3728
SnapshotCreated(const rocksdb::Snapshot * const snapshot)3729 void Rdb_snapshot_notifier::SnapshotCreated(
3730 const rocksdb::Snapshot *const snapshot) {
3731 if (m_owning_tx != nullptr) {
3732 m_owning_tx->snapshot_created(snapshot);
3733 }
3734 }
3735
3736 std::multiset<Rdb_transaction *> Rdb_transaction::s_tx_list;
3737 mysql_mutex_t Rdb_transaction::s_tx_list_mutex;
3738
get_tx_from_thd(THD * const thd)3739 static Rdb_transaction *get_tx_from_thd(THD *const thd) {
3740 return reinterpret_cast<Rdb_transaction *>(
3741 my_core::thd_get_ha_data(thd, rocksdb_hton));
3742 }
3743
3744 namespace {
3745
3746 class Rdb_perf_context_guard {
3747 Rdb_io_perf m_io_perf;
3748 Rdb_io_perf *m_io_perf_ptr;
3749 Rdb_transaction *m_tx;
3750 uint m_level;
3751
3752 public:
3753 Rdb_perf_context_guard(const Rdb_perf_context_guard &) = delete;
3754 Rdb_perf_context_guard &operator=(const Rdb_perf_context_guard &) = delete;
3755
Rdb_perf_context_guard(Rdb_io_perf * io_perf,uint level)3756 explicit Rdb_perf_context_guard(Rdb_io_perf *io_perf, uint level)
3757 : m_io_perf_ptr(io_perf), m_tx(nullptr), m_level(level) {
3758 m_io_perf_ptr->start(m_level);
3759 }
3760
Rdb_perf_context_guard(Rdb_transaction * tx,uint level)3761 explicit Rdb_perf_context_guard(Rdb_transaction *tx, uint level)
3762 : m_io_perf_ptr(nullptr), m_tx(tx), m_level(level) {
3763 /*
3764 if perf_context information is already being recorded, this becomes a
3765 no-op
3766 */
3767 if (tx != nullptr) {
3768 tx->io_perf_start(&m_io_perf);
3769 }
3770 }
3771
~Rdb_perf_context_guard()3772 ~Rdb_perf_context_guard() {
3773 if (m_tx != nullptr) {
3774 m_tx->io_perf_end_and_record();
3775 } else if (m_io_perf_ptr != nullptr) {
3776 m_io_perf_ptr->end_and_record(m_level);
3777 }
3778 }
3779 };
3780
3781 } // anonymous namespace
3782
3783 /*
3784 TODO: maybe, call this in external_lock() and store in ha_rocksdb..
3785 */
3786
get_or_create_tx(THD * const thd)3787 static Rdb_transaction *get_or_create_tx(THD *const thd) {
3788 Rdb_transaction *tx = get_tx_from_thd(thd);
3789 // TODO: this is called too many times.. O(#rows)
3790 if (tx == nullptr) {
3791 bool rpl_skip_tx_api= false; // MARIAROCKS_NOT_YET.
3792 if ((rpl_skip_tx_api && thd->rgi_slave) ||
3793 (THDVAR(thd, master_skip_tx_api) && !thd->rgi_slave))
3794 {
3795 tx = new Rdb_writebatch_impl(thd);
3796 } else {
3797 tx = new Rdb_transaction_impl(thd);
3798 }
3799 tx->set_params(THDVAR(thd, lock_wait_timeout), THDVAR(thd, max_row_locks));
3800 tx->start_tx();
3801 my_core::thd_set_ha_data(thd, rocksdb_hton, tx);
3802 } else {
3803 tx->set_params(THDVAR(thd, lock_wait_timeout), THDVAR(thd, max_row_locks));
3804 if (!tx->is_tx_started()) {
3805 tx->start_tx();
3806 }
3807 }
3808
3809 return tx;
3810 }
3811
rocksdb_close_connection(handlerton * const hton,THD * const thd)3812 static int rocksdb_close_connection(handlerton *const hton, THD *const thd) {
3813 Rdb_transaction *tx = get_tx_from_thd(thd);
3814 if (tx != nullptr) {
3815 bool is_critical_error;
3816 int rc = tx->finish_bulk_load(&is_critical_error, false);
3817 if (rc != 0 && is_critical_error) {
3818 // NO_LINT_DEBUG
3819 sql_print_error(
3820 "RocksDB: Error %d finalizing last SST file while "
3821 "disconnecting",
3822 rc);
3823 }
3824 if (tx->is_prepared())
3825 tx->detach_prepared_tx();
3826 delete tx;
3827 }
3828 return HA_EXIT_SUCCESS;
3829 }
3830
3831 /*
3832 * Serializes an xid to a string so that it can
3833 * be used as a rocksdb transaction name
3834 */
rdb_xid_to_string(const XID & src)3835 static std::string rdb_xid_to_string(const XID &src) {
3836 DBUG_ASSERT(src.gtrid_length >= 0 && src.gtrid_length <= MAXGTRIDSIZE);
3837 DBUG_ASSERT(src.bqual_length >= 0 && src.bqual_length <= MAXBQUALSIZE);
3838
3839 std::string buf;
3840 buf.reserve(RDB_XIDHDR_LEN + src.gtrid_length + src.bqual_length);
3841
3842 /*
3843 * expand formatID to fill 8 bytes if it doesn't already
3844 * then reinterpret bit pattern as unsigned and store in network order
3845 */
3846 uchar fidbuf[RDB_FORMATID_SZ];
3847 int64 signed_fid8 = src.formatID;
3848 const uint64 raw_fid8 = *reinterpret_cast<uint64 *>(&signed_fid8);
3849 rdb_netbuf_store_uint64(fidbuf, raw_fid8);
3850 buf.append(reinterpret_cast<const char *>(fidbuf), RDB_FORMATID_SZ);
3851
3852 buf.push_back(src.gtrid_length);
3853 buf.push_back(src.bqual_length);
3854 buf.append(src.data, (src.gtrid_length) + (src.bqual_length));
3855 return buf;
3856 }
3857
3858 #if 0
3859 // MARIAROCKS: MariaDB doesn't have flush_wal method
3860 /**
3861 Called by hton->flush_logs after MySQL group commit prepares a set of
3862 transactions.
3863 */
3864 static bool rocksdb_flush_wal(handlerton* hton __attribute__((__unused__)))
3865 DBUG_ASSERT(rdb != nullptr);
3866
3867 rocksdb::Status s;
3868 /*
3869 target_lsn is set to 0 when MySQL wants to sync the wal files
3870 */
3871 if ((target_lsn == 0 && !rocksdb_db_options->allow_mmap_writes) ||
3872 rocksdb_flush_log_at_trx_commit != FLUSH_LOG_NEVER) {
3873 rocksdb_wal_group_syncs++;
3874 s = rdb->FlushWAL(target_lsn == 0 ||
3875 rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
3876 }
3877
3878 if (!s.ok()) {
3879 rdb_log_status_error(s);
3880 return HA_EXIT_FAILURE;
3881 }
3882 return HA_EXIT_SUCCESS;
3883 }
3884 #endif
3885
3886 /**
3887 For a slave, prepare() updates the slave_gtid_info table which tracks the
3888 replication progress.
3889 */
rocksdb_prepare(handlerton * hton,THD * thd,bool prepare_tx)3890 static int rocksdb_prepare(handlerton* hton, THD* thd, bool prepare_tx)
3891 {
3892 bool async=false; // This is "ASYNC_COMMIT" feature which is only present in webscalesql
3893
3894 Rdb_transaction *tx = get_tx_from_thd(thd);
3895 if (!tx->can_prepare()) {
3896 return HA_EXIT_FAILURE;
3897 }
3898 if (prepare_tx ||
3899 (!my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
3900 /* We were instructed to prepare the whole transaction, or
3901 this is an SQL statement end and autocommit is on */
3902
3903 #ifdef MARIAROCKS_NOT_YET
3904 /*
3905 Storing binlog position inside MyRocks is needed only for restoring
3906 MyRocks from backups. This feature is not supported yet.
3907 */
3908 std::vector<st_slave_gtid_info> slave_gtid_info;
3909 my_core::thd_slave_gtid_info(thd, &slave_gtid_info);
3910 for (const auto &it : slave_gtid_info) {
3911 rocksdb::WriteBatchBase *const write_batch = tx->get_blind_write_batch();
3912 binlog_manager.update_slave_gtid_info(it.id, it.db, it.gtid, write_batch);
3913 }
3914 #endif
3915
3916 if (tx->is_two_phase()) {
3917
3918 /*
3919 MariaDB: the following branch is never taken.
3920 We always flush at Prepare and rely on RocksDB's internal Group Commit
3921 to do some grouping.
3922 */
3923 if (thd->durability_property == HA_IGNORE_DURABILITY || async) {
3924 tx->set_sync(false);
3925 }
3926
3927 /*
3928 MariaDB: do not flush logs if we are running in a non-crash-safe mode.
3929 */
3930 if (!rocksdb_flush_log_at_trx_commit)
3931 tx->set_sync(false);
3932
3933 XID xid;
3934 thd_get_xid(thd, reinterpret_cast<MYSQL_XID *>(&xid));
3935 if (!tx->prepare(rdb_xid_to_string(xid))) {
3936 return HA_EXIT_FAILURE;
3937 }
3938
3939 /*
3940 MariaDB: our Group Commit implementation does not use the
3941 hton->flush_logs call (at least currently) so the following is not
3942 needed (TODO: will we need this for binlog rotation?)
3943 */
3944 #ifdef MARIAROCKS_NOT_YET
3945 if (thd->durability_property == HA_IGNORE_DURABILITY )
3946 (rocksdb_flush_log_at_trx_commit != FLUSH_LOG_NEVER))
3947 &&
3948 THDVAR(thd, flush_log_at_trx_commit))
3949 #endif
3950 #ifdef MARIAROCKS_NOT_YET
3951 {
3952 // MariaRocks: disable the
3953 // "write/sync redo log before flushing binlog cache to file"
3954 // feature. See a869c56d361bb44f46c0efeb11a8f03561676247
3955 /**
3956 we set the log sequence as '1' just to trigger hton->flush_logs
3957 */
3958 thd_store_lsn(thd, 1, DB_TYPE_ROCKSDB);
3959 }
3960 #endif
3961 }
3962
3963 DEBUG_SYNC(thd, "rocksdb.prepared");
3964 } else {
3965 tx->make_stmt_savepoint_permanent();
3966 }
3967 return HA_EXIT_SUCCESS;
3968 }
3969
3970 /**
3971 do nothing for prepare/commit by xid
3972 this is needed to avoid crashes in XA scenarios
3973 */
rocksdb_commit_by_xid(handlerton * const hton,XID * const xid)3974 static int rocksdb_commit_by_xid(handlerton *const hton, XID *const xid) {
3975 DBUG_ENTER_FUNC();
3976
3977 DBUG_ASSERT(hton != nullptr);
3978 DBUG_ASSERT(xid != nullptr);
3979 DBUG_ASSERT(commit_latency_stats != nullptr);
3980
3981 rocksdb::StopWatchNano timer(rocksdb::Env::Default(), true);
3982
3983 const auto name = rdb_xid_to_string(*xid);
3984 DBUG_ASSERT(!name.empty());
3985
3986 rocksdb::Transaction *const trx = rdb->GetTransactionByName(name);
3987
3988 if (trx == nullptr) {
3989 DBUG_RETURN(HA_EXIT_FAILURE);
3990 }
3991
3992 const rocksdb::Status s = trx->Commit();
3993
3994 if (!s.ok()) {
3995 rdb_log_status_error(s);
3996 DBUG_RETURN(HA_EXIT_FAILURE);
3997 }
3998
3999 delete trx;
4000
4001 // `Add()` is implemented in a thread-safe manner.
4002 commit_latency_stats->Add(timer.ElapsedNanos() / 1000);
4003
4004 DBUG_RETURN(HA_EXIT_SUCCESS);
4005 }
4006
rocksdb_rollback_by_xid(handlerton * const hton MY_ATTRIBUTE ((__unused__)),XID * const xid)4007 static int rocksdb_rollback_by_xid(
4008 handlerton *const hton MY_ATTRIBUTE((__unused__)), XID *const xid) {
4009 DBUG_ENTER_FUNC();
4010
4011 DBUG_ASSERT(hton != nullptr);
4012 DBUG_ASSERT(xid != nullptr);
4013 DBUG_ASSERT(rdb != nullptr);
4014
4015 const auto name = rdb_xid_to_string(*xid);
4016
4017 rocksdb::Transaction *const trx = rdb->GetTransactionByName(name);
4018
4019 if (trx == nullptr) {
4020 DBUG_RETURN(HA_EXIT_FAILURE);
4021 }
4022
4023 const rocksdb::Status s = trx->Rollback();
4024
4025 if (!s.ok()) {
4026 rdb_log_status_error(s);
4027 DBUG_RETURN(HA_EXIT_FAILURE);
4028 }
4029
4030 delete trx;
4031
4032 DBUG_RETURN(HA_EXIT_SUCCESS);
4033 }
4034
4035 /**
4036 Rebuilds an XID from a serialized version stored in a string.
4037 */
rdb_xid_from_string(const std::string & src,XID * const dst)4038 static void rdb_xid_from_string(const std::string &src, XID *const dst) {
4039 DBUG_ASSERT(dst != nullptr);
4040 uint offset = 0;
4041 uint64 raw_fid8 =
4042 rdb_netbuf_to_uint64(reinterpret_cast<const uchar *>(src.data()));
4043 const int64 signed_fid8 = *reinterpret_cast<int64 *>(&raw_fid8);
4044 dst->formatID = signed_fid8;
4045 offset += RDB_FORMATID_SZ;
4046 dst->gtrid_length = src.at(offset);
4047 offset += RDB_GTRID_SZ;
4048 dst->bqual_length = src.at(offset);
4049 offset += RDB_BQUAL_SZ;
4050
4051 DBUG_ASSERT(dst->gtrid_length >= 0 && dst->gtrid_length <= MAXGTRIDSIZE);
4052 DBUG_ASSERT(dst->bqual_length >= 0 && dst->bqual_length <= MAXBQUALSIZE);
4053
4054 memset(dst->data, 0, XIDDATASIZE);
4055 src.copy(dst->data, (dst->gtrid_length) + (dst->bqual_length),
4056 RDB_XIDHDR_LEN);
4057 }
4058
4059 /**
4060 Reading last committed binary log info from RocksDB system row.
4061 The info is needed for crash safe slave/master to work.
4062 */
rocksdb_recover(handlerton * hton,XID * xid_list,uint len)4063 static int rocksdb_recover(handlerton* hton, XID* xid_list, uint len)
4064 #ifdef MARIAROCKS_NOT_YET
4065 char* const binlog_file,
4066 my_off_t *const binlog_pos,
4067 Gtid *const binlog_max_gtid) {
4068 #endif
4069 {
4070 #ifdef MARIAROCKS_NOT_YET
4071 if (binlog_file && binlog_pos) {
4072 char file_buf[FN_REFLEN + 1] = {0};
4073 my_off_t pos;
4074 char gtid_buf[FN_REFLEN + 1] = {0};
4075 if (binlog_manager.read(file_buf, &pos, gtid_buf)) {
4076 if (is_binlog_advanced(binlog_file, *binlog_pos, file_buf, pos)) {
4077 memcpy(binlog_file, file_buf, FN_REFLEN + 1);
4078 *binlog_pos = pos;
4079 // NO_LINT_DEBUG
4080 fprintf(stderr,
4081 "RocksDB: Last binlog file position %llu,"
4082 " file name %s\n",
4083 pos, file_buf);
4084 if (*gtid_buf) {
4085 global_sid_lock->rdlock();
4086 binlog_max_gtid->parse(global_sid_map, gtid_buf);
4087 global_sid_lock->unlock();
4088 // NO_LINT_DEBUG
4089 fprintf(stderr, "RocksDB: Last MySQL Gtid %s\n", gtid_buf);
4090 }
4091 }
4092 }
4093 }
4094 #endif
4095
4096 if (len == 0 || xid_list == nullptr) {
4097 return HA_EXIT_SUCCESS;
4098 }
4099
4100 std::vector<rocksdb::Transaction *> trans_list;
4101 rdb->GetAllPreparedTransactions(&trans_list);
4102
4103 uint count = 0;
4104 for (auto &trans : trans_list) {
4105 if (count >= len) {
4106 break;
4107 }
4108 auto name = trans->GetName();
4109 rdb_xid_from_string(name, &xid_list[count]);
4110 count++;
4111 }
4112 return count;
4113 }
4114
4115
4116 /*
4117 Handle a commit checkpoint request from server layer.
4118
4119 InnoDB does this:
4120 We put the request in a queue, so that we can notify upper layer about
4121 checkpoint complete when we have flushed the redo log.
4122 If we have already flushed all relevant redo log, we notify immediately.
4123
4124 MariaRocks just flushes everything right away ATM
4125 */
4126
4127 static void rocksdb_checkpoint_request(void *cookie)
4128 {
4129 const rocksdb::Status s= rdb->FlushWAL(true);
4130 //TODO: what to do on error?
4131 if (s.ok())
4132 {
4133 rocksdb_wal_group_syncs++;
4134 commit_checkpoint_notify_ha(cookie);
4135 }
4136 }
4137
4138 /*
4139 @param all: TRUE - commit the transaction
4140 FALSE - SQL statement ended
4141 */
4142 static void rocksdb_commit_ordered(handlerton *hton, THD* thd, bool all)
4143 {
4144 // Same assert as InnoDB has
4145 DBUG_ASSERT(all || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT |
4146 OPTION_BEGIN)));
4147 Rdb_transaction *tx = get_tx_from_thd(thd);
4148 if (!tx->is_two_phase()) {
4149 /*
4150 ordered_commit is supposedly slower as it is done sequentially
4151 in order to preserve commit order.
4152
4153 if we are not required do 2-phase commit with the binlog, do not do
4154 anything here.
4155 */
4156 return;
4157 }
4158
4159 tx->set_sync(false);
4160
4161 /* This will note the master position also */
4162 tx->commit_ordered_res= tx->commit();
4163 tx->commit_ordered_done= true;
4164
4165 }
4166
4167
4168 static int rocksdb_commit(handlerton* hton, THD* thd, bool commit_tx)
4169 {
4170 DBUG_ENTER_FUNC();
4171
4172 DBUG_ASSERT(hton != nullptr);
4173 DBUG_ASSERT(thd != nullptr);
4174 DBUG_ASSERT(commit_latency_stats != nullptr);
4175
4176 rocksdb::StopWatchNano timer(rocksdb::Env::Default(), true);
4177
4178 /* note: h->external_lock(F_UNLCK) is called after this function is called) */
4179 Rdb_transaction *tx = get_tx_from_thd(thd);
4180
4181 /* this will trigger saving of perf_context information */
4182 Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd));
4183
4184 if (tx != nullptr) {
4185 if (commit_tx || (!my_core::thd_test_options(
4186 thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
4187 /*
4188 This will not add anything to commit_latency_stats, and this is correct
4189 right?
4190 */
4191 if (tx->commit_ordered_done)
4192 {
4193 thd_wakeup_subsequent_commits(thd, 0);
4194 DBUG_RETURN((tx->commit_ordered_res? HA_ERR_INTERNAL_ERROR: 0));
4195 }
4196
4197 /*
4198 We get here
4199 - For a COMMIT statement that finishes a multi-statement transaction
4200 - For a statement that has its own transaction
4201 */
4202 if (thd->slave_thread)
4203 {
4204 // An attempt to make parallel slave performant (not fully successful,
4205 // see MDEV-15372):
4206
4207 // First, commit without syncing. This establishes the commit order
4208 tx->set_sync(false);
4209 bool tx_had_writes = tx->get_write_count()? true : false ;
4210 if (tx->commit()) {
4211 DBUG_RETURN(HA_ERR_ROCKSDB_COMMIT_FAILED);
4212 }
4213 thd_wakeup_subsequent_commits(thd, 0);
4214
4215 if (tx_had_writes && rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC)
4216 {
4217 rocksdb::Status s= rdb->FlushWAL(true);
4218 if (!s.ok())
4219 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
4220 }
4221 }
4222 else
4223 {
4224 /* Not a slave thread */
4225 if (tx->commit()) {
4226 DBUG_RETURN(HA_ERR_ROCKSDB_COMMIT_FAILED);
4227 }
4228 }
4229 } else {
4230 /*
4231 We get here when committing a statement within a transaction.
4232 */
4233 tx->make_stmt_savepoint_permanent();
4234 }
4235
4236 if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED) {
4237 // For READ_COMMITTED, we release any existing snapshot so that we will
4238 // see any changes that occurred since the last statement.
4239 tx->release_snapshot();
4240 }
4241 }
4242
4243 // `Add()` is implemented in a thread-safe manner.
4244 commit_latency_stats->Add(timer.ElapsedNanos() / 1000);
4245
4246 DBUG_RETURN(HA_EXIT_SUCCESS);
4247 }
4248
4249
4250 static int rocksdb_rollback(handlerton *const hton, THD *const thd,
4251 bool rollback_tx) {
4252 Rdb_transaction *tx = get_tx_from_thd(thd);
4253 Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd));
4254
4255 if (tx != nullptr) {
4256 if (rollback_tx) {
4257 /*
4258 We get here, when
4259 - ROLLBACK statement is issued.
4260
4261 Discard the changes made by the transaction
4262 */
4263 tx->rollback();
4264 } else {
4265 /*
4266 We get here when
4267 - a statement with AUTOCOMMIT=1 is being rolled back (because of some
4268 error)
4269 - a statement inside a transaction is rolled back
4270 */
4271
4272 tx->rollback_stmt();
4273 tx->set_tx_failed(true);
4274 }
4275
4276 if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED) {
4277 // For READ_COMMITTED, we release any existing snapshot so that we will
4278 // see any changes that occurred since the last statement.
4279 tx->release_snapshot();
4280 }
4281 }
4282 return HA_EXIT_SUCCESS;
4283 }
4284
4285 static bool print_stats(THD *const thd, std::string const &type,
4286 std::string const &name, std::string const &status,
4287 stat_print_fn *stat_print) {
4288 return stat_print(thd, type.c_str(), type.size(), name.c_str(), name.size(),
4289 status.c_str(), status.size());
4290 }
4291
4292 static std::string format_string(const char *const format, ...) {
4293 std::string res;
4294 va_list args;
4295 va_list args_copy;
4296 char static_buff[256];
4297
4298 DBUG_ASSERT(format != nullptr);
4299
4300 va_start(args, format);
4301 va_copy(args_copy, args);
4302
4303 // Calculate how much space we will need
4304 int len = vsnprintf(nullptr, 0, format, args);
4305 va_end(args);
4306
4307 if (len < 0) {
4308 res = std::string("<format error>");
4309 } else if (len == 0) {
4310 // Shortcut for an empty string
4311 res = std::string("");
4312 } else {
4313 // For short enough output use a static buffer
4314 char *buff = static_buff;
4315 std::unique_ptr<char[]> dynamic_buff = nullptr;
4316
4317 len++; // Add one for null terminator
4318
4319 // for longer output use an allocated buffer
4320 if (static_cast<uint>(len) > sizeof(static_buff)) {
4321 dynamic_buff.reset(new char[len]);
4322 buff = dynamic_buff.get();
4323 }
4324
4325 // Now re-do the vsnprintf with the buffer which is now large enough
4326 (void)vsnprintf(buff, len, format, args_copy);
4327
4328 // Convert to a std::string. Note we could have created a std::string
4329 // large enough and then converted the buffer to a 'char*' and created
4330 // the output in place. This would probably work but feels like a hack.
4331 // Since this isn't code that needs to be super-performant we are going
4332 // with this 'safer' method.
4333 res = std::string(buff);
4334 }
4335
4336 va_end(args_copy);
4337
4338 return res;
4339 }
4340
4341 class Rdb_snapshot_status : public Rdb_tx_list_walker {
4342 private:
4343 std::string m_data;
4344
4345 static std::string current_timestamp(void) {
4346 static const char *const format = "%d-%02d-%02d %02d:%02d:%02d";
4347 time_t currtime;
4348 struct tm currtm;
4349
4350 time(&currtime);
4351
4352 localtime_r(&currtime, &currtm);
4353
4354 return format_string(format, currtm.tm_year + 1900, currtm.tm_mon + 1,
4355 currtm.tm_mday, currtm.tm_hour, currtm.tm_min,
4356 currtm.tm_sec);
4357 }
4358
4359 static std::string get_header(void) {
4360 return "\n============================================================\n" +
4361 current_timestamp() +
4362 " ROCKSDB TRANSACTION MONITOR OUTPUT\n"
4363 "============================================================\n"
4364 "---------\n"
4365 "SNAPSHOTS\n"
4366 "---------\n"
4367 "LIST OF SNAPSHOTS FOR EACH SESSION:\n";
4368 }
4369
4370 static std::string get_footer(void) {
4371 return "-----------------------------------------\n"
4372 "END OF ROCKSDB TRANSACTION MONITOR OUTPUT\n"
4373 "=========================================\n";
4374 }
4375
4376 static Rdb_deadlock_info::Rdb_dl_trx_info get_dl_txn_info(
4377 const rocksdb::DeadlockInfo &txn, const GL_INDEX_ID &gl_index_id) {
4378 Rdb_deadlock_info::Rdb_dl_trx_info txn_data;
4379
4380 txn_data.trx_id = txn.m_txn_id;
4381
4382 txn_data.table_name = ddl_manager.safe_get_table_name(gl_index_id);
4383 if (txn_data.table_name.empty()) {
4384 txn_data.table_name =
4385 "NOT FOUND; INDEX_ID: " + std::to_string(gl_index_id.index_id);
4386 }
4387
4388 auto kd = ddl_manager.safe_find(gl_index_id);
4389 txn_data.index_name =
4390 (kd) ? kd->get_name()
4391 : "NOT FOUND; INDEX_ID: " + std::to_string(gl_index_id.index_id);
4392
4393 rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(txn.m_cf_id);
4394 txn_data.cf_name = cfh->GetName();
4395
4396 txn_data.waiting_key =
4397 rdb_hexdump(txn.m_waiting_key.c_str(), txn.m_waiting_key.length());
4398
4399 txn_data.exclusive_lock = txn.m_exclusive;
4400
4401 return txn_data;
4402 }
4403
4404 static Rdb_deadlock_info get_dl_path_trx_info(
4405 const rocksdb::DeadlockPath &path_entry) {
4406 Rdb_deadlock_info deadlock_info;
4407
4408 for (auto it = path_entry.path.begin(); it != path_entry.path.end(); it++) {
4409 const auto &txn = *it;
4410 const GL_INDEX_ID gl_index_id = {
4411 txn.m_cf_id, rdb_netbuf_to_uint32(reinterpret_cast<const uchar *>(
4412 txn.m_waiting_key.c_str()))};
4413 deadlock_info.path.push_back(get_dl_txn_info(txn, gl_index_id));
4414 }
4415 DBUG_ASSERT_IFF(path_entry.limit_exceeded, path_entry.path.empty());
4416 /* print the first txn in the path to display the full deadlock cycle */
4417 if (!path_entry.path.empty() && !path_entry.limit_exceeded) {
4418 const auto &deadlocking_txn = *(path_entry.path.end() - 1);
4419 deadlock_info.victim_trx_id = deadlocking_txn.m_txn_id;
4420 deadlock_info.deadlock_time = path_entry.deadlock_time;
4421 }
4422 return deadlock_info;
4423 }
4424
4425 public:
4426 Rdb_snapshot_status() : m_data(get_header()) {}
4427
4428 std::string getResult() { return m_data + get_footer(); }
4429
4430 /* Implement Rdb_transaction interface */
4431 /* Create one row in the snapshot status table */
4432 void process_tran(const Rdb_transaction *const tx) override {
4433 DBUG_ASSERT(tx != nullptr);
4434
4435 /* Calculate the duration the snapshot has existed */
4436 int64_t snapshot_timestamp = tx->m_snapshot_timestamp;
4437 if (snapshot_timestamp != 0) {
4438 int64_t curr_time;
4439 rdb->GetEnv()->GetCurrentTime(&curr_time);
4440
4441 char buffer[1024];
4442 #ifdef MARIAROCKS_NOT_YET
4443 thd_security_context(tx->get_thd(), buffer, sizeof buffer, 0);
4444 #endif
4445 m_data += format_string(
4446 "---SNAPSHOT, ACTIVE %lld sec\n"
4447 "%s\n"
4448 "lock count %llu, write count %llu\n"
4449 "insert count %llu, update count %llu, delete count %llu\n",
4450 (longlong)(curr_time - snapshot_timestamp), buffer, tx->get_lock_count(),
4451 tx->get_write_count(), tx->get_insert_count(), tx->get_update_count(),
4452 tx->get_delete_count());
4453 }
4454 }
4455
4456 void populate_deadlock_buffer() {
4457 auto dlock_buffer = rdb->GetDeadlockInfoBuffer();
4458 m_data += "----------LATEST DETECTED DEADLOCKS----------\n";
4459
4460 for (const auto &path_entry : dlock_buffer) {
4461 std::string path_data;
4462 if (path_entry.limit_exceeded) {
4463 path_data += "\n-------DEADLOCK EXCEEDED MAX DEPTH-------\n";
4464 } else {
4465 path_data +=
4466 "\n*** DEADLOCK PATH\n"
4467 "=========================================\n";
4468 const auto dl_info = get_dl_path_trx_info(path_entry);
4469 const auto deadlock_time = dl_info.deadlock_time;
4470 for (auto it = dl_info.path.begin(); it != dl_info.path.end(); it++) {
4471 const auto &trx_info = *it;
4472 path_data += format_string(
4473 "TIMESTAMP: %" PRId64
4474 "\n"
4475 "TRANSACTION ID: %u\n"
4476 "COLUMN FAMILY NAME: %s\n"
4477 "WAITING KEY: %s\n"
4478 "LOCK TYPE: %s\n"
4479 "INDEX NAME: %s\n"
4480 "TABLE NAME: %s\n",
4481 deadlock_time, trx_info.trx_id, trx_info.cf_name.c_str(),
4482 trx_info.waiting_key.c_str(),
4483 trx_info.exclusive_lock ? "EXCLUSIVE" : "SHARED",
4484 trx_info.index_name.c_str(), trx_info.table_name.c_str());
4485 if (it != dl_info.path.end() - 1) {
4486 path_data += "---------------WAITING FOR---------------\n";
4487 }
4488 }
4489 path_data += format_string(
4490 "\n--------TRANSACTION ID: %u GOT DEADLOCK---------\n",
4491 dl_info.victim_trx_id);
4492 }
4493 m_data += path_data;
4494 }
4495 }
4496
4497 std::vector<Rdb_deadlock_info> get_deadlock_info() {
4498 std::vector<Rdb_deadlock_info> deadlock_info;
4499 auto dlock_buffer = rdb->GetDeadlockInfoBuffer();
4500 for (const auto &path_entry : dlock_buffer) {
4501 if (!path_entry.limit_exceeded) {
4502 deadlock_info.push_back(get_dl_path_trx_info(path_entry));
4503 }
4504 }
4505 return deadlock_info;
4506 }
4507 };
4508
4509 /**
4510 * @brief
4511 * walks through all non-replication transactions and copies
4512 * out relevant information for information_schema.rocksdb_trx
4513 */
4514 class Rdb_trx_info_aggregator : public Rdb_tx_list_walker {
4515 private:
4516 std::vector<Rdb_trx_info> *m_trx_info;
4517
4518 public:
4519 explicit Rdb_trx_info_aggregator(std::vector<Rdb_trx_info> *const trx_info)
4520 : m_trx_info(trx_info) {}
4521
4522 void process_tran(const Rdb_transaction *const tx) override {
4523 static const std::map<int, std::string> state_map = {
4524 {rocksdb::Transaction::STARTED, "STARTED"},
4525 {rocksdb::Transaction::AWAITING_PREPARE, "AWAITING_PREPARE"},
4526 {rocksdb::Transaction::PREPARED, "PREPARED"},
4527 {rocksdb::Transaction::AWAITING_COMMIT, "AWAITING_COMMIT"},
4528 {rocksdb::Transaction::COMMITED, "COMMITED"},
4529 {rocksdb::Transaction::AWAITING_ROLLBACK, "AWAITING_ROLLBACK"},
4530 {rocksdb::Transaction::ROLLEDBACK, "ROLLEDBACK"},
4531 };
4532
4533 DBUG_ASSERT(tx != nullptr);
4534
4535 THD *const thd = tx->get_thd();
4536 ulong thread_id = thd_get_thread_id(thd);
4537
4538 if (tx->is_writebatch_trx()) {
4539 const auto wb_impl = static_cast<const Rdb_writebatch_impl *>(tx);
4540 DBUG_ASSERT(wb_impl);
4541 m_trx_info->push_back(
4542 {"", /* name */
4543 0, /* trx_id */
4544 wb_impl->get_write_count(), 0, /* lock_count */
4545 0, /* timeout_sec */
4546 "", /* state */
4547 "", /* waiting_key */
4548 0, /* waiting_cf_id */
4549 1, /*is_replication */
4550 1, /* skip_trx_api */
4551 wb_impl->is_tx_read_only(), 0, /* deadlock detection */
4552 wb_impl->num_ongoing_bulk_load(), thread_id, "" /* query string */});
4553 } else {
4554 const auto tx_impl = static_cast<const Rdb_transaction_impl *>(tx);
4555 DBUG_ASSERT(tx_impl);
4556 const rocksdb::Transaction *rdb_trx = tx_impl->get_rdb_trx();
4557
4558 if (rdb_trx == nullptr) {
4559 return;
4560 }
4561
4562 char query_buf[NAME_LEN+1];
4563 thd_query_safe(thd, query_buf, sizeof(query_buf));
4564 std::string query_str(query_buf);
4565
4566 const auto state_it = state_map.find(rdb_trx->GetState());
4567 DBUG_ASSERT(state_it != state_map.end());
4568 const int is_replication = (thd->rgi_slave != nullptr);
4569 uint32_t waiting_cf_id;
4570 std::string waiting_key;
4571 rdb_trx->GetWaitingTxns(&waiting_cf_id, &waiting_key),
4572
4573 m_trx_info->push_back(
4574 {rdb_trx->GetName(), rdb_trx->GetID(), tx_impl->get_write_count(),
4575 tx_impl->get_lock_count(), tx_impl->get_timeout_sec(),
4576 state_it->second, waiting_key, waiting_cf_id, is_replication,
4577 0, /* skip_trx_api */
4578 tx_impl->is_tx_read_only(), rdb_trx->IsDeadlockDetect(),
4579 tx_impl->num_ongoing_bulk_load(), thread_id, query_str});
4580 }
4581 }
4582 };
4583
4584 /*
4585 returns a vector of info for all non-replication threads
4586 for use by information_schema.rocksdb_trx
4587 */
4588 std::vector<Rdb_trx_info> rdb_get_all_trx_info() {
4589 std::vector<Rdb_trx_info> trx_info;
4590 Rdb_trx_info_aggregator trx_info_agg(&trx_info);
4591 Rdb_transaction::walk_tx_list(&trx_info_agg);
4592 return trx_info;
4593 }
4594
4595
4596 /*
4597 returns a vector of info of recent deadlocks
4598 for use by information_schema.rocksdb_deadlock
4599 */
4600 std::vector<Rdb_deadlock_info> rdb_get_deadlock_info() {
4601 Rdb_snapshot_status showStatus;
4602 Rdb_transaction::walk_tx_list(&showStatus);
4603 return showStatus.get_deadlock_info();
4604 }
4605
4606 #ifdef MARIAROCKS_NOT_YET
4607 /* Generate the snapshot status table */
4608 static bool rocksdb_show_snapshot_status(handlerton *const hton, THD *const thd,
4609 stat_print_fn *const stat_print) {
4610 Rdb_snapshot_status showStatus;
4611
4612 Rdb_transaction::walk_tx_list(&showStatus);
4613 showStatus.populate_deadlock_buffer();
4614
4615 /* Send the result data back to MySQL */
4616 return print_stats(thd, "rocksdb", "", showStatus.getResult(), stat_print);
4617 }
4618 #endif
4619
4620 /*
4621 This is called for SHOW ENGINE ROCKSDB STATUS | LOGS | etc.
4622
4623 For now, produce info about live files (which gives an imprecise idea about
4624 what column families are there).
4625 */
4626 static bool rocksdb_show_status(handlerton *const hton, THD *const thd,
4627 stat_print_fn *const stat_print,
4628 enum ha_stat_type stat_type) {
4629 DBUG_ASSERT(hton != nullptr);
4630 DBUG_ASSERT(thd != nullptr);
4631 DBUG_ASSERT(stat_print != nullptr);
4632
4633 bool res = false;
4634 char buf[100] = {'\0'};
4635
4636 if (stat_type == HA_ENGINE_STATUS) {
4637 DBUG_ASSERT(rdb != nullptr);
4638
4639 std::string str;
4640
4641 /* Global DB Statistics */
4642 if (rocksdb_stats) {
4643 str = rocksdb_stats->ToString();
4644
4645 // Use the same format as internal RocksDB statistics entries to make
4646 // sure that output will look unified.
4647 DBUG_ASSERT(commit_latency_stats != nullptr);
4648
4649 snprintf(buf, sizeof(buf),
4650 "rocksdb.commit_latency statistics "
4651 "Percentiles :=> 50 : %.2f 95 : %.2f "
4652 "99 : %.2f 100 : %.2f\n",
4653 commit_latency_stats->Percentile(50),
4654 commit_latency_stats->Percentile(95),
4655 commit_latency_stats->Percentile(99),
4656 commit_latency_stats->Percentile(100));
4657 str.append(buf);
4658
4659 uint64_t v = 0;
4660
4661 // Retrieve additional stalling related numbers from RocksDB and append
4662 // them to the buffer meant for displaying detailed statistics. The intent
4663 // here is to avoid adding another row to the query output because of
4664 // just two numbers.
4665 //
4666 // NB! We're replacing hyphens with underscores in output to better match
4667 // the existing naming convention.
4668 if (rdb->GetIntProperty("rocksdb.is-write-stopped", &v)) {
4669 snprintf(buf, sizeof(buf), "rocksdb.is_write_stopped COUNT : %llu\n", (ulonglong)v);
4670 str.append(buf);
4671 }
4672
4673 if (rdb->GetIntProperty("rocksdb.actual-delayed-write-rate", &v)) {
4674 snprintf(buf, sizeof(buf),
4675 "COUNT : %llu\n",
4676 (ulonglong)v);
4677 str.append(buf);
4678 }
4679
4680 res |= print_stats(thd, "STATISTICS", "rocksdb", str, stat_print);
4681 }
4682
4683 /* Per DB stats */
4684 if (rdb->GetProperty("rocksdb.dbstats", &str)) {
4685 res |= print_stats(thd, "DBSTATS", "rocksdb", str, stat_print);
4686 }
4687
4688 /* Per column family stats */
4689 for (const auto &cf_name : cf_manager.get_cf_names()) {
4690 rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(cf_name);
4691 if (cfh == nullptr) {
4692 continue;
4693 }
4694
4695 if (!rdb->GetProperty(cfh, "rocksdb.cfstats", &str)) {
4696 continue;
4697 }
4698
4699 res |= print_stats(thd, "CF_COMPACTION", cf_name, str, stat_print);
4700 }
4701
4702 /* Memory Statistics */
4703 std::vector<rocksdb::DB *> dbs;
4704 std::unordered_set<const rocksdb::Cache *> cache_set;
4705 size_t internal_cache_count = 0;
4706 size_t kDefaultInternalCacheSize = 8 * 1024 * 1024;
4707
4708 dbs.push_back(rdb);
4709 cache_set.insert(rocksdb_tbl_options->block_cache.get());
4710
4711 for (const auto &cf_handle : cf_manager.get_all_cf()) {
4712 rocksdb::ColumnFamilyDescriptor cf_desc;
4713 cf_handle->GetDescriptor(&cf_desc);
4714 auto *const table_factory = cf_desc.options.table_factory.get();
4715
4716 if (table_factory != nullptr) {
4717 std::string tf_name = table_factory->Name();
4718
4719 if (tf_name.find("BlockBasedTable") != std::string::npos) {
4720 const rocksdb::BlockBasedTableOptions *const bbt_opt =
4721 reinterpret_cast<rocksdb::BlockBasedTableOptions *>(
4722 table_factory->GetOptions());
4723
4724 if (bbt_opt != nullptr) {
4725 if (bbt_opt->block_cache.get() != nullptr) {
4726 cache_set.insert(bbt_opt->block_cache.get());
4727 } else {
4728 internal_cache_count++;
4729 }
4730 cache_set.insert(bbt_opt->block_cache_compressed.get());
4731 }
4732 }
4733 }
4734 }
4735
4736 std::map<rocksdb::MemoryUtil::UsageType, uint64_t> temp_usage_by_type;
4737 str.clear();
4738 rocksdb::MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set,
4739 &temp_usage_by_type);
4740 snprintf(buf, sizeof(buf), "\nMemTable Total: %llu",
4741 (ulonglong)temp_usage_by_type[rocksdb::MemoryUtil::kMemTableTotal]);
4742 str.append(buf);
4743 snprintf(buf, sizeof(buf), "\nMemTable Unflushed: %llu",
4744 (ulonglong)temp_usage_by_type[rocksdb::MemoryUtil::kMemTableUnFlushed]);
4745 str.append(buf);
4746 snprintf(buf, sizeof(buf), "\nTable Readers Total: %llu",
4747 (ulonglong)temp_usage_by_type[rocksdb::MemoryUtil::kTableReadersTotal]);
4748 str.append(buf);
4749 snprintf(buf, sizeof(buf), "\nCache Total: %llu",
4750 (ulonglong)temp_usage_by_type[rocksdb::MemoryUtil::kCacheTotal]);
4751 str.append(buf);
4752 snprintf(buf, sizeof(buf), "\nDefault Cache Capacity: %llu",
4753 (ulonglong)internal_cache_count * kDefaultInternalCacheSize);
4754 str.append(buf);
4755 res |= print_stats(thd, "MEMORY_STATS", "rocksdb", str, stat_print);
4756
4757 /* Show the background thread status */
4758 std::vector<rocksdb::ThreadStatus> thread_list;
4759 rocksdb::Status s = rdb->GetEnv()->GetThreadList(&thread_list);
4760
4761 if (!s.ok()) {
4762 // NO_LINT_DEBUG
4763 sql_print_error("RocksDB: Returned error (%s) from GetThreadList.\n",
4764 s.ToString().c_str());
4765 res |= true;
4766 } else {
4767 /* For each background thread retrieved, print out its information */
4768 for (auto &it : thread_list) {
4769 /* Only look at background threads. Ignore user threads, if any. */
4770 if (it.thread_type > rocksdb::ThreadStatus::LOW_PRIORITY) {
4771 continue;
4772 }
4773
4774 str = "\nthread_type: " + it.GetThreadTypeName(it.thread_type) +
4775 "\ncf_name: " + it.cf_name +
4776 "\noperation_type: " + it.GetOperationName(it.operation_type) +
4777 "\noperation_stage: " +
4778 it.GetOperationStageName(it.operation_stage) +
4779 "\nelapsed_time_ms: " + it.MicrosToString(it.op_elapsed_micros);
4780
4781 for (auto &it_props : it.InterpretOperationProperties(
4782 it.operation_type, it.op_properties)) {
4783 str += "\n" + it_props.first + ": " + std::to_string(it_props.second);
4784 }
4785
4786 str += "\nstate_type: " + it.GetStateName(it.state_type);
4787
4788 res |= print_stats(thd, "BG_THREADS", std::to_string(it.thread_id), str,
4789 stat_print);
4790 }
4791 }
4792
4793 #ifdef MARIAROCKS_NOT_YET
4794 /* Explicit snapshot information */
4795 str = Rdb_explicit_snapshot::dump_snapshots();
4796 #endif
4797
4798 if (!str.empty()) {
4799 res |= print_stats(thd, "EXPLICIT_SNAPSHOTS", "rocksdb", str, stat_print);
4800 }
4801 #ifdef MARIAROCKS_NOT_YET
4802 } else if (stat_type == HA_ENGINE_TRX) {
4803 /* Handle the SHOW ENGINE ROCKSDB TRANSACTION STATUS command */
4804 res |= rocksdb_show_snapshot_status(hton, thd, stat_print);
4805 #endif
4806 }
4807 return res;
4808 }
4809
4810 static inline void rocksdb_register_tx(handlerton *const hton, THD *const thd,
4811 Rdb_transaction *const tx) {
4812 DBUG_ASSERT(tx != nullptr);
4813
4814 trans_register_ha(thd, FALSE, rocksdb_hton, 0);
4815 if (my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
4816 tx->start_stmt();
4817 trans_register_ha(thd, TRUE, rocksdb_hton, 0);
4818 }
4819 }
4820
4821 static const char *ha_rocksdb_exts[] = {NullS};
4822
4823 #ifdef MARIAROCKS_NOT_YET
4824 static bool rocksdb_explicit_snapshot(
4825 handlerton *const /* hton */, /*!< in: RocksDB handlerton */
4826 THD *const thd, /*!< in: MySQL thread handle */
4827 snapshot_info_st *ss_info) /*!< out: Snapshot information */
4828 {
4829 switch (ss_info->op) {
4830 case snapshot_operation::SNAPSHOT_CREATE: {
4831 if (mysql_bin_log_is_open()) {
4832 mysql_bin_log_lock_commits(ss_info);
4833 }
4834 auto s = Rdb_explicit_snapshot::create(ss_info, rdb, rdb->GetSnapshot());
4835 if (mysql_bin_log_is_open()) {
4836 mysql_bin_log_unlock_commits(ss_info);
4837 }
4838
4839 thd->set_explicit_snapshot(s);
4840 return s == nullptr;
4841 }
4842 case snapshot_operation::SNAPSHOT_ATTACH: {
4843 auto s = Rdb_explicit_snapshot::get(ss_info->snapshot_id);
4844 if (!s) {
4845 return true;
4846 }
4847 *ss_info = s->ss_info;
4848 thd->set_explicit_snapshot(s);
4849 return false;
4850 }
4851 case snapshot_operation::SNAPSHOT_RELEASE: {
4852 if (!thd->get_explicit_snapshot()) {
4853 return true;
4854 }
4855 *ss_info = thd->get_explicit_snapshot()->ss_info;
4856 thd->set_explicit_snapshot(nullptr);
4857 return false;
4858 }
4859 default:
4860 DBUG_ASSERT(false);
4861 return true;
4862 }
4863 return true;
4864 }
4865 #endif
4866
4867 /*
4868 Supporting START TRANSACTION WITH CONSISTENT [ROCKSDB] SNAPSHOT
4869
4870 Features:
4871 1. Supporting START TRANSACTION WITH CONSISTENT SNAPSHOT
4872 2. Getting current binlog position in addition to #1.
4873
4874 The second feature is done by START TRANSACTION WITH
4875 CONSISTENT ROCKSDB SNAPSHOT. This is Facebook's extension, and
4876 it works like existing START TRANSACTION WITH CONSISTENT INNODB SNAPSHOT.
4877
4878 - When not setting engine, START TRANSACTION WITH CONSISTENT SNAPSHOT
4879 takes both InnoDB and RocksDB snapshots, and both InnoDB and RocksDB
4880 participate in transaction. When executing COMMIT, both InnoDB and
4881 RocksDB modifications are committed. Remember that XA is not supported yet,
4882 so mixing engines is not recommended anyway.
4883
4884 - When setting engine, START TRANSACTION WITH CONSISTENT.. takes
4885 snapshot for the specified engine only. But it starts both
4886 InnoDB and RocksDB transactions.
4887 */
4888 static int rocksdb_start_tx_and_assign_read_view(
4889 handlerton *const hton, /*!< in: RocksDB handlerton */
4890 THD *const thd /*!< in: MySQL thread handle of the
4891 user for whom the transaction should
4892 be committed */
4893 )
4894 #ifdef MARIAROCKS_NOT_YET
4895 snapshot_info_st *ss_info) /*!< in/out: Snapshot info like binlog file, pos,
4896 gtid executed and snapshot ID */
4897 #endif
4898 {
4899 ulong const tx_isolation = my_core::thd_tx_isolation(thd);
4900
4901 if (tx_isolation != ISO_REPEATABLE_READ) {
4902 my_error(ER_ISOLATION_LEVEL_WITH_CONSISTENT_SNAPSHOT, MYF(0));
4903 return HA_EXIT_FAILURE;
4904 }
4905
4906 #ifdef MARIADB_NOT_YET
4907 if (ss_info) {
4908 if (mysql_bin_log_is_open()) {
4909 mysql_bin_log_lock_commits(ss_info);
4910 } else {
4911 return HA_EXIT_FAILURE;
4912 }
4913 #endif
4914
4915 /*
4916 MariaDB: there is no need to call mysql_bin_log_lock_commits and then
4917 unlock back.
4918 SQL layer calls start_consistent_snapshot() for all engines, including the
4919 binlog under LOCK_commit_ordered mutex.
4920
4921 The mutex prevents binlog commits from happening (right?) while the storage
4922 engine(s) allocate read snapshots. That way, each storage engine is
4923 synchronized with current binlog position.
4924 */
4925 mysql_mutex_assert_owner(&LOCK_commit_ordered);
4926
4927 Rdb_transaction *const tx = get_or_create_tx(thd);
4928 Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd));
4929
4930 DBUG_ASSERT(!tx->has_snapshot());
4931 tx->set_tx_read_only(true);
4932 rocksdb_register_tx(hton, thd, tx);
4933 tx->acquire_snapshot(true);
4934
4935 #ifdef MARIADB_NOT_YET
4936 if (ss_info) {
4937 mysql_bin_log_unlock_commits(ss_info);
4938 }
4939 #endif
4940 return HA_EXIT_SUCCESS;
4941 }
4942
4943 #ifdef MARIADB_NOT_YET
4944 static int rocksdb_start_tx_with_shared_read_view(
4945 handlerton *const hton, /*!< in: RocksDB handlerton */
4946 THD *const thd) /*!< in: MySQL thread handle of the
4947 user for whom the transaction should
4948 be committed */
4949 #ifdef MARIADB_NOT_YET
4950 snapshot_info_st *ss_info) /*!< out: Snapshot info like binlog file, pos,
4951 gtid executed and snapshot ID */
4952 #endif
4953 {
4954 DBUG_ASSERT(thd != nullptr);
4955
4956 int error = HA_EXIT_SUCCESS;
4957
4958 ulong const tx_isolation = my_core::thd_tx_isolation(thd);
4959 if (tx_isolation != ISO_REPEATABLE_READ) {
4960 my_error(ER_ISOLATION_LEVEL_WITH_CONSISTENT_SNAPSHOT, MYF(0));
4961 return HA_EXIT_FAILURE;
4962 }
4963
4964 Rdb_transaction *tx = nullptr;
4965 #ifdef MARIADB_NOT_YET
4966 std::shared_ptr<Rdb_explicit_snapshot> explicit_snapshot;
4967 const auto op = ss_info->op;
4968
4969 DBUG_ASSERT(op == snapshot_operation::SNAPSHOT_CREATE ||
4970 op == snapshot_operation::SNAPSHOT_ATTACH);
4971
4972 // case: if binlogs are available get binlog file/pos and gtid info
4973 if (op == snapshot_operation::SNAPSHOT_CREATE && mysql_bin_log_is_open()) {
4974 mysql_bin_log_lock_commits(ss_info);
4975 }
4976
4977 if (op == snapshot_operation::SNAPSHOT_ATTACH) {
4978 explicit_snapshot = Rdb_explicit_snapshot::get(ss_info->snapshot_id);
4979 if (!explicit_snapshot) {
4980 my_printf_error(ER_UNKNOWN_ERROR, "Snapshot %llu does not exist", MYF(0),
4981 ss_info->snapshot_id);
4982 error = HA_EXIT_FAILURE;
4983 }
4984 }
4985 #endif
4986
4987 // case: all good till now
4988 if (error == HA_EXIT_SUCCESS) {
4989 tx = get_or_create_tx(thd);
4990 Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd));
4991
4992 #ifdef MARIADB_NOT_YET
4993 if (explicit_snapshot) {
4994 tx->m_explicit_snapshot = explicit_snapshot;
4995 }
4996 #endif
4997
4998 DBUG_ASSERT(!tx->has_snapshot());
4999 tx->set_tx_read_only(true);
5000 rocksdb_register_tx(hton, thd, tx);
5001 tx->acquire_snapshot(true);
5002
5003 #ifdef MARIADB_NOT_YET
5004 // case: an explicit snapshot was not assigned to this transaction
5005 if (!tx->m_explicit_snapshot) {
5006 tx->m_explicit_snapshot =
5007 Rdb_explicit_snapshot::create(ss_info, rdb, tx->m_read_opts.snapshot);
5008 if (!tx->m_explicit_snapshot) {
5009 my_printf_error(ER_UNKNOWN_ERROR, "Could not create snapshot", MYF(0));
5010 error = HA_EXIT_FAILURE;
5011 }
5012 }
5013 #endif
5014 }
5015
5016 #ifdef MARIADB_NOT_YET
5017 // case: unlock the binlog
5018 if (op == snapshot_operation::SNAPSHOT_CREATE && mysql_bin_log_is_open()) {
5019 mysql_bin_log_unlock_commits(ss_info);
5020 }
5021
5022 DBUG_ASSERT(error == HA_EXIT_FAILURE || tx->m_explicit_snapshot);
5023
5024 // copy over the snapshot details to pass to the upper layers
5025 if (tx->m_explicit_snapshot) {
5026 *ss_info = tx->m_explicit_snapshot->ss_info;
5027 ss_info->op = op;
5028 }
5029 #endif
5030
5031 return error;
5032 }
5033 #endif
5034
5035 /* Dummy SAVEPOINT support. This is needed for long running transactions
5036 * like mysqldump (https://bugs.mysql.com/bug.php?id=71017).
5037 * Current SAVEPOINT does not correctly handle ROLLBACK and does not return
5038 * errors. This needs to be addressed in future versions (Issue#96).
5039 */
5040 static int rocksdb_savepoint(handlerton *const hton, THD *const thd,
5041 void *const savepoint) {
5042 return HA_EXIT_SUCCESS;
5043 }
5044
5045 static int rocksdb_rollback_to_savepoint(handlerton *const hton, THD *const thd,
5046 void *const savepoint) {
5047 Rdb_transaction *tx = get_tx_from_thd(thd);
5048 return tx->rollback_to_savepoint(savepoint);
5049 }
5050
5051 static bool rocksdb_rollback_to_savepoint_can_release_mdl(
5052 handlerton *const /* hton */, THD *const /* thd */) {
5053 return true;
5054 }
5055
5056 #ifdef MARIAROCKS_NOT_YET
5057 /*
5058 This is called for INFORMATION_SCHEMA
5059 */
5060 static void rocksdb_update_table_stats(
5061 /* per-table stats callback */
5062 void (*cb)(const char *db, const char *tbl, bool is_partition,
5063 my_io_perf_t *r, my_io_perf_t *w, my_io_perf_t *r_blob,
5064 my_io_perf_t *r_primary, my_io_perf_t *r_secondary,
5065 page_stats_t *page_stats, comp_stats_t *comp_stats,
5066 int n_lock_wait, int n_lock_wait_timeout, int n_lock_deadlock,
5067 const char *engine)) {
5068 my_io_perf_t io_perf_read;
5069 my_io_perf_t io_perf_write;
5070 my_io_perf_t io_perf;
5071 page_stats_t page_stats;
5072 comp_stats_t comp_stats;
5073 uint lock_wait_timeout_stats;
5074 uint deadlock_stats;
5075 uint lock_wait_stats;
5076 std::vector<std::string> tablenames;
5077
5078 /*
5079 Most of these are for innodb, so setting them to 0.
5080 TODO: possibly separate out primary vs. secondary index reads
5081 */
5082 memset(&io_perf, 0, sizeof(io_perf));
5083 memset(&page_stats, 0, sizeof(page_stats));
5084 memset(&comp_stats, 0, sizeof(comp_stats));
5085 memset(&io_perf_write, 0, sizeof(io_perf_write));
5086
5087 tablenames = rdb_open_tables.get_table_names();
5088
5089 for (const auto &it : tablenames) {
5090 Rdb_table_handler *table_handler;
5091 std::string str, dbname, tablename, partname;
5092 char dbname_sys[NAME_LEN + 1];
5093 char tablename_sys[NAME_LEN + 1];
5094 bool is_partition;
5095
5096 if (rdb_normalize_tablename(it, &str) != HA_EXIT_SUCCESS) {
5097 /* Function needs to return void because of the interface and we've
5098 * detected an error which shouldn't happen. There's no way to let
5099 * caller know that something failed.
5100 */
5101 SHIP_ASSERT(false);
5102 return;
5103 }
5104
5105 if (rdb_split_normalized_tablename(str, &dbname, &tablename, &partname)) {
5106 continue;
5107 }
5108
5109 is_partition = (partname.size() != 0);
5110
5111 table_handler = rdb_open_tables.get_table_handler(it.c_str());
5112 if (table_handler == nullptr) {
5113 continue;
5114 }
5115
5116 io_perf_read.bytes = table_handler->m_io_perf_read.bytes.load();
5117 io_perf_read.requests = table_handler->m_io_perf_read.requests.load();
5118 io_perf_write.bytes = table_handler->m_io_perf_write.bytes.load();
5119 io_perf_write.requests = table_handler->m_io_perf_write.requests.load();
5120 lock_wait_timeout_stats = table_handler->m_lock_wait_timeout_counter.load();
5121 deadlock_stats = table_handler->m_deadlock_counter.load();
5122 lock_wait_stats =
5123 table_handler->m_table_perf_context.m_value[PC_KEY_LOCK_WAIT_COUNT]
5124 .load();
5125
5126 /*
5127 Convert from rocksdb timer to mysql timer. RocksDB values are
5128 in nanoseconds, but table statistics expect the value to be
5129 in my_timer format.
5130 */
5131 io_perf_read.svc_time = my_core::microseconds_to_my_timer(
5132 table_handler->m_io_perf_read.svc_time.load() / 1000);
5133 io_perf_read.svc_time_max = my_core::microseconds_to_my_timer(
5134 table_handler->m_io_perf_read.svc_time_max.load() / 1000);
5135 io_perf_read.wait_time = my_core::microseconds_to_my_timer(
5136 table_handler->m_io_perf_read.wait_time.load() / 1000);
5137 io_perf_read.wait_time_max = my_core::microseconds_to_my_timer(
5138 table_handler->m_io_perf_read.wait_time_max.load() / 1000);
5139 io_perf_read.slow_ios = table_handler->m_io_perf_read.slow_ios.load();
5140 rdb_open_tables.release_table_handler(table_handler);
5141
5142 /*
5143 Table stats expects our database and table name to be in system encoding,
5144 not filename format. Convert before calling callback.
5145 */
5146 my_core::filename_to_tablename(dbname.c_str(), dbname_sys,
5147 sizeof(dbname_sys));
5148 my_core::filename_to_tablename(tablename.c_str(), tablename_sys,
5149 sizeof(tablename_sys));
5150 (*cb)(dbname_sys, tablename_sys, is_partition, &io_perf_read,
5151 &io_perf_write, &io_perf, &io_perf, &io_perf, &page_stats,
5152 &comp_stats, lock_wait_stats, lock_wait_timeout_stats, deadlock_stats,
5153 rocksdb_hton_name);
5154 }
5155 }
5156 #endif
5157 static rocksdb::Status check_rocksdb_options_compatibility(
5158 const char *const dbpath, const rocksdb::Options &main_opts,
5159 const std::vector<rocksdb::ColumnFamilyDescriptor> &cf_descr) {
5160 DBUG_ASSERT(rocksdb_datadir != nullptr);
5161
5162 rocksdb::DBOptions loaded_db_opt;
5163 std::vector<rocksdb::ColumnFamilyDescriptor> loaded_cf_descs;
5164 rocksdb::Status status =
5165 LoadLatestOptions(dbpath, rocksdb::Env::Default(), &loaded_db_opt,
5166 &loaded_cf_descs, rocksdb_ignore_unknown_options);
5167
5168 // If we're starting from scratch and there are no options saved yet then this
5169 // is a valid case. Therefore we can't compare the current set of options to
5170 // anything.
5171 if (status.IsNotFound()) {
5172 return rocksdb::Status::OK();
5173 }
5174
5175 if (!status.ok()) {
5176 return status;
5177 }
5178
5179 if (loaded_cf_descs.size() != cf_descr.size()) {
5180 return rocksdb::Status::NotSupported(
5181 "Mismatched size of column family "
5182 "descriptors.");
5183 }
5184
5185 // Please see RocksDB documentation for more context about why we need to set
5186 // user-defined functions and pointer-typed options manually.
5187 for (size_t i = 0; i < loaded_cf_descs.size(); i++) {
5188 loaded_cf_descs[i].options.compaction_filter =
5189 cf_descr[i].options.compaction_filter;
5190 loaded_cf_descs[i].options.compaction_filter_factory =
5191 cf_descr[i].options.compaction_filter_factory;
5192 loaded_cf_descs[i].options.comparator = cf_descr[i].options.comparator;
5193 loaded_cf_descs[i].options.memtable_factory =
5194 cf_descr[i].options.memtable_factory;
5195 loaded_cf_descs[i].options.merge_operator =
5196 cf_descr[i].options.merge_operator;
5197 loaded_cf_descs[i].options.prefix_extractor =
5198 cf_descr[i].options.prefix_extractor;
5199 loaded_cf_descs[i].options.table_factory =
5200 cf_descr[i].options.table_factory;
5201 }
5202
5203 // This is the essence of the function - determine if it's safe to open the
5204 // database or not.
5205 status = CheckOptionsCompatibility(dbpath, rocksdb::Env::Default(), main_opts,
5206 loaded_cf_descs,
5207 rocksdb_ignore_unknown_options);
5208
5209 return status;
5210 }
5211
5212 bool prevent_myrocks_loading= false;
5213
5214
5215 /*
5216 Storage Engine initialization function, invoked when plugin is loaded.
5217 */
5218
5219 static int rocksdb_init_func(void *const p) {
5220
5221 DBUG_ENTER_FUNC();
5222
5223 if (prevent_myrocks_loading)
5224 {
5225 my_error(ER_INTERNAL_ERROR, MYF(0),
5226 "Loading MyRocks plugin after it has been unloaded is not "
5227 "supported. Please restart mysqld");
5228 DBUG_RETURN(1);
5229 }
5230
5231 if (rocksdb_ignore_datadic_errors)
5232 {
5233 sql_print_information(
5234 "CAUTION: Running with rocksdb_ignore_datadic_errors=1. "
5235 " This should only be used to perform repairs");
5236 }
5237
5238 if (rdb_check_rocksdb_corruption()) {
5239 // NO_LINT_DEBUG
5240 sql_print_error(
5241 "RocksDB: There was a corruption detected in RockDB files. "
5242 "Check error log emitted earlier for more details.");
5243 if (rocksdb_allow_to_start_after_corruption) {
5244 // NO_LINT_DEBUG
5245 sql_print_information(
5246 "RocksDB: Remove rocksdb_allow_to_start_after_corruption to prevent "
5247 "server operating if RocksDB corruption is detected.");
5248 } else {
5249 // NO_LINT_DEBUG
5250 sql_print_error(
5251 "RocksDB: The server will exit normally and stop restart "
5252 "attempts. Remove %s file from data directory and "
5253 "start mysqld manually.",
5254 rdb_corruption_marker_file_name().c_str());
5255 exit(0);
5256 }
5257 }
5258
5259 // Validate the assumption about the size of ROCKSDB_SIZEOF_HIDDEN_PK_COLUMN.
5260 static_assert(sizeof(longlong) == 8, "Assuming that longlong is 8 bytes.");
5261
5262 init_rocksdb_psi_keys();
5263
5264 rocksdb_hton = (handlerton *)p;
5265
5266 rdb_open_tables.init();
5267 Ensure_cleanup rdb_open_tables_cleanup([]() { rdb_open_tables.free(); });
5268
5269 #ifdef HAVE_PSI_INTERFACE
5270 rdb_bg_thread.init(rdb_signal_bg_psi_mutex_key, rdb_signal_bg_psi_cond_key);
5271 rdb_drop_idx_thread.init(rdb_signal_drop_idx_psi_mutex_key,
5272 rdb_signal_drop_idx_psi_cond_key);
5273 rdb_mc_thread.init(rdb_signal_mc_psi_mutex_key, rdb_signal_mc_psi_cond_key);
5274 #else
5275 rdb_bg_thread.init();
5276 rdb_drop_idx_thread.init();
5277 rdb_mc_thread.init();
5278 #endif
5279 mysql_mutex_init(rdb_collation_data_mutex_key, &rdb_collation_data_mutex,
5280 MY_MUTEX_INIT_FAST);
5281 mysql_mutex_init(rdb_mem_cmp_space_mutex_key, &rdb_mem_cmp_space_mutex,
5282 MY_MUTEX_INIT_FAST);
5283
5284 const char* initial_rocksdb_datadir_for_ignore_dirs= rocksdb_datadir;
5285 if (!strncmp(rocksdb_datadir, "./", 2))
5286 initial_rocksdb_datadir_for_ignore_dirs += 2;
5287 ignore_db_dirs_append(initial_rocksdb_datadir_for_ignore_dirs);
5288
5289 #if defined(HAVE_PSI_INTERFACE)
5290 rdb_collation_exceptions =
5291 new Regex_list_handler(key_rwlock_collation_exception_list);
5292 #else
5293 rdb_collation_exceptions = new Regex_list_handler();
5294 #endif
5295
5296 mysql_mutex_init(rdb_sysvars_psi_mutex_key, &rdb_sysvars_mutex,
5297 MY_MUTEX_INIT_FAST);
5298 mysql_mutex_init(rdb_block_cache_resize_mutex_key,
5299 &rdb_block_cache_resize_mutex, MY_MUTEX_INIT_FAST);
5300 Rdb_transaction::init_mutex();
5301
5302 rocksdb_hton->create = rocksdb_create_handler;
5303 rocksdb_hton->close_connection = rocksdb_close_connection;
5304
5305 rocksdb_hton->prepare = rocksdb_prepare;
5306 rocksdb_hton->prepare_ordered = NULL; // Do not need it
5307
5308 rocksdb_hton->commit_by_xid = rocksdb_commit_by_xid;
5309 rocksdb_hton->rollback_by_xid = rocksdb_rollback_by_xid;
5310 rocksdb_hton->recover = rocksdb_recover;
5311
5312 rocksdb_hton->commit_ordered= rocksdb_commit_ordered;
5313 rocksdb_hton->commit = rocksdb_commit;
5314
5315 rocksdb_hton->commit_checkpoint_request= rocksdb_checkpoint_request;
5316
5317 rocksdb_hton->rollback = rocksdb_rollback;
5318 rocksdb_hton->show_status = rocksdb_show_status;
5319 #ifdef MARIADB_NOT_YET
5320 rocksdb_hton->explicit_snapshot = rocksdb_explicit_snapshot;
5321 #endif
5322 rocksdb_hton->start_consistent_snapshot =
5323 rocksdb_start_tx_and_assign_read_view;
5324 #ifdef MARIADB_NOT_YET
5325 rocksdb_hton->start_shared_snapshot = rocksdb_start_tx_with_shared_read_view;
5326 #endif
5327 rocksdb_hton->savepoint_set = rocksdb_savepoint;
5328 rocksdb_hton->savepoint_rollback = rocksdb_rollback_to_savepoint;
5329 rocksdb_hton->savepoint_rollback_can_release_mdl =
5330 rocksdb_rollback_to_savepoint_can_release_mdl;
5331 #ifdef MARIAROCKS_NOT_YET
5332 rocksdb_hton->update_table_stats = rocksdb_update_table_stats;
5333 #endif // MARIAROCKS_NOT_YET
5334
5335 /*
5336 Not needed in MariaDB:
5337 rocksdb_hton->flush_logs = rocksdb_flush_wal;
5338 rocksdb_hton->handle_single_table_select = rocksdb_handle_single_table_select;
5339
5340 */
5341
5342 rocksdb_hton->flags = HTON_TEMPORARY_NOT_SUPPORTED |
5343 HTON_SUPPORTS_EXTENDED_KEYS | HTON_CAN_RECREATE;
5344
5345 rocksdb_hton->tablefile_extensions= ha_rocksdb_exts;
5346 DBUG_ASSERT(!mysqld_embedded);
5347
5348 if (rocksdb_db_options->max_open_files > (long)open_files_limit) {
5349 // NO_LINT_DEBUG
5350 sql_print_information(
5351 "RocksDB: rocksdb_max_open_files should not be "
5352 "greater than the open_files_limit, effective value "
5353 "of rocksdb_max_open_files is being set to "
5354 "open_files_limit / 2.");
5355 rocksdb_db_options->max_open_files = open_files_limit / 2;
5356 } else if (rocksdb_db_options->max_open_files == -2) {
5357 rocksdb_db_options->max_open_files = open_files_limit / 2;
5358 }
5359
5360 #if 0 // MARIAROCKS_NOT_YET : read-free replication is not supported
5361 rdb_read_free_regex_handler.set_patterns(DEFAULT_READ_FREE_RPL_TABLES);
5362 #endif
5363
5364 rocksdb_stats = rocksdb::CreateDBStatistics();
5365 rocksdb_stats->set_stats_level(
5366 static_cast<rocksdb::StatsLevel>(rocksdb_stats_level));
5367 rocksdb_stats_level = rocksdb_stats->get_stats_level();
5368 rocksdb_db_options->statistics = rocksdb_stats;
5369
5370 if (rocksdb_rate_limiter_bytes_per_sec != 0) {
5371 rocksdb_rate_limiter.reset(
5372 rocksdb::NewGenericRateLimiter(rocksdb_rate_limiter_bytes_per_sec));
5373 rocksdb_db_options->rate_limiter = rocksdb_rate_limiter;
5374 }
5375
5376 rocksdb_db_options->delayed_write_rate = rocksdb_delayed_write_rate;
5377
5378 std::shared_ptr<Rdb_logger> myrocks_logger = std::make_shared<Rdb_logger>();
5379 rocksdb::Status s = rocksdb::CreateLoggerFromOptions(
5380 rocksdb_datadir, *rocksdb_db_options, &rocksdb_db_options->info_log);
5381 if (s.ok()) {
5382 myrocks_logger->SetRocksDBLogger(rocksdb_db_options->info_log);
5383 }
5384
5385 rocksdb_db_options->info_log = myrocks_logger;
5386 myrocks_logger->SetInfoLogLevel(
5387 static_cast<rocksdb::InfoLogLevel>(rocksdb_info_log_level));
5388 rocksdb_db_options->wal_dir = rocksdb_wal_dir;
5389
5390 rocksdb_db_options->wal_recovery_mode =
5391 static_cast<rocksdb::WALRecoveryMode>(rocksdb_wal_recovery_mode);
5392
5393 rocksdb_db_options->access_hint_on_compaction_start =
5394 static_cast<rocksdb::Options::AccessHint>(
5395 rocksdb_access_hint_on_compaction_start);
5396
5397 if (rocksdb_db_options->allow_mmap_reads &&
5398 rocksdb_db_options->use_direct_reads) {
5399 // allow_mmap_reads implies !use_direct_reads and RocksDB will not open if
5400 // mmap_reads and direct_reads are both on. (NO_LINT_DEBUG)
5401 sql_print_error(
5402 "RocksDB: Can't enable both use_direct_reads "
5403 "and allow_mmap_reads\n");
5404 DBUG_RETURN(HA_EXIT_FAILURE);
5405 }
5406
5407 // Check whether the filesystem backing rocksdb_datadir allows O_DIRECT
5408 if (rocksdb_db_options->use_direct_reads ||
5409 rocksdb_db_options->use_direct_io_for_flush_and_compaction) {
5410 rocksdb::EnvOptions soptions;
5411 rocksdb::Status check_status;
5412 rocksdb::Env *const env = rocksdb_db_options->env;
5413
5414 std::string fname = format_string("%s/DIRECT_CHECK", rocksdb_datadir);
5415 if (env->FileExists(fname).ok()) {
5416 std::unique_ptr<rocksdb::SequentialFile> file;
5417 soptions.use_direct_reads = true;
5418 check_status = env->NewSequentialFile(fname, &file, soptions);
5419 } else {
5420 std::unique_ptr<rocksdb::WritableFile> file;
5421 soptions.use_direct_writes = true;
5422 check_status = env->ReopenWritableFile(fname, &file, soptions);
5423 if (file != nullptr) {
5424 file->Close();
5425 }
5426 env->DeleteFile(fname);
5427 }
5428
5429 if (!check_status.ok()) {
5430 // NO_LINT_DEBUG
5431 sql_print_error(
5432 "RocksDB: Unable to use direct io in rocksdb-datadir:"
5433 "(%s)",
5434 check_status.getState());
5435 DBUG_RETURN(HA_EXIT_FAILURE);
5436 }
5437 }
5438
5439 if (rocksdb_db_options->allow_mmap_writes &&
5440 rocksdb_db_options->use_direct_io_for_flush_and_compaction) {
5441 // See above comment for allow_mmap_reads. (NO_LINT_DEBUG)
5442 sql_print_error(
5443 "RocksDB: Can't enable both "
5444 "use_direct_io_for_flush_and_compaction and "
5445 "allow_mmap_writes\n");
5446 DBUG_RETURN(HA_EXIT_FAILURE);
5447 }
5448
5449 if (rocksdb_db_options->allow_mmap_writes &&
5450 rocksdb_flush_log_at_trx_commit != FLUSH_LOG_NEVER) {
5451 // NO_LINT_DEBUG
5452 sql_print_error(
5453 "RocksDB: rocksdb_flush_log_at_trx_commit needs to be 0 "
5454 "to use allow_mmap_writes");
5455 DBUG_RETURN(HA_EXIT_FAILURE);
5456 }
5457
5458 // sst_file_manager will move deleted rocksdb sst files to trash_dir
5459 // to be deleted in a background thread.
5460 std::string trash_dir = std::string(rocksdb_datadir) + "/trash";
5461 rocksdb_db_options->sst_file_manager.reset(NewSstFileManager(
5462 rocksdb_db_options->env, myrocks_logger, trash_dir,
5463 rocksdb_sst_mgr_rate_bytes_per_sec, true /* delete_existing_trash */));
5464
5465 std::vector<std::string> cf_names;
5466 rocksdb::Status status;
5467 status = rocksdb::DB::ListColumnFamilies(*rocksdb_db_options, rocksdb_datadir,
5468 &cf_names);
5469 if (!status.ok()) {
5470 /*
5471 When we start on an empty datadir, ListColumnFamilies returns IOError,
5472 and RocksDB doesn't provide any way to check what kind of error it was.
5473 Checking system errno happens to work right now.
5474 */
5475 if (status.IsIOError()
5476 #ifndef _WIN32
5477 && errno == ENOENT
5478 #endif
5479 ) {
5480 sql_print_information("RocksDB: Got ENOENT when listing column families");
5481
5482 // NO_LINT_DEBUG
5483 sql_print_information(
5484 "RocksDB: assuming that we're creating a new database");
5485 } else {
5486 rdb_log_status_error(status, "Error listing column families");
5487 DBUG_RETURN(HA_EXIT_FAILURE);
5488 }
5489 } else {
5490 // NO_LINT_DEBUG
5491 sql_print_information("RocksDB: %ld column families found",
5492 cf_names.size());
5493 }
5494
5495 std::vector<rocksdb::ColumnFamilyDescriptor> cf_descr;
5496 std::vector<rocksdb::ColumnFamilyHandle *> cf_handles;
5497
5498 rocksdb_tbl_options->index_type =
5499 (rocksdb::BlockBasedTableOptions::IndexType)rocksdb_index_type;
5500
5501 if (!rocksdb_tbl_options->no_block_cache) {
5502 std::shared_ptr<rocksdb::MemoryAllocator> memory_allocator;
5503 if (!rocksdb_cache_dump) {
5504 size_t block_size = rocksdb_tbl_options->block_size;
5505 rocksdb::JemallocAllocatorOptions alloc_opt;
5506 // Limit jemalloc tcache memory usage. The range
5507 // [block_size/4, block_size] should be enough to cover most of
5508 // block cache allocation sizes.
5509 alloc_opt.limit_tcache_size = true;
5510 alloc_opt.tcache_size_lower_bound = block_size / 4;
5511 alloc_opt.tcache_size_upper_bound = block_size;
5512 rocksdb::Status new_alloc_status =
5513 rocksdb::NewJemallocNodumpAllocator(alloc_opt, &memory_allocator);
5514 if (!new_alloc_status.ok()) {
5515 // Fallback to use default malloc/free.
5516 rdb_log_status_error(new_alloc_status,
5517 "Error excluding block cache from core dump");
5518 memory_allocator = nullptr;
5519 DBUG_RETURN(HA_EXIT_FAILURE);
5520 }
5521 }
5522 std::shared_ptr<rocksdb::Cache> block_cache =
5523 rocksdb_use_clock_cache
5524 ? rocksdb::NewClockCache(rocksdb_block_cache_size)
5525 : rocksdb::NewLRUCache(
5526 rocksdb_block_cache_size, -1 /*num_shard_bits*/,
5527 false /*strict_capcity_limit*/,
5528 rocksdb_cache_high_pri_pool_ratio, memory_allocator);
5529 if (rocksdb_sim_cache_size > 0) {
5530 // Simulated cache enabled
5531 // Wrap block cache inside a simulated cache and pass it to RocksDB
5532 rocksdb_tbl_options->block_cache =
5533 rocksdb::NewSimCache(block_cache, rocksdb_sim_cache_size, 6);
5534 } else {
5535 // Pass block cache to RocksDB
5536 rocksdb_tbl_options->block_cache = block_cache;
5537 }
5538 }
5539 // Using newer BlockBasedTable format version for better compression
5540 // and better memory allocation.
5541 // See:
5542 // https://github.com/facebook/rocksdb/commit/9ab5adfc59a621d12357580c94451d9f7320c2dd
5543 rocksdb_tbl_options->format_version = 2;
5544
5545 if (rocksdb_collect_sst_properties) {
5546 properties_collector_factory =
5547 std::make_shared<Rdb_tbl_prop_coll_factory>(&ddl_manager);
5548
5549 rocksdb_set_compaction_options(nullptr, nullptr, nullptr, nullptr);
5550
5551 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
5552
5553 DBUG_ASSERT(rocksdb_table_stats_sampling_pct <=
5554 RDB_TBL_STATS_SAMPLE_PCT_MAX);
5555 properties_collector_factory->SetTableStatsSamplingPct(
5556 rocksdb_table_stats_sampling_pct);
5557
5558 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
5559 }
5560
5561 if (rocksdb_persistent_cache_size_mb > 0) {
5562 std::shared_ptr<rocksdb::PersistentCache> pcache;
5563 uint64_t cache_size_bytes = rocksdb_persistent_cache_size_mb * 1024 * 1024;
5564 status = rocksdb::NewPersistentCache(
5565 rocksdb::Env::Default(), std::string(rocksdb_persistent_cache_path),
5566 cache_size_bytes, myrocks_logger, true, &pcache);
5567 if (!status.ok()) {
5568 // NO_LINT_DEBUG
5569 sql_print_error("RocksDB: Persistent cache returned error: (%s)",
5570 status.getState());
5571 DBUG_RETURN(HA_EXIT_FAILURE);
5572 }
5573 rocksdb_tbl_options->persistent_cache = pcache;
5574 } else if (strlen(rocksdb_persistent_cache_path)) {
5575 // NO_LINT_DEBUG
5576 sql_print_error("RocksDB: Must specify rocksdb_persistent_cache_size_mb");
5577 DBUG_RETURN(HA_EXIT_FAILURE);
5578 }
5579
5580 std::unique_ptr<Rdb_cf_options> cf_options_map(new Rdb_cf_options());
5581 if (!cf_options_map->init(*rocksdb_tbl_options, properties_collector_factory,
5582 rocksdb_default_cf_options,
5583 rocksdb_override_cf_options)) {
5584 // NO_LINT_DEBUG
5585 sql_print_error("RocksDB: Failed to initialize CF options map.");
5586 DBUG_RETURN(HA_EXIT_FAILURE);
5587 }
5588
5589 /*
5590 If there are no column families, we're creating the new database.
5591 Create one column family named "default".
5592 */
5593 if (cf_names.size() == 0) cf_names.push_back(DEFAULT_CF_NAME);
5594
5595 std::vector<int> compaction_enabled_cf_indices;
5596
5597 // NO_LINT_DEBUG
5598 sql_print_information("RocksDB: Column Families at start:");
5599 for (size_t i = 0; i < cf_names.size(); ++i) {
5600 rocksdb::ColumnFamilyOptions opts;
5601 cf_options_map->get_cf_options(cf_names[i], &opts);
5602
5603 // NO_LINT_DEBUG
5604 sql_print_information(" cf=%s", cf_names[i].c_str());
5605
5606 // NO_LINT_DEBUG
5607 sql_print_information(" write_buffer_size=%ld", opts.write_buffer_size);
5608
5609 // NO_LINT_DEBUG
5610 sql_print_information(" target_file_size_base=%" PRIu64,
5611 opts.target_file_size_base);
5612
5613 /*
5614 Temporarily disable compactions to prevent a race condition where
5615 compaction starts before compaction filter is ready.
5616 */
5617 if (!opts.disable_auto_compactions) {
5618 compaction_enabled_cf_indices.push_back(i);
5619 opts.disable_auto_compactions = true;
5620 }
5621 cf_descr.push_back(rocksdb::ColumnFamilyDescriptor(cf_names[i], opts));
5622 }
5623
5624 rocksdb::Options main_opts(*rocksdb_db_options,
5625 cf_options_map->get_defaults());
5626
5627 rocksdb::TransactionDBOptions tx_db_options;
5628 tx_db_options.transaction_lock_timeout = 2000; // 2 seconds
5629 tx_db_options.custom_mutex_factory = std::make_shared<Rdb_mutex_factory>();
5630 tx_db_options.write_policy =
5631 static_cast<rocksdb::TxnDBWritePolicy>(rocksdb_write_policy);
5632
5633 status =
5634 check_rocksdb_options_compatibility(rocksdb_datadir, main_opts, cf_descr);
5635
5636 // We won't start if we'll determine that there's a chance of data corruption
5637 // because of incompatible options.
5638 if (!status.ok()) {
5639 rdb_log_status_error(
5640 status, "Compatibility check against existing database options failed");
5641 DBUG_RETURN(HA_EXIT_FAILURE);
5642 }
5643
5644 status = rocksdb::TransactionDB::Open(
5645 main_opts, tx_db_options, rocksdb_datadir, cf_descr, &cf_handles, &rdb);
5646
5647 if (!status.ok()) {
5648 rdb_log_status_error(status, "Error opening instance");
5649 DBUG_RETURN(HA_EXIT_FAILURE);
5650 }
5651 cf_manager.init(std::move(cf_options_map), &cf_handles);
5652
5653 if (dict_manager.init(rdb, &cf_manager)) {
5654 // NO_LINT_DEBUG
5655 sql_print_error("RocksDB: Failed to initialize data dictionary.");
5656 DBUG_RETURN(HA_EXIT_FAILURE);
5657 }
5658
5659 if (binlog_manager.init(&dict_manager)) {
5660 // NO_LINT_DEBUG
5661 sql_print_error("RocksDB: Failed to initialize binlog manager.");
5662 DBUG_RETURN(HA_EXIT_FAILURE);
5663 }
5664
5665 if (ddl_manager.init(&dict_manager, &cf_manager, rocksdb_validate_tables)) {
5666 // NO_LINT_DEBUG
5667 sql_print_error("RocksDB: Failed to initialize DDL manager.");
5668
5669 if (rocksdb_ignore_datadic_errors)
5670 {
5671 sql_print_error("RocksDB: rocksdb_ignore_datadic_errors=1, "
5672 "trying to continue");
5673 }
5674 else
5675 DBUG_RETURN(HA_EXIT_FAILURE);
5676 }
5677
5678 Rdb_sst_info::init(rdb);
5679
5680 /*
5681 Enable auto compaction, things needed for compaction filter are finished
5682 initializing
5683 */
5684 std::vector<rocksdb::ColumnFamilyHandle *> compaction_enabled_cf_handles;
5685 compaction_enabled_cf_handles.reserve(compaction_enabled_cf_indices.size());
5686 for (const auto &index : compaction_enabled_cf_indices) {
5687 compaction_enabled_cf_handles.push_back(cf_handles[index]);
5688 }
5689
5690 status = rdb->EnableAutoCompaction(compaction_enabled_cf_handles);
5691
5692 if (!status.ok()) {
5693 rdb_log_status_error(status, "Error enabling compaction");
5694 DBUG_RETURN(HA_EXIT_FAILURE);
5695 }
5696
5697 #ifndef HAVE_PSI_INTERFACE
5698 auto err = rdb_bg_thread.create_thread(BG_THREAD_NAME);
5699 #else
5700 auto err = rdb_bg_thread.create_thread(BG_THREAD_NAME,
5701 rdb_background_psi_thread_key);
5702 #endif
5703 if (err != 0) {
5704 // NO_LINT_DEBUG
5705 sql_print_error("RocksDB: Couldn't start the background thread: (errno=%d)",
5706 err);
5707 DBUG_RETURN(HA_EXIT_FAILURE);
5708 }
5709
5710 #ifndef HAVE_PSI_INTERFACE
5711 err = rdb_drop_idx_thread.create_thread(INDEX_THREAD_NAME);
5712 #else
5713 err = rdb_drop_idx_thread.create_thread(INDEX_THREAD_NAME,
5714 rdb_drop_idx_psi_thread_key);
5715 #endif
5716 if (err != 0) {
5717 // NO_LINT_DEBUG
5718 sql_print_error("RocksDB: Couldn't start the drop index thread: (errno=%d)",
5719 err);
5720 DBUG_RETURN(HA_EXIT_FAILURE);
5721 }
5722
5723 err = rdb_mc_thread.create_thread(MANUAL_COMPACTION_THREAD_NAME
5724 #ifdef HAVE_PSI_INTERFACE
5725 ,
5726 rdb_mc_psi_thread_key
5727 #endif
5728 );
5729 if (err != 0) {
5730 // NO_LINT_DEBUG
5731 sql_print_error(
5732 "RocksDB: Couldn't start the manual compaction thread: (errno=%d)",
5733 err);
5734 DBUG_RETURN(HA_EXIT_FAILURE);
5735 }
5736
5737 rdb_set_collation_exception_list(rocksdb_strict_collation_exceptions);
5738
5739 if (rocksdb_pause_background_work) {
5740 rdb->PauseBackgroundWork();
5741 }
5742
5743 // NO_LINT_DEBUG
5744 sql_print_information("RocksDB: global statistics using %s indexer",
5745 STRINGIFY_ARG(RDB_INDEXER));
5746 #if defined(HAVE_SCHED_GETCPU)
5747 if (sched_getcpu() == -1) {
5748 // NO_LINT_DEBUG
5749 sql_print_information(
5750 "RocksDB: sched_getcpu() failed - "
5751 "global statistics will use thread_id_indexer_t instead");
5752 }
5753 #endif
5754
5755 err = my_error_register(rdb_get_error_messages, HA_ERR_ROCKSDB_FIRST,
5756 HA_ERR_ROCKSDB_LAST);
5757 if (err != 0) {
5758 // NO_LINT_DEBUG
5759 sql_print_error("RocksDB: Couldn't initialize error messages");
5760 DBUG_RETURN(HA_EXIT_FAILURE);
5761 }
5762
5763
5764
5765 // Creating an instance of HistogramImpl should only happen after RocksDB
5766 // has been successfully initialized.
5767 commit_latency_stats = new rocksdb::HistogramImpl();
5768
5769 // Construct a list of directories which will be monitored by I/O watchdog
5770 // to make sure that we won't lose write access to them.
5771 std::vector<std::string> directories;
5772
5773 // 1. Data directory.
5774 directories.push_back(mysql_real_data_home);
5775
5776 // 2. Transaction logs.
5777 if (myrocks::rocksdb_wal_dir && *myrocks::rocksdb_wal_dir) {
5778 directories.push_back(myrocks::rocksdb_wal_dir);
5779 }
5780
5781 #if !defined(_WIN32) && !defined(__APPLE__)
5782 io_watchdog = new Rdb_io_watchdog(std::move(directories));
5783 io_watchdog->reset_timeout(rocksdb_io_write_timeout_secs);
5784 #endif
5785
5786 // NO_LINT_DEBUG
5787 sql_print_information(
5788 "MyRocks storage engine plugin has been successfully "
5789 "initialized.");
5790
5791 // Skip cleaning up rdb_open_tables as we've succeeded
5792 rdb_open_tables_cleanup.skip();
5793
5794 DBUG_RETURN(HA_EXIT_SUCCESS);
5795 }
5796
5797 /*
5798 Storage Engine deinitialization function, invoked when plugin is unloaded.
5799 */
5800
5801 static int rocksdb_done_func(void *const p) {
5802 DBUG_ENTER_FUNC();
5803
5804 int error = 0;
5805
5806 // signal the drop index thread to stop
5807 rdb_drop_idx_thread.signal(true);
5808
5809 // Flush all memtables for not losing data, even if WAL is disabled.
5810 rocksdb_flush_all_memtables();
5811
5812 // Stop all rocksdb background work
5813 CancelAllBackgroundWork(rdb->GetBaseDB(), true);
5814
5815 // Signal the background thread to stop and to persist all stats collected
5816 // from background flushes and compactions. This will add more keys to a new
5817 // memtable, but since the memtables were just flushed, it should not trigger
5818 // a flush that can stall due to background threads being stopped. As long
5819 // as these keys are stored in a WAL file, they can be retrieved on restart.
5820 rdb_bg_thread.signal(true);
5821
5822 // Wait for the background thread to finish.
5823 auto err = rdb_bg_thread.join();
5824 if (err != 0) {
5825 // We'll log the message and continue because we're shutting down and
5826 // continuation is the optimal strategy.
5827 // NO_LINT_DEBUG
5828 sql_print_error("RocksDB: Couldn't stop the background thread: (errno=%d)",
5829 err);
5830 }
5831
5832 // Wait for the drop index thread to finish.
5833 err = rdb_drop_idx_thread.join();
5834 if (err != 0) {
5835 // NO_LINT_DEBUG
5836 sql_print_error("RocksDB: Couldn't stop the index thread: (errno=%d)", err);
5837 }
5838
5839 // signal the manual compaction thread to stop
5840 rdb_mc_thread.signal(true);
5841 // Wait for the manual compaction thread to finish.
5842 err = rdb_mc_thread.join();
5843 if (err != 0) {
5844 // NO_LINT_DEBUG
5845 sql_print_error(
5846 "RocksDB: Couldn't stop the manual compaction thread: (errno=%d)", err);
5847 }
5848
5849 if (rdb_open_tables.count()) {
5850 // Looks like we are getting unloaded and yet we have some open tables
5851 // left behind.
5852 error = 1;
5853 }
5854
5855 rdb_open_tables.free();
5856 /*
5857 destructors for static objects can be called at _exit(),
5858 but we want to free the memory at dlclose()
5859 */
5860 // MARIADB_MERGE_2019: rdb_open_tables.m_hash.~Rdb_table_set();
5861 mysql_mutex_destroy(&rdb_sysvars_mutex);
5862 mysql_mutex_destroy(&rdb_block_cache_resize_mutex);
5863
5864
5865 delete rdb_collation_exceptions;
5866
5867 mysql_mutex_destroy(&rdb_collation_data_mutex);
5868 mysql_mutex_destroy(&rdb_mem_cmp_space_mutex);
5869
5870 Rdb_transaction::term_mutex();
5871
5872 for (auto &it : rdb_collation_data) {
5873 delete it;
5874 it = nullptr;
5875 }
5876
5877 ddl_manager.cleanup();
5878 binlog_manager.cleanup();
5879 dict_manager.cleanup();
5880 cf_manager.cleanup();
5881
5882 delete rdb;
5883 rdb = nullptr;
5884
5885 delete commit_latency_stats;
5886 commit_latency_stats = nullptr;
5887
5888 #if !defined(_WIN32) && !defined(__APPLE__)
5889 delete io_watchdog;
5890 io_watchdog = nullptr;
5891 #endif
5892
5893 // Disown the cache data since we're shutting down.
5894 // This results in memory leaks but it improved the shutdown time.
5895 // Don't disown when running under valgrind
5896 #ifndef HAVE_valgrind
5897 if (rocksdb_tbl_options->block_cache) {
5898 rocksdb_tbl_options->block_cache->DisownData();
5899 }
5900 #endif /* HAVE_valgrind */
5901
5902 /*
5903 MariaDB: don't clear rocksdb_db_options and rocksdb_tbl_options.
5904 MyRocks' plugin variables refer to them.
5905
5906 The plugin cannot be loaded again (see prevent_myrocks_loading) but plugin
5907 variables are processed before myrocks::rocksdb_init_func is invoked, so
5908 they must point to valid memory.
5909 */
5910 //rocksdb_db_options = nullptr;
5911 rocksdb_db_options->statistics = nullptr;
5912 //rocksdb_tbl_options = nullptr;
5913 rocksdb_stats = nullptr;
5914
5915 my_free(rocksdb_update_cf_options);
5916 rocksdb_update_cf_options = nullptr;
5917
5918 my_error_unregister(HA_ERR_ROCKSDB_FIRST, HA_ERR_ROCKSDB_LAST);
5919
5920 /*
5921 Prevent loading the plugin after it has been loaded and then unloaded. This
5922 doesn't work currently.
5923 */
5924 prevent_myrocks_loading= true;
5925
5926 DBUG_RETURN(error);
5927 }
5928
5929 static inline void rocksdb_smart_seek(bool seek_backward,
5930 rocksdb::Iterator *const iter,
5931 const rocksdb::Slice &key_slice) {
5932 if (seek_backward) {
5933 iter->SeekForPrev(key_slice);
5934 } else {
5935 iter->Seek(key_slice);
5936 }
5937 }
5938
5939 static inline void rocksdb_smart_next(bool seek_backward,
5940 rocksdb::Iterator *const iter) {
5941 if (seek_backward) {
5942 iter->Prev();
5943 } else {
5944 iter->Next();
5945 }
5946 }
5947
5948 #ifndef DBUG_OFF
5949 // simulate that RocksDB has reported corrupted data
5950 static void dbug_change_status_to_corrupted(rocksdb::Status *status) {
5951 *status = rocksdb::Status::Corruption();
5952 }
5953 #endif
5954
5955 // If the iterator is not valid it might be because of EOF but might be due
5956 // to IOError or corruption. The good practice is always check it.
5957 // https://github.com/facebook/rocksdb/wiki/Iterator#error-handling
5958 static inline bool is_valid(rocksdb::Iterator *scan_it) {
5959 if (scan_it->Valid()) {
5960 return true;
5961 } else {
5962 rocksdb::Status s = scan_it->status();
5963 DBUG_EXECUTE_IF("rocksdb_return_status_corrupted",
5964 dbug_change_status_to_corrupted(&s););
5965 if (s.IsIOError() || s.IsCorruption()) {
5966 if (s.IsCorruption()) {
5967 rdb_persist_corruption_marker();
5968 }
5969 rdb_handle_io_error(s, RDB_IO_ERROR_GENERAL);
5970 }
5971 return false;
5972 }
5973 }
5974
5975 /**
5976 @brief
5977 Example of simple lock controls. The "table_handler" it creates is a
5978 structure we will pass to each ha_rocksdb handler. Do you have to have
5979 one of these? Well, you have pieces that are used for locking, and
5980 they are needed to function.
5981 */
5982
5983 Rdb_table_handler *Rdb_open_tables_map::get_table_handler(
5984 const char *const table_name) {
5985 DBUG_ASSERT(table_name != nullptr);
5986
5987 Rdb_table_handler *table_handler;
5988
5989 std::string table_name_str(table_name);
5990
5991 // First, look up the table in the hash map.
5992 RDB_MUTEX_LOCK_CHECK(m_mutex);
5993 const auto it = m_table_map.find(table_name_str);
5994 if (it != m_table_map.end()) {
5995 // Found it
5996 table_handler = it->second;
5997 } else {
5998 char *tmp_name;
5999
6000 // Since we did not find it in the hash map, attempt to create and add it
6001 // to the hash map.
6002 if (!(table_handler = reinterpret_cast<Rdb_table_handler *>(my_multi_malloc(
6003 PSI_INSTRUMENT_ME,
6004 MYF(MY_WME | MY_ZEROFILL), &table_handler, sizeof(*table_handler),
6005 &tmp_name, table_name_str.length() + 1, NullS)))) {
6006 // Allocating a new Rdb_table_handler and a new table name failed.
6007 RDB_MUTEX_UNLOCK_CHECK(m_mutex);
6008 return nullptr;
6009 }
6010
6011 table_handler->m_ref_count = 0;
6012 table_handler->m_table_name_length = table_name_str.length();
6013 table_handler->m_table_name = tmp_name;
6014 strmov(table_handler->m_table_name, table_name);
6015
6016 m_table_map.emplace(table_name_str, table_handler);
6017
6018 thr_lock_init(&table_handler->m_thr_lock);
6019 #ifdef MARIAROCKS_NOT_YET
6020 table_handler->m_io_perf_read.init();
6021 table_handler->m_io_perf_write.init();
6022 #endif
6023 }
6024 DBUG_ASSERT(table_handler->m_ref_count >= 0);
6025 table_handler->m_ref_count++;
6026
6027 RDB_MUTEX_UNLOCK_CHECK(m_mutex);
6028
6029 return table_handler;
6030 }
6031
6032 std::vector<std::string> rdb_get_open_table_names(void) {
6033 return rdb_open_tables.get_table_names();
6034 }
6035
6036 std::vector<std::string> Rdb_open_tables_map::get_table_names(void) const {
6037 const Rdb_table_handler *table_handler;
6038 std::vector<std::string> names;
6039
6040 RDB_MUTEX_LOCK_CHECK(m_mutex);
6041 for (const auto &kv : m_table_map) {
6042 table_handler = kv.second;
6043 DBUG_ASSERT(table_handler != nullptr);
6044 names.push_back(table_handler->m_table_name);
6045 }
6046 RDB_MUTEX_UNLOCK_CHECK(m_mutex);
6047
6048 return names;
6049 }
6050
6051 /*
6052 Inspired by innobase_get_int_col_max_value from InnoDB. This returns the
6053 maximum value a type can take on.
6054 */
6055 static ulonglong rdb_get_int_col_max_value(const Field *field) {
6056 ulonglong max_value = 0;
6057 switch (field->key_type()) {
6058 case HA_KEYTYPE_BINARY:
6059 max_value = 0xFFULL;
6060 break;
6061 case HA_KEYTYPE_INT8:
6062 max_value = 0x7FULL;
6063 break;
6064 case HA_KEYTYPE_USHORT_INT:
6065 max_value = 0xFFFFULL;
6066 break;
6067 case HA_KEYTYPE_SHORT_INT:
6068 max_value = 0x7FFFULL;
6069 break;
6070 case HA_KEYTYPE_UINT24:
6071 max_value = 0xFFFFFFULL;
6072 break;
6073 case HA_KEYTYPE_INT24:
6074 max_value = 0x7FFFFFULL;
6075 break;
6076 case HA_KEYTYPE_ULONG_INT:
6077 max_value = 0xFFFFFFFFULL;
6078 break;
6079 case HA_KEYTYPE_LONG_INT:
6080 max_value = 0x7FFFFFFFULL;
6081 break;
6082 case HA_KEYTYPE_ULONGLONG:
6083 max_value = 0xFFFFFFFFFFFFFFFFULL;
6084 break;
6085 case HA_KEYTYPE_LONGLONG:
6086 max_value = 0x7FFFFFFFFFFFFFFFULL;
6087 break;
6088 case HA_KEYTYPE_FLOAT:
6089 max_value = 0x1000000ULL;
6090 break;
6091 case HA_KEYTYPE_DOUBLE:
6092 max_value = 0x20000000000000ULL;
6093 break;
6094 default:
6095 abort();
6096 }
6097
6098 return max_value;
6099 }
6100
6101 void ha_rocksdb::load_auto_incr_value() {
6102 ulonglong auto_incr = 0;
6103 bool validate_last = false, use_datadic = true;
6104 #ifndef DBUG_OFF
6105 DBUG_EXECUTE_IF("myrocks_autoinc_upgrade", use_datadic = false;);
6106 validate_last = true;
6107 #endif
6108
6109 if (use_datadic && dict_manager.get_auto_incr_val(
6110 m_tbl_def->get_autoincr_gl_index_id(), &auto_incr)) {
6111 update_auto_incr_val(auto_incr);
6112 }
6113
6114 // If we find nothing in the data dictionary, or if we are in debug mode,
6115 // then call index_last to get the last value.
6116 //
6117 // This is needed when upgrading from a server that did not support
6118 // persistent auto_increment, of if the table is empty.
6119 //
6120 // For debug mode, we are just verifying that the data dictionary value is
6121 // greater than or equal to the maximum value in the table.
6122 if (auto_incr == 0 || validate_last) {
6123 auto_incr = load_auto_incr_value_from_index();
6124 update_auto_incr_val(auto_incr);
6125 }
6126
6127 // If we failed to find anything from the data dictionary and index, then
6128 // initialize auto_increment to 1.
6129 if (m_tbl_def->m_auto_incr_val == 0) {
6130 update_auto_incr_val(1);
6131 }
6132 }
6133
6134 ulonglong ha_rocksdb::load_auto_incr_value_from_index() {
6135 const int save_active_index = active_index;
6136 active_index = table->s->next_number_index;
6137 const uint8 save_table_status = table->status;
6138 ulonglong last_val = 0;
6139
6140 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
6141 const bool is_new_snapshot = !tx->has_snapshot();
6142 if (is_new_snapshot) {
6143 tx->acquire_snapshot(true);
6144 }
6145
6146 // Do a lookup. We only need index column, so it should be index-only.
6147 // (another reason to make it index-only is that table->read_set is not set
6148 // appropriately and non-index-only lookup will not read the value)
6149 const bool save_keyread_only = m_keyread_only;
6150 m_keyread_only = true;
6151 m_converter->set_is_key_requested(true);
6152
6153 if (!index_last(table->record[0])) {
6154 Field *field =
6155 table->key_info[table->s->next_number_index].key_part[0].field;
6156 ulonglong max_val = rdb_get_int_col_max_value(field);
6157 MY_BITMAP *const old_map = dbug_tmp_use_all_columns(table, &table->read_set);
6158 last_val = field->val_int();
6159 if (last_val != max_val) {
6160 last_val++;
6161 }
6162 #ifndef DBUG_OFF
6163 ulonglong dd_val;
6164 if (last_val <= max_val) {
6165 const auto &gl_index_id = m_tbl_def->get_autoincr_gl_index_id();
6166 if (dict_manager.get_auto_incr_val(gl_index_id, &dd_val) &&
6167 tx->get_auto_incr(gl_index_id) == 0) {
6168 DBUG_ASSERT(dd_val >= last_val);
6169 }
6170 }
6171 #endif
6172 dbug_tmp_restore_column_map(&table->read_set, old_map);
6173 }
6174
6175 m_keyread_only = save_keyread_only;
6176 if (is_new_snapshot) {
6177 tx->release_snapshot();
6178 }
6179
6180 table->status = save_table_status;
6181 active_index = save_active_index;
6182
6183 /*
6184 Do what ha_rocksdb::index_end() does.
6185 (Why don't we use index_init/index_end? class handler defines index_init
6186 as private, for some reason).
6187 */
6188 release_scan_iterator();
6189
6190 return last_val;
6191 }
6192
6193 void ha_rocksdb::update_auto_incr_val(ulonglong val) {
6194 ulonglong auto_incr_val = m_tbl_def->m_auto_incr_val;
6195 while (
6196 auto_incr_val < val &&
6197 !m_tbl_def->m_auto_incr_val.compare_exchange_weak(auto_incr_val, val)) {
6198 // Do nothing - just loop until auto_incr_val is >= val or we successfully
6199 // set it
6200 }
6201 }
6202
6203 void ha_rocksdb::update_auto_incr_val_from_field() {
6204 Field *field;
6205 ulonglong new_val, max_val;
6206 field = table->key_info[table->s->next_number_index].key_part[0].field;
6207 max_val = rdb_get_int_col_max_value(field);
6208
6209 MY_BITMAP *const old_map =
6210 dbug_tmp_use_all_columns(table, &table->read_set);
6211 new_val = field->val_int();
6212 // don't increment if we would wrap around
6213 if (new_val != max_val) {
6214 new_val++;
6215 }
6216
6217 dbug_tmp_restore_column_map(&table->read_set, old_map);
6218
6219 // Only update if positive value was set for auto_incr column.
6220 if (new_val <= max_val) {
6221 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
6222 tx->set_auto_incr(m_tbl_def->get_autoincr_gl_index_id(), new_val);
6223
6224 // Update the in memory auto_incr value in m_tbl_def.
6225 update_auto_incr_val(new_val);
6226 }
6227 }
6228
6229 int ha_rocksdb::load_hidden_pk_value() {
6230 const int save_active_index = active_index;
6231 active_index = m_tbl_def->m_key_count - 1;
6232 const uint8 save_table_status = table->status;
6233
6234 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
6235 const bool is_new_snapshot = !tx->has_snapshot();
6236
6237 longlong hidden_pk_id = 1;
6238 // Do a lookup.
6239 if (!index_last(table->record[0])) {
6240 /*
6241 Decode PK field from the key
6242 */
6243 auto err = read_hidden_pk_id_from_rowkey(&hidden_pk_id);
6244 if (err) {
6245 if (is_new_snapshot) {
6246 tx->release_snapshot();
6247 }
6248 return err;
6249 }
6250
6251 hidden_pk_id++;
6252 }
6253
6254 longlong old = m_tbl_def->m_hidden_pk_val;
6255 while (old < hidden_pk_id &&
6256 !m_tbl_def->m_hidden_pk_val.compare_exchange_weak(old, hidden_pk_id)) {
6257 }
6258
6259 if (is_new_snapshot) {
6260 tx->release_snapshot();
6261 }
6262
6263 table->status = save_table_status;
6264 active_index = save_active_index;
6265
6266 release_scan_iterator();
6267
6268 return HA_EXIT_SUCCESS;
6269 }
6270
6271 /* Get PK value from m_tbl_def->m_hidden_pk_info. */
6272 longlong ha_rocksdb::update_hidden_pk_val() {
6273 DBUG_ASSERT(has_hidden_pk(table));
6274 const longlong new_val = m_tbl_def->m_hidden_pk_val++;
6275 return new_val;
6276 }
6277
6278 /* Get the id of the hidden pk id from m_last_rowkey */
6279 int ha_rocksdb::read_hidden_pk_id_from_rowkey(longlong *const hidden_pk_id) {
6280 DBUG_ASSERT(table != nullptr);
6281 DBUG_ASSERT(has_hidden_pk(table));
6282
6283 rocksdb::Slice rowkey_slice(m_last_rowkey.ptr(), m_last_rowkey.length());
6284
6285 // Get hidden primary key from old key slice
6286 Rdb_string_reader reader(&rowkey_slice);
6287 if ((!reader.read(Rdb_key_def::INDEX_NUMBER_SIZE))) {
6288 return HA_ERR_ROCKSDB_CORRUPT_DATA;
6289 }
6290
6291 const int length= 8; /* was Field_longlong::PACK_LENGTH in FB MySQL tree */
6292 const uchar *from = reinterpret_cast<const uchar *>(reader.read(length));
6293 if (from == nullptr) {
6294 /* Mem-comparable image doesn't have enough bytes */
6295 return HA_ERR_ROCKSDB_CORRUPT_DATA;
6296 }
6297
6298 *hidden_pk_id = rdb_netbuf_read_uint64(&from);
6299 return HA_EXIT_SUCCESS;
6300 }
6301
6302 /**
6303 @brief
6304 Free lock controls. We call this whenever we close a table. If the table had
6305 the last reference to the table_handler, then we free the memory associated
6306 with it.
6307 */
6308
6309 void Rdb_open_tables_map::release_table_handler(
6310 Rdb_table_handler *const table_handler) {
6311 RDB_MUTEX_LOCK_CHECK(m_mutex);
6312
6313 DBUG_ASSERT(table_handler != nullptr);
6314 DBUG_ASSERT(table_handler->m_ref_count > 0);
6315 if (!--table_handler->m_ref_count) {
6316 // Last reference was released. Tear down the hash entry.
6317 const auto ret MY_ATTRIBUTE((__unused__)) =
6318 m_table_map.erase(std::string(table_handler->m_table_name));
6319 DBUG_ASSERT(ret == 1); // the hash entry must actually be found and deleted
6320 my_core::thr_lock_delete(&table_handler->m_thr_lock);
6321 my_free(table_handler);
6322 }
6323
6324 RDB_MUTEX_UNLOCK_CHECK(m_mutex);
6325 }
6326
6327 static handler *rocksdb_create_handler(my_core::handlerton *const hton,
6328 my_core::TABLE_SHARE *const table_arg,
6329 my_core::MEM_ROOT *const mem_root) {
6330 return new (mem_root) ha_rocksdb(hton, table_arg);
6331 }
6332
6333 ha_rocksdb::ha_rocksdb(my_core::handlerton *const hton,
6334 my_core::TABLE_SHARE *const table_arg)
6335 : handler(hton, table_arg),
6336 m_table_handler(nullptr),
6337 m_scan_it(nullptr),
6338 m_scan_it_skips_bloom(false),
6339 m_scan_it_snapshot(nullptr),
6340 m_scan_it_lower_bound(nullptr),
6341 m_scan_it_upper_bound(nullptr),
6342 m_tbl_def(nullptr),
6343 m_pk_descr(nullptr),
6344 m_key_descr_arr(nullptr),
6345 m_pk_can_be_decoded(false),
6346 m_pk_tuple(nullptr),
6347 m_pk_packed_tuple(nullptr),
6348 m_sk_packed_tuple(nullptr),
6349 m_end_key_packed_tuple(nullptr),
6350 m_sk_match_prefix(nullptr),
6351 m_sk_match_prefix_buf(nullptr),
6352 m_sk_packed_tuple_old(nullptr),
6353 m_dup_sk_packed_tuple(nullptr),
6354 m_dup_sk_packed_tuple_old(nullptr),
6355 m_pack_buffer(nullptr),
6356 m_record_buffer(nullptr),
6357 m_lock_rows(RDB_LOCK_NONE),
6358 m_keyread_only(false),
6359 m_insert_with_update(false),
6360 m_dup_pk_found(false),
6361 m_in_rpl_delete_rows(false),
6362 m_in_rpl_update_rows(false),
6363 m_force_skip_unique_check(false) {}
6364
6365
6366 const std::string &ha_rocksdb::get_table_basename() const {
6367 return m_tbl_def->base_tablename();
6368 }
6369
6370 /**
6371 @return
6372 false OK
6373 other Error inpacking the data
6374 */
6375 bool ha_rocksdb::init_with_fields() {
6376 DBUG_ENTER_FUNC();
6377
6378 const uint pk = table_share->primary_key;
6379 if (pk != MAX_KEY) {
6380 const uint key_parts = table_share->key_info[pk].user_defined_key_parts;
6381 check_keyread_allowed(pk /*PK*/, key_parts - 1, true);
6382 } else {
6383 m_pk_can_be_decoded = false;
6384 }
6385 cached_table_flags = table_flags();
6386
6387 DBUG_RETURN(false); /* Ok */
6388 }
6389
6390 /*
6391 If the key is a TTL key, we may need to filter it out.
6392
6393 The purpose of read filtering for tables with TTL is to ensure that
6394 during a transaction a key which has expired already but not removed by
6395 compaction yet is not returned to the user.
6396
6397 Without this the user might be hit with problems such as disappearing
6398 rows within a transaction, etc, because the compaction filter ignores
6399 snapshots when filtering keys.
6400 */
6401 bool ha_rocksdb::should_hide_ttl_rec(const Rdb_key_def &kd,
6402 const rocksdb::Slice &ttl_rec_val,
6403 const int64_t curr_ts) {
6404 DBUG_ASSERT(kd.has_ttl());
6405 DBUG_ASSERT(kd.m_ttl_rec_offset != UINT_MAX);
6406
6407 /*
6408 Curr_ts can only be 0 if there are no snapshots open.
6409 should_hide_ttl_rec can only be called when there is >=1 snapshots, unless
6410 we are filtering on the write path (single INSERT/UPDATE) in which case
6411 we are passed in the current time as curr_ts.
6412
6413 In the event curr_ts is 0, we always decide not to filter the record. We
6414 also log a warning and increment a diagnostic counter.
6415 */
6416 if (curr_ts == 0) {
6417 update_row_stats(ROWS_HIDDEN_NO_SNAPSHOT);
6418 return false;
6419 }
6420
6421 if (!rdb_is_ttl_read_filtering_enabled() || !rdb_is_ttl_enabled()) {
6422 return false;
6423 }
6424
6425 Rdb_string_reader reader(&ttl_rec_val);
6426
6427 /*
6428 Find where the 8-byte ttl is for each record in this index.
6429 */
6430 uint64 ts;
6431 if (!reader.read(kd.m_ttl_rec_offset) || reader.read_uint64(&ts)) {
6432 /*
6433 This condition should never be reached since all TTL records have an
6434 8 byte ttl field in front. Don't filter the record out, and log an error.
6435 */
6436 std::string buf;
6437 buf = rdb_hexdump(ttl_rec_val.data(), ttl_rec_val.size(),
6438 RDB_MAX_HEXDUMP_LEN);
6439 const GL_INDEX_ID gl_index_id = kd.get_gl_index_id();
6440 // NO_LINT_DEBUG
6441 sql_print_error(
6442 "Decoding ttl from PK value failed, "
6443 "for index (%u,%u), val: %s",
6444 gl_index_id.cf_id, gl_index_id.index_id, buf.c_str());
6445 DBUG_ASSERT(0);
6446 return false;
6447 }
6448
6449 /* Hide record if it has expired before the current snapshot time. */
6450 uint64 read_filter_ts = 0;
6451 #ifndef DBUG_OFF
6452 read_filter_ts += rdb_dbug_set_ttl_read_filter_ts();
6453 #endif
6454 bool is_hide_ttl =
6455 ts + kd.m_ttl_duration + read_filter_ts <= static_cast<uint64>(curr_ts);
6456 if (is_hide_ttl) {
6457 update_row_stats(ROWS_FILTERED);
6458
6459 /* increment examined row count when rows are skipped */
6460 THD *thd = ha_thd();
6461 thd->inc_examined_row_count(1);
6462 DEBUG_SYNC(thd, "rocksdb.ttl_rows_examined");
6463 }
6464 return is_hide_ttl;
6465 }
6466
6467 int ha_rocksdb::rocksdb_skip_expired_records(const Rdb_key_def &kd,
6468 rocksdb::Iterator *const iter,
6469 bool seek_backward) {
6470 if (kd.has_ttl()) {
6471 THD *thd = ha_thd();
6472 while (iter->Valid() &&
6473 should_hide_ttl_rec(
6474 kd, iter->value(),
6475 get_or_create_tx(table->in_use)->m_snapshot_timestamp)) {
6476 DEBUG_SYNC(thd, "rocksdb.check_flags_ser");
6477 if (thd && thd->killed) {
6478 return HA_ERR_QUERY_INTERRUPTED;
6479 }
6480 rocksdb_smart_next(seek_backward, iter);
6481 }
6482 }
6483 return HA_EXIT_SUCCESS;
6484 }
6485
6486 #ifndef DBUG_OFF
6487 void dbug_append_garbage_at_end(rocksdb::PinnableSlice *on_disk_rec) {
6488 std::string str(on_disk_rec->data(), on_disk_rec->size());
6489 on_disk_rec->Reset();
6490 str.append("abc");
6491 on_disk_rec->PinSelf(rocksdb::Slice(str));
6492 }
6493
6494 void dbug_truncate_record(rocksdb::PinnableSlice *on_disk_rec) {
6495 on_disk_rec->remove_suffix(on_disk_rec->size());
6496 }
6497
6498 void dbug_modify_rec_varchar12(rocksdb::PinnableSlice *on_disk_rec) {
6499 std::string res;
6500 // The record is NULL-byte followed by VARCHAR(10).
6501 // Put the NULL-byte
6502 res.append("\0", 1);
6503 // Then, add a valid VARCHAR(12) value.
6504 res.append("\xC", 1);
6505 res.append("123456789ab", 12);
6506
6507 on_disk_rec->Reset();
6508 on_disk_rec->PinSelf(rocksdb::Slice(res));
6509 }
6510
6511 void dbug_create_err_inplace_alter() {
6512 my_printf_error(ER_UNKNOWN_ERROR,
6513 "Intentional failure in inplace alter occurred.", MYF(0));
6514 }
6515 #endif
6516
6517 int ha_rocksdb::convert_record_from_storage_format(
6518 const rocksdb::Slice *const key, uchar *const buf) {
6519 DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read1",
6520 dbug_append_garbage_at_end(&m_retrieved_record););
6521 DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read2",
6522 dbug_truncate_record(&m_retrieved_record););
6523 DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read3",
6524 dbug_modify_rec_varchar12(&m_retrieved_record););
6525
6526 return convert_record_from_storage_format(key, &m_retrieved_record, buf);
6527 }
6528
6529 /*
6530 @brief
6531 Unpack the record in this->m_retrieved_record and this->m_last_rowkey from
6532 storage format into buf (which can be table->record[0] or table->record[1]).
6533
6534 @param key Table record's key in mem-comparable form.
6535 @param buf Store record in table->record[0] format here
6536
6537 @detail
6538 If the table has blobs, the unpacked data in buf may keep pointers to the
6539 data in this->m_retrieved_record.
6540
6541 The key is only needed to check its checksum value (the checksum is in
6542 m_retrieved_record).
6543
6544 @seealso
6545 rdb_converter::setup_read_decoders() Sets up data structures which tell
6546 which columns to decode.
6547
6548 @return
6549 0 OK
6550 other Error inpacking the data
6551 */
6552
6553 int ha_rocksdb::convert_record_from_storage_format(
6554 const rocksdb::Slice *const key, const rocksdb::Slice *const value,
6555 uchar *const buf) {
6556 return m_converter->decode(m_pk_descr, buf, key, value);
6557 }
6558
6559 int ha_rocksdb::alloc_key_buffers(const TABLE *const table_arg,
6560 const Rdb_tbl_def *const tbl_def_arg,
6561 bool alloc_alter_buffers) {
6562 DBUG_ENTER_FUNC();
6563
6564 DBUG_ASSERT(m_pk_tuple == nullptr);
6565
6566 std::shared_ptr<Rdb_key_def> *const kd_arr = tbl_def_arg->m_key_descr_arr;
6567
6568 uint key_len = 0;
6569 uint max_packed_sk_len = 0;
6570 uint pack_key_len = 0;
6571 uint record_len = table->s->reclength + table->s->null_bytes;
6572
6573 m_pk_descr = kd_arr[pk_index(table_arg, tbl_def_arg)];
6574 if (has_hidden_pk(table_arg)) {
6575 m_pk_key_parts = 1;
6576 } else {
6577 m_pk_key_parts =
6578 table->key_info[table->s->primary_key].user_defined_key_parts;
6579 key_len = table->key_info[table->s->primary_key].key_length;
6580 }
6581
6582 // move this into get_table_handler() ??
6583 m_pk_descr->setup(table_arg, tbl_def_arg);
6584
6585 m_pk_tuple = reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, key_len, MYF(0)));
6586
6587 pack_key_len = m_pk_descr->max_storage_fmt_length();
6588 m_pk_packed_tuple =
6589 reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, pack_key_len, MYF(0)));
6590
6591 /* Sometimes, we may use m_sk_packed_tuple for storing packed PK */
6592 max_packed_sk_len = pack_key_len;
6593 for (uint i = 0; i < table_arg->s->keys; i++) {
6594 /* Primary key was processed above */
6595 if (i == table_arg->s->primary_key) continue;
6596
6597 // TODO: move this into get_table_handler() ??
6598 kd_arr[i]->setup(table_arg, tbl_def_arg);
6599
6600 const uint packed_len = kd_arr[i]->max_storage_fmt_length();
6601 if (packed_len > max_packed_sk_len) {
6602 max_packed_sk_len = packed_len;
6603 }
6604 }
6605
6606 m_sk_packed_tuple =
6607 reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, max_packed_sk_len, MYF(0)));
6608 m_sk_match_prefix_buf =
6609 reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, max_packed_sk_len, MYF(0)));
6610 m_sk_packed_tuple_old =
6611 reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, max_packed_sk_len, MYF(0)));
6612 m_end_key_packed_tuple =
6613 reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, max_packed_sk_len, MYF(0)));
6614 m_pack_buffer =
6615 reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, max_packed_sk_len, MYF(0)));
6616 m_record_buffer =
6617 reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, record_len, MYF(0)));
6618
6619 m_scan_it_lower_bound =
6620 reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, max_packed_sk_len, MYF(0)));
6621 m_scan_it_upper_bound =
6622 reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, max_packed_sk_len, MYF(0)));
6623
6624 /*
6625 If inplace alter is happening, allocate special buffers for unique
6626 secondary index duplicate checking.
6627 */
6628 if (alloc_alter_buffers) {
6629 m_dup_sk_packed_tuple =
6630 reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, max_packed_sk_len, MYF(0)));
6631 m_dup_sk_packed_tuple_old =
6632 reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, max_packed_sk_len, MYF(0)));
6633 }
6634
6635 if (m_pk_tuple == nullptr || m_pk_packed_tuple == nullptr ||
6636 m_sk_packed_tuple == nullptr || m_sk_packed_tuple_old == nullptr ||
6637 m_end_key_packed_tuple == nullptr || m_pack_buffer == nullptr ||
6638 m_scan_it_upper_bound == nullptr || m_scan_it_lower_bound == nullptr ||
6639 m_record_buffer == nullptr ||
6640 (alloc_alter_buffers && (m_dup_sk_packed_tuple == nullptr ||
6641 m_dup_sk_packed_tuple_old == nullptr))) {
6642 // One or more of the above allocations failed. Clean up and exit
6643 free_key_buffers();
6644
6645 DBUG_RETURN(HA_ERR_OUT_OF_MEM);
6646 }
6647
6648 DBUG_RETURN(HA_EXIT_SUCCESS);
6649 }
6650
6651 void ha_rocksdb::free_key_buffers() {
6652 my_free(m_pk_tuple);
6653 m_pk_tuple = nullptr;
6654
6655 my_free(m_pk_packed_tuple);
6656 m_pk_packed_tuple = nullptr;
6657
6658 my_free(m_sk_packed_tuple);
6659 m_sk_packed_tuple = nullptr;
6660
6661 my_free(m_sk_match_prefix_buf);
6662 m_sk_match_prefix_buf = nullptr;
6663
6664 my_free(m_sk_packed_tuple_old);
6665 m_sk_packed_tuple_old = nullptr;
6666
6667 my_free(m_end_key_packed_tuple);
6668 m_end_key_packed_tuple = nullptr;
6669
6670 my_free(m_pack_buffer);
6671 m_pack_buffer = nullptr;
6672
6673 my_free(m_record_buffer);
6674 m_record_buffer = nullptr;
6675
6676 my_free(m_dup_sk_packed_tuple);
6677 m_dup_sk_packed_tuple = nullptr;
6678
6679 my_free(m_dup_sk_packed_tuple_old);
6680 m_dup_sk_packed_tuple_old = nullptr;
6681
6682 my_free(m_scan_it_lower_bound);
6683 m_scan_it_lower_bound = nullptr;
6684
6685 my_free(m_scan_it_upper_bound);
6686 m_scan_it_upper_bound = nullptr;
6687 }
6688
6689 #ifdef MARIAROCKS_NOT_YET
6690 #endif
6691 void ha_rocksdb::set_skip_unique_check_tables(const char *const whitelist) {
6692 const char *const wl =
6693 whitelist ? whitelist : DEFAULT_SKIP_UNIQUE_CHECK_TABLES;
6694
6695 #if defined(HAVE_PSI_INTERFACE)
6696 Regex_list_handler regex_handler(key_rwlock_skip_unique_check_tables);
6697 #else
6698 Regex_list_handler regex_handler;
6699 #endif
6700
6701 if (!regex_handler.set_patterns(wl)) {
6702 warn_about_bad_patterns(®ex_handler, "skip_unique_check_tables");
6703 }
6704
6705 m_skip_unique_check = regex_handler.matches(m_tbl_def->base_tablename());
6706 }
6707
6708 /**
6709 @return
6710 HA_EXIT_SUCCESS OK
6711 other HA_ERR error code (can be SE-specific)
6712 */
6713 int ha_rocksdb::open(const char *const name, int mode, uint test_if_locked) {
6714 DBUG_ENTER_FUNC();
6715
6716 int err = close();
6717 if (err) {
6718 DBUG_RETURN(err);
6719 }
6720
6721 m_table_handler = rdb_open_tables.get_table_handler(name);
6722
6723 if (m_table_handler == nullptr) {
6724 DBUG_RETURN(HA_ERR_OUT_OF_MEM);
6725 }
6726
6727 my_core::thr_lock_data_init(&m_table_handler->m_thr_lock, &m_db_lock,
6728 nullptr);
6729 m_io_perf.init(&m_table_handler->m_table_perf_context,
6730 &m_table_handler->m_io_perf_read,
6731 &m_table_handler->m_io_perf_write, &stats);
6732 Rdb_perf_context_guard guard(&m_io_perf,
6733 rocksdb_perf_context_level(ha_thd()));
6734
6735 std::string fullname;
6736 err = rdb_normalize_tablename(name, &fullname);
6737 if (err != HA_EXIT_SUCCESS) {
6738 DBUG_RETURN(err);
6739 }
6740
6741 m_tbl_def = ddl_manager.find(fullname);
6742 if (m_tbl_def == nullptr) {
6743 my_error(ER_INTERNAL_ERROR, MYF(0),
6744 "Attempt to open a table that is not present in RocksDB-SE data "
6745 "dictionary");
6746 DBUG_RETURN(HA_ERR_ROCKSDB_INVALID_TABLE);
6747 }
6748 if (m_tbl_def->m_key_count != table->s->keys + has_hidden_pk(table)? 1:0)
6749 {
6750 sql_print_error("MyRocks: DDL mismatch: .frm file has %u indexes, "
6751 "MyRocks has %u (%s hidden pk)",
6752 table->s->keys, m_tbl_def->m_key_count,
6753 has_hidden_pk(table)? "1" : "no");
6754
6755 if (rocksdb_ignore_datadic_errors)
6756 {
6757 sql_print_error("MyRocks: rocksdb_ignore_datadic_errors=1, "
6758 "trying to continue");
6759 }
6760 else
6761 {
6762 my_error(ER_INTERNAL_ERROR, MYF(0),
6763 "MyRocks: DDL mismatch. Check the error log for details");
6764 DBUG_RETURN(HA_ERR_ROCKSDB_INVALID_TABLE);
6765 }
6766 }
6767
6768
6769 m_lock_rows = RDB_LOCK_NONE;
6770 m_key_descr_arr = m_tbl_def->m_key_descr_arr;
6771
6772 /*
6773 Full table scan actually uses primary key
6774 (UPDATE needs to know this, otherwise it will go into infinite loop on
6775 queries like "UPDATE tbl SET pk=pk+100")
6776 */
6777 key_used_on_scan = table->s->primary_key;
6778
6779 // close() above has already called free_key_buffers(). No need to do it here.
6780 err = alloc_key_buffers(table, m_tbl_def);
6781
6782 if (err) {
6783 DBUG_RETURN(err);
6784 }
6785
6786 /*
6787 init_with_fields() is used to initialize table flags based on the field
6788 definitions in table->field[].
6789 It is called by open_binary_frm(), but that function calls the method for
6790 a temporary ha_rocksdb object which is later destroyed.
6791
6792 If we are here in ::open(), then init_with_fields() has not been called
6793 for this object. Call it ourselves, we want all member variables to be
6794 properly initialized.
6795 */
6796 init_with_fields();
6797
6798 /* Initialize decoder */
6799 m_converter = std::make_shared<Rdb_converter>(ha_thd(), m_tbl_def, table);
6800
6801 /*
6802 Update m_ttl_bytes address to same as Rdb_converter's m_ttl_bytes.
6803 Remove this code after moving convert_record_to_storage_format() into
6804 Rdb_converter class.
6805 */
6806 m_ttl_bytes = m_converter->get_ttl_bytes_buffer();
6807
6808 /*
6809 MariaDB: adjust field->part_of_key for PK columns. We can only do it here
6810 because SE API is just relying on the HA_PRIMARY_KEY_IN_READ_INDEX which
6811 does not allow to distinguish between unpack'able and non-unpack'able
6812 columns.
6813 Upstream uses handler->init_with_fields() but we don't have that call.
6814 */
6815 {
6816 if (!has_hidden_pk(table)) {
6817 KEY *const pk_info = &table->key_info[table->s->primary_key];
6818 for (uint kp = 0; kp < pk_info->user_defined_key_parts; kp++) {
6819 if (!m_pk_descr->can_unpack(kp)) {
6820 //
6821 uint field_index= pk_info->key_part[kp].field->field_index;
6822 table->field[field_index]->part_of_key.clear_all();
6823 table->field[field_index]->part_of_key.set_bit(table->s->primary_key);
6824 }
6825 }
6826 }
6827
6828 for (uint key= 0; key < table->s->keys; key++) {
6829 KEY *const key_info = &table->key_info[key];
6830 if (key == table->s->primary_key)
6831 continue;
6832 for (uint kp = 0; kp < key_info->usable_key_parts; kp++) {
6833 uint field_index= key_info->key_part[kp].field->field_index;
6834 if (m_key_descr_arr[key]->can_unpack(kp)) {
6835 table->field[field_index]->part_of_key.set_bit(key);
6836 } else {
6837 table->field[field_index]->part_of_key.clear_bit(key);
6838 }
6839 }
6840 }
6841 }
6842
6843 info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
6844
6845 /*
6846 The following load_XXX code calls row decode functions, and they do
6847 that without having done ::external_lock() or index_init()/rnd_init().
6848 (Note: this also means we're doing a read when there was no
6849 rdb_converter::setup_field_encoders() call)
6850
6851 Initialize the necessary variables for them:
6852 */
6853
6854 /* Load auto_increment value only once on first use. */
6855 if (table->found_next_number_field && m_tbl_def->m_auto_incr_val == 0) {
6856 load_auto_incr_value();
6857 }
6858
6859 /* Load hidden pk only once on first use. */
6860 if (has_hidden_pk(table) && m_tbl_def->m_hidden_pk_val == 0 &&
6861 (err = load_hidden_pk_value()) != HA_EXIT_SUCCESS) {
6862 free_key_buffers();
6863 DBUG_RETURN(err);
6864 }
6865
6866 /* Index block size in MyRocks: used by MySQL in query optimization */
6867 stats.block_size = rocksdb_tbl_options->block_size;
6868
6869 #ifdef MARIAROCKS_NOT_YET // MDEV-10976
6870 #endif
6871 /* Determine at open whether we should skip unique checks for this table */
6872 set_skip_unique_check_tables(THDVAR(ha_thd(), skip_unique_check_tables));
6873
6874 DBUG_RETURN(HA_EXIT_SUCCESS);
6875 }
6876
6877 int ha_rocksdb::close(void) {
6878 DBUG_ENTER_FUNC();
6879
6880 m_pk_descr = nullptr;
6881 m_key_descr_arr = nullptr;
6882 m_converter = nullptr;
6883 free_key_buffers();
6884
6885 if (m_table_handler != nullptr) {
6886 rdb_open_tables.release_table_handler(m_table_handler);
6887 m_table_handler = nullptr;
6888 }
6889
6890 // These are needed to suppress valgrind errors in rocksdb.partition
6891 m_last_rowkey.free();
6892 m_sk_tails.free();
6893 m_sk_tails_old.free();
6894 m_pk_unpack_info.free();
6895
6896 DBUG_RETURN(HA_EXIT_SUCCESS);
6897 }
6898
6899 static const char *rdb_error_messages[] = {
6900 "Table must have a PRIMARY KEY.",
6901 "Specifying DATA DIRECTORY for an individual table is not supported.",
6902 "Specifying INDEX DIRECTORY for an individual table is not supported.",
6903 "RocksDB commit failed.",
6904 "Failure during bulk load operation.",
6905 "Found data corruption.",
6906 "CRC checksum mismatch.",
6907 "Invalid table.",
6908 "Could not access RocksDB properties.",
6909 "File I/O error during merge/sort operation.",
6910 "RocksDB status: not found.",
6911 "RocksDB status: corruption.",
6912 "RocksDB status: invalid argument.",
6913 "RocksDB status: io error.",
6914 "RocksDB status: no space.",
6915 "RocksDB status: merge in progress.",
6916 "RocksDB status: incomplete.",
6917 "RocksDB status: shutdown in progress.",
6918 "RocksDB status: timed out.",
6919 "RocksDB status: aborted.",
6920 "RocksDB status: lock limit reached.",
6921 "RocksDB status: busy.",
6922 "RocksDB status: deadlock.",
6923 "RocksDB status: expired.",
6924 "RocksDB status: try again.",
6925 };
6926
6927 static_assert((sizeof(rdb_error_messages) / sizeof(rdb_error_messages[0])) ==
6928 ((HA_ERR_ROCKSDB_LAST - HA_ERR_ROCKSDB_FIRST) + 1),
6929 "Number of error messages doesn't match number of error codes");
6930
6931 //psergey-merge: do we need this in MariaDB: we have get_error_messages
6932 //below...
6933 #if 0
6934 static const char *rdb_get_error_message(int nr) {
6935 return rdb_error_messages[nr - HA_ERR_ROCKSDB_FIRST];
6936 }
6937 #endif
6938
6939 static const char **rdb_get_error_messages(int nr) { return rdb_error_messages; }
6940
6941 bool ha_rocksdb::get_error_message(const int error, String *const buf) {
6942 DBUG_ENTER_FUNC();
6943
6944 static_assert(HA_ERR_ROCKSDB_LAST > HA_ERR_FIRST,
6945 "HA_ERR_ROCKSDB_LAST > HA_ERR_FIRST");
6946 static_assert(HA_ERR_ROCKSDB_LAST > HA_ERR_LAST,
6947 "HA_ERR_ROCKSDB_LAST > HA_ERR_LAST");
6948
6949 if (error == HA_ERR_LOCK_WAIT_TIMEOUT || error == HA_ERR_LOCK_DEADLOCK ||
6950 error == HA_ERR_ROCKSDB_STATUS_BUSY) {
6951 Rdb_transaction *const tx = get_tx_from_thd(ha_thd());
6952 DBUG_ASSERT(tx != nullptr);
6953 buf->append(tx->m_detailed_error);
6954 DBUG_RETURN(true);
6955 }
6956
6957 if (error >= HA_ERR_ROCKSDB_FIRST && error <= HA_ERR_ROCKSDB_LAST) {
6958 buf->append(rdb_error_messages[error - HA_ERR_ROCKSDB_FIRST]);
6959 }
6960
6961 // We can be called with the values which are < HA_ERR_FIRST because most
6962 // MySQL internal functions will just return HA_EXIT_FAILURE in case of
6963 // an error.
6964
6965 DBUG_RETURN(false);
6966 }
6967
6968 /*
6969 Generalized way to convert RocksDB status errors into MySQL error code, and
6970 print error message.
6971
6972 Each error code below maps to a RocksDB status code found in:
6973 rocksdb/include/rocksdb/status.h
6974 */
6975 int ha_rocksdb::rdb_error_to_mysql(const rocksdb::Status &s,
6976 const char *opt_msg) {
6977 DBUG_ASSERT(!s.ok());
6978
6979 int err;
6980 switch (s.code()) {
6981 case rocksdb::Status::Code::kOk:
6982 err = HA_EXIT_SUCCESS;
6983 break;
6984 case rocksdb::Status::Code::kNotFound:
6985 err = HA_ERR_ROCKSDB_STATUS_NOT_FOUND;
6986 break;
6987 case rocksdb::Status::Code::kCorruption:
6988 err = HA_ERR_ROCKSDB_STATUS_CORRUPTION;
6989 break;
6990 case rocksdb::Status::Code::kNotSupported:
6991 err = HA_ERR_ROCKSDB_STATUS_NOT_SUPPORTED;
6992 break;
6993 case rocksdb::Status::Code::kInvalidArgument:
6994 err = HA_ERR_ROCKSDB_STATUS_INVALID_ARGUMENT;
6995 break;
6996 case rocksdb::Status::Code::kIOError:
6997 err = (s.IsNoSpace()) ? HA_ERR_ROCKSDB_STATUS_NO_SPACE
6998 : HA_ERR_ROCKSDB_STATUS_IO_ERROR;
6999 break;
7000 case rocksdb::Status::Code::kMergeInProgress:
7001 err = HA_ERR_ROCKSDB_STATUS_MERGE_IN_PROGRESS;
7002 break;
7003 case rocksdb::Status::Code::kIncomplete:
7004 err = HA_ERR_ROCKSDB_STATUS_INCOMPLETE;
7005 break;
7006 case rocksdb::Status::Code::kShutdownInProgress:
7007 err = HA_ERR_ROCKSDB_STATUS_SHUTDOWN_IN_PROGRESS;
7008 break;
7009 case rocksdb::Status::Code::kTimedOut:
7010 err = HA_ERR_ROCKSDB_STATUS_TIMED_OUT;
7011 break;
7012 case rocksdb::Status::Code::kAborted:
7013 err = (s.IsLockLimit()) ? HA_ERR_ROCKSDB_STATUS_LOCK_LIMIT
7014 : HA_ERR_ROCKSDB_STATUS_ABORTED;
7015 break;
7016 case rocksdb::Status::Code::kBusy:
7017 err = (s.IsDeadlock()) ? HA_ERR_ROCKSDB_STATUS_DEADLOCK
7018 : HA_ERR_ROCKSDB_STATUS_BUSY;
7019 break;
7020 case rocksdb::Status::Code::kExpired:
7021 err = HA_ERR_ROCKSDB_STATUS_EXPIRED;
7022 break;
7023 case rocksdb::Status::Code::kTryAgain:
7024 err = HA_ERR_ROCKSDB_STATUS_TRY_AGAIN;
7025 break;
7026 default:
7027 DBUG_ASSERT(0);
7028 return -1;
7029 }
7030
7031 std::string errMsg;
7032 if (s.IsLockLimit()) {
7033 errMsg =
7034 "Operation aborted: Failed to acquire lock due to "
7035 "rocksdb_max_row_locks limit";
7036 } else {
7037 errMsg = s.ToString();
7038 }
7039
7040 if (opt_msg) {
7041 std::string concatenated_error = errMsg + " (" + std::string(opt_msg) + ")";
7042 my_error(ER_GET_ERRMSG, MYF(0), s.code(), concatenated_error.c_str(),
7043 rocksdb_hton_name);
7044 } else {
7045 my_error(ER_GET_ERRMSG, MYF(0), s.code(), errMsg.c_str(),
7046 rocksdb_hton_name);
7047 }
7048
7049 return err;
7050 }
7051
7052 /* MyRocks supports only the following collations for indexed columns */
7053 static const std::set<uint> RDB_INDEX_COLLATIONS = {
7054 COLLATION_BINARY, COLLATION_UTF8_BIN, COLLATION_LATIN1_BIN};
7055
7056 static bool rdb_is_index_collation_supported(
7057 const my_core::Field *const field) {
7058 const my_core::enum_field_types type = field->real_type();
7059 /* Handle [VAR](CHAR|BINARY) or TEXT|BLOB */
7060 if (type == MYSQL_TYPE_VARCHAR || type == MYSQL_TYPE_STRING ||
7061 type == MYSQL_TYPE_BLOB) {
7062
7063 return (RDB_INDEX_COLLATIONS.find(field->charset()->number) !=
7064 RDB_INDEX_COLLATIONS.end()) ||
7065 rdb_is_collation_supported(field->charset());
7066 }
7067 return true;
7068 }
7069
7070
7071 static bool
7072 rdb_field_uses_nopad_collation(const my_core::Field *const field) {
7073 const my_core::enum_field_types type = field->real_type();
7074 /* Handle [VAR](CHAR|BINARY) or TEXT|BLOB */
7075 if (type == MYSQL_TYPE_VARCHAR || type == MYSQL_TYPE_STRING ||
7076 type == MYSQL_TYPE_BLOB) {
7077
7078 /*
7079 This is technically a NOPAD collation but it's a binary collation
7080 that we can handle.
7081 */
7082 if (RDB_INDEX_COLLATIONS.find(field->charset()->number) !=
7083 RDB_INDEX_COLLATIONS.end())
7084 return false;
7085
7086 return (field->charset()->state & MY_CS_NOPAD);
7087 }
7088 return false;
7089 }
7090
7091
7092 /*
7093 Create structures needed for storing data in rocksdb. This is called when the
7094 table is created. The structures will be shared by all TABLE* objects.
7095
7096 @param
7097 table_arg Table with definition
7098 db_table "dbname.tablename"
7099 len strlen of the above
7100 tbl_def_arg tbl_def whose key_descr is being created/populated
7101 old_tbl_def_arg tbl_def from which keys are being copied over from
7102 (for use during inplace alter)
7103
7104 @return
7105 0 - Ok
7106 other - error, either given table ddl is not supported by rocksdb or OOM.
7107 */
7108 int ha_rocksdb::create_key_defs(
7109 const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
7110 const TABLE *const old_table_arg /* = nullptr */,
7111 const Rdb_tbl_def *const old_tbl_def_arg
7112 /* = nullptr */) const {
7113 DBUG_ENTER_FUNC();
7114
7115 DBUG_ASSERT(table_arg->s != nullptr);
7116
7117 /*
7118 These need to be one greater than MAX_INDEXES since the user can create
7119 MAX_INDEXES secondary keys and no primary key which would cause us
7120 to generate a hidden one.
7121 */
7122 std::array<key_def_cf_info, MAX_INDEXES + 1> cfs;
7123
7124 /*
7125 NOTE: All new column families must be created before new index numbers are
7126 allocated to each key definition. See below for more details.
7127 http://github.com/MySQLOnRocksDB/mysql-5.6/issues/86#issuecomment-138515501
7128 */
7129 if (create_cfs(table_arg, tbl_def_arg, &cfs)) {
7130 DBUG_RETURN(HA_EXIT_FAILURE);
7131 }
7132
7133 uint64 ttl_duration = 0;
7134 std::string ttl_column;
7135 uint ttl_field_offset;
7136
7137 uint err;
7138 if ((err = Rdb_key_def::extract_ttl_duration(table_arg, tbl_def_arg,
7139 &ttl_duration))) {
7140 DBUG_RETURN(err);
7141 }
7142
7143 if ((err = Rdb_key_def::extract_ttl_col(table_arg, tbl_def_arg, &ttl_column,
7144 &ttl_field_offset))) {
7145 DBUG_RETURN(err);
7146 }
7147
7148 /* We don't currently support TTL on tables with hidden primary keys. */
7149 if (ttl_duration > 0 && has_hidden_pk(table_arg)) {
7150 my_error(ER_RDB_TTL_UNSUPPORTED, MYF(0));
7151 DBUG_RETURN(HA_EXIT_FAILURE);
7152 }
7153
7154 /*
7155 If TTL duration is not specified but TTL column was specified, throw an
7156 error because TTL column requires duration.
7157 */
7158 if (ttl_duration == 0 && !ttl_column.empty()) {
7159 my_error(ER_RDB_TTL_COL_FORMAT, MYF(0), ttl_column.c_str());
7160 DBUG_RETURN(HA_EXIT_FAILURE);
7161 }
7162
7163 if (!old_tbl_def_arg) {
7164 /*
7165 old_tbl_def doesn't exist. this means we are in the process of creating
7166 a new table.
7167
7168 Get the index numbers (this will update the next_index_number)
7169 and create Rdb_key_def structures.
7170 */
7171 for (uint i = 0; i < tbl_def_arg->m_key_count; i++) {
7172 if (create_key_def(table_arg, i, tbl_def_arg, &m_key_descr_arr[i], cfs[i],
7173 ttl_duration, ttl_column)) {
7174 DBUG_RETURN(HA_EXIT_FAILURE);
7175 }
7176 }
7177 } else {
7178 /*
7179 old_tbl_def exists. This means we are creating a new tbl_def as part of
7180 in-place alter table. Copy over existing keys from the old_tbl_def and
7181 generate the necessary new key definitions if any.
7182 */
7183 if (create_inplace_key_defs(table_arg, tbl_def_arg, old_table_arg,
7184 old_tbl_def_arg, cfs, ttl_duration,
7185 ttl_column)) {
7186 DBUG_RETURN(HA_EXIT_FAILURE);
7187 }
7188 }
7189
7190 DBUG_RETURN(HA_EXIT_SUCCESS);
7191 }
7192
7193 /*
7194 Checks index parameters and creates column families needed for storing data
7195 in rocksdb if necessary.
7196
7197 @param in
7198 table_arg Table with definition
7199 db_table Table name
7200 tbl_def_arg Table def structure being populated
7201
7202 @param out
7203 cfs CF info for each key definition in 'key_info' order
7204
7205 @return
7206 0 - Ok
7207 other - error
7208 */
7209 int ha_rocksdb::create_cfs(
7210 const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
7211 std::array<struct key_def_cf_info, MAX_INDEXES + 1> *const cfs) const {
7212 DBUG_ENTER_FUNC();
7213
7214 DBUG_ASSERT(table_arg->s != nullptr);
7215
7216 char tablename_sys[NAME_LEN + 1];
7217 bool tsys_set= false;
7218
7219 /*
7220 The first loop checks the index parameters and creates
7221 column families if necessary.
7222 */
7223 for (uint i = 0; i < tbl_def_arg->m_key_count; i++) {
7224 rocksdb::ColumnFamilyHandle *cf_handle;
7225
7226 if (!is_hidden_pk(i, table_arg, tbl_def_arg) &&
7227 tbl_def_arg->base_tablename().find(tmp_file_prefix) != 0) {
7228 if (!tsys_set)
7229 {
7230 tsys_set= true;
7231 my_core::filename_to_tablename(tbl_def_arg->base_tablename().c_str(),
7232 tablename_sys, sizeof(tablename_sys));
7233 }
7234
7235 for (uint part = 0; part < table_arg->key_info[i].ext_key_parts;
7236 part++)
7237 {
7238 /* MariaDB: disallow NOPAD collations */
7239 if (rdb_field_uses_nopad_collation(
7240 table_arg->key_info[i].key_part[part].field))
7241 {
7242 my_error(ER_MYROCKS_CANT_NOPAD_COLLATION, MYF(0));
7243 DBUG_RETURN(HA_EXIT_FAILURE);
7244 }
7245
7246 if (rocksdb_strict_collation_check &&
7247 !rdb_is_index_collation_supported(
7248 table_arg->key_info[i].key_part[part].field) &&
7249 !rdb_collation_exceptions->matches(tablename_sys)) {
7250
7251 char buf[1024];
7252 my_snprintf(buf, sizeof(buf),
7253 "Indexed column %s.%s uses a collation that does not "
7254 "allow index-only access in secondary key and has "
7255 "reduced disk space efficiency in primary key.",
7256 tbl_def_arg->full_tablename().c_str(),
7257 table_arg->key_info[i].key_part[part].field->field_name.str);
7258
7259 my_error(ER_INTERNAL_ERROR, MYF(ME_WARNING), buf);
7260 }
7261 }
7262 }
7263
7264 // Internal consistency check to make sure that data in TABLE and
7265 // Rdb_tbl_def structures matches. Either both are missing or both are
7266 // specified. Yes, this is critical enough to make it into SHIP_ASSERT.
7267 SHIP_ASSERT(IF_PARTITIONING(!table_arg->part_info,true) == tbl_def_arg->base_partition().empty());
7268
7269 // Generate the name for the column family to use.
7270 bool per_part_match_found = false;
7271 std::string cf_name =
7272 generate_cf_name(i, table_arg, tbl_def_arg, &per_part_match_found);
7273
7274 // Prevent create from using the system column family.
7275 if (cf_name == DEFAULT_SYSTEM_CF_NAME) {
7276 my_error(ER_WRONG_ARGUMENTS, MYF(0),
7277 "column family not valid for storing index data.");
7278 DBUG_RETURN(HA_EXIT_FAILURE);
7279 }
7280
7281 // Here's how `get_or_create_cf` will use the input parameters:
7282 //
7283 // `cf_name` - will be used as a CF name.
7284 cf_handle = cf_manager.get_or_create_cf(rdb, cf_name);
7285
7286 if (!cf_handle) {
7287 DBUG_RETURN(HA_EXIT_FAILURE);
7288 }
7289
7290 auto &cf = (*cfs)[i];
7291
7292 cf.cf_handle = cf_handle;
7293 cf.is_reverse_cf = Rdb_cf_manager::is_cf_name_reverse(cf_name.c_str());
7294 cf.is_per_partition_cf = per_part_match_found;
7295 }
7296
7297 DBUG_RETURN(HA_EXIT_SUCCESS);
7298 }
7299
7300 /*
7301 Create key definition needed for storing data in rocksdb during ADD index
7302 inplace operations.
7303
7304 @param in
7305 table_arg Table with definition
7306 tbl_def_arg New table def structure being populated
7307 old_tbl_def_arg Old(current) table def structure
7308 cfs Struct array which contains column family information
7309
7310 @return
7311 0 - Ok
7312 other - error, either given table ddl is not supported by rocksdb or OOM.
7313 */
7314 int ha_rocksdb::create_inplace_key_defs(
7315 const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
7316 const TABLE *const old_table_arg, const Rdb_tbl_def *const old_tbl_def_arg,
7317 const std::array<key_def_cf_info, MAX_INDEXES + 1> &cfs,
7318 uint64 ttl_duration, const std::string &ttl_column) const {
7319 DBUG_ENTER_FUNC();
7320
7321 std::shared_ptr<Rdb_key_def> *const old_key_descr =
7322 old_tbl_def_arg->m_key_descr_arr;
7323 std::shared_ptr<Rdb_key_def> *const new_key_descr =
7324 tbl_def_arg->m_key_descr_arr;
7325 const std::unordered_map<std::string, uint> old_key_pos =
7326 get_old_key_positions(table_arg, tbl_def_arg, old_table_arg,
7327 old_tbl_def_arg);
7328
7329 uint i;
7330 for (i = 0; i < tbl_def_arg->m_key_count; i++) {
7331 const auto &it = old_key_pos.find(get_key_name(i, table_arg, tbl_def_arg));
7332
7333 if (it != old_key_pos.end()) {
7334 /*
7335 Found matching index in old table definition, so copy it over to the
7336 new one created.
7337 */
7338 const Rdb_key_def &okd = *old_key_descr[it->second];
7339
7340 const GL_INDEX_ID gl_index_id = okd.get_gl_index_id();
7341 struct Rdb_index_info index_info;
7342 if (!dict_manager.get_index_info(gl_index_id, &index_info)) {
7343 // NO_LINT_DEBUG
7344 sql_print_error(
7345 "RocksDB: Could not get index information "
7346 "for Index Number (%u,%u), table %s",
7347 gl_index_id.cf_id, gl_index_id.index_id,
7348 old_tbl_def_arg->full_tablename().c_str());
7349 DBUG_RETURN(HA_EXIT_FAILURE);
7350 }
7351
7352 uint32 ttl_rec_offset =
7353 Rdb_key_def::has_index_flag(index_info.m_index_flags,
7354 Rdb_key_def::TTL_FLAG)
7355 ? Rdb_key_def::calculate_index_flag_offset(
7356 index_info.m_index_flags, Rdb_key_def::TTL_FLAG)
7357 : UINT_MAX;
7358
7359 /*
7360 We can't use the copy constructor because we need to update the
7361 keynr within the pack_info for each field and the keyno of the keydef
7362 itself.
7363 */
7364 new_key_descr[i] = std::make_shared<Rdb_key_def>(
7365 okd.get_index_number(), i, okd.get_cf(),
7366 index_info.m_index_dict_version, index_info.m_index_type,
7367 index_info.m_kv_version, okd.m_is_reverse_cf,
7368 okd.m_is_per_partition_cf, okd.m_name.c_str(),
7369 dict_manager.get_stats(gl_index_id), index_info.m_index_flags,
7370 ttl_rec_offset, index_info.m_ttl_duration);
7371 } else if (create_key_def(table_arg, i, tbl_def_arg, &new_key_descr[i],
7372 cfs[i], ttl_duration, ttl_column)) {
7373 DBUG_RETURN(HA_EXIT_FAILURE);
7374 }
7375
7376 DBUG_ASSERT(new_key_descr[i] != nullptr);
7377 new_key_descr[i]->setup(table_arg, tbl_def_arg);
7378 }
7379
7380 DBUG_RETURN(HA_EXIT_SUCCESS);
7381 }
7382
7383 std::unordered_map<std::string, uint> ha_rocksdb::get_old_key_positions(
7384 const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg,
7385 const TABLE *const old_table_arg,
7386 const Rdb_tbl_def *const old_tbl_def_arg) const {
7387 DBUG_ENTER_FUNC();
7388
7389 std::shared_ptr<Rdb_key_def> *const old_key_descr =
7390 old_tbl_def_arg->m_key_descr_arr;
7391 std::unordered_map<std::string, uint> old_key_pos;
7392 std::unordered_map<std::string, uint> new_key_pos;
7393 uint i;
7394
7395 for (i = 0; i < tbl_def_arg->m_key_count; i++) {
7396 new_key_pos[get_key_name(i, table_arg, tbl_def_arg)] = i;
7397 }
7398
7399 for (i = 0; i < old_tbl_def_arg->m_key_count; i++) {
7400 if (is_hidden_pk(i, old_table_arg, old_tbl_def_arg)) {
7401 old_key_pos[old_key_descr[i]->m_name] = i;
7402 continue;
7403 }
7404
7405 /*
7406 In case of matching key name, need to check key parts of keys as well,
7407 in case a simultaneous drop + add is performed, where the key name is the
7408 same but the key parts are different.
7409
7410 Example:
7411 CREATE TABLE t1 (a INT, b INT, KEY ka(a)) ENGINE=RocksDB;
7412 ALTER TABLE t1 DROP INDEX ka, ADD INDEX ka(b), ALGORITHM=INPLACE;
7413 */
7414 const KEY *const old_key = &old_table_arg->key_info[i];
7415 const auto &it = new_key_pos.find(old_key->name.str);
7416 if (it == new_key_pos.end()) {
7417 continue;
7418 }
7419
7420 KEY *const new_key = &table_arg->key_info[it->second];
7421
7422 /*
7423 Check that the key is identical between old and new tables.
7424 If not, we still need to create a new index.
7425
7426 The exception is if there is an index changed from unique to non-unique,
7427 in these cases we don't need to rebuild as they are stored the same way in
7428 RocksDB.
7429 */
7430 bool unique_to_non_unique =
7431 ((old_key->flags ^ new_key->flags) == HA_NOSAME) &&
7432 (old_key->flags & HA_NOSAME);
7433
7434 if (compare_keys(old_key, new_key) && !unique_to_non_unique) {
7435 continue;
7436 }
7437
7438 /* Check to make sure key parts match. */
7439 if (compare_key_parts(old_key, new_key)) {
7440 continue;
7441 }
7442
7443 old_key_pos[old_key->name.str] = i;
7444 }
7445
7446 DBUG_RETURN(old_key_pos);
7447 }
7448
7449 /* Check to see if two keys are identical. */
7450 int ha_rocksdb::compare_keys(const KEY *const old_key,
7451 const KEY *const new_key) const {
7452 DBUG_ENTER_FUNC();
7453
7454 /* Check index name. */
7455 if (strcmp(old_key->name.str, new_key->name.str) != 0) {
7456 DBUG_RETURN(HA_EXIT_FAILURE);
7457 }
7458
7459 /* If index algorithms are different then keys are different. */
7460 if (old_key->algorithm != new_key->algorithm) {
7461 DBUG_RETURN(HA_EXIT_FAILURE);
7462 }
7463
7464 /* Check that the key is identical between old and new tables. */
7465 if ((old_key->flags ^ new_key->flags) & HA_KEYFLAG_MASK) {
7466 DBUG_RETURN(HA_EXIT_FAILURE);
7467 }
7468
7469 /* Check index comment. (for column family changes) */
7470 std::string old_comment(old_key->comment.str, old_key->comment.length);
7471 std::string new_comment(new_key->comment.str, new_key->comment.length);
7472 if (old_comment.compare(new_comment) != 0) {
7473 DBUG_RETURN(HA_EXIT_FAILURE);
7474 }
7475
7476 DBUG_RETURN(HA_EXIT_SUCCESS);
7477 }
7478
7479 /* Check two keys to ensure that key parts within keys match */
7480 int ha_rocksdb::compare_key_parts(const KEY *const old_key,
7481 const KEY *const new_key) const {
7482 DBUG_ENTER_FUNC();
7483
7484 /* Skip if key parts do not match, as it is a different key */
7485 if (new_key->user_defined_key_parts != old_key->user_defined_key_parts) {
7486 DBUG_RETURN(HA_EXIT_FAILURE);
7487 }
7488
7489 /* Check to see that key parts themselves match */
7490 for (uint i = 0; i < old_key->user_defined_key_parts; i++) {
7491 if (strcmp(old_key->key_part[i].field->field_name.str,
7492 new_key->key_part[i].field->field_name.str) != 0) {
7493 DBUG_RETURN(HA_EXIT_FAILURE);
7494 }
7495
7496 /* Check if prefix index key part length has changed */
7497 if (old_key->key_part[i].length != new_key->key_part[i].length) {
7498 DBUG_RETURN(HA_EXIT_FAILURE);
7499 }
7500 }
7501
7502 DBUG_RETURN(HA_EXIT_SUCCESS);
7503 }
7504
7505 /*
7506 Create key definition needed for storing data in rocksdb.
7507 This can be called either during CREATE table or doing ADD index operations.
7508
7509 @param in
7510 table_arg Table with definition
7511 i Position of index being created inside table_arg->key_info
7512 tbl_def_arg Table def structure being populated
7513 cf_info Struct which contains column family information
7514
7515 @param out
7516 new_key_def Newly created index definition.
7517
7518 @return
7519 0 - Ok
7520 other - error, either given table ddl is not supported by rocksdb or OOM.
7521 */
7522 int ha_rocksdb::create_key_def(const TABLE *const table_arg, const uint i,
7523 const Rdb_tbl_def *const tbl_def_arg,
7524 std::shared_ptr<Rdb_key_def> *const new_key_def,
7525 const struct key_def_cf_info &cf_info,
7526 uint64 ttl_duration,
7527 const std::string &ttl_column) const {
7528 DBUG_ENTER_FUNC();
7529
7530 DBUG_ASSERT(*new_key_def == nullptr);
7531
7532 const uint index_id = ddl_manager.get_and_update_next_number(&dict_manager);
7533 const uint16_t index_dict_version = Rdb_key_def::INDEX_INFO_VERSION_LATEST;
7534 uchar index_type;
7535 uint16_t kv_version;
7536
7537 if (is_hidden_pk(i, table_arg, tbl_def_arg)) {
7538 index_type = Rdb_key_def::INDEX_TYPE_HIDDEN_PRIMARY;
7539 kv_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_LATEST;
7540 } else if (i == table_arg->s->primary_key) {
7541 index_type = Rdb_key_def::INDEX_TYPE_PRIMARY;
7542 uint16 pk_latest_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_LATEST;
7543 kv_version = pk_latest_version;
7544 } else {
7545 index_type = Rdb_key_def::INDEX_TYPE_SECONDARY;
7546 uint16 sk_latest_version = Rdb_key_def::SECONDARY_FORMAT_VERSION_LATEST;
7547 kv_version = sk_latest_version;
7548 }
7549
7550 // Use PRIMARY_FORMAT_VERSION_UPDATE1 here since it is the same value as
7551 // SECONDARY_FORMAT_VERSION_UPDATE1 so it doesn't matter if this is a
7552 // primary key or secondary key.
7553 DBUG_EXECUTE_IF("MYROCKS_LEGACY_VARBINARY_FORMAT", {
7554 kv_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_UPDATE1;
7555 });
7556
7557 DBUG_EXECUTE_IF("MYROCKS_NO_COVERED_BITMAP_FORMAT", {
7558 if (index_type == Rdb_key_def::INDEX_TYPE_SECONDARY) {
7559 kv_version = Rdb_key_def::SECONDARY_FORMAT_VERSION_UPDATE2;
7560 }
7561 });
7562
7563 uint32 index_flags = (ttl_duration > 0 ? Rdb_key_def::TTL_FLAG : 0);
7564
7565 uint32 ttl_rec_offset =
7566 Rdb_key_def::has_index_flag(index_flags, Rdb_key_def::TTL_FLAG)
7567 ? Rdb_key_def::calculate_index_flag_offset(index_flags,
7568 Rdb_key_def::TTL_FLAG)
7569 : UINT_MAX;
7570
7571 const char *const key_name = get_key_name(i, table_arg, m_tbl_def);
7572 *new_key_def = std::make_shared<Rdb_key_def>(
7573 index_id, i, cf_info.cf_handle, index_dict_version, index_type,
7574 kv_version, cf_info.is_reverse_cf, cf_info.is_per_partition_cf, key_name,
7575 Rdb_index_stats(), index_flags, ttl_rec_offset, ttl_duration);
7576
7577 if (!ttl_column.empty()) {
7578 (*new_key_def)->m_ttl_column = ttl_column;
7579 }
7580 // initialize key_def
7581 (*new_key_def)->setup(table_arg, tbl_def_arg);
7582 DBUG_RETURN(HA_EXIT_SUCCESS);
7583 }
7584
7585 int rdb_normalize_tablename(const std::string &tablename,
7586 std::string *const strbuf) {
7587 if (tablename.size() < 2 || tablename[0] != '.' ||
7588 (tablename[1] != FN_LIBCHAR && tablename[1] != FN_LIBCHAR2)) {
7589 DBUG_ASSERT(0); // We were not passed table name?
7590 return HA_ERR_ROCKSDB_INVALID_TABLE;
7591 }
7592
7593 size_t pos = tablename.find_first_of(FN_LIBCHAR, 2);
7594 if (pos == std::string::npos) {
7595 pos = tablename.find_first_of(FN_LIBCHAR2, 2);
7596 }
7597
7598 if (pos == std::string::npos) {
7599 DBUG_ASSERT(0); // We were not passed table name?
7600 return HA_ERR_ROCKSDB_INVALID_TABLE;
7601 }
7602
7603 *strbuf = tablename.substr(2, pos - 2) + "." + tablename.substr(pos + 1);
7604
7605 return HA_EXIT_SUCCESS;
7606 }
7607
7608 /*
7609 Check to see if the user's original statement includes foreign key
7610 references
7611 */
7612 bool ha_rocksdb::contains_foreign_key(THD *const thd) {
7613 bool success;
7614 const char *str = thd_query_string(thd)->str;
7615
7616 DBUG_ASSERT(str != nullptr);
7617
7618 while (*str != '\0') {
7619 // Scan from our current pos looking for 'FOREIGN'
7620 str = rdb_find_in_string(str, "FOREIGN", &success);
7621 if (!success) {
7622 return false;
7623 }
7624
7625 // Skip past the found "FOREIGN'
7626 str = rdb_check_next_token(&my_charset_bin, str, "FOREIGN", &success);
7627 DBUG_ASSERT(success);
7628
7629 if (!my_isspace(&my_charset_bin, *str)) {
7630 return false;
7631 }
7632
7633 // See if the next token is 'KEY'
7634 str = rdb_check_next_token(&my_charset_bin, str, "KEY", &success);
7635 if (!success) {
7636 continue;
7637 }
7638
7639 // See if the next token is '('
7640 str = rdb_check_next_token(&my_charset_bin, str, "(", &success);
7641 if (!success) {
7642 // There is an optional index id after 'FOREIGN KEY', skip it
7643 str = rdb_skip_id(&my_charset_bin, str);
7644
7645 // Now check for '(' again
7646 str = rdb_check_next_token(&my_charset_bin, str, "(", &success);
7647 }
7648
7649 // If we have found 'FOREIGN KEY [<word>] (' we can be confident we have
7650 // a foreign key clause.
7651 return success;
7652 }
7653
7654 // We never found a valid foreign key clause
7655 return false;
7656 }
7657
7658 /**
7659 @brief
7660 splits the normalized table name of <dbname>.<tablename>#P#<part_no> into
7661 the <dbname>, <tablename> and <part_no> components.
7662
7663 @param dbbuf returns database name/table_schema
7664 @param tablebuf returns tablename
7665 @param partitionbuf returns partition suffix if there is one
7666 @return HA_EXIT_SUCCESS on success, non-zero on failure to split
7667 */
7668 int rdb_split_normalized_tablename(const std::string &fullname,
7669 std::string *const db,
7670 std::string *const table,
7671 std::string *const partition) {
7672 DBUG_ASSERT(!fullname.empty());
7673
7674 #define RDB_PARTITION_STR "#P#"
7675
7676 /* Normalize returns dbname.tablename. */
7677 size_t dotpos = fullname.find('.');
7678
7679 /* Invalid table name? */
7680 if (dotpos == std::string::npos) {
7681 return HA_ERR_ROCKSDB_INVALID_TABLE;
7682 }
7683
7684 // Table must have a database name associated with it.
7685 DBUG_ASSERT(dotpos > 0);
7686
7687 if (db != nullptr) {
7688 *db = fullname.substr(0, dotpos);
7689 }
7690
7691 dotpos++;
7692
7693 const size_t partpos =
7694 fullname.find(RDB_PARTITION_STR, dotpos, strlen(RDB_PARTITION_STR));
7695
7696 if (partpos != std::string::npos) {
7697 DBUG_ASSERT(partpos >= dotpos);
7698
7699 if (table != nullptr) {
7700 *table = fullname.substr(dotpos, partpos - dotpos);
7701 }
7702
7703 if (partition != nullptr) {
7704 *partition = fullname.substr(partpos + strlen(RDB_PARTITION_STR));
7705 }
7706 } else if (table != nullptr) {
7707 *table = fullname.substr(dotpos);
7708 }
7709
7710 return HA_EXIT_SUCCESS;
7711 }
7712
7713 /*
7714 Create a table's Rdb_tbl_def and its Rdb_key_defs and store table information
7715 into MyRocks Data Dictionary
7716 The method is called during create table/partition, truncate table/partition
7717
7718 @param table_name IN table's name formated as
7719 'dbname.tablename'
7720 @param table_arg IN sql table
7721 @param auto_increment_value IN specified table's auto increment value
7722
7723 @return
7724 HA_EXIT_SUCCESS OK
7725 other HA_ERR error code (can be SE-specific)
7726 */
7727 int ha_rocksdb::create_table(const std::string &table_name,
7728 const TABLE *table_arg,
7729 ulonglong auto_increment_value) {
7730 DBUG_ENTER_FUNC();
7731
7732
7733
7734 int err;
7735
7736 const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
7737 rocksdb::WriteBatch *const batch = wb.get();
7738
7739 /* Create table/key descriptions and put them into the data dictionary */
7740 m_tbl_def = new Rdb_tbl_def(table_name);
7741
7742 uint n_keys = table_arg->s->keys;
7743
7744 /*
7745 If no primary key found, create a hidden PK and place it inside table
7746 definition
7747 */
7748 if (has_hidden_pk(table_arg)) {
7749 n_keys += 1;
7750 // reset hidden pk id
7751 // the starting valid value for hidden pk is 1
7752 m_tbl_def->m_hidden_pk_val = 1;
7753 }
7754
7755 m_key_descr_arr = new std::shared_ptr<Rdb_key_def>[n_keys];
7756 m_tbl_def->m_key_count = n_keys;
7757 m_tbl_def->m_key_descr_arr = m_key_descr_arr;
7758
7759 err = create_key_defs(table_arg, m_tbl_def);
7760 if (err != HA_EXIT_SUCCESS) {
7761 goto error;
7762 }
7763
7764 m_pk_descr = m_key_descr_arr[pk_index(table_arg, m_tbl_def)];
7765
7766 if (auto_increment_value) {
7767 bool autoinc_upgrade_test = false;
7768 m_tbl_def->m_auto_incr_val = auto_increment_value;
7769 DBUG_EXECUTE_IF("myrocks_autoinc_upgrade", autoinc_upgrade_test = true;);
7770 if (!autoinc_upgrade_test) {
7771 auto s = dict_manager.put_auto_incr_val(
7772 batch, m_tbl_def->get_autoincr_gl_index_id(),
7773 m_tbl_def->m_auto_incr_val);
7774 if (!s.ok()) {
7775 goto error;
7776 }
7777 }
7778 }
7779
7780 dict_manager.lock();
7781 err = ddl_manager.put_and_write(m_tbl_def, batch);
7782 if (err != HA_EXIT_SUCCESS) {
7783 dict_manager.unlock();
7784 goto error;
7785 }
7786
7787 err = dict_manager.commit(batch);
7788 if (err != HA_EXIT_SUCCESS) {
7789 dict_manager.unlock();
7790 goto error;
7791 }
7792
7793 dict_manager.unlock();
7794
7795 DBUG_RETURN(HA_EXIT_SUCCESS);
7796
7797 error:
7798 /* Delete what we have allocated so far */
7799 delete m_tbl_def;
7800 m_tbl_def = nullptr;
7801 m_key_descr_arr = nullptr;
7802
7803 DBUG_RETURN(err);
7804 }
7805
7806 /**
7807 @brief
7808 create() is called to create a table. The variable name will have the name
7809 of the table.
7810
7811 @details
7812 When create() is called you do not need to worry about
7813 opening the table. Also, the .frm file will have already been
7814 created so adjusting create_info is not necessary. You can overwrite
7815 the .frm file at this point if you wish to change the table
7816 definition, but there are no methods currently provided for doing
7817 so.
7818
7819 Called from handle.cc by ha_create_table().
7820
7821 @return
7822 HA_EXIT_SUCCESS OK
7823 other HA_ERR error code (can be SE-specific)
7824
7825 @see
7826 ha_create_table() in handle.cc
7827 */
7828
7829 int ha_rocksdb::create(const char *const name, TABLE *const table_arg,
7830 HA_CREATE_INFO *const create_info) {
7831 DBUG_ENTER_FUNC();
7832
7833 DBUG_ASSERT(table_arg != nullptr);
7834 DBUG_ASSERT(create_info != nullptr);
7835
7836 if (create_info->data_file_name) {
7837 // DATA DIRECTORY is used to create tables under a specific location
7838 // outside the MySQL data directory. We don't support this for MyRocks.
7839 // The `rocksdb_datadir` setting should be used to configure RocksDB data
7840 // directory.
7841 DBUG_RETURN(HA_ERR_ROCKSDB_TABLE_DATA_DIRECTORY_NOT_SUPPORTED);
7842 }
7843
7844 if (create_info->index_file_name) {
7845 // Similar check for INDEX DIRECTORY as well.
7846 DBUG_RETURN(HA_ERR_ROCKSDB_TABLE_INDEX_DIRECTORY_NOT_SUPPORTED);
7847 }
7848
7849 int err;
7850 /*
7851 Construct dbname.tablename ourselves, because parititioning
7852 passes strings like "./test/t14#P#p0" for individual partitions,
7853 while table_arg->s->table_name has none of that.
7854 */
7855 std::string str;
7856 err = rdb_normalize_tablename(name, &str);
7857 if (err != HA_EXIT_SUCCESS) {
7858 DBUG_RETURN(err);
7859 }
7860
7861 // FOREIGN KEY isn't supported yet
7862 THD *const thd = my_core::thd_get_current_thd();
7863 if (contains_foreign_key(thd)) {
7864 my_error(ER_NOT_SUPPORTED_YET, MYF(0),
7865 "FOREIGN KEY for the RocksDB storage engine");
7866 DBUG_RETURN(HA_ERR_UNSUPPORTED);
7867 }
7868
7869 // Check whether Data Dictionary contain information
7870 Rdb_tbl_def *tbl = ddl_manager.find(str);
7871 if (tbl != nullptr) {
7872 if (thd->lex->sql_command == SQLCOM_TRUNCATE) {
7873 err = delete_table(tbl);
7874 if (err != HA_EXIT_SUCCESS) {
7875 DBUG_RETURN(err);
7876 }
7877 } else {
7878 my_error(ER_METADATA_INCONSISTENCY, MYF(0), str.c_str(), name);
7879 DBUG_RETURN(HA_ERR_ROCKSDB_CORRUPT_DATA);
7880 }
7881 }
7882
7883 // The below adds/clears hooks in RocksDB sync points. There's no reason for
7884 // this code to be in ::create() but it needs to be somewhere where it is
7885 // away from any tight loops and where one can invoke it from mtr:
7886 DBUG_EXECUTE_IF("rocksdb_enable_delay_commits",
7887 {
7888 auto syncpoint= rocksdb::SyncPoint::GetInstance();
7889 syncpoint->SetCallBack("DBImpl::WriteImpl:BeforeLeaderEnters",
7890 [&](void* /*arg*/) {my_sleep(500);} );
7891 syncpoint->EnableProcessing();
7892 push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_WRONG_ARGUMENTS,
7893 "enable_delay_commits_mode ON");
7894
7895 });
7896 DBUG_EXECUTE_IF("rocksdb_disable_delay_commits",
7897 {
7898 auto syncpoint= rocksdb::SyncPoint::GetInstance();
7899 syncpoint->ClearCallBack("DBImpl::WriteImpl:BeforeLeaderEnters");
7900 syncpoint->DisableProcessing();
7901 push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_WRONG_ARGUMENTS,
7902 "enable_delay_commits_mode OFF");
7903 });
7904
7905 DBUG_RETURN(create_table(str, table_arg, create_info->auto_increment_value));
7906 }
7907
7908 /**
7909 @note
7910 This function is used only when the table has not yet been opened, and
7911 keyread_allowed bitmap doesn't have the correct values yet.
7912
7913 See comment in ha_rocksdb::index_flags() for details.
7914 */
7915
7916 bool ha_rocksdb::check_keyread_allowed(uint inx, uint part,
7917 bool all_parts) const {
7918 bool res = true;
7919 KEY *const key_info = &table_share->key_info[inx];
7920
7921 Rdb_field_packing dummy1;
7922 res = dummy1.setup(nullptr, key_info->key_part[part].field, inx, part,
7923 key_info->key_part[part].length);
7924
7925 if (res && all_parts) {
7926 for (uint i = 0; i < part; i++) {
7927 Field *field;
7928 if ((field = key_info->key_part[i].field)) {
7929 Rdb_field_packing dummy;
7930 if (!dummy.setup(nullptr, field, inx, i,
7931 key_info->key_part[i].length)) {
7932 /* Cannot do index-only reads for this column */
7933 res = false;
7934 break;
7935 }
7936 }
7937 }
7938 }
7939
7940 const uint pk = table_share->primary_key;
7941 if (inx == pk && all_parts &&
7942 part + 1 == table_share->key_info[pk].user_defined_key_parts) {
7943 m_pk_can_be_decoded = res;
7944 }
7945
7946 return res;
7947 }
7948
7949 int ha_rocksdb::read_key_exact(const Rdb_key_def &kd,
7950 rocksdb::Iterator *const iter,
7951 const bool /* unused */,
7952 const rocksdb::Slice &key_slice,
7953 const int64_t ttl_filter_ts) {
7954 THD *thd = ha_thd();
7955 /*
7956 We are looking for the first record such that
7957 index_tuple= lookup_tuple.
7958 lookup_tuple may be a prefix of the index.
7959 */
7960 rocksdb_smart_seek(kd.m_is_reverse_cf, iter, key_slice);
7961
7962 while (iter->Valid() && kd.value_matches_prefix(iter->key(), key_slice)) {
7963 if (thd && thd->killed) {
7964 return HA_ERR_QUERY_INTERRUPTED;
7965 }
7966 /*
7967 If TTL is enabled we need to check if the given key has already expired
7968 from the POV of the current transaction. If it has, try going to the next
7969 key.
7970 */
7971 if (kd.has_ttl() && should_hide_ttl_rec(kd, iter->value(), ttl_filter_ts)) {
7972 rocksdb_smart_next(kd.m_is_reverse_cf, iter);
7973 continue;
7974 }
7975
7976 return HA_EXIT_SUCCESS;
7977 }
7978
7979 /*
7980 Got a record that is not equal to the lookup value, or even a record
7981 from another table.index.
7982 */
7983 return HA_ERR_KEY_NOT_FOUND;
7984 }
7985
7986 int ha_rocksdb::read_before_key(const Rdb_key_def &kd,
7987 const bool full_key_match,
7988 const rocksdb::Slice &key_slice,
7989 const int64_t ttl_filter_ts) {
7990 THD *thd = ha_thd();
7991 /*
7992 We are looking for record with the biggest t.key such that
7993 t.key < lookup_tuple.
7994 */
7995 rocksdb_smart_seek(!kd.m_is_reverse_cf, m_scan_it, key_slice);
7996
7997 while (is_valid(m_scan_it)) {
7998 if (thd && thd->killed) {
7999 return HA_ERR_QUERY_INTERRUPTED;
8000 }
8001 /*
8002 We are using full key and we've hit an exact match, or...
8003
8004 If TTL is enabled we need to check if the given key has already expired
8005 from the POV of the current transaction. If it has, try going to the next
8006 key.
8007 */
8008 if ((full_key_match &&
8009 kd.value_matches_prefix(m_scan_it->key(), key_slice)) ||
8010 (kd.has_ttl() &&
8011 should_hide_ttl_rec(kd, m_scan_it->value(), ttl_filter_ts))) {
8012 rocksdb_smart_next(!kd.m_is_reverse_cf, m_scan_it);
8013 continue;
8014 }
8015
8016 return HA_EXIT_SUCCESS;
8017 }
8018
8019 return HA_ERR_KEY_NOT_FOUND;
8020 }
8021
8022 int ha_rocksdb::read_after_key(const Rdb_key_def &kd,
8023 const rocksdb::Slice &key_slice,
8024 const int64_t ttl_filter_ts) {
8025 THD *thd = ha_thd();
8026 /*
8027 We are looking for the first record such that
8028
8029 index_tuple $GT lookup_tuple
8030
8031 with HA_READ_AFTER_KEY, $GT = '>',
8032 with HA_READ_KEY_OR_NEXT, $GT = '>='
8033 */
8034 rocksdb_smart_seek(kd.m_is_reverse_cf, m_scan_it, key_slice);
8035
8036 /*
8037 If TTL is enabled we need to check if the given key has already expired
8038 from the POV of the current transaction. If it has, try going to the next
8039 key.
8040 */
8041 while (is_valid(m_scan_it) && kd.has_ttl() &&
8042 should_hide_ttl_rec(kd, m_scan_it->value(), ttl_filter_ts)) {
8043 if (thd && thd->killed) {
8044 return HA_ERR_QUERY_INTERRUPTED;
8045 }
8046 rocksdb_smart_next(kd.m_is_reverse_cf, m_scan_it);
8047 }
8048
8049 return is_valid(m_scan_it) ? HA_EXIT_SUCCESS : HA_ERR_KEY_NOT_FOUND;
8050 }
8051
8052 int ha_rocksdb::position_to_correct_key(
8053 const Rdb_key_def &kd, const enum ha_rkey_function &find_flag,
8054 const bool full_key_match, const uchar *const key,
8055 const key_part_map &keypart_map, const rocksdb::Slice &key_slice,
8056 bool *const move_forward, const int64_t ttl_filter_ts) {
8057 int rc = 0;
8058
8059 *move_forward = true;
8060
8061 switch (find_flag) {
8062 case HA_READ_KEY_EXACT:
8063 rc = read_key_exact(kd, m_scan_it, full_key_match, key_slice,
8064 ttl_filter_ts);
8065 break;
8066 case HA_READ_BEFORE_KEY:
8067 *move_forward = false;
8068 rc = read_before_key(kd, full_key_match, key_slice, ttl_filter_ts);
8069 if (rc == 0 && !kd.covers_key(m_scan_it->key())) {
8070 /* The record we've got is not from this index */
8071 rc = HA_ERR_KEY_NOT_FOUND;
8072 }
8073 break;
8074 case HA_READ_AFTER_KEY:
8075 case HA_READ_KEY_OR_NEXT:
8076 rc = read_after_key(kd, key_slice, ttl_filter_ts);
8077 if (rc == 0 && !kd.covers_key(m_scan_it->key())) {
8078 /* The record we've got is not from this index */
8079 rc = HA_ERR_KEY_NOT_FOUND;
8080 }
8081 break;
8082 case HA_READ_KEY_OR_PREV:
8083 case HA_READ_PREFIX:
8084 /* This flag is not used by the SQL layer, so we don't support it yet. */
8085 rc = HA_ERR_UNSUPPORTED;
8086 break;
8087 case HA_READ_PREFIX_LAST:
8088 case HA_READ_PREFIX_LAST_OR_PREV:
8089 *move_forward = false;
8090 /*
8091 Find the last record with the specified index prefix lookup.
8092 - HA_READ_PREFIX_LAST requires that the record has the
8093 prefix=lookup (if there are no such records,
8094 HA_ERR_KEY_NOT_FOUND should be returned).
8095 - HA_READ_PREFIX_LAST_OR_PREV has no such requirement. If there are no
8096 records with prefix=lookup, we should return the last record
8097 before that.
8098 */
8099 rc = read_before_key(kd, full_key_match, key_slice, ttl_filter_ts);
8100 if (rc == 0) {
8101 const rocksdb::Slice &rkey = m_scan_it->key();
8102 if (!kd.covers_key(rkey)) {
8103 /* The record we've got is not from this index */
8104 rc = HA_ERR_KEY_NOT_FOUND;
8105 } else if (find_flag == HA_READ_PREFIX_LAST) {
8106 uint size = kd.pack_index_tuple(table, m_pack_buffer,
8107 m_sk_packed_tuple, m_record_buffer,
8108 key, keypart_map);
8109 rocksdb::Slice lookup_tuple(
8110 reinterpret_cast<char *>(m_sk_packed_tuple), size);
8111
8112 // We need to compare the key we've got with the original search
8113 // prefix.
8114 if (!kd.value_matches_prefix(rkey, lookup_tuple)) {
8115 rc = HA_ERR_KEY_NOT_FOUND;
8116 }
8117 }
8118 }
8119 break;
8120 default:
8121 DBUG_ASSERT(0);
8122 break;
8123 }
8124
8125 return rc;
8126 }
8127
8128 int ha_rocksdb::calc_eq_cond_len(const Rdb_key_def &kd,
8129 const enum ha_rkey_function &find_flag,
8130 const rocksdb::Slice &slice,
8131 const int bytes_changed_by_succ,
8132 const key_range *const end_key,
8133 uint *const end_key_packed_size) {
8134 if (find_flag == HA_READ_KEY_EXACT) return slice.size();
8135
8136 if (find_flag == HA_READ_PREFIX_LAST) {
8137 /*
8138 We have made the kd.successor(m_sk_packed_tuple) call above.
8139
8140 The slice is at least Rdb_key_def::INDEX_NUMBER_SIZE bytes long.
8141 */
8142 return slice.size() - bytes_changed_by_succ;
8143 }
8144
8145 if (end_key) {
8146 *end_key_packed_size =
8147 kd.pack_index_tuple(table, m_pack_buffer, m_end_key_packed_tuple,
8148 m_record_buffer, end_key->key, end_key->keypart_map);
8149
8150 /*
8151 Calculating length of the equal conditions here. 4 byte index id is
8152 included.
8153 Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3)
8154 WHERE id1=1 AND id2=1 AND id3>=2 => eq_cond_len= 4+8+4= 16
8155 WHERE id1=1 AND id2>=1 AND id3>=2 => eq_cond_len= 4+8= 12
8156 Example2: id1 VARCHAR(30), id2 INT, PRIMARY KEY (id1, id2)
8157 WHERE id1 = 'AAA' and id2 < 3; => eq_cond_len=13 (varchar used 9 bytes)
8158 */
8159 rocksdb::Slice end_slice(reinterpret_cast<char *>(m_end_key_packed_tuple),
8160 *end_key_packed_size);
8161 return slice.difference_offset(end_slice);
8162 }
8163
8164 /*
8165 On range scan without any end key condition, there is no
8166 eq cond, and eq cond length is the same as index_id size (4 bytes).
8167 Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3)
8168 WHERE id1>=1 AND id2 >= 2 and id2 <= 5 => eq_cond_len= 4
8169 */
8170 return Rdb_key_def::INDEX_NUMBER_SIZE;
8171 }
8172
8173 int ha_rocksdb::read_row_from_primary_key(uchar *const buf) {
8174 int rc;
8175 const rocksdb::Slice &rkey = m_scan_it->key();
8176 const uint pk_size = rkey.size();
8177 const char *pk_data = rkey.data();
8178
8179 memcpy(m_pk_packed_tuple, pk_data, pk_size);
8180 m_last_rowkey.copy(pk_data, pk_size, &my_charset_bin);
8181
8182 if (m_lock_rows != RDB_LOCK_NONE) {
8183 /* We need to put a lock and re-read */
8184 rc = get_row_by_rowid(buf, m_pk_packed_tuple, pk_size);
8185 } else {
8186 /* Unpack from the row we've read */
8187 const rocksdb::Slice &value = m_scan_it->value();
8188 rc = convert_record_from_storage_format(&rkey, &value, buf);
8189 }
8190
8191 return rc;
8192 }
8193
8194 int ha_rocksdb::read_row_from_secondary_key(uchar *const buf,
8195 const Rdb_key_def &kd,
8196 bool move_forward) {
8197 int rc = 0;
8198 uint pk_size;
8199
8200 /* Get the key columns and primary key value */
8201 const rocksdb::Slice &rkey = m_scan_it->key();
8202 const rocksdb::Slice &value = m_scan_it->value();
8203
8204 #ifndef DBUG_OFF
8205 bool save_keyread_only = m_keyread_only;
8206 #endif
8207 DBUG_EXECUTE_IF("dbug.rocksdb.HA_EXTRA_KEYREAD", { m_keyread_only = true; });
8208
8209 bool covered_lookup = (m_keyread_only && kd.can_cover_lookup()) ||
8210 kd.covers_lookup(&value, &m_lookup_bitmap);
8211
8212 #ifndef DBUG_OFF
8213 m_keyread_only = save_keyread_only;
8214 #endif
8215
8216 if (covered_lookup && m_lock_rows == RDB_LOCK_NONE) {
8217 pk_size =
8218 kd.get_primary_key_tuple(table, *m_pk_descr, &rkey, m_pk_packed_tuple);
8219 if (pk_size == RDB_INVALID_KEY_LEN) {
8220 rc = HA_ERR_ROCKSDB_CORRUPT_DATA;
8221 } else {
8222 rc = kd.unpack_record(table, buf, &rkey, &value,
8223 m_converter->get_verify_row_debug_checksums());
8224 global_stats.covered_secondary_key_lookups.inc();
8225 }
8226 } else {
8227 if (kd.m_is_reverse_cf) move_forward = !move_forward;
8228
8229 rc = find_icp_matching_index_rec(move_forward, buf);
8230 if (!rc) {
8231 const rocksdb::Slice &rkey = m_scan_it->key();
8232 pk_size = kd.get_primary_key_tuple(table, *m_pk_descr, &rkey,
8233 m_pk_packed_tuple);
8234 if (pk_size == RDB_INVALID_KEY_LEN) {
8235 rc = HA_ERR_ROCKSDB_CORRUPT_DATA;
8236 } else {
8237 rc = get_row_by_rowid(buf, m_pk_packed_tuple, pk_size);
8238 }
8239 }
8240 }
8241
8242 if (!rc) {
8243 m_last_rowkey.copy((const char *)m_pk_packed_tuple, pk_size,
8244 &my_charset_bin);
8245 }
8246
8247 return rc;
8248 }
8249
8250 /**
8251 @note
8252 The problem with this function is that SQL layer calls it, when
8253 - the table has not been yet opened (no ::open() call done)
8254 - this->table_share already exists, but it is in the process of being
8255 filled, so some of fields are still NULL.
8256 - In particular, table_share->key_info[inx].key_part[] is filled only up
8257 to part #part. Subsequent key parts are not yet filled.
8258
8259 To complicate things further, SQL layer will call index_flags() with
8260 all_parts=TRUE. Essentially, we're asked to provide flags for reading
8261 keyparts whose datatype is not yet known.
8262
8263 We walk around this problem by using check_keyread_allowed(), which uses
8264 table_share object and is careful not to step on unitialized data.
8265
8266 When we get a call with all_parts=TRUE, we try to analyze all parts but
8267 ignore those that have key_part->field==nullptr (these are not initialized
8268 yet).
8269 */
8270
8271 ulong ha_rocksdb::index_flags(uint inx, uint part, bool all_parts) const {
8272 DBUG_ENTER_FUNC();
8273
8274 ulong base_flags = HA_READ_NEXT | // doesn't seem to be used
8275 HA_READ_ORDER | HA_READ_RANGE | HA_READ_PREV;
8276
8277 if (check_keyread_allowed(inx, part, all_parts)) {
8278 base_flags |= HA_KEYREAD_ONLY;
8279 }
8280
8281 if (inx == table_share->primary_key) {
8282 /*
8283 Index-only reads on primary key are the same as table scan for us. Still,
8284 we need to explicitly "allow" them, otherwise SQL layer will miss some
8285 plans.
8286 */
8287 base_flags |= HA_KEYREAD_ONLY | HA_CLUSTERED_INDEX;
8288 } else {
8289 /*
8290 We can Index Condition Pushdown any key except the primary. With primary
8291 key, we get (pk, record) pair immediately, there is no place to put the
8292 ICP check.
8293 */
8294 base_flags |= HA_DO_INDEX_COND_PUSHDOWN;
8295 }
8296
8297 DBUG_RETURN(base_flags);
8298 }
8299
8300 /**
8301 @brief
8302 Read next index tuple through the secondary index.
8303
8304 @details
8305 m_scan_it points at the index key-value pair that we should read the (pk,row)
8306 pair for.
8307 */
8308 int ha_rocksdb::secondary_index_read(const int keyno, uchar *const buf) {
8309 DBUG_ASSERT(table != nullptr);
8310 #ifdef MARIAROCKS_NOT_YET
8311 stats.rows_requested++;
8312 #endif
8313 /* Use STATUS_NOT_FOUND when record not found or some error occurred */
8314 table->status = STATUS_NOT_FOUND;
8315
8316 if (is_valid(m_scan_it)) {
8317 rocksdb::Slice key = m_scan_it->key();
8318
8319 /* Check if we've ran out of records of this index */
8320 if (m_key_descr_arr[keyno]->covers_key(key)) {
8321 int rc = 0;
8322
8323 // TODO: We could here check if we have ran out of range we're scanning
8324 const uint size = m_key_descr_arr[keyno]->get_primary_key_tuple(
8325 table, *m_pk_descr, &key, m_pk_packed_tuple);
8326 if (size == RDB_INVALID_KEY_LEN) {
8327 return HA_ERR_ROCKSDB_CORRUPT_DATA;
8328 }
8329
8330 m_last_rowkey.copy((const char *)m_pk_packed_tuple, size,
8331 &my_charset_bin);
8332
8333 rocksdb::Slice value = m_scan_it->value();
8334 bool covered_lookup =
8335 (m_keyread_only && m_key_descr_arr[keyno]->can_cover_lookup()) ||
8336 m_key_descr_arr[keyno]->covers_lookup(&value, &m_lookup_bitmap);
8337 if (covered_lookup && m_lock_rows == RDB_LOCK_NONE) {
8338 rc = m_key_descr_arr[keyno]->unpack_record(
8339 table, buf, &key, &value,
8340 m_converter->get_verify_row_debug_checksums());
8341 global_stats.covered_secondary_key_lookups.inc();
8342 } else {
8343 DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete_sk");
8344 rc = get_row_by_rowid(buf, m_pk_packed_tuple, size);
8345 }
8346
8347 if (!rc) {
8348 table->status = 0;
8349 #ifdef MARIAROCKS_NOT_YET
8350 stats.rows_read++;
8351 stats.rows_index_next++;
8352 #endif
8353 update_row_stats(ROWS_READ);
8354 }
8355 return rc;
8356 }
8357 }
8358 return HA_ERR_END_OF_FILE;
8359 }
8360
8361 bool ha_rocksdb::is_using_full_key(key_part_map keypart_map,
8362 uint actual_key_parts)
8363 {
8364 return (keypart_map == HA_WHOLE_KEY) ||
8365 (keypart_map == ((key_part_map(1) << actual_key_parts)
8366 - 1));
8367 }
8368
8369 /*
8370 ha_rocksdb::read_range_first overrides handler::read_range_first.
8371 The only difference from handler::read_range_first is that
8372 ha_rocksdb::read_range_first passes end_key to
8373 ha_rocksdb::index_read_map_impl function.
8374
8375 @return
8376 HA_EXIT_SUCCESS OK
8377 other HA_ERR error code (can be SE-specific)
8378 */
8379 int ha_rocksdb::read_range_first(const key_range *const start_key,
8380 const key_range *const end_key,
8381 bool eq_range_arg, bool sorted) {
8382 DBUG_ENTER_FUNC();
8383
8384 int result;
8385
8386 eq_range = eq_range_arg;
8387
8388 /* MariaDB: Pass RANGE_SCAN_ASC when we support ICP on backwards scans */
8389 set_end_range(end_key);
8390
8391 range_key_part = table->key_info[active_index].key_part;
8392
8393 if (!start_key) {
8394 // Read first record
8395 result = ha_index_first(table->record[0]);
8396 } else {
8397 #ifdef MARIAROCKS_NOT_YET
8398 if (is_using_prohibited_gap_locks(
8399 is_using_full_unique_key(active_index, start_key->keypart_map,
8400 start_key->flag))) {
8401 DBUG_RETURN(HA_ERR_LOCK_DEADLOCK);
8402 }
8403 #endif
8404 increment_statistics(&SSV::ha_read_key_count);
8405
8406 result =
8407 index_read_map_impl(table->record[0], start_key->key,
8408 start_key->keypart_map, start_key->flag, end_key);
8409 }
8410 if (result) {
8411 DBUG_RETURN((result == HA_ERR_KEY_NOT_FOUND) ? HA_ERR_END_OF_FILE : result);
8412 }
8413
8414 if (compare_key(end_range) <= 0) {
8415 DBUG_RETURN(HA_EXIT_SUCCESS);
8416 } else {
8417 /*
8418 The last read row does not fall in the range. So request
8419 storage engine to release row lock if possible.
8420 */
8421 unlock_row();
8422 DBUG_RETURN(HA_ERR_END_OF_FILE);
8423 }
8424 }
8425
8426 int ha_rocksdb::prepare_index_scan()
8427 {
8428 range_key_part= table->key_info[active_index].key_part;
8429 set_end_range(NULL);
8430 return 0;
8431 }
8432
8433
8434 int ha_rocksdb::prepare_range_scan(const key_range *start_key,
8435 const key_range *end_key)
8436 {
8437 range_key_part= table->key_info[active_index].key_part;
8438
8439 if (start_key)
8440 {
8441 m_save_start_range= *start_key;
8442 m_start_range= &m_save_start_range;
8443 }
8444 else
8445 m_start_range= NULL;
8446
8447 set_end_range(end_key);
8448 return 0;
8449 }
8450
8451
8452 /**
8453 @return
8454 HA_EXIT_SUCCESS OK
8455 other HA_ERR error code (can be SE-specific)
8456 */
8457
8458 int ha_rocksdb::index_read_map(uchar *const buf, const uchar *const key,
8459 key_part_map keypart_map,
8460 enum ha_rkey_function find_flag) {
8461 DBUG_ENTER_FUNC();
8462
8463 DBUG_RETURN(index_read_map_impl(buf, key, keypart_map, find_flag, nullptr));
8464 }
8465
8466 /*
8467 See storage/rocksdb/rocksdb-range-access.txt for description of how MySQL
8468 index navigation commands are converted into RocksDB lookup commands.
8469
8470 This function takes end_key as an argument, and it is set on range scan.
8471 MyRocks needs to decide whether prefix bloom filter can be used or not.
8472 To decide to use prefix bloom filter or not, calculating equal condition
8473 length
8474 is needed. On equal lookups (find_flag == HA_READ_KEY_EXACT), equal
8475 condition length is the same as rocksdb::Slice.size() of the start key.
8476 On range scan, equal condition length is MIN(start_key, end_key) of the
8477 rocksdb::Slice expression.
8478
8479 @return
8480 HA_EXIT_SUCCESS OK
8481 other HA_ERR error code (can be SE-specific)
8482 */
8483 int ha_rocksdb::index_read_map_impl(uchar *const buf, const uchar *const key,
8484 key_part_map keypart_map,
8485 enum ha_rkey_function find_flag,
8486 const key_range *end_key) {
8487 DBUG_ENTER_FUNC();
8488
8489 DBUG_EXECUTE_IF("myrocks_busy_loop_on_row_read", int debug_i = 0;
8490 while (1) { debug_i++; });
8491
8492 int rc = 0;
8493
8494 THD *thd = ha_thd();
8495 DEBUG_SYNC(thd, "rocksdb.check_flags_rmi");
8496 if (thd && thd->killed) {
8497 rc = HA_ERR_QUERY_INTERRUPTED;
8498 DBUG_RETURN(rc);
8499 }
8500
8501 const Rdb_key_def &kd = *m_key_descr_arr[active_index];
8502 const uint actual_key_parts = kd.get_key_parts();
8503 bool using_full_key = is_using_full_key(keypart_map, actual_key_parts);
8504
8505 if (!end_key) end_key = end_range;
8506
8507 /* By default, we don't need the retrieved records to match the prefix */
8508 m_sk_match_prefix = nullptr;
8509 #ifdef MARIAROCKS_NOT_YET
8510 stats.rows_requested++;
8511 #endif
8512 if (active_index == table->s->primary_key && find_flag == HA_READ_KEY_EXACT &&
8513 using_full_key) {
8514 /*
8515 Equality lookup over primary key, using full tuple.
8516 This is a special case, use DB::Get.
8517 */
8518 const uint size = kd.pack_index_tuple(table, m_pack_buffer,
8519 m_pk_packed_tuple, m_record_buffer,
8520 key, keypart_map);
8521 bool skip_lookup = is_blind_delete_enabled();
8522
8523 rc = get_row_by_rowid(buf, m_pk_packed_tuple, size, skip_lookup, false);
8524
8525 if (!rc && !skip_lookup) {
8526 #ifdef MARIAROCKS_NOT_YET
8527 stats.rows_read++;
8528 stats.rows_index_first++;
8529 #endif
8530 update_row_stats(ROWS_READ);
8531 }
8532 DBUG_RETURN(rc);
8533 }
8534
8535 /*
8536 Unique secondary index performs lookups without the extended key fields
8537 */
8538 uint packed_size;
8539 if (active_index != table->s->primary_key &&
8540 table->key_info[active_index].flags & HA_NOSAME &&
8541 find_flag == HA_READ_KEY_EXACT && using_full_key) {
8542 key_part_map tmp_map = (key_part_map(1) << table->key_info[active_index]
8543 .user_defined_key_parts) -
8544 1;
8545 packed_size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
8546 m_record_buffer, key, tmp_map);
8547 if (table->key_info[active_index].user_defined_key_parts !=
8548 kd.get_key_parts()) {
8549 using_full_key = false;
8550 }
8551 } else {
8552 packed_size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
8553 m_record_buffer, key, keypart_map);
8554 }
8555
8556 if ((pushed_idx_cond && pushed_idx_cond_keyno == active_index) &&
8557 (find_flag == HA_READ_KEY_EXACT || find_flag == HA_READ_PREFIX_LAST)) {
8558 /*
8559 We are doing a point index lookup, and ICP is enabled. It is possible
8560 that this call will be followed by ha_rocksdb->index_next_same() call.
8561
8562 Do what InnoDB does: save the lookup tuple now. We will need it in
8563 index_next_same/find_icp_matching_index_rec in order to stop scanning
8564 as soon as index record doesn't match the lookup tuple.
8565
8566 When not using ICP, handler::index_next_same() will make sure that rows
8567 that don't match the lookup prefix are not returned.
8568 row matches the lookup prefix.
8569 */
8570 m_sk_match_prefix = m_sk_match_prefix_buf;
8571 m_sk_match_length = packed_size;
8572 memcpy(m_sk_match_prefix, m_sk_packed_tuple, packed_size);
8573 }
8574
8575 int bytes_changed_by_succ = 0;
8576 if (find_flag == HA_READ_PREFIX_LAST_OR_PREV ||
8577 find_flag == HA_READ_PREFIX_LAST || find_flag == HA_READ_AFTER_KEY) {
8578 /* See below */
8579 bytes_changed_by_succ = kd.successor(m_sk_packed_tuple, packed_size);
8580 }
8581
8582 rocksdb::Slice slice(reinterpret_cast<const char *>(m_sk_packed_tuple),
8583 packed_size);
8584
8585 uint end_key_packed_size = 0;
8586 /*
8587 In MariaDB, the end_key is always the bigger end of the range.
8588 If we are doing a reverse-ordered scan (that is, walking from the bigger
8589 key values to smaller), we should use the smaller end of range as end_key.
8590 */
8591 const key_range *cur_end_key= end_key;
8592 if (find_flag == HA_READ_PREFIX_LAST_OR_PREV ||
8593 find_flag == HA_READ_BEFORE_KEY)
8594 {
8595 cur_end_key= m_start_range;
8596 }
8597
8598 const uint eq_cond_len =
8599 calc_eq_cond_len(kd, find_flag, slice, bytes_changed_by_succ, cur_end_key,
8600 &end_key_packed_size);
8601
8602 bool use_all_keys = false;
8603 if (find_flag == HA_READ_KEY_EXACT &&
8604 my_count_bits(keypart_map) == kd.get_key_parts()) {
8605 use_all_keys = true;
8606 }
8607
8608 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
8609 const bool is_new_snapshot = !tx->has_snapshot();
8610 // Loop as long as we get a deadlock error AND we end up creating the
8611 // snapshot here (i.e. it did not exist prior to this)
8612 for (;;) {
8613 DEBUG_SYNC(thd, "rocksdb.check_flags_rmi_scan");
8614 if (thd && thd->killed) {
8615 rc = HA_ERR_QUERY_INTERRUPTED;
8616 break;
8617 }
8618 /*
8619 This will open the iterator and position it at a record that's equal or
8620 greater than the lookup tuple.
8621 */
8622 setup_scan_iterator(kd, &slice, use_all_keys, eq_cond_len);
8623
8624 /*
8625 Once we are positioned on from above, move to the position we really
8626 want: See storage/rocksdb/rocksdb-range-access.txt
8627 */
8628 bool move_forward;
8629 rc =
8630 position_to_correct_key(kd, find_flag, using_full_key, key, keypart_map,
8631 slice, &move_forward, tx->m_snapshot_timestamp);
8632
8633 if (rc) {
8634 break;
8635 }
8636
8637 m_skip_scan_it_next_call = false;
8638
8639 /*
8640 Now get the data for the row into 'buf'. If we were using a primary key
8641 then we have all the rows we need. For a secondary key we now need to
8642 lookup the primary key.
8643 */
8644 if (active_index == table->s->primary_key) {
8645 rc = read_row_from_primary_key(buf);
8646 } else {
8647 rc = read_row_from_secondary_key(buf, kd, move_forward);
8648 }
8649
8650 if (!should_recreate_snapshot(rc, is_new_snapshot)) {
8651 break; /* Exit the loop */
8652 }
8653
8654 // release the snapshot and iterator so they will be regenerated
8655 tx->release_snapshot();
8656 release_scan_iterator();
8657 }
8658
8659 if (rc) {
8660 /*
8661 This status is returned on any error
8662 the only possible error condition is record-not-found
8663 */
8664 table->status = STATUS_NOT_FOUND;
8665 } else {
8666 table->status = 0;
8667 #ifdef MARIAROCKS_NOT_YET
8668 stats.rows_read++;
8669 stats.rows_index_first++;
8670 #endif
8671 update_row_stats(ROWS_READ);
8672 }
8673
8674 DBUG_RETURN(rc);
8675 }
8676
8677 /*
8678 @brief
8679 Scan the secondary index until we find an index record that satisfies ICP
8680
8681 @param move_forward TRUE <=> move m_scan_it forward
8682 FALSE <=> move m_scan_it backward
8683 @param buf Record buffer (must be the same buffer that
8684 pushed index condition points to, in practice
8685 it is table->record[0])
8686
8687 @detail
8688 Move the current iterator m_scan_it until we get an index tuple that
8689 satisfies the pushed Index Condition.
8690 (if there is no pushed index condition, return right away)
8691
8692 @return
8693 0 - Index tuple satisfies ICP, can do index read.
8694 other - error code
8695 */
8696
8697 int ha_rocksdb::find_icp_matching_index_rec(const bool move_forward,
8698 uchar *const buf) {
8699 if (pushed_idx_cond && pushed_idx_cond_keyno == active_index) {
8700 const Rdb_key_def &kd = *m_key_descr_arr[active_index];
8701 THD *thd = ha_thd();
8702
8703 while (1) {
8704 int rc = rocksdb_skip_expired_records(kd, m_scan_it, !move_forward);
8705 if (rc != HA_EXIT_SUCCESS) {
8706 return rc;
8707 }
8708
8709 if (thd && thd->killed) {
8710 return HA_ERR_QUERY_INTERRUPTED;
8711 }
8712
8713 if (!is_valid(m_scan_it)) {
8714 table->status = STATUS_NOT_FOUND;
8715 return HA_ERR_END_OF_FILE;
8716 }
8717 const rocksdb::Slice rkey = m_scan_it->key();
8718
8719 if (!kd.covers_key(rkey)) {
8720 table->status = STATUS_NOT_FOUND;
8721 return HA_ERR_END_OF_FILE;
8722 }
8723
8724 if (m_sk_match_prefix) {
8725 const rocksdb::Slice prefix((const char *)m_sk_match_prefix,
8726 m_sk_match_length);
8727 if (!kd.value_matches_prefix(rkey, prefix)) {
8728 table->status = STATUS_NOT_FOUND;
8729 return HA_ERR_END_OF_FILE;
8730 }
8731 }
8732
8733 const rocksdb::Slice value = m_scan_it->value();
8734 int err = kd.unpack_record(table, buf, &rkey, &value,
8735 m_converter->get_verify_row_debug_checksums());
8736 if (err != HA_EXIT_SUCCESS) {
8737 return err;
8738 }
8739
8740 const check_result_t icp_status= handler_index_cond_check(this);
8741 if (icp_status == CHECK_NEG) {
8742 rocksdb_smart_next(!move_forward, m_scan_it);
8743 continue; /* Get the next (or prev) index tuple */
8744 }
8745 else if (icp_status == CHECK_OUT_OF_RANGE ||
8746 icp_status == CHECK_ABORTED_BY_USER) {
8747 /* We have walked out of range we are scanning */
8748 table->status = STATUS_NOT_FOUND;
8749 return HA_ERR_END_OF_FILE;
8750 }
8751 else /* icp_status == CHECK_POS */
8752 {
8753 /* Index Condition is satisfied. We have rc==0, proceed to fetch the
8754 * row. */
8755 break;
8756 }
8757 }
8758 }
8759 return HA_EXIT_SUCCESS;
8760 }
8761
8762 /**
8763 @return
8764 HA_EXIT_SUCCESS OK
8765 other HA_ERR error code (can be SE-specific)
8766 */
8767
8768 /**
8769 @return
8770 HA_ADMIN_OK OK
8771 other HA_ADMIN error code
8772 */
8773 int ha_rocksdb::check(THD *const thd, HA_CHECK_OPT *const check_opt) {
8774 DBUG_ENTER_FUNC();
8775
8776 DBUG_ASSERT(thd != nullptr);
8777 DBUG_ASSERT(check_opt != nullptr);
8778
8779 const uint pk = pk_index(table, m_tbl_def);
8780 String rowkey_copy;
8781 String sec_key_copy;
8782 const char *const table_name = table->s->table_name.str;
8783
8784 // Only when debugging: don't use snapshot when reading
8785 // Rdb_transaction *tx= get_or_create_tx(table->in_use);
8786 // tx->snapshot= nullptr;
8787
8788 bool save_verify_row_debug_checksums =
8789 m_converter->get_verify_row_debug_checksums();
8790 m_converter->set_verify_row_debug_checksums(true);
8791 /* For each secondary index, check that we can get a PK value from it */
8792 // NO_LINT_DEBUG
8793 sql_print_verbose_info("CHECKTABLE %s: Checking table %s", table_name,
8794 table_name);
8795 ha_rows UNINIT_VAR(row_checksums_at_start); // set/used iff first_index==true
8796 ha_rows row_checksums = ha_rows(-1);
8797 bool first_index = true;
8798
8799 for (uint keyno = 0; keyno < table->s->keys; keyno++) {
8800 if (keyno != pk) {
8801 extra(HA_EXTRA_KEYREAD);
8802 ha_index_init(keyno, true);
8803 ha_rows rows = 0;
8804 ha_rows checksums = 0;
8805 if (first_index) {
8806 row_checksums_at_start = m_converter->get_row_checksums_checked();
8807 }
8808 int res;
8809 // NO_LINT_DEBUG
8810 sql_print_verbose_info("CHECKTABLE %s: Checking index %s", table_name,
8811 table->key_info[keyno].name.str);
8812 while (1) {
8813 if (!rows) {
8814 res = index_first(table->record[0]);
8815 } else {
8816 res = index_next(table->record[0]);
8817 }
8818
8819 if (res == HA_ERR_END_OF_FILE) break;
8820 if (res) {
8821 // error
8822 // NO_LINT_DEBUG
8823 sql_print_error("CHECKTABLE %s: .. row %lld: index scan error %d",
8824 table_name, rows, res);
8825 goto error;
8826 }
8827 rocksdb::Slice key = m_scan_it->key();
8828 sec_key_copy.copy(key.data(), key.size(), &my_charset_bin);
8829 rowkey_copy.copy(m_last_rowkey.ptr(), m_last_rowkey.length(),
8830 &my_charset_bin);
8831
8832 if (m_key_descr_arr[keyno]->unpack_info_has_checksum(
8833 m_scan_it->value())) {
8834 checksums++;
8835 }
8836
8837 if ((res = get_row_by_rowid(table->record[0], rowkey_copy.ptr(),
8838 rowkey_copy.length()))) {
8839 // NO_LINT_DEBUG
8840 sql_print_error(
8841 "CHECKTABLE %s: .. row %lld: "
8842 "failed to fetch row by rowid",
8843 table_name, rows);
8844 goto error;
8845 }
8846
8847 longlong hidden_pk_id = 0;
8848 if (has_hidden_pk(table) &&
8849 read_hidden_pk_id_from_rowkey(&hidden_pk_id)) {
8850 goto error;
8851 }
8852
8853 /* Check if we get the same PK value */
8854 uint packed_size = m_pk_descr->pack_record(
8855 table, m_pack_buffer, table->record[0], m_pk_packed_tuple, nullptr,
8856 false, hidden_pk_id);
8857 if (packed_size != rowkey_copy.length() ||
8858 memcmp(m_pk_packed_tuple, rowkey_copy.ptr(), packed_size)) {
8859 // NO_LINT_DEBUG
8860 sql_print_error("CHECKTABLE %s: .. row %lld: PK value mismatch",
8861 table_name, rows);
8862 goto print_and_error;
8863 }
8864
8865 /* Check if we get the same secondary key value */
8866 packed_size = m_key_descr_arr[keyno]->pack_record(
8867 table, m_pack_buffer, table->record[0], m_sk_packed_tuple,
8868 &m_sk_tails, false, hidden_pk_id);
8869 if (packed_size != sec_key_copy.length() ||
8870 memcmp(m_sk_packed_tuple, sec_key_copy.ptr(), packed_size)) {
8871 // NO_LINT_DEBUG
8872 sql_print_error(
8873 "CHECKTABLE %s: .. row %lld: "
8874 "secondary index value mismatch",
8875 table_name, rows);
8876 goto print_and_error;
8877 }
8878 rows++;
8879 continue;
8880
8881 print_and_error : {
8882 std::string buf;
8883 buf = rdb_hexdump(rowkey_copy.ptr(), rowkey_copy.length(),
8884 RDB_MAX_HEXDUMP_LEN);
8885 // NO_LINT_DEBUG
8886 sql_print_error("CHECKTABLE %s: rowkey: %s", table_name, buf.c_str());
8887
8888 buf = rdb_hexdump(m_retrieved_record.data(), m_retrieved_record.size(),
8889 RDB_MAX_HEXDUMP_LEN);
8890 // NO_LINT_DEBUG
8891 sql_print_error("CHECKTABLE %s: record: %s", table_name, buf.c_str());
8892
8893 buf = rdb_hexdump(sec_key_copy.ptr(), sec_key_copy.length(),
8894 RDB_MAX_HEXDUMP_LEN);
8895 // NO_LINT_DEBUG
8896 sql_print_error("CHECKTABLE %s: index: %s", table_name, buf.c_str());
8897
8898 goto error;
8899 }
8900 }
8901 // NO_LINT_DEBUG
8902 sql_print_verbose_info("CHECKTABLE %s: ... %lld index entries checked "
8903 "(%lld had checksums)",
8904 table_name, rows, checksums);
8905
8906 if (first_index) {
8907 row_checksums =
8908 m_converter->get_row_checksums_checked() - row_checksums_at_start;
8909 first_index = false;
8910 }
8911 ha_index_end();
8912 }
8913 }
8914 if (row_checksums != ha_rows(-1)) {
8915 // NO_LINT_DEBUG
8916 sql_print_verbose_info("CHECKTABLE %s: %lld table records had checksums",
8917 table_name, row_checksums);
8918 }
8919 extra(HA_EXTRA_NO_KEYREAD);
8920
8921 m_converter->set_verify_row_debug_checksums(save_verify_row_debug_checksums);
8922 /*
8923 TODO(sergiy): we should check also for PK records that are missing in
8924 the secondary indexes.
8925 For that, need to walk through the PK and check that every PK record has a
8926 proper counterpart in each secondary index.
8927 */
8928 DBUG_RETURN(HA_ADMIN_OK);
8929 error:
8930 m_converter->set_verify_row_debug_checksums(save_verify_row_debug_checksums);
8931 ha_index_or_rnd_end();
8932 extra(HA_EXTRA_NO_KEYREAD);
8933
8934 DBUG_RETURN(HA_ADMIN_CORRUPT);
8935 }
8936
8937 static void dbug_dump_str(FILE *const out, const char *const str, int len) {
8938 fprintf(out, "\"");
8939 for (int i = 0; i < len; i++) {
8940 if (str[i] > 32) {
8941 fprintf(out, "%c", str[i]);
8942 } else {
8943 fprintf(out, "\\%d", str[i]);
8944 }
8945 }
8946 fprintf(out, "\"");
8947 }
8948
8949 /*
8950 Debugging help: dump the whole database into a human-readable file.
8951 Usage:
8952 dbug_dump_database(rdb);
8953 */
8954
8955 void dbug_dump_database(rocksdb::DB *const db) {
8956 FILE *const out = fopen("/tmp/rocksdb.dump", "wt");
8957 if (!out) return;
8958
8959 rocksdb::Iterator *it = db->NewIterator(rocksdb::ReadOptions());
8960 for (it->SeekToFirst(); it->Valid(); it->Next()) {
8961 rocksdb::Slice key = it->key();
8962 rocksdb::Slice val = it->value();
8963 dbug_dump_str(out, key.data(), key.size());
8964 fprintf(out, " -> ");
8965 dbug_dump_str(out, val.data(), val.size());
8966 fprintf(out, "\n");
8967 }
8968
8969 delete it;
8970 fclose(out);
8971 }
8972
8973 rocksdb::Status ha_rocksdb::get_for_update(
8974 Rdb_transaction *const tx, rocksdb::ColumnFamilyHandle *const column_family,
8975 const rocksdb::Slice &key, rocksdb::PinnableSlice *const value) const {
8976 DBUG_ASSERT(m_lock_rows != RDB_LOCK_NONE);
8977 const bool exclusive = m_lock_rows != RDB_LOCK_READ;
8978
8979 const bool do_validate =
8980 my_core::thd_tx_isolation(ha_thd()) > ISO_READ_COMMITTED;
8981 rocksdb::Status s =
8982 tx->get_for_update(column_family, key, value, exclusive, do_validate);
8983
8984 #ifndef DBUG_OFF
8985 ++rocksdb_num_get_for_update_calls;
8986 #endif
8987 return s;
8988 }
8989
8990 bool ha_rocksdb::is_blind_delete_enabled() {
8991 THD *thd = ha_thd();
8992 /*
8993 Note: in MariaDB, thd->lex->table_count is only set for multi-table DELETE,
8994 not for single-table DELETE. So we check thd->lex->query_tables instead.
8995 */
8996 return (THDVAR(thd, blind_delete_primary_key) &&
8997 thd->lex->sql_command == SQLCOM_DELETE &&
8998 thd->lex->query_tables && !thd->lex->query_tables->next_global &&
8999 table->s->keys == 1 &&
9000 !has_hidden_pk(table) && !thd->rgi_slave);
9001 }
9002
9003 /*
9004 Given a rowid (i.e. packed PK) as a parameter, get the record.
9005
9006 @return
9007 HA_EXIT_SUCCESS OK
9008 other HA_ERR error code (can be SE-specific)
9009 */
9010
9011 int ha_rocksdb::get_row_by_rowid(uchar *const buf, const char *const rowid,
9012 const uint rowid_size, const bool skip_lookup,
9013 const bool skip_ttl_check) {
9014 DBUG_ENTER_FUNC();
9015
9016 DBUG_ASSERT(table != nullptr);
9017
9018 int rc;
9019
9020 rocksdb::Slice key_slice(rowid, rowid_size);
9021
9022 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
9023 DBUG_ASSERT(tx != nullptr);
9024
9025 #ifdef ENABLED_DEBUG_SYNC
9026 DEBUG_SYNC(ha_thd(), "rocksdb.get_row_by_rowid");
9027 DBUG_EXECUTE_IF("dbug.rocksdb.get_row_by_rowid", {
9028 THD *thd = ha_thd();
9029 const char act[] =
9030 "now signal Reached "
9031 "wait_for signal.rocksdb.get_row_by_rowid_let_running";
9032 DBUG_ASSERT(opt_debug_sync_timeout > 0);
9033 DBUG_ASSERT(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
9034 };);
9035 #endif /* ENABLED_DEBUG_SYNC */
9036
9037 bool found;
9038 rocksdb::Status s;
9039
9040 /* Pretend row found without looking up */
9041 if (skip_lookup) {
9042 #ifdef MARIAROCKS_NOT_YET
9043 stats.rows_deleted_blind++;
9044 #endif
9045 update_row_stats(ROWS_DELETED_BLIND);
9046 m_last_rowkey.copy((const char *)rowid, rowid_size, &my_charset_bin);
9047 table->status = 0;
9048 DBUG_RETURN(0);
9049 }
9050
9051 if (m_lock_rows == RDB_LOCK_NONE) {
9052 tx->acquire_snapshot(true);
9053 s = tx->get(m_pk_descr->get_cf(), key_slice, &m_retrieved_record);
9054 } else if (m_insert_with_update && m_dup_pk_found) {
9055 DBUG_ASSERT(m_pk_descr->get_keyno() == m_dupp_errkey);
9056 DBUG_ASSERT(m_dup_pk_retrieved_record.length() ==
9057 m_retrieved_record.size());
9058 DBUG_ASSERT(memcmp(m_dup_pk_retrieved_record.ptr(),
9059 m_retrieved_record.data(),
9060 m_retrieved_record.size()) == 0);
9061
9062 // do nothing - we already have the result in m_retrieved_record and
9063 // already taken the lock
9064 s = rocksdb::Status::OK();
9065 } else {
9066 s = get_for_update(tx, m_pk_descr->get_cf(), key_slice,
9067 &m_retrieved_record);
9068 }
9069
9070 DBUG_EXECUTE_IF("rocksdb_return_status_corrupted",
9071 dbug_change_status_to_corrupted(&s););
9072
9073 if (!s.IsNotFound() && !s.ok()) {
9074 DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def,
9075 m_table_handler));
9076 }
9077 found = !s.IsNotFound();
9078
9079 table->status = STATUS_NOT_FOUND;
9080 if (found) {
9081 /* If we found the record, but it's expired, pretend we didn't find it. */
9082 if (!skip_ttl_check && m_pk_descr->has_ttl() &&
9083 should_hide_ttl_rec(*m_pk_descr, m_retrieved_record,
9084 tx->m_snapshot_timestamp)) {
9085 DBUG_RETURN(HA_ERR_KEY_NOT_FOUND);
9086 }
9087
9088 m_last_rowkey.copy((const char *)rowid, rowid_size, &my_charset_bin);
9089 rc = convert_record_from_storage_format(&key_slice, buf);
9090
9091 if (!rc) {
9092 table->status = 0;
9093 }
9094 } else {
9095 /*
9096 Note: we don't need to unlock the row. It is intentional that we keep
9097 locks on rows that don't exist.
9098 */
9099 rc = HA_ERR_KEY_NOT_FOUND;
9100 }
9101
9102 DBUG_RETURN(rc);
9103 }
9104
9105 /**
9106 @return
9107 HA_EXIT_SUCCESS OK
9108 other HA_ERR error code (can be SE-specific)
9109 */
9110 int ha_rocksdb::index_next(uchar *const buf) {
9111 DBUG_ENTER_FUNC();
9112
9113 bool moves_forward = true;
9114 if (m_key_descr_arr[active_index]->m_is_reverse_cf) {
9115 moves_forward = false;
9116 }
9117
9118 int rc = index_next_with_direction(buf, moves_forward);
9119 if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
9120
9121 DBUG_RETURN(rc);
9122 }
9123
9124 /**
9125 @return
9126 HA_EXIT_SUCCESS OK
9127 other HA_ERR error code (can be SE-specific)
9128 */
9129 int ha_rocksdb::index_prev(uchar *const buf) {
9130 DBUG_ENTER_FUNC();
9131
9132 bool moves_forward = false;
9133 if (m_key_descr_arr[active_index]->m_is_reverse_cf) {
9134 moves_forward = true;
9135 }
9136
9137 int rc = index_next_with_direction(buf, moves_forward);
9138 if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
9139
9140 DBUG_RETURN(rc);
9141 }
9142
9143 int ha_rocksdb::index_next_with_direction(uchar *const buf, bool move_forward) {
9144 DBUG_ENTER_FUNC();
9145
9146 int rc;
9147
9148 if (active_index == pk_index(table, m_tbl_def)) {
9149 rc = rnd_next_with_direction(buf, move_forward);
9150 } else {
9151 THD *thd = ha_thd();
9152 for (;;) {
9153 DEBUG_SYNC(thd, "rocksdb.check_flags_inwd");
9154 if (thd && thd->killed) {
9155 rc = HA_ERR_QUERY_INTERRUPTED;
9156 break;
9157 }
9158 if (m_skip_scan_it_next_call) {
9159 m_skip_scan_it_next_call = false;
9160 } else {
9161 if (move_forward) {
9162 m_scan_it->Next(); /* this call cannot fail */
9163 } else {
9164 m_scan_it->Prev();
9165 }
9166 }
9167 rc = rocksdb_skip_expired_records(*m_key_descr_arr[active_index],
9168 m_scan_it, !move_forward);
9169 if (rc != HA_EXIT_SUCCESS) {
9170 break;
9171 }
9172 rc = find_icp_matching_index_rec(move_forward, buf);
9173 if (!rc) rc = secondary_index_read(active_index, buf);
9174 if (!should_skip_invalidated_record(rc)) {
9175 break;
9176 }
9177 }
9178 }
9179
9180 DBUG_RETURN(rc);
9181 }
9182
9183 /**
9184 @return
9185 HA_EXIT_SUCCESS OK
9186 other HA_ERR error code (can be SE-specific)
9187 */
9188 int ha_rocksdb::index_first(uchar *const buf) {
9189 DBUG_ENTER_FUNC();
9190
9191 m_sk_match_prefix = nullptr;
9192 int rc = m_key_descr_arr[active_index]->m_is_reverse_cf
9193 ? index_last_intern(buf)
9194 : index_first_intern(buf);
9195 if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
9196
9197 DBUG_RETURN(rc);
9198 }
9199
9200 /**
9201 @return
9202 HA_EXIT_SUCCESS OK
9203 other HA_ERR error code (can be SE-specific)
9204 */
9205 int ha_rocksdb::index_last(uchar *const buf) {
9206 DBUG_ENTER_FUNC();
9207
9208 m_sk_match_prefix = nullptr;
9209 int rc = m_key_descr_arr[active_index]->m_is_reverse_cf
9210 ? index_first_intern(buf)
9211 : index_last_intern(buf);
9212 if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
9213
9214 DBUG_RETURN(rc);
9215 }
9216
9217 /*
9218 Start scanning from the "first" value.
9219
9220 The 'first' here means "the first from start of the key space".
9221 For reverse-ordered key spaces, the first value will be the biggest, 'zzzz'.
9222
9223 An picture of a forward-ordered keyspace (remember, the keys have form
9224 'indexnr-keyval'. Suppose the index we are at has number n)
9225
9226 (n-1) - ...
9227 ( n ) <--- 1. (n) doesn't exist in the db but it would be here.
9228 ( n ) - aaa <--- 2. Seek("n") will put us here on the first index
9229 ( n ) - bbb record.
9230 ( n ) - cc
9231
9232 So, need to do: Seek(n);
9233
9234 A backward-ordered keyspace:
9235
9236 (n+1) - bbb
9237 (n+1) - aaa
9238 (n+1) <--- (n+1) doesn't exist in the db but would be here.
9239 ( n ) - ccc <--- 1. We need to be here.
9240 ( n ) - bbb
9241 ( n ) - aaa
9242 ( n )
9243
9244 So, need to: Seek(n+1);
9245
9246 */
9247
9248 int ha_rocksdb::index_first_intern(uchar *const buf) {
9249 DBUG_ENTER_FUNC();
9250
9251 uchar *key;
9252 uint key_size;
9253 int rc;
9254
9255 if (is_pk(active_index, table, m_tbl_def)) {
9256 key = m_pk_packed_tuple;
9257 } else {
9258 key = m_sk_packed_tuple;
9259 }
9260
9261 DBUG_ASSERT(key != nullptr);
9262
9263 const Rdb_key_def &kd = *m_key_descr_arr[active_index];
9264 int key_start_matching_bytes = kd.get_first_key(key, &key_size);
9265
9266 rocksdb::Slice index_key((const char *)key, key_size);
9267
9268 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
9269 DBUG_ASSERT(tx != nullptr);
9270
9271 const bool is_new_snapshot = !tx->has_snapshot();
9272 // Loop as long as we get a deadlock error AND we end up creating the
9273 // snapshot here (i.e. it did not exist prior to this)
9274 for (;;) {
9275 setup_scan_iterator(kd, &index_key, false, key_start_matching_bytes);
9276 m_scan_it->Seek(index_key);
9277 m_skip_scan_it_next_call = true;
9278
9279 rc = index_next_with_direction(buf, true);
9280 if (!should_recreate_snapshot(rc, is_new_snapshot)) {
9281 break; /* exit the loop */
9282 }
9283
9284 // release the snapshot and iterator so they will be regenerated
9285 tx->release_snapshot();
9286 release_scan_iterator();
9287 }
9288
9289 if (!rc) {
9290 /*
9291 index_next is always incremented on success, so decrement if it is
9292 index_first instead
9293 */
9294 #ifdef MARIAROCKS_NOT_YET
9295 stats.rows_index_first++;
9296 stats.rows_index_next--;
9297 #endif
9298 }
9299
9300 DBUG_RETURN(rc);
9301 }
9302
9303 /**
9304 @details
9305 Start scanning from the "last" value
9306
9307 The 'last' here means "the last from start of the key space".
9308 For reverse-ordered key spaces, we will actually read the smallest value.
9309
9310 An picture of a forward-ordered keyspace (remember, the keys have form
9311 'indexnr-keyval'. Suppose the we are at a key that has number n)
9312
9313 (n-1)-something
9314 ( n )-aaa
9315 ( n )-bbb
9316 ( n )-ccc <----------- Need to seek to here.
9317 (n+1) <---- Doesn't exist, but would be here.
9318 (n+1)-smth, or no value at all
9319
9320 RocksDB's Iterator::SeekForPrev($val) seeks to "at $val or last value that's
9321 smaller". We can't seek to "(n)-ccc" directly, because we don't know what
9322 is the value of 'ccc' (the biggest record with prefix (n)). Instead, we seek
9323 to "(n+1)", which is the least possible value that's greater than any value
9324 in index #n.
9325
9326 So, need to: it->SeekForPrev(n+1)
9327
9328 A backward-ordered keyspace:
9329
9330 (n+1)-something
9331 ( n ) - ccc
9332 ( n ) - bbb
9333 ( n ) - aaa <---------------- (*) Need to seek here.
9334 ( n ) <--- Doesn't exist, but would be here.
9335 (n-1)-smth, or no value at all
9336
9337 So, need to: it->SeekForPrev(n)
9338 */
9339
9340 int ha_rocksdb::index_last_intern(uchar *const buf) {
9341 DBUG_ENTER_FUNC();
9342
9343 uchar *key;
9344 uint key_size;
9345 int rc;
9346
9347 if (is_pk(active_index, table, m_tbl_def)) {
9348 key = m_pk_packed_tuple;
9349 } else {
9350 key = m_sk_packed_tuple;
9351 }
9352
9353 DBUG_ASSERT(key != nullptr);
9354
9355 const Rdb_key_def &kd = *m_key_descr_arr[active_index];
9356 int key_end_matching_bytes = kd.get_last_key(key, &key_size);
9357
9358 rocksdb::Slice index_key((const char *)key, key_size);
9359
9360 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
9361 DBUG_ASSERT(tx != nullptr);
9362
9363 bool is_new_snapshot = !tx->has_snapshot();
9364 // Loop as long as we get a deadlock error AND we end up creating the
9365 // snapshot here (i.e. it did not exist prior to this)
9366 for (;;) {
9367 setup_scan_iterator(kd, &index_key, false, key_end_matching_bytes);
9368 m_scan_it->SeekForPrev(index_key);
9369 m_skip_scan_it_next_call = false;
9370
9371 if (is_pk(active_index, table, m_tbl_def)) {
9372 m_skip_scan_it_next_call = true;
9373 rc = rnd_next_with_direction(buf, false);
9374 } else {
9375 rc = find_icp_matching_index_rec(false /*move_forward*/, buf);
9376 if (!rc) rc = secondary_index_read(active_index, buf);
9377 }
9378
9379 if (!should_recreate_snapshot(rc, is_new_snapshot)) {
9380 break; /* exit the loop */
9381 }
9382
9383 // release the snapshot and iterator so they will be regenerated
9384 tx->release_snapshot();
9385 release_scan_iterator();
9386 }
9387
9388 if (!rc) {
9389 /*
9390 index_next is always incremented on success, so decrement if it is
9391 index_first instead
9392 */
9393 #ifdef MARIAROCKS_NOT_YET
9394 stats.rows_index_first++;
9395 stats.rows_index_next--;
9396 #endif
9397 }
9398
9399 DBUG_RETURN(rc);
9400 }
9401
9402 void ha_rocksdb::unlock_row() {
9403 DBUG_ENTER_FUNC();
9404
9405 if (m_lock_rows != RDB_LOCK_NONE) {
9406 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
9407 tx->release_lock(m_pk_descr->get_cf(),
9408 std::string(m_last_rowkey.ptr(), m_last_rowkey.length()));
9409 }
9410
9411 DBUG_VOID_RETURN;
9412 }
9413
9414 /*
9415 Returning true if SingleDelete can be used.
9416 - Secondary Indexes can always use SingleDelete.
9417 - If the index is PRIMARY KEY, and if all of the columns of the table
9418 are covered by the PRIMARY KEY, SingleDelete can be used.
9419 */
9420 bool ha_rocksdb::can_use_single_delete(const uint index) const {
9421 return (index != pk_index(table, m_tbl_def) ||
9422 (!has_hidden_pk(table) &&
9423 table->key_info[index].ext_key_parts == table->s->fields));
9424 }
9425
9426 bool ha_rocksdb::skip_unique_check() const {
9427 /*
9428 We want to skip unique checks if:
9429 1) bulk_load is on
9430 2) this table is in the whitelist of tables to skip and the replication
9431 lag has reached a large enough value (see unique_check_lag_threshold
9432 and unique_check_lage_reset_threshold)
9433 3) the user set unique_checks option to 0, and the table does not have
9434 any indexes. If the table has secondary keys, then those might becomes
9435 inconsisted/corrupted
9436 4) We're using read-free replication
9437 */
9438 return THDVAR(table->in_use, bulk_load) ||
9439 (m_force_skip_unique_check && m_skip_unique_check) ||
9440 (my_core::thd_test_options(table->in_use,
9441 OPTION_RELAXED_UNIQUE_CHECKS) &&
9442 m_tbl_def->m_key_count == 1) ||
9443 #ifdef MARIAROCKS_NOT_YET
9444 use_read_free_rpl();
9445 #else
9446 FALSE;
9447 #endif
9448 }
9449
9450 #ifdef MARIAROCKS_NOT_YET // MDEV-10975
9451 void ha_rocksdb::set_force_skip_unique_check(bool skip) {
9452 DBUG_ENTER_FUNC();
9453
9454 m_force_skip_unique_check = skip;
9455
9456 DBUG_VOID_RETURN;
9457 }
9458 #endif
9459
9460 bool ha_rocksdb::commit_in_the_middle() {
9461 return THDVAR(table->in_use, bulk_load) ||
9462 THDVAR(table->in_use, commit_in_the_middle);
9463 }
9464
9465 /*
9466 Executing bulk commit if it should.
9467 @retval true if bulk commit failed
9468 @retval false if bulk commit was skipped or succeeded
9469 */
9470 bool ha_rocksdb::do_bulk_commit(Rdb_transaction *const tx) {
9471 return commit_in_the_middle() &&
9472 tx->get_write_count() >= THDVAR(table->in_use, bulk_load_size) &&
9473 tx->flush_batch();
9474 }
9475
9476 /*
9477 If table was created without primary key, SQL layer represents the primary
9478 key number as MAX_INDEXES. Hence, this function returns true if the table
9479 does not contain a primary key. (In which case we generate a hidden
9480 'auto-incremented' pk.)
9481 */
9482 bool ha_rocksdb::has_hidden_pk(const TABLE *const table) const {
9483 return Rdb_key_def::table_has_hidden_pk(table);
9484 }
9485
9486 /*
9487 Returns true if given index number is a hidden_pk.
9488 - This is used when a table is created with no primary key.
9489 */
9490 bool ha_rocksdb::is_hidden_pk(const uint index, const TABLE *const table_arg,
9491 const Rdb_tbl_def *const tbl_def_arg) {
9492 DBUG_ASSERT(table_arg->s != nullptr);
9493
9494 return (table_arg->s->primary_key == MAX_INDEXES &&
9495 index == tbl_def_arg->m_key_count - 1);
9496 }
9497
9498 /* Returns index of primary key */
9499 uint ha_rocksdb::pk_index(const TABLE *const table_arg,
9500 const Rdb_tbl_def *const tbl_def_arg) {
9501 DBUG_ASSERT(table_arg->s != nullptr);
9502
9503 return table_arg->s->primary_key == MAX_INDEXES ? tbl_def_arg->m_key_count - 1
9504 : table_arg->s->primary_key;
9505 }
9506
9507 /* Returns true if given index number is a primary key */
9508 bool ha_rocksdb::is_pk(const uint index, const TABLE *const table_arg,
9509 const Rdb_tbl_def *const tbl_def_arg) {
9510 DBUG_ASSERT(table_arg->s != nullptr);
9511
9512 return index == table_arg->s->primary_key ||
9513 is_hidden_pk(index, table_arg, tbl_def_arg);
9514 }
9515
9516 uint ha_rocksdb::max_supported_key_part_length() const {
9517 DBUG_ENTER_FUNC();
9518 DBUG_RETURN(rocksdb_large_prefix ? MAX_INDEX_COL_LEN_LARGE
9519 : MAX_INDEX_COL_LEN_SMALL);
9520 }
9521
9522 const char *ha_rocksdb::get_key_name(const uint index,
9523 const TABLE *const table_arg,
9524 const Rdb_tbl_def *const tbl_def_arg) {
9525 if (is_hidden_pk(index, table_arg, tbl_def_arg)) {
9526 return HIDDEN_PK_NAME;
9527 }
9528
9529 DBUG_ASSERT(table_arg->key_info != nullptr);
9530 DBUG_ASSERT(table_arg->key_info[index].name.str != nullptr);
9531
9532 return table_arg->key_info[index].name.str;
9533 }
9534
9535 const char *ha_rocksdb::get_key_comment(const uint index,
9536 const TABLE *const table_arg,
9537 const Rdb_tbl_def *const tbl_def_arg) {
9538 if (is_hidden_pk(index, table_arg, tbl_def_arg)) {
9539 return nullptr;
9540 }
9541
9542 DBUG_ASSERT(table_arg->key_info != nullptr);
9543
9544 return table_arg->key_info[index].comment.str;
9545 }
9546
9547 const std::string ha_rocksdb::generate_cf_name(
9548 const uint index, const TABLE *const table_arg,
9549 const Rdb_tbl_def *const tbl_def_arg, bool *per_part_match_found) {
9550 DBUG_ASSERT(table_arg != nullptr);
9551 DBUG_ASSERT(tbl_def_arg != nullptr);
9552 DBUG_ASSERT(per_part_match_found != nullptr);
9553
9554 // When creating CF-s the caller needs to know if there was a custom CF name
9555 // specified for a given paritition.
9556 *per_part_match_found = false;
9557
9558 // Index comment is used to define the column family name specification(s).
9559 // If there was no comment, we get an emptry string, and it means "use the
9560 // default column family".
9561 const char *const comment = get_key_comment(index, table_arg, tbl_def_arg);
9562
9563 // `get_key_comment` can return `nullptr`, that's why this.
9564 std::string key_comment = comment ? comment : "";
9565
9566 std::string cf_name = Rdb_key_def::parse_comment_for_qualifier(
9567 key_comment, table_arg, tbl_def_arg, per_part_match_found,
9568 RDB_CF_NAME_QUALIFIER);
9569
9570 if (IF_PARTITIONING(table_arg->part_info,nullptr) != nullptr && !*per_part_match_found) {
9571 // At this point we tried to search for a custom CF name for a partition,
9572 // but none was specified. Therefore default one will be used.
9573 return "";
9574 }
9575
9576 // If we didn't find any partitioned/non-partitioned qualifiers, return the
9577 // comment itself. NOTE: this currently handles returning the cf name
9578 // specified in the index comment in the case of no partitions, which doesn't
9579 // use any qualifiers at the moment. (aka its a special case)
9580 if (cf_name.empty() && !key_comment.empty()) {
9581 return key_comment;
9582 }
9583
9584 return cf_name;
9585 }
9586
9587 const std::string ha_rocksdb::get_table_comment(const TABLE *const table_arg) {
9588 DBUG_ASSERT(table_arg->s != nullptr);
9589
9590 return table_arg->s->comment.str;
9591 }
9592
9593 /**
9594 Write a new row
9595
9596 @param[in] buf new row data to write
9597 @return
9598 HA_EXIT_SUCCESS OK
9599 other HA_ERR error code (can be SE-specific)
9600 */
9601 int ha_rocksdb::write_row(const uchar *const buf) {
9602 DBUG_ENTER_FUNC();
9603
9604 DBUG_ASSERT(buf != nullptr);
9605 DBUG_ASSERT(buf == table->record[0]);
9606 DBUG_ASSERT(m_lock_rows == RDB_LOCK_WRITE);
9607
9608 /*
9609 Note: "buf == table->record[0]" is copied from innodb. I am not aware of
9610 any use cases where this condition is not true.
9611 */
9612 if (table->next_number_field && buf == table->record[0]) {
9613 int err;
9614 if ((err = update_auto_increment())) {
9615 DBUG_RETURN(err);
9616 }
9617 }
9618
9619 // clear cache at beginning of write for INSERT ON DUPLICATE
9620 // we may get multiple write->fail->read->update if there are multiple
9621 // values from INSERT
9622 m_dup_pk_found = false;
9623
9624 const int rv = update_write_row(nullptr, buf, skip_unique_check());
9625
9626 if (rv == 0) {
9627 #ifdef MARIAROCKS_NOT_YET
9628 stats.rows_inserted++;
9629 #endif
9630 update_row_stats(ROWS_INSERTED);
9631 }
9632
9633 DBUG_RETURN(rv);
9634 }
9635
9636 /**
9637 Constructing m_last_rowkey (MyRocks key expression) from
9638 before_update|delete image (MySQL row expression).
9639 m_last_rowkey is normally set during lookup phase, such as
9640 rnd_next_with_direction() and rnd_pos(). With Read Free Replication,
9641 these read functions are skipped and update_rows(), delete_rows() are
9642 called without setting m_last_rowkey. This function sets m_last_rowkey
9643 for Read Free Replication.
9644 */
9645 void ha_rocksdb::set_last_rowkey(const uchar *const old_data) {
9646 #ifdef MARIAROCKS_NOT_YET
9647 if (old_data && use_read_free_rpl()) {
9648 const int old_pk_size = m_pk_descr->pack_record(
9649 table, m_pack_buffer, old_data, m_pk_packed_tuple, nullptr, false);
9650 m_last_rowkey.copy((const char *)m_pk_packed_tuple, old_pk_size,
9651 &my_charset_bin);
9652 }
9653 #endif
9654 }
9655
9656 /**
9657 Collect update data for primary key
9658
9659 @param[in, out] row_info hold all data for update row, such as
9660 new row data/old row data
9661 @return
9662 HA_EXIT_SUCCESS OK
9663 other HA_ERR error code (can be SE-specific)
9664 */
9665 int ha_rocksdb::get_pk_for_update(struct update_row_info *const row_info) {
9666 int size;
9667
9668 /*
9669 Get new row key for any insert, and any update where the pk is not hidden.
9670 Row key for updates with hidden pk is handled below.
9671 */
9672 if (!has_hidden_pk(table)) {
9673 row_info->hidden_pk_id = 0;
9674
9675 row_info->new_pk_unpack_info = &m_pk_unpack_info;
9676
9677 size = m_pk_descr->pack_record(
9678 table, m_pack_buffer, row_info->new_data, m_pk_packed_tuple,
9679 row_info->new_pk_unpack_info, false, 0, 0, nullptr);
9680 } else if (row_info->old_data == nullptr) {
9681 row_info->hidden_pk_id = update_hidden_pk_val();
9682 size =
9683 m_pk_descr->pack_hidden_pk(row_info->hidden_pk_id, m_pk_packed_tuple);
9684 } else {
9685 /*
9686 If hidden primary key, rowkey for new record will always be the same as
9687 before
9688 */
9689 size = row_info->old_pk_slice.size();
9690 memcpy(m_pk_packed_tuple, row_info->old_pk_slice.data(), size);
9691 int err = read_hidden_pk_id_from_rowkey(&row_info->hidden_pk_id);
9692 if (err) {
9693 return err;
9694 }
9695 }
9696
9697 row_info->new_pk_slice =
9698 rocksdb::Slice((const char *)m_pk_packed_tuple, size);
9699
9700 return HA_EXIT_SUCCESS;
9701 }
9702
9703 /**
9704 Check the specified primary key value is unique and also lock the row
9705
9706 @param[in] key_id key index
9707 @param[in] row_info hold all data for update row, such as old row
9708 data and new row data
9709 @param[out] found whether the primary key exists before.
9710 @param[out] pk_changed whether primary key is changed
9711 @return
9712 HA_EXIT_SUCCESS OK
9713 other HA_ERR error code (can be SE-specific)
9714 */
9715 int ha_rocksdb::check_and_lock_unique_pk(const uint key_id,
9716 const struct update_row_info &row_info,
9717 bool *const found) {
9718 DBUG_ASSERT(found != nullptr);
9719
9720 DBUG_ASSERT(row_info.old_pk_slice.size() == 0 ||
9721 row_info.new_pk_slice.compare(row_info.old_pk_slice) != 0);
9722
9723 /* Ignore PK violations if this is a optimized 'replace into' */
9724 #ifdef MARIAROCKS_NOT_YET
9725 const bool ignore_pk_unique_check = ha_thd()->lex->blind_replace_into;
9726 #else
9727 const bool ignore_pk_unique_check= false;
9728 #endif
9729
9730 /*
9731 Perform a read to determine if a duplicate entry exists. For primary
9732 keys, a point lookup will be sufficient.
9733
9734 note: we intentionally don't set options.snapshot here. We want to read
9735 the latest committed data.
9736 */
9737
9738 /*
9739 To prevent race conditions like below, it is necessary to
9740 take a lock for a target row. get_for_update() holds a gap lock if
9741 target key does not exist, so below conditions should never
9742 happen.
9743
9744 1) T1 Get(empty) -> T2 Get(empty) -> T1 Put(insert) -> T1 commit
9745 -> T2 Put(overwrite) -> T2 commit
9746 2) T1 Get(empty) -> T1 Put(insert, not committed yet) -> T2 Get(empty)
9747 -> T2 Put(insert, blocked) -> T1 commit -> T2 commit(overwrite)
9748 */
9749 const rocksdb::Status s =
9750 get_for_update(row_info.tx, m_pk_descr->get_cf(), row_info.new_pk_slice,
9751 ignore_pk_unique_check ? nullptr : &m_retrieved_record);
9752 if (!s.ok() && !s.IsNotFound()) {
9753 return row_info.tx->set_status_error(
9754 table->in_use, s, *m_key_descr_arr[key_id], m_tbl_def, m_table_handler);
9755 }
9756
9757 bool key_found = ignore_pk_unique_check ? false : !s.IsNotFound();
9758
9759 /*
9760 If the pk key has ttl, we may need to pretend the row wasn't
9761 found if it is already expired.
9762 */
9763 if (key_found && m_pk_descr->has_ttl() &&
9764 should_hide_ttl_rec(*m_pk_descr, m_retrieved_record,
9765 (row_info.tx->m_snapshot_timestamp
9766 ? row_info.tx->m_snapshot_timestamp
9767 : static_cast<int64_t>(std::time(nullptr))))) {
9768 key_found = false;
9769 }
9770
9771 if (key_found && row_info.old_data == nullptr && m_insert_with_update) {
9772 // In INSERT ON DUPLICATE KEY UPDATE ... case, if the insert failed
9773 // due to a duplicate key, remember the last key and skip the check
9774 // next time
9775 m_dup_pk_found = true;
9776
9777 #ifndef DBUG_OFF
9778 // save it for sanity checking later
9779 m_dup_pk_retrieved_record.copy(m_retrieved_record.data(),
9780 m_retrieved_record.size(), &my_charset_bin);
9781 #endif
9782 }
9783
9784 *found = key_found;
9785
9786 return HA_EXIT_SUCCESS;
9787 }
9788
9789 /**
9790 Check the specified secondary key value is unique and also lock the row
9791
9792 @param[in] key_id key index
9793 @param[in] row_info hold all data for update row, such as old row
9794 data and new row data
9795 @param[out] found whether specified key value exists before.
9796 @return
9797 HA_EXIT_SUCCESS OK
9798 other HA_ERR error code (can be SE-specific)
9799 */
9800 int ha_rocksdb::check_and_lock_sk(const uint key_id,
9801 const struct update_row_info &row_info,
9802 bool *const found) {
9803 DBUG_ASSERT(found != nullptr);
9804 *found = false;
9805
9806 /*
9807 Can skip checking this key if none of the key fields have changed.
9808 */
9809 if (row_info.old_data != nullptr && !m_update_scope.is_set(key_id)) {
9810 return HA_EXIT_SUCCESS;
9811 }
9812
9813 KEY *key_info = nullptr;
9814 uint n_null_fields = 0;
9815 uint user_defined_key_parts = 1;
9816
9817 key_info = &table->key_info[key_id];
9818 user_defined_key_parts = key_info->user_defined_key_parts;
9819 /*
9820 If there are no uniqueness requirements, there's no need to obtain a
9821 lock for this key.
9822 */
9823 if (!(key_info->flags & HA_NOSAME)) {
9824 return HA_EXIT_SUCCESS;
9825 }
9826
9827 const Rdb_key_def &kd = *m_key_descr_arr[key_id];
9828
9829 /*
9830 Calculate the new key for obtaining the lock
9831
9832 For unique secondary indexes, the key used for locking does not
9833 include the extended fields.
9834 */
9835 int size =
9836 kd.pack_record(table, m_pack_buffer, row_info.new_data, m_sk_packed_tuple,
9837 nullptr, false, 0, user_defined_key_parts, &n_null_fields);
9838 if (n_null_fields > 0) {
9839 /*
9840 If any fields are marked as NULL this will never match another row as
9841 to NULL never matches anything else including another NULL.
9842 */
9843 return HA_EXIT_SUCCESS;
9844 }
9845
9846 const rocksdb::Slice new_slice =
9847 rocksdb::Slice((const char *)m_sk_packed_tuple, size);
9848
9849 /*
9850 Acquire lock on the old key in case of UPDATE
9851 */
9852 if (row_info.old_data != nullptr) {
9853 size = kd.pack_record(table, m_pack_buffer, row_info.old_data,
9854 m_sk_packed_tuple_old, nullptr, false, 0,
9855 user_defined_key_parts);
9856 const rocksdb::Slice old_slice =
9857 rocksdb::Slice((const char *)m_sk_packed_tuple_old, size);
9858
9859 const rocksdb::Status s =
9860 get_for_update(row_info.tx, kd.get_cf(), old_slice, nullptr);
9861 if (!s.ok()) {
9862 return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def,
9863 m_table_handler);
9864 }
9865
9866 /*
9867 If the old and new keys are the same we're done since we've already taken
9868 the lock on the old key
9869 */
9870 if (!new_slice.compare(old_slice)) {
9871 return HA_EXIT_SUCCESS;
9872 }
9873 }
9874
9875 /*
9876 Perform a read to determine if a duplicate entry exists - since this is
9877 a secondary indexes a range scan is needed.
9878
9879 note: we intentionally don't set options.snapshot here. We want to read
9880 the latest committed data.
9881 */
9882
9883 const bool all_parts_used = (user_defined_key_parts == kd.get_key_parts());
9884
9885 /*
9886 This iterator seems expensive since we need to allocate and free
9887 memory for each unique index.
9888
9889 If this needs to be optimized, for keys without NULL fields, the
9890 extended primary key fields can be migrated to the value portion of the
9891 key. This enables using Get() instead of Seek() as in the primary key
9892 case.
9893
9894 The bloom filter may need to be disabled for this lookup.
9895 */
9896 uchar lower_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE];
9897 uchar upper_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE];
9898 rocksdb::Slice lower_bound_slice;
9899 rocksdb::Slice upper_bound_slice;
9900
9901 const bool total_order_seek = !check_bloom_and_set_bounds(
9902 ha_thd(), kd, new_slice, all_parts_used, Rdb_key_def::INDEX_NUMBER_SIZE,
9903 lower_bound_buf, upper_bound_buf, &lower_bound_slice, &upper_bound_slice);
9904 const bool fill_cache = !THDVAR(ha_thd(), skip_fill_cache);
9905
9906 const rocksdb::Status s =
9907 get_for_update(row_info.tx, kd.get_cf(), new_slice, nullptr);
9908 if (!s.ok() && !s.IsNotFound()) {
9909 return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def,
9910 m_table_handler);
9911 }
9912
9913 rocksdb::Iterator *const iter = row_info.tx->get_iterator(
9914 kd.get_cf(), total_order_seek, fill_cache, lower_bound_slice,
9915 upper_bound_slice, true /* read current data */,
9916 false /* acquire snapshot */);
9917 /*
9918 Need to scan the transaction to see if there is a duplicate key.
9919 Also need to scan RocksDB and verify the key has not been deleted
9920 in the transaction.
9921 */
9922 iter->Seek(new_slice);
9923 *found = !read_key_exact(kd, iter, all_parts_used, new_slice,
9924 row_info.tx->m_snapshot_timestamp);
9925 delete iter;
9926
9927 return HA_EXIT_SUCCESS;
9928 }
9929
9930 /**
9931 Enumerate all keys to check their uniquess and also lock it
9932
9933 @param[in] row_info hold all data for update row, such as old row
9934 data and new row data
9935 @param[out] pk_changed whether primary key is changed
9936 @return
9937 HA_EXIT_SUCCESS OK
9938 other HA_ERR error code (can be SE-specific)
9939 */
9940 int ha_rocksdb::check_uniqueness_and_lock(
9941 const struct update_row_info &row_info, bool pk_changed) {
9942 /*
9943 Go through each index and determine if the index has uniqueness
9944 requirements. If it does, then try to obtain a row lock on the new values.
9945 Once all locks have been obtained, then perform the changes needed to
9946 update/insert the row.
9947 */
9948 for (uint key_id = 0; key_id < m_tbl_def->m_key_count; key_id++) {
9949 bool found;
9950 int rc;
9951
9952 if (is_pk(key_id, table, m_tbl_def)) {
9953 if (row_info.old_pk_slice.size() > 0 && !pk_changed) {
9954 found = false;
9955 rc = HA_EXIT_SUCCESS;
9956 } else {
9957 rc = check_and_lock_unique_pk(key_id, row_info, &found);
9958 }
9959 } else {
9960 rc = check_and_lock_sk(key_id, row_info, &found);
9961 }
9962
9963 if (rc != HA_EXIT_SUCCESS) {
9964 return rc;
9965 }
9966
9967 if (found) {
9968 /* There is a row with this key already, so error out. */
9969 errkey = key_id;
9970 m_dupp_errkey = errkey;
9971
9972 return HA_ERR_FOUND_DUPP_KEY;
9973 }
9974 }
9975
9976 return HA_EXIT_SUCCESS;
9977 }
9978
9979 /**
9980 Check whether secondary key value is duplicate or not
9981
9982 @param[in] table_arg the table currently working on
9983 @param[in key_def the key_def is being checked
9984 @param[in] key secondary key storage data
9985 @param[out] sk_info hold secondary key memcmp datas(new/old)
9986 @return
9987 HA_EXIT_SUCCESS OK
9988 other HA_ERR error code (can be SE-specific)
9989 */
9990
9991 int ha_rocksdb::check_duplicate_sk(const TABLE *table_arg,
9992 const Rdb_key_def &key_def,
9993 const rocksdb::Slice *key,
9994 struct unique_sk_buf_info *sk_info) {
9995 uint n_null_fields = 0;
9996 const rocksdb::Comparator *index_comp = key_def.get_cf()->GetComparator();
9997
9998 /* Get proper SK buffer. */
9999 uchar *sk_buf = sk_info->swap_and_get_sk_buf();
10000
10001 /* Get memcmp form of sk without extended pk tail */
10002 uint sk_memcmp_size =
10003 key_def.get_memcmp_sk_parts(table_arg, *key, sk_buf, &n_null_fields);
10004
10005 sk_info->sk_memcmp_key =
10006 rocksdb::Slice(reinterpret_cast<char *>(sk_buf), sk_memcmp_size);
10007
10008 if (sk_info->sk_memcmp_key_old.size() > 0 && n_null_fields == 0 &&
10009 index_comp->Compare(sk_info->sk_memcmp_key, sk_info->sk_memcmp_key_old) ==
10010 0) {
10011 return 1;
10012 }
10013
10014 sk_info->sk_memcmp_key_old = sk_info->sk_memcmp_key;
10015 return 0;
10016 }
10017
10018 int ha_rocksdb::bulk_load_key(Rdb_transaction *const tx, const Rdb_key_def &kd,
10019 const rocksdb::Slice &key,
10020 const rocksdb::Slice &value, bool sort) {
10021 DBUG_ENTER_FUNC();
10022 int res;
10023 THD *thd = ha_thd();
10024 if (thd && thd->killed) {
10025 DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED);
10026 }
10027
10028 rocksdb::ColumnFamilyHandle *cf = kd.get_cf();
10029
10030 // In the case of unsorted inserts, m_sst_info allocated here is not
10031 // used to store the keys. It is still used to indicate when tables
10032 // are switched.
10033 if (m_sst_info == nullptr || m_sst_info->is_done()) {
10034 m_sst_info.reset(new Rdb_sst_info(rdb, m_table_handler->m_table_name,
10035 kd.get_name(), cf, *rocksdb_db_options,
10036 THDVAR(ha_thd(), trace_sst_api)));
10037 res = tx->start_bulk_load(this, m_sst_info);
10038 if (res != HA_EXIT_SUCCESS) {
10039 DBUG_RETURN(res);
10040 }
10041 }
10042 DBUG_ASSERT(m_sst_info);
10043
10044 if (sort) {
10045 Rdb_index_merge *key_merge;
10046 DBUG_ASSERT(cf != nullptr);
10047
10048 res = tx->get_key_merge(kd.get_gl_index_id(), cf, &key_merge);
10049 if (res == HA_EXIT_SUCCESS) {
10050 res = key_merge->add(key, value);
10051 }
10052 } else {
10053 res = m_sst_info->put(key, value);
10054 }
10055
10056 DBUG_RETURN(res);
10057 }
10058
10059 int ha_rocksdb::finalize_bulk_load(bool print_client_error) {
10060 DBUG_ENTER_FUNC();
10061
10062 int res = HA_EXIT_SUCCESS;
10063
10064 /* Skip if there are no possible ongoing bulk loads */
10065 if (m_sst_info) {
10066 if (m_sst_info->is_done()) {
10067 m_sst_info.reset();
10068 DBUG_RETURN(res);
10069 }
10070
10071 Rdb_sst_info::Rdb_sst_commit_info commit_info;
10072
10073 // Wrap up the current work in m_sst_info and get ready to commit
10074 // This transfer the responsibility of commit over to commit_info
10075 res = m_sst_info->finish(&commit_info, print_client_error);
10076 if (res == 0) {
10077 // Make sure we have work to do - under race condition we could lose
10078 // to another thread and end up with no work
10079 if (commit_info.has_work()) {
10080 rocksdb::IngestExternalFileOptions opts;
10081 opts.move_files = true;
10082 opts.snapshot_consistency = false;
10083 opts.allow_global_seqno = false;
10084 opts.allow_blocking_flush = false;
10085
10086 const rocksdb::Status s = rdb->IngestExternalFile(
10087 commit_info.get_cf(), commit_info.get_committed_files(), opts);
10088 if (!s.ok()) {
10089 if (print_client_error) {
10090 Rdb_sst_info::report_error_msg(s, nullptr);
10091 }
10092 res = HA_ERR_ROCKSDB_BULK_LOAD;
10093 } else {
10094 // Mark the list of SST files as committed, otherwise they'll get
10095 // cleaned up when commit_info destructs
10096 commit_info.commit();
10097 }
10098 }
10099 }
10100 m_sst_info.reset();
10101 }
10102 DBUG_RETURN(res);
10103 }
10104
10105 /**
10106 Update an existing primary key record or write a new primary key record
10107
10108 @param[in] kd the primary key is being update/write
10109 @param[in] update_row_info hold all row data, such as old row data and
10110 new row data
10111 @param[in] pk_changed whether primary key is changed
10112 @return
10113 HA_EXIT_SUCCESS OK
10114 Other HA_ERR error code (can be SE-specific)
10115 */
10116 int ha_rocksdb::update_write_pk(const Rdb_key_def &kd,
10117 const struct update_row_info &row_info,
10118 bool pk_changed) {
10119 uint key_id = kd.get_keyno();
10120 bool hidden_pk = is_hidden_pk(key_id, table, m_tbl_def);
10121 ulonglong bytes_written = 0;
10122
10123 /*
10124 If the PK has changed, or if this PK uses single deletes and this is an
10125 update, the old key needs to be deleted. In the single delete case, it
10126 might be possible to have this sequence of keys: PUT(X), PUT(X), SD(X),
10127 resulting in the first PUT(X) showing up.
10128 */
10129 if (!hidden_pk && (pk_changed || ((row_info.old_pk_slice.size() > 0) &&
10130 can_use_single_delete(key_id)))) {
10131 const rocksdb::Status s = delete_or_singledelete(
10132 key_id, row_info.tx, kd.get_cf(), row_info.old_pk_slice);
10133 if (!s.ok()) {
10134 return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def,
10135 m_table_handler);
10136 } else {
10137 bytes_written = row_info.old_pk_slice.size();
10138 }
10139 }
10140
10141 if (table->found_next_number_field) {
10142 update_auto_incr_val_from_field();
10143 }
10144
10145 int rc = HA_EXIT_SUCCESS;
10146 rocksdb::Slice value_slice;
10147 /* Prepare the new record to be written into RocksDB */
10148 if ((rc = m_converter->encode_value_slice(
10149 m_pk_descr, row_info.new_pk_slice, row_info.new_pk_unpack_info,
10150 !row_info.old_pk_slice.empty(), should_store_row_debug_checksums(),
10151 m_ttl_bytes, &m_ttl_bytes_updated, &value_slice))) {
10152 return rc;
10153 }
10154
10155 const auto cf = m_pk_descr->get_cf();
10156 if (rocksdb_enable_bulk_load_api && THDVAR(table->in_use, bulk_load) &&
10157 !hidden_pk) {
10158 /*
10159 Write the primary key directly to an SST file using an SstFileWriter
10160 */
10161 rc = bulk_load_key(row_info.tx, kd, row_info.new_pk_slice, value_slice,
10162 THDVAR(table->in_use, bulk_load_allow_unsorted));
10163 } else if (row_info.skip_unique_check || row_info.tx->m_ddl_transaction) {
10164 /*
10165 It is responsibility of the user to make sure that the data being
10166 inserted doesn't violate any unique keys.
10167 */
10168 row_info.tx->get_indexed_write_batch()->Put(cf, row_info.new_pk_slice,
10169 value_slice);
10170 } else {
10171 const bool assume_tracked = can_assume_tracked(ha_thd());
10172 const auto s = row_info.tx->put(cf, row_info.new_pk_slice, value_slice,
10173 assume_tracked);
10174 if (!s.ok()) {
10175 if (s.IsBusy()) {
10176 errkey = table->s->primary_key;
10177 m_dupp_errkey = errkey;
10178 rc = HA_ERR_FOUND_DUPP_KEY;
10179 } else {
10180 rc = row_info.tx->set_status_error(table->in_use, s, *m_pk_descr,
10181 m_tbl_def, m_table_handler);
10182 }
10183 }
10184 }
10185
10186 if (rc == HA_EXIT_SUCCESS) {
10187 row_info.tx->update_bytes_written(
10188 bytes_written + row_info.new_pk_slice.size() + value_slice.size());
10189 }
10190 return rc;
10191 }
10192
10193 /**
10194 update an existing secondary key record or write a new secondary key record
10195
10196 @param[in] table_arg Table we're working on
10197 @param[in] kd The secondary key being update/write
10198 @param[in] row_info data structure contains old row data and new row data
10199 @param[in] bulk_load_sk whether support bulk load. Currently it is only
10200 support for write
10201 @return
10202 HA_EXIT_SUCCESS OK
10203 Other HA_ERR error code (can be SE-specific)
10204 */
10205 int ha_rocksdb::update_write_sk(const TABLE *const table_arg,
10206 const Rdb_key_def &kd,
10207 const struct update_row_info &row_info,
10208 const bool bulk_load_sk) {
10209 int new_packed_size;
10210 int old_packed_size;
10211 int rc = HA_EXIT_SUCCESS;
10212
10213 rocksdb::Slice new_key_slice;
10214 rocksdb::Slice new_value_slice;
10215 rocksdb::Slice old_key_slice;
10216
10217 const uint key_id = kd.get_keyno();
10218
10219 ulonglong bytes_written = 0;
10220
10221 /*
10222 Can skip updating this key if none of the key fields have changed and, if
10223 this table has TTL, the TTL timestamp has not changed.
10224 */
10225 if (row_info.old_data != nullptr && !m_update_scope.is_set(key_id) &&
10226 (!kd.has_ttl() || !m_ttl_bytes_updated)) {
10227 return HA_EXIT_SUCCESS;
10228 }
10229
10230 bool store_row_debug_checksums = should_store_row_debug_checksums();
10231 new_packed_size =
10232 kd.pack_record(table_arg, m_pack_buffer, row_info.new_data,
10233 m_sk_packed_tuple, &m_sk_tails, store_row_debug_checksums,
10234 row_info.hidden_pk_id, 0, nullptr, m_ttl_bytes);
10235
10236 if (row_info.old_data != nullptr) {
10237 // The old value
10238 old_packed_size = kd.pack_record(
10239 table_arg, m_pack_buffer, row_info.old_data, m_sk_packed_tuple_old,
10240 &m_sk_tails_old, store_row_debug_checksums, row_info.hidden_pk_id, 0,
10241 nullptr, m_ttl_bytes);
10242
10243 /*
10244 Check if we are going to write the same value. This can happen when
10245 one does
10246 UPDATE tbl SET col='foo'
10247 and we are looking at the row that already has col='foo'.
10248
10249 We also need to compare the unpack info. Suppose, the collation is
10250 case-insensitive, and unpack info contains information about whether
10251 the letters were uppercase and lowercase. Then, both 'foo' and 'FOO'
10252 will have the same key value, but different data in unpack_info.
10253
10254 (note: anyone changing bytewise_compare should take this code into
10255 account)
10256 */
10257 if (old_packed_size == new_packed_size &&
10258 m_sk_tails_old.get_current_pos() == m_sk_tails.get_current_pos() &&
10259 !(kd.has_ttl() && m_ttl_bytes_updated) &&
10260 memcmp(m_sk_packed_tuple_old, m_sk_packed_tuple, old_packed_size) ==
10261 0 &&
10262 memcmp(m_sk_tails_old.ptr(), m_sk_tails.ptr(),
10263 m_sk_tails.get_current_pos()) == 0) {
10264 return HA_EXIT_SUCCESS;
10265 }
10266
10267 /*
10268 Deleting entries from secondary index should skip locking, but
10269 be visible to the transaction.
10270 (also note that DDL statements do not delete rows, so this is not a DDL
10271 statement)
10272 */
10273 old_key_slice = rocksdb::Slice(
10274 reinterpret_cast<const char *>(m_sk_packed_tuple_old), old_packed_size);
10275
10276 row_info.tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(),
10277 old_key_slice);
10278
10279 bytes_written = old_key_slice.size();
10280 }
10281
10282 new_key_slice = rocksdb::Slice(
10283 reinterpret_cast<const char *>(m_sk_packed_tuple), new_packed_size);
10284 new_value_slice =
10285 rocksdb::Slice(reinterpret_cast<const char *>(m_sk_tails.ptr()),
10286 m_sk_tails.get_current_pos());
10287
10288 if (bulk_load_sk && row_info.old_data == nullptr) {
10289 rc = bulk_load_key(row_info.tx, kd, new_key_slice, new_value_slice, true);
10290 } else {
10291 row_info.tx->get_indexed_write_batch()->Put(kd.get_cf(), new_key_slice,
10292 new_value_slice);
10293 }
10294
10295 row_info.tx->update_bytes_written(bytes_written + new_key_slice.size() +
10296 new_value_slice.size());
10297
10298 return rc;
10299 }
10300
10301 /**
10302 Update existing indexes(PK/SKs) or write new indexes(PK/SKs)
10303
10304 @param[in] row_info hold all row data, such as old key/new key
10305 @param[in] pk_changed whether primary key is changed
10306 @return
10307 HA_EXIT_SUCCESS OK
10308 Other HA_ERR error code (can be SE-specific)
10309 */
10310 int ha_rocksdb::update_write_indexes(const struct update_row_info &row_info,
10311 const bool pk_changed) {
10312 int rc;
10313 bool bulk_load_sk;
10314
10315 // The PK must be updated first to pull out the TTL value.
10316 rc = update_write_pk(*m_pk_descr, row_info, pk_changed);
10317 if (rc != HA_EXIT_SUCCESS) {
10318 return rc;
10319 }
10320
10321 // Update the remaining indexes. Allow bulk loading only if
10322 // allow_sk is enabled
10323 bulk_load_sk = rocksdb_enable_bulk_load_api &&
10324 THDVAR(table->in_use, bulk_load) &&
10325 THDVAR(table->in_use, bulk_load_allow_sk);
10326 for (uint key_id = 0; key_id < m_tbl_def->m_key_count; key_id++) {
10327 if (is_pk(key_id, table, m_tbl_def)) {
10328 continue;
10329 }
10330
10331 rc = update_write_sk(table, *m_key_descr_arr[key_id], row_info,
10332 bulk_load_sk);
10333 if (rc != HA_EXIT_SUCCESS) {
10334 return rc;
10335 }
10336 }
10337
10338 return HA_EXIT_SUCCESS;
10339 }
10340
10341 /**
10342 Update an existing row or write a new row
10343
10344 @param[in] old_data nullptr for write, non-null for update
10345 @param[in] new_data non-null for write/update
10346 @param[in] skip_unique_check whether to check uniqueness
10347 @return
10348 HA_EXIT_SUCCESS OK
10349 Other HA_ERR error code (can be SE-specific)
10350 */
10351 int ha_rocksdb::update_write_row(const uchar *const old_data,
10352 const uchar *const new_data,
10353 const bool skip_unique_check) {
10354 DBUG_ENTER_FUNC();
10355
10356 THD *thd = ha_thd();
10357 if (thd && thd->killed) {
10358 DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED);
10359 }
10360
10361 bool pk_changed = false;
10362 struct update_row_info row_info;
10363
10364 row_info.old_data = old_data;
10365 row_info.new_data = new_data;
10366 row_info.skip_unique_check = skip_unique_check;
10367 row_info.new_pk_unpack_info = nullptr;
10368 set_last_rowkey(old_data);
10369
10370 row_info.tx = get_or_create_tx(table->in_use);
10371
10372 if (old_data != nullptr) {
10373 row_info.old_pk_slice =
10374 rocksdb::Slice(m_last_rowkey.ptr(), m_last_rowkey.length());
10375
10376 /* Determine which indexes need updating. */
10377 calc_updated_indexes();
10378 }
10379
10380 /*
10381 Get the new row key into row_info.new_pk_slice
10382 */
10383 int rc = get_pk_for_update(&row_info);
10384 if (rc != HA_EXIT_SUCCESS) {
10385 DBUG_RETURN(rc);
10386 }
10387
10388 /*
10389 For UPDATEs, if the key has changed, we need to obtain a lock. INSERTs
10390 always require locking.
10391 */
10392 if (row_info.old_pk_slice.size() > 0) {
10393 pk_changed = row_info.new_pk_slice.compare(row_info.old_pk_slice) != 0;
10394 }
10395
10396 if (!skip_unique_check) {
10397 /*
10398 Check to see if we are going to have failures because of unique
10399 keys. Also lock the appropriate key values.
10400 */
10401 rc = check_uniqueness_and_lock(row_info, pk_changed);
10402 if (rc != HA_EXIT_SUCCESS) {
10403 DBUG_RETURN(rc);
10404 }
10405 }
10406
10407 DEBUG_SYNC(ha_thd(), "rocksdb.update_write_row_after_unique_check");
10408
10409 /*
10410 At this point, all locks have been obtained, and all checks for duplicate
10411 keys have been performed. No further errors can be allowed to occur from
10412 here because updates to the transaction will be made and those updates
10413 cannot be easily removed without rolling back the entire transaction.
10414 */
10415 rc = update_write_indexes(row_info, pk_changed);
10416 if (rc != HA_EXIT_SUCCESS) {
10417 DBUG_RETURN(rc);
10418 }
10419
10420 if (old_data != nullptr) {
10421 row_info.tx->incr_update_count();
10422 } else {
10423 row_info.tx->incr_insert_count();
10424 }
10425
10426 row_info.tx->log_table_write_op(m_tbl_def);
10427
10428 if (do_bulk_commit(row_info.tx)) {
10429 DBUG_RETURN(HA_ERR_ROCKSDB_BULK_LOAD);
10430 }
10431
10432 DBUG_RETURN(HA_EXIT_SUCCESS);
10433 }
10434
10435 /*
10436 Setting iterator upper/lower bounds for Seek/SeekForPrev.
10437 This makes RocksDB to avoid scanning tombstones outside of
10438 the given key ranges, when prefix_same_as_start=true was not passed
10439 (when prefix bloom filter can not be used).
10440 Inversing upper/lower bound is necessary on reverse order CF.
10441 This covers HA_READ_PREFIX_LAST* case as well. For example,
10442 if given query eq condition was 12 bytes and condition was
10443 0x0000b3eb003f65c5e78858b8, and if doing HA_READ_PREFIX_LAST,
10444 eq_cond_len was 11 (see calc_eq_cond_len() for details).
10445 If the index was reverse order, upper bound would be
10446 0x0000b3eb003f65c5e78857, and lower bound would be
10447 0x0000b3eb003f65c5e78859. These cover given eq condition range.
10448
10449 @param lower_bound_buf IN Buffer for lower bound
10450 @param upper_bound_buf IN Buffer for upper bound
10451
10452 @param outer_u
10453 */
10454 void ha_rocksdb::setup_iterator_bounds(
10455 const Rdb_key_def &kd, const rocksdb::Slice &eq_cond, size_t bound_len,
10456 uchar *const lower_bound, uchar *const upper_bound,
10457 rocksdb::Slice *lower_bound_slice, rocksdb::Slice *upper_bound_slice) {
10458 // If eq_cond is shorter than Rdb_key_def::INDEX_NUMBER_SIZE, we should be
10459 // able to get better bounds just by using index id directly.
10460 if (eq_cond.size() <= Rdb_key_def::INDEX_NUMBER_SIZE) {
10461 DBUG_ASSERT(bound_len == Rdb_key_def::INDEX_NUMBER_SIZE);
10462 uint size;
10463 kd.get_infimum_key(lower_bound, &size);
10464 DBUG_ASSERT(size == Rdb_key_def::INDEX_NUMBER_SIZE);
10465 kd.get_supremum_key(upper_bound, &size);
10466 DBUG_ASSERT(size == Rdb_key_def::INDEX_NUMBER_SIZE);
10467 } else {
10468 DBUG_ASSERT(bound_len <= eq_cond.size());
10469 memcpy(upper_bound, eq_cond.data(), bound_len);
10470 kd.successor(upper_bound, bound_len);
10471 memcpy(lower_bound, eq_cond.data(), bound_len);
10472 kd.predecessor(lower_bound, bound_len);
10473 }
10474
10475 if (kd.m_is_reverse_cf) {
10476 *upper_bound_slice = rocksdb::Slice((const char *)lower_bound, bound_len);
10477 *lower_bound_slice = rocksdb::Slice((const char *)upper_bound, bound_len);
10478 } else {
10479 *upper_bound_slice = rocksdb::Slice((const char *)upper_bound, bound_len);
10480 *lower_bound_slice = rocksdb::Slice((const char *)lower_bound, bound_len);
10481 }
10482 }
10483
10484 /*
10485 Open a cursor
10486 */
10487
10488 void ha_rocksdb::setup_scan_iterator(const Rdb_key_def &kd,
10489 rocksdb::Slice *const slice,
10490 const bool use_all_keys,
10491 const uint eq_cond_len) {
10492 DBUG_ASSERT(slice->size() >= eq_cond_len);
10493
10494 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10495
10496 bool skip_bloom = true;
10497
10498 const rocksdb::Slice eq_cond(slice->data(), eq_cond_len);
10499 // The size of m_scan_it_lower_bound (and upper) is technically
10500 // max_packed_sk_len as calculated in ha_rocksdb::alloc_key_buffers. Rather
10501 // than recalculating that number, we pass in the max of eq_cond_len and
10502 // Rdb_key_def::INDEX_NUMBER_SIZE which is guaranteed to be smaller than
10503 // max_packed_sk_len, hence ensuring no buffer overrun.
10504 //
10505 // See ha_rocksdb::setup_iterator_bounds on how the bound_len parameter is
10506 // used.
10507 if (check_bloom_and_set_bounds(
10508 ha_thd(), kd, eq_cond, use_all_keys,
10509 std::max(eq_cond_len, (uint)Rdb_key_def::INDEX_NUMBER_SIZE),
10510 m_scan_it_lower_bound, m_scan_it_upper_bound,
10511 &m_scan_it_lower_bound_slice, &m_scan_it_upper_bound_slice)) {
10512 skip_bloom = false;
10513 }
10514
10515 /*
10516 In some cases, setup_scan_iterator() is called multiple times from
10517 the same query but bloom filter can not always be used.
10518 Suppose the following query example. id2 is VARCHAR(30) and PRIMARY KEY
10519 (id1, id2).
10520 select count(*) from t2 WHERE id1=100 and id2 IN ('00000000000000000000',
10521 '100');
10522 In this case, setup_scan_iterator() is called twice, the first time is for
10523 (id1, id2)=(100, '00000000000000000000') and the second time is for (100,
10524 '100').
10525 If prefix bloom filter length is 24 bytes, prefix bloom filter can be used
10526 for the
10527 first condition but not for the second condition.
10528 If bloom filter condition is changed, currently it is necessary to destroy
10529 and
10530 re-create Iterator.
10531 */
10532 if (m_scan_it_skips_bloom != skip_bloom) {
10533 release_scan_iterator();
10534 }
10535
10536 /*
10537 SQL layer can call rnd_init() multiple times in a row.
10538 In that case, re-use the iterator, but re-position it at the table start.
10539 */
10540 if (!m_scan_it) {
10541 const bool fill_cache = !THDVAR(ha_thd(), skip_fill_cache);
10542 if (commit_in_the_middle()) {
10543 DBUG_ASSERT(m_scan_it_snapshot == nullptr);
10544 m_scan_it_snapshot = rdb->GetSnapshot();
10545
10546 auto read_opts = rocksdb::ReadOptions();
10547 // TODO(mung): set based on WHERE conditions
10548 read_opts.total_order_seek = true;
10549 read_opts.snapshot = m_scan_it_snapshot;
10550 m_scan_it = rdb->NewIterator(read_opts, kd.get_cf());
10551 } else {
10552 m_scan_it = tx->get_iterator(kd.get_cf(), skip_bloom, fill_cache,
10553 m_scan_it_lower_bound_slice,
10554 m_scan_it_upper_bound_slice);
10555 }
10556 m_scan_it_skips_bloom = skip_bloom;
10557 }
10558 }
10559
10560 void ha_rocksdb::release_scan_iterator() {
10561 delete m_scan_it;
10562 m_scan_it = nullptr;
10563
10564 if (m_scan_it_snapshot) {
10565 rdb->ReleaseSnapshot(m_scan_it_snapshot);
10566 m_scan_it_snapshot = nullptr;
10567 }
10568 }
10569
10570 void ha_rocksdb::setup_iterator_for_rnd_scan() {
10571 uint key_size;
10572
10573 int key_start_matching_bytes =
10574 m_pk_descr->get_first_key(m_pk_packed_tuple, &key_size);
10575
10576 rocksdb::Slice table_key((const char *)m_pk_packed_tuple, key_size);
10577
10578 setup_scan_iterator(*m_pk_descr, &table_key, false, key_start_matching_bytes);
10579 m_scan_it->Seek(table_key);
10580 m_skip_scan_it_next_call = true;
10581 }
10582
10583 /**
10584 @return
10585 HA_EXIT_SUCCESS OK
10586 other HA_ERR error code (can be SE-specific)
10587 */
10588 int ha_rocksdb::rnd_init(bool scan) {
10589 DBUG_ENTER_FUNC();
10590
10591 THD *thd = ha_thd();
10592 if (thd && thd->killed) {
10593 DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED);
10594 }
10595
10596 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10597
10598 // when this table is being updated, decode all fields
10599 m_converter->setup_field_decoders(table->read_set,
10600 m_lock_rows == RDB_LOCK_WRITE);
10601
10602 if (scan) {
10603 m_rnd_scan_is_new_snapshot = !tx->has_snapshot();
10604 setup_iterator_for_rnd_scan();
10605 } else {
10606 /* We don't need any preparations for rnd_pos() calls. */
10607 }
10608
10609 // If m_lock_rows is on then we will be doing a get_for_update when accessing
10610 // the index, so don't acquire the snapshot right away. Otherwise acquire
10611 // the snapshot immediately.
10612 tx->acquire_snapshot(m_lock_rows == RDB_LOCK_NONE);
10613
10614 DBUG_RETURN(HA_EXIT_SUCCESS);
10615 }
10616
10617 /**
10618 @return
10619 HA_EXIT_SUCCESS OK
10620 other HA_ERR error code (can be SE-specific)
10621 */
10622 int ha_rocksdb::rnd_next(uchar *const buf) {
10623 DBUG_ENTER_FUNC();
10624
10625 int rc;
10626 for (;;) {
10627 rc = rnd_next_with_direction(buf, true);
10628 if (!should_recreate_snapshot(rc, m_rnd_scan_is_new_snapshot)) {
10629 break; /* exit the loop */
10630 }
10631 // release the snapshot and iterator and then regenerate them
10632 Rdb_transaction *tx = get_or_create_tx(table->in_use);
10633 tx->release_snapshot();
10634 release_scan_iterator();
10635 setup_iterator_for_rnd_scan();
10636 }
10637
10638 m_rnd_scan_is_new_snapshot = false;
10639
10640 if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
10641
10642 DBUG_RETURN(rc);
10643 }
10644
10645 /*
10646 See also secondary_index_read().
10647 */
10648 int ha_rocksdb::rnd_next_with_direction(uchar *const buf, bool move_forward) {
10649 DBUG_ENTER_FUNC();
10650
10651 int rc;
10652 THD *thd = ha_thd();
10653
10654 table->status = STATUS_NOT_FOUND;
10655 #ifdef MARIAROCKS_NOT_YET
10656 stats.rows_requested++;
10657 #endif
10658 if (!m_scan_it || !is_valid(m_scan_it)) {
10659 /*
10660 We can get here when SQL layer has called
10661
10662 h->index_init(PRIMARY);
10663 h->index_read_map(full index tuple, HA_READ_KEY_EXACT);
10664
10665 In this case, we should return EOF.
10666 */
10667 DBUG_RETURN(HA_ERR_END_OF_FILE);
10668 }
10669
10670 for (;;) {
10671 DEBUG_SYNC(thd, "rocksdb.check_flags_rnwd");
10672 if (thd && thd->killed) {
10673 rc = HA_ERR_QUERY_INTERRUPTED;
10674 break;
10675 }
10676
10677 if (m_skip_scan_it_next_call) {
10678 m_skip_scan_it_next_call = false;
10679 } else {
10680 if (move_forward) {
10681 m_scan_it->Next(); /* this call cannot fail */
10682 } else {
10683 m_scan_it->Prev(); /* this call cannot fail */
10684 }
10685 }
10686
10687 if (!is_valid(m_scan_it)) {
10688 rc = HA_ERR_END_OF_FILE;
10689 break;
10690 }
10691
10692 /* check if we're out of this table */
10693 const rocksdb::Slice key = m_scan_it->key();
10694 if (!m_pk_descr->covers_key(key)) {
10695 rc = HA_ERR_END_OF_FILE;
10696 break;
10697 }
10698
10699 if (m_lock_rows != RDB_LOCK_NONE) {
10700 /*
10701 Lock the row we've just read.
10702
10703 Now we call get_for_update which will 1) Take a lock and 2) Will fail
10704 if the row was deleted since the snapshot was taken.
10705 */
10706 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10707 DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete");
10708
10709 if (m_pk_descr->has_ttl() &&
10710 should_hide_ttl_rec(*m_pk_descr, m_scan_it->value(),
10711 tx->m_snapshot_timestamp)) {
10712 continue;
10713 }
10714
10715 const rocksdb::Status s =
10716 get_for_update(tx, m_pk_descr->get_cf(), key, &m_retrieved_record);
10717 if (s.IsNotFound() &&
10718 should_skip_invalidated_record(HA_ERR_KEY_NOT_FOUND)) {
10719 continue;
10720 }
10721
10722 if (!s.ok()) {
10723 DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr,
10724 m_tbl_def, m_table_handler));
10725 }
10726
10727 // If we called get_for_update() use the value from that call not from
10728 // the iterator as it may be stale since we don't have a snapshot
10729 // when m_lock_rows is not RDB_LOCK_NONE.
10730 m_last_rowkey.copy(key.data(), key.size(), &my_charset_bin);
10731 rc = convert_record_from_storage_format(&key, buf);
10732 } else {
10733 // Use the value from the iterator
10734 rocksdb::Slice value = m_scan_it->value();
10735
10736 if (m_pk_descr->has_ttl() &&
10737 should_hide_ttl_rec(
10738 *m_pk_descr, value,
10739 get_or_create_tx(table->in_use)->m_snapshot_timestamp)) {
10740 continue;
10741 }
10742
10743 m_last_rowkey.copy(key.data(), key.size(), &my_charset_bin);
10744 rc = convert_record_from_storage_format(&key, &value, buf);
10745 }
10746
10747 table->status = 0;
10748 break;
10749 }
10750
10751 if (!rc) {
10752 #ifdef MARIAROCKS_NOT_YET
10753 stats.rows_read++;
10754 stats.rows_index_next++;
10755 #endif
10756 update_row_stats(ROWS_READ);
10757 }
10758
10759 DBUG_RETURN(rc);
10760 }
10761
10762 int ha_rocksdb::rnd_end() {
10763 DBUG_ENTER_FUNC();
10764
10765 release_scan_iterator();
10766
10767 DBUG_RETURN(HA_EXIT_SUCCESS);
10768 }
10769
10770 /**
10771 @return
10772 HA_EXIT_SUCCESS OK
10773 other HA_ERR error code (can be SE-specific)
10774 */
10775 int ha_rocksdb::index_init(uint idx, bool sorted) {
10776 DBUG_ENTER_FUNC();
10777
10778 THD *thd = ha_thd();
10779 if (thd && thd->killed) {
10780 DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED);
10781 }
10782
10783 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10784 DBUG_ASSERT(tx != nullptr);
10785
10786 m_start_range= NULL;
10787 // when this table is being updated, decode all fields
10788 m_converter->setup_field_decoders(table->read_set,
10789 m_lock_rows == RDB_LOCK_WRITE);
10790
10791 if (!m_keyread_only) {
10792 m_key_descr_arr[idx]->get_lookup_bitmap(table, &m_lookup_bitmap);
10793 }
10794
10795 // If m_lock_rows is not RDB_LOCK_NONE then we will be doing a get_for_update
10796 // when accessing the index, so don't acquire the snapshot right away.
10797 // Otherwise acquire the snapshot immediately.
10798 tx->acquire_snapshot(m_lock_rows == RDB_LOCK_NONE);
10799
10800 active_index = idx;
10801
10802 DBUG_RETURN(HA_EXIT_SUCCESS);
10803 }
10804
10805 /**
10806 @return
10807 HA_EXIT_SUCCESS OK
10808 */
10809 int ha_rocksdb::index_end() {
10810 DBUG_ENTER_FUNC();
10811
10812 release_scan_iterator();
10813
10814 bitmap_free(&m_lookup_bitmap);
10815
10816 active_index = MAX_KEY;
10817 in_range_check_pushed_down = FALSE;
10818 m_start_range= NULL;
10819
10820 DBUG_RETURN(HA_EXIT_SUCCESS);
10821 }
10822
10823 /**
10824 @return
10825 HA_EXIT_SUCCESS OK
10826 other HA_ERR error code (can be SE-specific)
10827 */
10828 int ha_rocksdb::truncate() {
10829 DBUG_ENTER_FUNC();
10830
10831 DBUG_ASSERT(m_tbl_def != nullptr);
10832
10833 // Save table name to use later
10834 std::string table_name = m_tbl_def->full_tablename();
10835
10836 // Delete current table
10837 int err = delete_table(m_tbl_def);
10838 if (err != HA_EXIT_SUCCESS) {
10839 DBUG_RETURN(err);
10840 }
10841
10842 // Reset auto_increment_value to 1 if auto-increment feature is enabled
10843 // By default, the starting valid value for auto_increment_value is 1
10844 DBUG_RETURN(create_table(
10845 table_name, table,
10846 table->found_next_number_field ? 1 : 0 /* auto_increment_value */));
10847 }
10848
10849 /*
10850 Delete the row we've last read. The row is also passed as parameter.
10851
10852 @detail
10853 The caller guarantees table buf points to the row that was just read.
10854 The row is either table->record[0] or table->record[1].
10855 (Check out InnoDB: row_update_for_mysql() has "UT_NOT_USED(mysql_rec)"
10856
10857 @return
10858 HA_EXIT_SUCCESS OK
10859 other HA_ERR error code (can be SE-specific)
10860 */
10861 int ha_rocksdb::delete_row(const uchar *const buf) {
10862 DBUG_ENTER_FUNC();
10863
10864 DBUG_ASSERT(buf != nullptr);
10865
10866 set_last_rowkey(buf);
10867
10868 rocksdb::Slice key_slice(m_last_rowkey.ptr(), m_last_rowkey.length());
10869 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10870 ulonglong bytes_written = 0;
10871
10872 const uint index = pk_index(table, m_tbl_def);
10873 rocksdb::Status s =
10874 delete_or_singledelete(index, tx, m_pk_descr->get_cf(), key_slice);
10875 if (!s.ok()) {
10876 DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def,
10877 m_table_handler));
10878 } else {
10879 bytes_written = key_slice.size();
10880 }
10881
10882 longlong hidden_pk_id = 0;
10883 if (m_tbl_def->m_key_count > 1 && has_hidden_pk(table)) {
10884 int err = read_hidden_pk_id_from_rowkey(&hidden_pk_id);
10885 if (err) {
10886 DBUG_RETURN(err);
10887 }
10888 }
10889
10890 // Delete the record for every secondary index
10891 for (uint i = 0; i < m_tbl_def->m_key_count; i++) {
10892 if (!is_pk(i, table, m_tbl_def)) {
10893 int packed_size;
10894 const Rdb_key_def &kd = *m_key_descr_arr[i];
10895 packed_size = kd.pack_record(table, m_pack_buffer, buf, m_sk_packed_tuple,
10896 nullptr, false, hidden_pk_id);
10897 rocksdb::Slice secondary_key_slice(
10898 reinterpret_cast<const char *>(m_sk_packed_tuple), packed_size);
10899 /* Deleting on secondary key doesn't need any locks: */
10900 tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(),
10901 secondary_key_slice);
10902 bytes_written += secondary_key_slice.size();
10903 }
10904 }
10905
10906 tx->incr_delete_count();
10907 tx->log_table_write_op(m_tbl_def);
10908
10909 if (do_bulk_commit(tx)) {
10910 DBUG_RETURN(HA_ERR_ROCKSDB_BULK_LOAD);
10911 }
10912 #ifdef MARIAROCKS_NOT_YET
10913 stats.rows_deleted++;
10914 #endif
10915 update_row_stats(ROWS_DELETED);
10916 tx->update_bytes_written(bytes_written);
10917
10918 DBUG_RETURN(HA_EXIT_SUCCESS);
10919 }
10920
10921 rocksdb::Status ha_rocksdb::delete_or_singledelete(
10922 uint index, Rdb_transaction *const tx,
10923 rocksdb::ColumnFamilyHandle *const column_family,
10924 const rocksdb::Slice &key) {
10925 const bool assume_tracked = can_assume_tracked(ha_thd());
10926 if (can_use_single_delete(index)) {
10927 return tx->single_delete(column_family, key, assume_tracked);
10928 }
10929 return tx->delete_key(column_family, key, assume_tracked);
10930 }
10931
10932 void ha_rocksdb::update_stats(void) {
10933 DBUG_ENTER_FUNC();
10934
10935 stats.records = 0;
10936 stats.index_file_length = 0ul;
10937 stats.data_file_length = 0ul;
10938 stats.mean_rec_length = 0;
10939
10940 for (uint i = 0; i < m_tbl_def->m_key_count; i++) {
10941 if (is_pk(i, table, m_tbl_def)) {
10942 stats.data_file_length = m_pk_descr->m_stats.m_actual_disk_size;
10943 stats.records = m_pk_descr->m_stats.m_rows;
10944 } else {
10945 stats.index_file_length += m_key_descr_arr[i]->m_stats.m_actual_disk_size;
10946 }
10947 }
10948
10949 DBUG_VOID_RETURN;
10950 }
10951
10952 /**
10953 @return
10954 HA_EXIT_SUCCESS OK
10955 HA_EXIT_FAILURE Error
10956 */
10957 int ha_rocksdb::info(uint flag) {
10958 DBUG_ENTER_FUNC();
10959
10960 if (!table) {
10961 DBUG_RETURN(HA_EXIT_FAILURE);
10962 }
10963
10964 if (flag & HA_STATUS_VARIABLE) {
10965 /*
10966 Test only to simulate corrupted stats
10967 */
10968 DBUG_EXECUTE_IF("myrocks_simulate_negative_stats",
10969 m_pk_descr->m_stats.m_actual_disk_size =
10970 -m_pk_descr->m_stats.m_actual_disk_size;);
10971
10972 update_stats();
10973
10974 /*
10975 If any stats are negative due to bad cached stats, re-run analyze table
10976 and re-retrieve the stats.
10977 */
10978 if (static_cast<longlong>(stats.data_file_length) < 0 ||
10979 static_cast<longlong>(stats.index_file_length) < 0 ||
10980 static_cast<longlong>(stats.records) < 0) {
10981 if (calculate_stats_for_table()) {
10982 DBUG_RETURN(HA_EXIT_FAILURE);
10983 }
10984
10985 update_stats();
10986 }
10987
10988 // if number of records is hardcoded, we do not want to force computation
10989 // of memtable cardinalities
10990 if (stats.records == 0 || (rocksdb_force_compute_memtable_stats &&
10991 rocksdb_debug_optimizer_n_rows == 0)) {
10992 // First, compute SST files stats
10993 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
10994 auto r = get_range(pk_index(table, m_tbl_def), buf);
10995 uint64_t sz = 0;
10996 uint8_t include_flags = rocksdb::DB::INCLUDE_FILES;
10997 // recompute SST files stats only if records count is 0
10998 if (stats.records == 0) {
10999 rdb->GetApproximateSizes(m_pk_descr->get_cf(), &r, 1, &sz,
11000 include_flags);
11001 stats.records += sz / ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE;
11002 stats.data_file_length += sz;
11003 }
11004 // Second, compute memtable stats. This call is expensive, so cache
11005 // values computed for some time.
11006 uint64_t cachetime = rocksdb_force_compute_memtable_stats_cachetime;
11007 uint64_t time = (cachetime == 0) ? 0 : my_interval_timer() / 1000;
11008 if (cachetime == 0 ||
11009 time > m_table_handler->m_mtcache_last_update + cachetime) {
11010 uint64_t memtableCount;
11011 uint64_t memtableSize;
11012
11013 // the stats below are calculated from skiplist wich is a probablistic
11014 // data structure, so the results vary between test runs
11015 // it also can return 0 for quite a large tables which means that
11016 // cardinality for memtable only indxes will be reported as 0
11017 rdb->GetApproximateMemTableStats(m_pk_descr->get_cf(), r,
11018 &memtableCount, &memtableSize);
11019
11020 // Atomically update all of these fields at the same time
11021 if (cachetime > 0) {
11022 if (m_table_handler->m_mtcache_lock.fetch_add(
11023 1, std::memory_order_acquire) == 0) {
11024 m_table_handler->m_mtcache_count = memtableCount;
11025 m_table_handler->m_mtcache_size = memtableSize;
11026 m_table_handler->m_mtcache_last_update = time;
11027 }
11028 m_table_handler->m_mtcache_lock.fetch_sub(1,
11029 std::memory_order_release);
11030 }
11031
11032 stats.records += memtableCount;
11033 stats.data_file_length += memtableSize;
11034 } else {
11035 // Cached data is still valid, so use it instead
11036 stats.records += m_table_handler->m_mtcache_count;
11037 stats.data_file_length += m_table_handler->m_mtcache_size;
11038 }
11039
11040 // Do like InnoDB does. stats.records=0 confuses the optimizer
11041 if (stats.records == 0 && !(flag & (HA_STATUS_TIME | HA_STATUS_OPEN))) {
11042 stats.records++;
11043 }
11044 }
11045
11046 if (rocksdb_debug_optimizer_n_rows > 0)
11047 stats.records = rocksdb_debug_optimizer_n_rows;
11048
11049 if (stats.records != 0) {
11050 stats.mean_rec_length = stats.data_file_length / stats.records;
11051 }
11052 }
11053
11054 if (flag & HA_STATUS_CONST) {
11055 ref_length = m_pk_descr->max_storage_fmt_length();
11056
11057 for (uint i = 0; i < m_tbl_def->m_key_count; i++) {
11058 if (is_hidden_pk(i, table, m_tbl_def)) {
11059 continue;
11060 }
11061 KEY *const k = &table->key_info[i];
11062 for (uint j = 0; j < k->ext_key_parts; j++) {
11063 const Rdb_index_stats &k_stats = m_key_descr_arr[i]->m_stats;
11064 uint x;
11065
11066 if (k_stats.m_distinct_keys_per_prefix.size() > j &&
11067 k_stats.m_distinct_keys_per_prefix[j] > 0) {
11068 x = k_stats.m_rows / k_stats.m_distinct_keys_per_prefix[j];
11069 /*
11070 If the number of rows is less than the number of prefixes (due to
11071 sampling), the average number of rows with the same prefix is 1.
11072 */
11073 if (x == 0) {
11074 x = 1;
11075 }
11076 } else {
11077 x = 0;
11078 }
11079 if (x > stats.records) x = stats.records;
11080 if ((x == 0 && rocksdb_debug_optimizer_no_zero_cardinality) ||
11081 rocksdb_debug_optimizer_n_rows > 0) {
11082 // Fake cardinality implementation. For example, (idx1, idx2, idx3)
11083 // index
11084 /*
11085 Make MariaRocks behave the same way as MyRocks does:
11086 1. SQL layer thinks that unique secondary indexes are not extended
11087 with PK columns (both in MySQL and MariaDB)
11088 2. MariaDB also thinks that indexes with partially-covered columns
11089 are not extended with PK columns. Use the same number of
11090 keyparts that MyRocks would use.
11091 */
11092 uint ext_key_parts2;
11093 if (k->flags & HA_NOSAME)
11094 ext_key_parts2= k->ext_key_parts; // This is #1
11095 else
11096 ext_key_parts2= m_key_descr_arr[i]->get_key_parts(); // This is #2.
11097
11098 // will have rec_per_key for (idx1)=4, (idx1,2)=2, and (idx1,2,3)=1.
11099 // rec_per_key for the whole index is 1, and multiplied by 2^n if
11100 // n suffix columns of the index are not used.
11101 x = 1 << (ext_key_parts2 - j - 1);
11102 }
11103 k->rec_per_key[j] = x;
11104 }
11105 }
11106
11107 stats.create_time = m_tbl_def->get_create_time();
11108 }
11109
11110 if (flag & HA_STATUS_TIME) {
11111 stats.update_time = m_tbl_def->m_update_time;
11112 }
11113
11114 if (flag & HA_STATUS_ERRKEY) {
11115 /*
11116 Currently we support only primary keys so we know which key had a
11117 uniqueness violation.
11118 */
11119 errkey = m_dupp_errkey;
11120 dup_ref = m_pk_tuple; // TODO(?): this should store packed PK.
11121 }
11122
11123 if (flag & HA_STATUS_AUTO) {
11124 stats.auto_increment_value = m_tbl_def->m_auto_incr_val;
11125 }
11126
11127 DBUG_RETURN(HA_EXIT_SUCCESS);
11128 }
11129
11130 void ha_rocksdb::position(const uchar *const record) {
11131 DBUG_ENTER_FUNC();
11132
11133 longlong hidden_pk_id = 0;
11134 if (has_hidden_pk(table) && read_hidden_pk_id_from_rowkey(&hidden_pk_id)) {
11135 DBUG_ASSERT(false); // should never reach here
11136 }
11137
11138 /*
11139 Get packed primary key value from the record.
11140
11141 (Note: m_last_rowkey has the packed PK of last-read-row, which allows to
11142 handle most cases, but there is an exception to this: when slave applies
11143 RBR events, it fills the record and then calls position(); rnd_pos())
11144
11145 Also note that we *can* rely on PK column values being available. This is
11146 because table_flags() includes HA_PRIMARY_KEY_REQUIRED_FOR_POSITION bit.
11147 When that is true, table->prepare_for_position() adds PK columns into the
11148 read set (this may potentially disable index-only access when PK column
11149 cannot be restored from its mem-comparable form in the secondary indexes).
11150 */
11151 const uint packed_size = m_pk_descr->pack_record(
11152 table, m_pack_buffer, record, ref, nullptr, false, hidden_pk_id);
11153
11154 /*
11155 It could be that mem-comparable form of PK occupies less than ref_length
11156 bytes. Fill the remainder with zeros.
11157 */
11158 if (ref_length > packed_size) {
11159 memset(ref + packed_size, 0, ref_length - packed_size);
11160 }
11161
11162 DBUG_VOID_RETURN;
11163 }
11164
11165 /**
11166 @return
11167 HA_EXIT_SUCCESS OK
11168 other HA_ERR error code (can be SE-specific)
11169 */
11170 int ha_rocksdb::rnd_pos(uchar *const buf, uchar *const pos) {
11171 DBUG_ENTER_FUNC();
11172
11173 int rc;
11174 size_t len;
11175
11176 #ifdef MARIAROCKS_NOT_YET
11177 stats.rows_requested++;
11178 #endif
11179 len = m_pk_descr->key_length(table,
11180 rocksdb::Slice((const char *)pos, ref_length));
11181 if (len == size_t(-1)) {
11182 DBUG_RETURN(HA_ERR_ROCKSDB_CORRUPT_DATA); /* Data corruption? */
11183 }
11184
11185 rc = get_row_by_rowid(buf, pos, len);
11186
11187 if (!rc) {
11188 #ifdef MARIAROCKS_NOT_YET
11189 stats.rows_read++;
11190 #endif
11191 update_row_stats(ROWS_READ);
11192 }
11193
11194 DBUG_RETURN(rc);
11195 }
11196
11197 /*
11198 @brief
11199 Calculate (if needed) the bitmap of indexes that are modified by the
11200 current query
11201
11202 @detail
11203 The calculation done by checking index definitions against the
11204 table->write_set
11205 column bitmap.
11206 */
11207
11208 void ha_rocksdb::calc_updated_indexes() {
11209 if (!m_update_scope_is_valid) {
11210 m_update_scope_is_valid = true;
11211 m_update_scope.clear_all();
11212
11213 for (uint keynr = 0; keynr < table->s->keys; keynr++) {
11214 const Rdb_key_def &kd = *m_key_descr_arr[keynr];
11215 /* Walk over all key parts, including the "extended key" suffix */
11216 const uint key_parts = kd.get_key_parts();
11217 for (uint kp = 0; kp < key_parts; kp++) {
11218 if (has_hidden_pk(table) && kp + 1 == key_parts) break;
11219
11220 Field *const field = kd.get_table_field_for_part_no(table, kp);
11221 if (bitmap_is_set(table->write_set, field->field_index)) {
11222 m_update_scope.set_bit(keynr);
11223 break;
11224 }
11225 }
11226 }
11227 }
11228 }
11229
11230 /**
11231 Update an existing row
11232 @param[in] old_data nullptr for write, non-null for update
11233 @param[in] new_data non-null for write/update
11234 @return
11235 HA_EXIT_SUCCESS OK
11236 other HA_ERR error code (can be SE-specific)
11237 */
11238 int ha_rocksdb::update_row(const uchar *const old_data,
11239 const uchar *const new_data) {
11240 DBUG_ENTER_FUNC();
11241
11242 DBUG_ASSERT(old_data != nullptr);
11243 DBUG_ASSERT(new_data != nullptr);
11244 DBUG_ASSERT(m_lock_rows == RDB_LOCK_WRITE);
11245 /*
11246 old_data points to record we're updating. It is the same as the record
11247 we've just read (for multi-table UPDATE, too, because SQL layer will make
11248 an rnd_pos() call to re-read the record before calling update_row())
11249 */
11250 DBUG_ASSERT(new_data == table->record[0]);
11251
11252 const int rv = update_write_row(old_data, new_data, skip_unique_check());
11253
11254 if (rv == 0) {
11255 #ifdef MARIAROCKS_NOT_YET
11256 stats.rows_updated++;
11257 #endif
11258 update_row_stats(ROWS_UPDATED);
11259 }
11260
11261 DBUG_RETURN(rv);
11262 }
11263
11264 /*
11265 MariaDB's temporary: MyRocks has this function in sql/handler.cc:
11266 */
11267
11268 bool can_hold_read_locks_on_select(THD *thd, thr_lock_type lock_type)
11269 {
11270 return (lock_type == TL_READ_WITH_SHARED_LOCKS
11271 || lock_type == TL_READ_NO_INSERT
11272 || (lock_type != TL_IGNORE
11273 && thd->lex->sql_command != SQLCOM_SELECT));
11274 }
11275
11276
11277 /* The following function was copied from ha_blackhole::store_lock: */
11278 THR_LOCK_DATA **ha_rocksdb::store_lock(THD *const thd, THR_LOCK_DATA **to,
11279 enum thr_lock_type lock_type) {
11280 DBUG_ENTER_FUNC();
11281
11282 DBUG_ASSERT(thd != nullptr);
11283 DBUG_ASSERT(to != nullptr);
11284
11285 bool in_lock_tables = my_core::thd_in_lock_tables(thd);
11286
11287 /* First, make a decision about MyRocks's internal locking */
11288 if (lock_type >= TL_WRITE_ALLOW_WRITE) {
11289 m_lock_rows = RDB_LOCK_WRITE;
11290 } else if (lock_type == TL_READ_WITH_SHARED_LOCKS) {
11291 m_lock_rows = RDB_LOCK_READ;
11292 } else if (lock_type != TL_IGNORE) {
11293 m_lock_rows = RDB_LOCK_NONE;
11294 if (THDVAR(thd, lock_scanned_rows)) {
11295 /*
11296 The following logic was copied directly from
11297 ha_innobase::store_lock_with_x_type() in
11298 storage/innobase/handler/ha_innodb.cc and causes MyRocks to leave
11299 locks in place on rows that are in a table that is not being updated.
11300 */
11301 const uint sql_command = my_core::thd_sql_command(thd);
11302 if ((lock_type == TL_READ && in_lock_tables) ||
11303 (lock_type == TL_READ_HIGH_PRIORITY && in_lock_tables) ||
11304 can_hold_read_locks_on_select(thd, lock_type)) {
11305 ulong tx_isolation = my_core::thd_tx_isolation(thd);
11306 if (sql_command != SQLCOM_CHECKSUM &&
11307 ((my_core::thd_test_options(thd, OPTION_BIN_LOG) &&
11308 tx_isolation > ISO_READ_COMMITTED) ||
11309 tx_isolation == ISO_SERIALIZABLE ||
11310 (lock_type != TL_READ && lock_type != TL_READ_NO_INSERT) ||
11311 (sql_command != SQLCOM_INSERT_SELECT &&
11312 sql_command != SQLCOM_REPLACE_SELECT &&
11313 sql_command != SQLCOM_UPDATE && sql_command != SQLCOM_DELETE &&
11314 sql_command != SQLCOM_CREATE_TABLE))) {
11315 m_lock_rows = RDB_LOCK_READ;
11316 }
11317 }
11318 }
11319 }
11320
11321 /* Then, tell the SQL layer what kind of locking it should use: */
11322 if (lock_type != TL_IGNORE && m_db_lock.type == TL_UNLOCK) {
11323 /*
11324 Here is where we get into the guts of a row level lock.
11325 If TL_UNLOCK is set
11326 If we are not doing a LOCK TABLE or DISCARD/IMPORT
11327 TABLESPACE, then allow multiple writers
11328 */
11329
11330 if ((lock_type >= TL_WRITE_CONCURRENT_INSERT && lock_type <= TL_WRITE) &&
11331 !in_lock_tables && !my_core::thd_tablespace_op(thd)) {
11332 lock_type = TL_WRITE_ALLOW_WRITE;
11333 }
11334
11335 /*
11336 In queries of type INSERT INTO t1 SELECT ... FROM t2 ...
11337 MySQL would use the lock TL_READ_NO_INSERT on t2, and that
11338 would conflict with TL_WRITE_ALLOW_WRITE, blocking all inserts
11339 to t2. Convert the lock to a normal read lock to allow
11340 concurrent inserts to t2.
11341 */
11342
11343 if (lock_type == TL_READ_NO_INSERT && !in_lock_tables) {
11344 lock_type = TL_READ;
11345 }
11346
11347 m_db_lock.type = lock_type;
11348 }
11349
11350 *to++ = &m_db_lock;
11351
11352 DBUG_RETURN(to);
11353 }
11354
11355 void ha_rocksdb::read_thd_vars(THD *const thd) {
11356 m_store_row_debug_checksums = THDVAR(thd, store_row_debug_checksums);
11357 m_converter->set_verify_row_debug_checksums(
11358 THDVAR(thd, verify_row_debug_checksums));
11359 m_checksums_pct = THDVAR(thd, checksums_pct);
11360 }
11361
11362 ulonglong ha_rocksdb::table_flags() const
11363 {
11364 DBUG_ENTER_FUNC();
11365
11366 /*
11367 HA_BINLOG_STMT_CAPABLE
11368 Upstream: MyRocks advertises itself as it supports SBR, but has additional
11369 checks in ha_rocksdb::external_lock()/ start_stmt() which will return an
11370 error if one tries to run the statement.
11371 Exceptions: @@rocksdb_unsafe_for_binlog or we are an SQL slave thread.
11372
11373 MariaDB: Inform the upper layer we don't support SBR, so it switches to RBR
11374 if possible. The exceptions are the same as with the upstream.
11375
11376 HA_REC_NOT_IN_SEQ
11377 If we don't set it, filesort crashes, because it assumes rowids are
11378 1..8 byte numbers
11379 HA_PRIMARY_KEY_IN_READ_INDEX
11380 This flag is always set, even for tables that:
11381 - have no PK
11382 - have some (or all) of PK that can't be decoded from the secondary
11383 index.
11384 */
11385 THD *thd= ha_thd();
11386 DBUG_RETURN(HA_BINLOG_ROW_CAPABLE |
11387 ((thd && (THDVAR(thd, unsafe_for_binlog) ||thd->rgi_slave))?
11388 HA_BINLOG_STMT_CAPABLE : 0) |
11389 HA_REC_NOT_IN_SEQ | HA_CAN_INDEX_BLOBS |
11390 HA_PRIMARY_KEY_IN_READ_INDEX |
11391 HA_PRIMARY_KEY_REQUIRED_FOR_POSITION | HA_NULL_IN_KEY |
11392 HA_PARTIAL_COLUMN_READ |
11393 HA_TABLE_SCAN_ON_INDEX);
11394 }
11395
11396
11397 /**
11398 @return
11399 HA_EXIT_SUCCESS OK
11400 other HA_ERR error code (cannot be SE-specific)
11401 */
11402 int ha_rocksdb::external_lock(THD *const thd, int lock_type) {
11403 DBUG_ENTER_FUNC();
11404
11405 DBUG_ASSERT(thd != nullptr);
11406
11407 int res = HA_EXIT_SUCCESS;
11408 #if 0
11409 // MariaDB uses a different way to implement this, see ha_rocksdb::table_flags
11410
11411 int binlog_format = my_core::thd_binlog_format(thd);
11412 bool unsafe_for_binlog = THDVAR(ha_thd(), unsafe_for_binlog);
11413
11414 /*
11415 A note about (*) below: In fb/mysql, LOCK TABLE myrocks_table WRITE
11416 - produces an error when @@binlog_format='ROW'.
11417 - does not produce an error with @@binlog_format='STATEMENT'
11418
11419 We cannot easily provide this behavior in MariaDB, because LOCK ROWS
11420 changes @@binlog_format for the duration of the statement.
11421
11422 Instead we will make the check in ha_rocksdb::start_stmt and will produce
11423 an error in the first DML statement done under LOCK TABLE
11424
11425 (**) - The same happens for other "DDL-type" commands, so allow them here
11426 as well.
11427 */
11428 if (lock_type == F_WRLCK && !thd->rgi_slave && !unsafe_for_binlog &&
11429 binlog_format != BINLOG_FORMAT_ROW &&
11430 binlog_format != BINLOG_FORMAT_UNSPEC &&
11431 thd->lex->sql_command != SQLCOM_LOCK_TABLES && // (*)
11432 thd->lex->sql_command != SQLCOM_ANALYZE && // (**)
11433 thd->lex->sql_command != SQLCOM_OPTIMIZE && // (**)
11434 thd->lex->sql_command != SQLCOM_FLUSH && // (**)
11435 my_core::thd_binlog_filter_ok(thd)) {
11436 my_error(ER_REQUIRE_ROW_BINLOG_FORMAT, MYF(0));
11437 DBUG_RETURN(HA_ERR_UNSUPPORTED);
11438 }
11439 #endif
11440
11441 if (lock_type == F_UNLCK) {
11442 Rdb_transaction *const tx = get_tx_from_thd(thd);
11443
11444 if (tx) {
11445 tx->io_perf_end_and_record(&m_io_perf);
11446 tx->m_n_mysql_tables_in_use--;
11447 if (tx->m_n_mysql_tables_in_use == 0 &&
11448 !my_core::thd_test_options(thd,
11449 OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
11450 /*
11451 Do like InnoDB: when we get here, it's time to commit a
11452 single-statement transaction.
11453
11454 If the statement involved multiple tables, this code will be executed
11455 for each of them, but that's ok because non-first tx->commit() calls
11456 will be no-ops.
11457 */
11458 if (tx->commit_or_rollback()) {
11459 res = HA_ERR_INTERNAL_ERROR;
11460 }
11461 }
11462 }
11463 } else {
11464 if (my_core::thd_tx_isolation(thd) < ISO_READ_COMMITTED ||
11465 my_core::thd_tx_isolation(thd) > ISO_REPEATABLE_READ) {
11466 my_error(ER_ISOLATION_MODE_NOT_SUPPORTED, MYF(0),
11467 tx_isolation_names[my_core::thd_tx_isolation(thd)]);
11468 DBUG_RETURN(HA_ERR_UNSUPPORTED);
11469 }
11470 /*
11471 It's nice to do the following on start of every statement. The problem
11472 is, handler->start_stmt() is not called for INSERTs.
11473 So, we put this code here.
11474 */
11475 Rdb_transaction *const tx = get_or_create_tx(thd);
11476 read_thd_vars(thd);
11477
11478 m_update_scope_is_valid = false;
11479
11480 if (skip_unique_check()) {
11481 if ((thd->lex->sql_command == SQLCOM_INSERT ||
11482 thd->lex->sql_command == SQLCOM_LOAD ||
11483 thd->lex->sql_command == SQLCOM_REPLACE) &&
11484 (thd->lex->duplicates == DUP_REPLACE ||
11485 thd->lex->duplicates == DUP_UPDATE)) {
11486 my_error(ER_ON_DUPLICATE_DISABLED, MYF(0), thd->query());
11487 DBUG_RETURN(HA_ERR_UNSUPPORTED);
11488 }
11489 }
11490
11491 if (lock_type == F_WRLCK) {
11492 if (tx->is_tx_read_only()) {
11493 my_error(ER_UPDATES_WITH_CONSISTENT_SNAPSHOT, MYF(0));
11494 DBUG_RETURN(HA_ERR_UNSUPPORTED);
11495 }
11496
11497 #ifdef MARIADB_NOT_YET
11498 if (thd->get_explicit_snapshot()) {
11499 my_error(ER_UPDATES_WITH_EXPLICIT_SNAPSHOT, MYF(0));
11500 DBUG_RETURN(HA_ERR_UNSUPPORTED);
11501 }
11502 #endif
11503
11504 /*
11505 SQL layer signals us to take a write lock. It does so when starting DML
11506 statement. We should put locks on the rows we're reading.
11507
11508 Note: sometimes, external_lock() can be called without a prior
11509 ::store_lock call. That's why we need to set lock_* members here, too.
11510 */
11511 m_lock_rows = RDB_LOCK_WRITE;
11512
11513 if (thd->lex->sql_command == SQLCOM_CREATE_INDEX ||
11514 thd->lex->sql_command == SQLCOM_DROP_INDEX ||
11515 thd->lex->sql_command == SQLCOM_ALTER_TABLE) {
11516 tx->m_ddl_transaction = true;
11517 }
11518 }
11519 tx->m_n_mysql_tables_in_use++;
11520 rocksdb_register_tx(rocksdb_hton, thd, tx);
11521 tx->io_perf_start(&m_io_perf);
11522 }
11523
11524 DBUG_RETURN(res);
11525 }
11526
11527 /**
11528 @note
11529 A quote from ha_innobase::start_stmt():
11530 <quote>
11531 MySQL calls this function at the start of each SQL statement inside LOCK
11532 TABLES. Inside LOCK TABLES the ::external_lock method does not work to
11533 mark SQL statement borders.
11534 </quote>
11535
11536 @return
11537 HA_EXIT_SUCCESS OK
11538 */
11539
11540 int ha_rocksdb::start_stmt(THD *const thd, thr_lock_type lock_type) {
11541 DBUG_ENTER_FUNC();
11542
11543 DBUG_ASSERT(thd != nullptr);
11544
11545 Rdb_transaction *const tx = get_or_create_tx(thd);
11546 read_thd_vars(thd);
11547 rocksdb_register_tx(ht, thd, tx);
11548 tx->io_perf_start(&m_io_perf);
11549
11550 DBUG_RETURN(HA_EXIT_SUCCESS);
11551 }
11552
11553 rocksdb::Range get_range(uint32_t i,
11554 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2],
11555 int offset1, int offset2) {
11556 uchar *buf_begin = buf;
11557 uchar *buf_end = buf + Rdb_key_def::INDEX_NUMBER_SIZE;
11558 rdb_netbuf_store_index(buf_begin, i + offset1);
11559 rdb_netbuf_store_index(buf_end, i + offset2);
11560
11561 return rocksdb::Range(
11562 rocksdb::Slice((const char *)buf_begin, Rdb_key_def::INDEX_NUMBER_SIZE),
11563 rocksdb::Slice((const char *)buf_end, Rdb_key_def::INDEX_NUMBER_SIZE));
11564 }
11565
11566 static rocksdb::Range get_range(const Rdb_key_def &kd,
11567 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2],
11568 int offset1, int offset2) {
11569 return get_range(kd.get_index_number(), buf, offset1, offset2);
11570 }
11571
11572 rocksdb::Range get_range(const Rdb_key_def &kd,
11573 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]) {
11574 if (kd.m_is_reverse_cf) {
11575 return myrocks::get_range(kd, buf, 1, 0);
11576 } else {
11577 return myrocks::get_range(kd, buf, 0, 1);
11578 }
11579 }
11580
11581 rocksdb::Range ha_rocksdb::get_range(
11582 const int i, uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]) const {
11583 return myrocks::get_range(*m_key_descr_arr[i], buf);
11584 }
11585
11586 /*
11587 This function is called with total_order_seek=true, but
11588 upper/lower bound setting is not necessary.
11589 Boundary set is useful when there is no matching key,
11590 but in drop_index_thread's case, it means index is marked as removed,
11591 so no further seek will happen for the index id.
11592 */
11593 static bool is_myrocks_index_empty(rocksdb::ColumnFamilyHandle *cfh,
11594 const bool is_reverse_cf,
11595 const rocksdb::ReadOptions &read_opts,
11596 const uint index_id) {
11597 bool index_removed = false;
11598 uchar key_buf[Rdb_key_def::INDEX_NUMBER_SIZE] = {0};
11599 rdb_netbuf_store_uint32(key_buf, index_id);
11600 const rocksdb::Slice key =
11601 rocksdb::Slice(reinterpret_cast<char *>(key_buf), sizeof(key_buf));
11602 std::unique_ptr<rocksdb::Iterator> it(rdb->NewIterator(read_opts, cfh));
11603 rocksdb_smart_seek(is_reverse_cf, it.get(), key);
11604 if (!it->Valid()) {
11605 index_removed = true;
11606 } else {
11607 if (memcmp(it->key().data(), key_buf, Rdb_key_def::INDEX_NUMBER_SIZE)) {
11608 // Key does not have same prefix
11609 index_removed = true;
11610 }
11611 }
11612 return index_removed;
11613 }
11614
11615 /*
11616 Drop index thread's main logic
11617 */
11618
11619 void Rdb_drop_index_thread::run() {
11620 RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
11621
11622 for (;;) {
11623 // The stop flag might be set by shutdown command
11624 // after drop_index_thread releases signal_mutex
11625 // (i.e. while executing expensive Seek()). To prevent drop_index_thread
11626 // from entering long cond_timedwait, checking if stop flag
11627 // is true or not is needed, with drop_index_interrupt_mutex held.
11628 if (m_stop) {
11629 break;
11630 }
11631
11632 timespec ts;
11633 int sec= dict_manager.is_drop_index_empty()
11634 ? 24 * 60 * 60 // no filtering
11635 : 60; // filtering
11636 set_timespec(ts,sec);
11637
11638 const auto ret MY_ATTRIBUTE((__unused__)) =
11639 mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts);
11640 if (m_stop) {
11641 break;
11642 }
11643 // make sure, no program error is returned
11644 DBUG_ASSERT(ret == 0 || ret == ETIMEDOUT);
11645 RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
11646
11647 std::unordered_set<GL_INDEX_ID> indices;
11648 dict_manager.get_ongoing_drop_indexes(&indices);
11649 if (!indices.empty()) {
11650 std::unordered_set<GL_INDEX_ID> finished;
11651 rocksdb::ReadOptions read_opts;
11652 read_opts.total_order_seek = true; // disable bloom filter
11653
11654 for (const auto d : indices) {
11655 uint32 cf_flags = 0;
11656 if (!dict_manager.get_cf_flags(d.cf_id, &cf_flags)) {
11657 // NO_LINT_DEBUG
11658 sql_print_error(
11659 "RocksDB: Failed to get column family flags "
11660 "from cf id %u. MyRocks data dictionary may "
11661 "get corrupted.",
11662 d.cf_id);
11663 if (rocksdb_ignore_datadic_errors)
11664 {
11665 sql_print_error("RocksDB: rocksdb_ignore_datadic_errors=1, "
11666 "trying to continue");
11667 continue;
11668 }
11669 abort();
11670 }
11671 rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(d.cf_id);
11672 DBUG_ASSERT(cfh);
11673 const bool is_reverse_cf = cf_flags & Rdb_key_def::REVERSE_CF_FLAG;
11674
11675 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
11676 rocksdb::Range range = get_range(d.index_id, buf, is_reverse_cf ? 1 : 0,
11677 is_reverse_cf ? 0 : 1);
11678 rocksdb::Status status = DeleteFilesInRange(rdb->GetBaseDB(), cfh,
11679 &range.start, &range.limit);
11680 if (!status.ok()) {
11681 if (status.IsShutdownInProgress()) {
11682 break;
11683 }
11684 rdb_handle_io_error(status, RDB_IO_ERROR_BG_THREAD);
11685 }
11686 status = rdb->CompactRange(getCompactRangeOptions(), cfh, &range.start,
11687 &range.limit);
11688 if (!status.ok()) {
11689 if (status.IsShutdownInProgress()) {
11690 break;
11691 }
11692 rdb_handle_io_error(status, RDB_IO_ERROR_BG_THREAD);
11693 }
11694 if (is_myrocks_index_empty(cfh, is_reverse_cf, read_opts, d.index_id)) {
11695 finished.insert(d);
11696 }
11697 }
11698
11699 if (!finished.empty()) {
11700 dict_manager.finish_drop_indexes(finished);
11701 }
11702 }
11703 RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
11704 }
11705
11706 RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
11707 }
11708
11709 Rdb_tbl_def *ha_rocksdb::get_table_if_exists(const char *const tablename) {
11710 std::string str;
11711 if (rdb_normalize_tablename(tablename, &str) != HA_EXIT_SUCCESS) {
11712 // We were not passed table name?
11713 DBUG_ASSERT(0);
11714 return nullptr;
11715 }
11716
11717 return ddl_manager.find(str);
11718 }
11719
11720 /*
11721 Overload func for delete table ---it deletes table meta data in data
11722 dictionary immediately and delete real data in background thread(async)
11723
11724 @param tbl IN MyRocks table definition
11725
11726 @return
11727 HA_EXIT_SUCCESS OK
11728 other HA_ERR error code (can be SE-specific)
11729 */
11730 int ha_rocksdb::delete_table(Rdb_tbl_def *const tbl) {
11731 DBUG_ENTER_FUNC();
11732
11733 DBUG_ASSERT(tbl != nullptr);
11734 DBUG_ASSERT(m_tbl_def == nullptr || m_tbl_def == tbl);
11735
11736 const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
11737 rocksdb::WriteBatch *const batch = wb.get();
11738
11739 dict_manager.add_drop_table(tbl->m_key_descr_arr, tbl->m_key_count, batch);
11740
11741 /*
11742 Remove the table entry in data dictionary (this will also remove it from
11743 the persistent data dictionary).
11744 */
11745 ddl_manager.remove(tbl, batch, true);
11746
11747 int err = dict_manager.commit(batch);
11748 if (err) {
11749 DBUG_RETURN(err);
11750 }
11751
11752 rdb_drop_idx_thread.signal();
11753 // avoid dangling pointer
11754 m_tbl_def = nullptr;
11755 DBUG_RETURN(HA_EXIT_SUCCESS);
11756 }
11757
11758 /*
11759 Note: the following function is called when the table is not open. That is,
11760 this->table==nullptr, pk_key_descr==nullptr, etc.
11761
11762 tablename points to line in form "./dbname/tablename".
11763
11764 @return
11765 HA_EXIT_SUCCESS OK
11766 other HA_ERR error code (can be SE-specific)
11767 */
11768
11769 int ha_rocksdb::delete_table(const char *const tablename) {
11770 DBUG_ENTER_FUNC();
11771
11772 DBUG_ASSERT(tablename != nullptr);
11773
11774 /* Find the table in the hash */
11775 Rdb_tbl_def *const tbl = get_table_if_exists(tablename);
11776 if (!tbl) {
11777 DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
11778 }
11779
11780 DBUG_RETURN(delete_table(tbl));
11781 }
11782
11783 int ha_rocksdb::remove_rows(Rdb_tbl_def *const tbl) {
11784 const rocksdb::WriteOptions wo =
11785 rdb_get_rocksdb_write_options(handler::ha_thd());
11786
11787 rocksdb::ReadOptions opts;
11788 opts.total_order_seek = true;
11789 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
11790
11791 char key_buf[MAX_KEY_LENGTH];
11792 uint key_len;
11793 ulonglong bytes_written = 0;
11794
11795 uchar lower_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE];
11796 uchar upper_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE];
11797 rocksdb::Slice lower_bound_slice;
11798 rocksdb::Slice upper_bound_slice;
11799
11800 /*
11801 Remove all records in each index.
11802 (This is is not crash-safe, but it doesn't matter, because bulk row
11803 deletion will be handled on rocksdb side)
11804 */
11805 for (uint i = 0; i < tbl->m_key_count; i++) {
11806 const Rdb_key_def &kd = *tbl->m_key_descr_arr[i];
11807 kd.get_infimum_key(reinterpret_cast<uchar *>(key_buf), &key_len);
11808 rocksdb::ColumnFamilyHandle *cf = kd.get_cf();
11809 const rocksdb::Slice table_key(key_buf, key_len);
11810 setup_iterator_bounds(kd, table_key, Rdb_key_def::INDEX_NUMBER_SIZE,
11811 lower_bound_buf, upper_bound_buf, &lower_bound_slice,
11812 &upper_bound_slice);
11813 DBUG_ASSERT(key_len == Rdb_key_def::INDEX_NUMBER_SIZE);
11814 opts.iterate_lower_bound = &lower_bound_slice;
11815 opts.iterate_upper_bound = &upper_bound_slice;
11816 std::unique_ptr<rocksdb::Iterator> it(rdb->NewIterator(opts, cf));
11817
11818 it->Seek(table_key);
11819 while (it->Valid()) {
11820 const rocksdb::Slice key = it->key();
11821 if (!kd.covers_key(key)) {
11822 break;
11823 }
11824
11825 rocksdb::Status s;
11826 if (can_use_single_delete(i)) {
11827 s = rdb->SingleDelete(wo, cf, key);
11828 } else {
11829 s = rdb->Delete(wo, cf, key);
11830 }
11831
11832 if (!s.ok()) {
11833 return tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def,
11834 m_table_handler);
11835 }
11836 bytes_written += key.size();
11837 it->Next();
11838 }
11839 }
11840
11841 tx->update_bytes_written(bytes_written);
11842
11843 return HA_EXIT_SUCCESS;
11844 }
11845
11846 /**
11847 @return
11848 HA_EXIT_SUCCESS OK
11849 other HA_ERR error code (cannot be SE-specific)
11850 */
11851 int ha_rocksdb::rename_table(const char *const from, const char *const to) {
11852 DBUG_ENTER_FUNC();
11853
11854 std::string from_str;
11855 std::string to_str;
11856 std::string from_db;
11857 std::string to_db;
11858
11859 int rc = rdb_normalize_tablename(from, &from_str);
11860 if (rc != HA_EXIT_SUCCESS) {
11861 DBUG_RETURN(rc);
11862 }
11863
11864 rc = rdb_split_normalized_tablename(from_str, &from_db);
11865 if (rc != HA_EXIT_SUCCESS) {
11866 DBUG_RETURN(rc);
11867 }
11868
11869 rc = rdb_normalize_tablename(to, &to_str);
11870 if (rc != HA_EXIT_SUCCESS) {
11871 DBUG_RETURN(rc);
11872 }
11873
11874 rc = rdb_split_normalized_tablename(to_str, &to_db);
11875 if (rc != HA_EXIT_SUCCESS) {
11876 DBUG_RETURN(rc);
11877 }
11878
11879 // If the user changed the database part of the name then validate that the
11880 // 'to' database exists.
11881 if (from_db != to_db && !rdb_database_exists(to_db)) {
11882 // If we return a RocksDB specific error code here we get
11883 // "error: 206 - Unknown error 206". InnoDB gets
11884 // "error -1 - Unknown error -1" so let's match them.
11885 DBUG_RETURN(-1);
11886 }
11887
11888 DBUG_EXECUTE_IF("gen_sql_table_name", to_str = to_str + "#sql-test";);
11889
11890 const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
11891 rocksdb::WriteBatch *const batch = wb.get();
11892 dict_manager.lock();
11893
11894 if (ddl_manager.rename(from_str, to_str, batch)) {
11895 rc = HA_ERR_NO_SUCH_TABLE;
11896 } else {
11897 rc = dict_manager.commit(batch);
11898 }
11899 dict_manager.unlock();
11900
11901 DBUG_RETURN(rc);
11902 }
11903
11904 /**
11905 check_if_incompatible_data() called if ALTER TABLE can't detect otherwise
11906 if new and old definition are compatible
11907
11908 @details If there are no other explicit signs like changed number of
11909 fields this function will be called by compare_tables()
11910 (sql/sql_tables.cc) to decide should we rewrite whole table or only .frm
11911 file.
11912
11913 */
11914
11915 bool ha_rocksdb::check_if_incompatible_data(HA_CREATE_INFO *const info,
11916 uint table_changes) {
11917 DBUG_ENTER_FUNC();
11918
11919 DBUG_ASSERT(info != nullptr);
11920
11921 // this function is needed only for online alter-table
11922 DBUG_RETURN(COMPATIBLE_DATA_NO);
11923 }
11924
11925 /**
11926 @return
11927 HA_EXIT_SUCCESS OK
11928 */
11929 int ha_rocksdb::extra(enum ha_extra_function operation) {
11930 DBUG_ENTER_FUNC();
11931
11932 switch (operation) {
11933 case HA_EXTRA_KEYREAD:
11934 m_keyread_only = true;
11935 break;
11936 case HA_EXTRA_NO_KEYREAD:
11937 m_keyread_only = false;
11938 break;
11939 case HA_EXTRA_FLUSH:
11940 /*
11941 If the table has blobs, then they are part of m_retrieved_record.
11942 This call invalidates them.
11943 */
11944 m_retrieved_record.Reset();
11945 break;
11946 case HA_EXTRA_INSERT_WITH_UPDATE:
11947 // INSERT ON DUPLICATE KEY UPDATE
11948 if (rocksdb_enable_insert_with_update_caching) {
11949 m_insert_with_update = true;
11950 }
11951 break;
11952 case HA_EXTRA_NO_IGNORE_DUP_KEY:
11953 // PAIRED with HA_EXTRA_INSERT_WITH_UPDATE or HA_EXTRA_WRITE_CAN_REPLACE
11954 // that indicates the end of REPLACE / INSERT ON DUPLICATE KEY
11955 m_insert_with_update = false;
11956 break;
11957
11958 default:
11959 break;
11960 }
11961
11962 DBUG_RETURN(HA_EXIT_SUCCESS);
11963 }
11964
11965 /*
11966 Given a starting key and an ending key, estimate the number of rows that
11967 will exist between the two keys.
11968 */
11969 ha_rows ha_rocksdb::records_in_range(uint inx, const key_range *const min_key,
11970 const key_range *const max_key,
11971 page_range *pages) {
11972 DBUG_ENTER_FUNC();
11973
11974 ha_rows ret = THDVAR(ha_thd(), records_in_range);
11975 if (ret) {
11976 DBUG_RETURN(ret);
11977 }
11978 if (table->force_index) {
11979 const ha_rows force_rows = THDVAR(ha_thd(), force_index_records_in_range);
11980 if (force_rows) {
11981 DBUG_RETURN(force_rows);
11982 }
11983 }
11984
11985 const Rdb_key_def &kd = *m_key_descr_arr[inx];
11986
11987 uint size1 = 0;
11988 if (min_key) {
11989 size1 = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
11990 m_record_buffer,
11991 min_key->key, min_key->keypart_map);
11992 if (min_key->flag == HA_READ_PREFIX_LAST_OR_PREV ||
11993 min_key->flag == HA_READ_PREFIX_LAST ||
11994 min_key->flag == HA_READ_AFTER_KEY) {
11995 kd.successor(m_sk_packed_tuple, size1);
11996 }
11997 } else {
11998 kd.get_infimum_key(m_sk_packed_tuple, &size1);
11999 }
12000
12001 uint size2 = 0;
12002 if (max_key) {
12003 size2 = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple_old,
12004 m_record_buffer,
12005 max_key->key, max_key->keypart_map);
12006 if (max_key->flag == HA_READ_PREFIX_LAST_OR_PREV ||
12007 max_key->flag == HA_READ_PREFIX_LAST ||
12008 max_key->flag == HA_READ_AFTER_KEY) {
12009 kd.successor(m_sk_packed_tuple_old, size2);
12010 }
12011 // pad the upper key with FFFFs to make sure it is more than the lower
12012 if (size1 > size2) {
12013 memset(m_sk_packed_tuple_old + size2, 0xff, size1 - size2);
12014 size2 = size1;
12015 }
12016 } else {
12017 kd.get_supremum_key(m_sk_packed_tuple_old, &size2);
12018 }
12019
12020 const rocksdb::Slice slice1((const char *)m_sk_packed_tuple, size1);
12021 const rocksdb::Slice slice2((const char *)m_sk_packed_tuple_old, size2);
12022
12023 // slice1 >= slice2 means no row will match
12024 if (slice1.compare(slice2) >= 0) {
12025 DBUG_RETURN(HA_EXIT_SUCCESS);
12026 }
12027
12028 rocksdb::Range r(kd.m_is_reverse_cf ? slice2 : slice1,
12029 kd.m_is_reverse_cf ? slice1 : slice2);
12030
12031 uint64_t sz = 0;
12032 auto disk_size = kd.m_stats.m_actual_disk_size;
12033 if (disk_size == 0) disk_size = kd.m_stats.m_data_size;
12034 auto rows = kd.m_stats.m_rows;
12035 if (rows == 0 || disk_size == 0) {
12036 rows = 1;
12037 disk_size = ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE;
12038 }
12039
12040 // Getting statistics, including from Memtables
12041 uint8_t include_flags = rocksdb::DB::INCLUDE_FILES;
12042 rdb->GetApproximateSizes(kd.get_cf(), &r, 1, &sz, include_flags);
12043 ret = rows * sz / disk_size;
12044 uint64_t memTableCount;
12045 rdb->GetApproximateMemTableStats(kd.get_cf(), r, &memTableCount, &sz);
12046 ret += memTableCount;
12047
12048 /*
12049 GetApproximateSizes() gives estimates so ret might exceed stats.records.
12050 MySQL then decides to use full index scan rather than range scan, which
12051 is not efficient for most cases.
12052 To prevent this, changing estimated records slightly smaller than
12053 stats.records.
12054 */
12055 if (ret >= stats.records) {
12056 ret = stats.records * 0.99;
12057 }
12058
12059 if (rocksdb_debug_optimizer_n_rows > 0) {
12060 ret = rocksdb_debug_optimizer_n_rows;
12061 } else if (ret == 0) {
12062 ret = 1;
12063 }
12064
12065 DBUG_RETURN(ret);
12066 }
12067
12068 void ha_rocksdb::update_create_info(HA_CREATE_INFO *const create_info) {
12069 DBUG_ENTER_FUNC();
12070
12071 DBUG_ASSERT(create_info != nullptr);
12072
12073 if (!(create_info->used_fields & HA_CREATE_USED_AUTO)) {
12074 create_info->auto_increment_value = m_tbl_def->m_auto_incr_val;
12075 }
12076
12077 DBUG_VOID_RETURN;
12078 }
12079
12080 /**
12081 @brief
12082 Doing manual compaction on OPTIMIZE TABLE in RocksDB.
12083 Compaction itself is executed by background thread in RocksDB, but
12084 CompactRange() waits until compaction completes so this function
12085 may take a long time.
12086 Since RocksDB dataset is allocated per index id, OPTIMIZE TABLE
12087 triggers manual compaction for all indexes of the table.
12088 @details
12089 Compaction range is from the beginning of the index id to
12090 the first row of the next index id. When using reverse order
12091 column family, the first row of the next index id should be
12092 the last row of the previous index id.
12093
12094 @return
12095 HA_ADMIN_OK OK
12096 other HA_ADMIN error code
12097 */
12098 int ha_rocksdb::optimize(THD *const thd, HA_CHECK_OPT *const check_opt) {
12099 DBUG_ENTER_FUNC();
12100
12101 DBUG_ASSERT(thd != nullptr);
12102 DBUG_ASSERT(check_opt != nullptr);
12103
12104 for (uint i = 0; i < table->s->keys; i++) {
12105 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
12106 auto range = get_range(i, buf);
12107 const rocksdb::Status s = rdb->CompactRange(getCompactRangeOptions(),
12108 m_key_descr_arr[i]->get_cf(),
12109 &range.start, &range.limit);
12110 if (!s.ok()) {
12111 DBUG_RETURN(rdb_error_to_mysql(s));
12112 }
12113 }
12114
12115 DBUG_RETURN(HA_EXIT_SUCCESS);
12116 }
12117
12118 static int calculate_stats(
12119 const std::unordered_map<GL_INDEX_ID, std::shared_ptr<const Rdb_key_def>>
12120 &to_recalc,
12121 bool include_memtables) {
12122 DBUG_ENTER_FUNC();
12123
12124 // find per column family key ranges which need to be queried
12125 std::unordered_map<rocksdb::ColumnFamilyHandle *, std::vector<rocksdb::Range>>
12126 ranges;
12127 std::unordered_map<GL_INDEX_ID, Rdb_index_stats> stats;
12128 std::vector<uchar> buf(to_recalc.size() * 2 * Rdb_key_def::INDEX_NUMBER_SIZE);
12129
12130 uchar *bufp = buf.data();
12131 for (const auto &it : to_recalc) {
12132 const GL_INDEX_ID index_id = it.first;
12133 auto &kd = it.second;
12134 ranges[kd->get_cf()].push_back(myrocks::get_range(*kd, bufp));
12135 bufp += 2 * Rdb_key_def::INDEX_NUMBER_SIZE;
12136
12137 stats[index_id] = Rdb_index_stats(index_id);
12138 DBUG_ASSERT(kd->get_key_parts() > 0);
12139 stats[index_id].m_distinct_keys_per_prefix.resize(kd->get_key_parts());
12140 }
12141
12142 // get RocksDB table properties for these ranges
12143 rocksdb::TablePropertiesCollection props;
12144 for (const auto &it : ranges) {
12145 const auto old_size MY_ATTRIBUTE((__unused__)) = props.size();
12146 const auto status = rdb->GetPropertiesOfTablesInRange(
12147 it.first, &it.second[0], it.second.size(), &props);
12148 DBUG_ASSERT(props.size() >= old_size);
12149 if (!status.ok()) {
12150 DBUG_RETURN(ha_rocksdb::rdb_error_to_mysql(
12151 status, "Could not access RocksDB properties"));
12152 }
12153 }
12154
12155 int num_sst = 0;
12156 for (const auto &it : props) {
12157 std::vector<Rdb_index_stats> sst_stats;
12158 Rdb_tbl_prop_coll::read_stats_from_tbl_props(it.second, &sst_stats);
12159 /*
12160 sst_stats is a list of index statistics for indexes that have entries
12161 in the current SST file.
12162 */
12163 for (const auto &it1 : sst_stats) {
12164 /*
12165 Only update statistics for indexes that belong to this SQL table.
12166
12167 The reason is: We are walking through all SST files that have
12168 entries from this table (and so can compute good statistics). For
12169 other SQL tables, it can be that we're only seeing a small fraction
12170 of table's entries (and so we can't update statistics based on that).
12171 */
12172 if (stats.find(it1.m_gl_index_id) == stats.end()) {
12173 continue;
12174 }
12175
12176 auto it_index = to_recalc.find(it1.m_gl_index_id);
12177 DBUG_ASSERT(it_index != to_recalc.end());
12178 if (it_index == to_recalc.end()) {
12179 continue;
12180 }
12181 stats[it1.m_gl_index_id].merge(
12182 it1, true, it_index->second->max_storage_fmt_length());
12183 }
12184 num_sst++;
12185 }
12186
12187 if (include_memtables) {
12188 // calculate memtable cardinality
12189 Rdb_tbl_card_coll cardinality_collector(rocksdb_table_stats_sampling_pct);
12190 auto read_opts = rocksdb::ReadOptions();
12191 read_opts.read_tier = rocksdb::ReadTier::kMemtableTier;
12192 for (const auto &it_kd : to_recalc) {
12193 const std::shared_ptr<const Rdb_key_def> &kd = it_kd.second;
12194 Rdb_index_stats &stat = stats[kd->get_gl_index_id()];
12195
12196 uchar r_buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
12197 auto r = myrocks::get_range(*kd, r_buf);
12198 uint64_t memtableCount;
12199 uint64_t memtableSize;
12200 rdb->GetApproximateMemTableStats(kd->get_cf(), r, &memtableCount,
12201 &memtableSize);
12202 if (memtableCount < (uint64_t)stat.m_rows / 10) {
12203 // skip tables that already have enough stats from SST files to reduce
12204 // overhead and avoid degradation of big tables stats by sampling from
12205 // relatively tiny (less than 10% of full data set) memtable dataset
12206 continue;
12207 }
12208
12209 std::unique_ptr<rocksdb::Iterator> it =
12210 std::unique_ptr<rocksdb::Iterator>(
12211 rdb->NewIterator(read_opts, kd->get_cf()));
12212
12213 rocksdb::Slice first_index_key((const char *)r_buf,
12214 Rdb_key_def::INDEX_NUMBER_SIZE);
12215
12216 cardinality_collector.Reset();
12217 for (it->Seek(first_index_key); is_valid(it.get()); it->Next()) {
12218 const rocksdb::Slice key = it->key();
12219 if (!kd->covers_key(key)) {
12220 break; // end of this index
12221 }
12222 stat.m_rows++;
12223
12224 cardinality_collector.ProcessKey(key, kd.get(), &stat);
12225 }
12226 cardinality_collector.AdjustStats(&stat);
12227 }
12228 }
12229
12230 // set and persist new stats
12231 ddl_manager.set_stats(stats);
12232 ddl_manager.persist_stats(true);
12233
12234 DBUG_RETURN(HA_EXIT_SUCCESS);
12235 }
12236
12237 int ha_rocksdb::calculate_stats_for_table() {
12238 DBUG_ENTER_FUNC();
12239
12240 std::unordered_map<GL_INDEX_ID, std::shared_ptr<const Rdb_key_def>>
12241 ids_to_check;
12242 for (uint i = 0; i < table->s->keys; i++) {
12243 ids_to_check.insert(std::make_pair(m_key_descr_arr[i]->get_gl_index_id(),
12244 m_key_descr_arr[i]));
12245 }
12246
12247 DBUG_RETURN(calculate_stats(ids_to_check, true));
12248 }
12249
12250 /*
12251 @return
12252 HA_ADMIN_OK OK
12253 other HA_ADMIN error code
12254 */
12255 int ha_rocksdb::analyze(THD *const thd, HA_CHECK_OPT *const check_opt) {
12256 DBUG_ENTER_FUNC();
12257
12258 if (table) {
12259 if (calculate_stats_for_table() != HA_EXIT_SUCCESS) {
12260 DBUG_RETURN(HA_ADMIN_FAILED);
12261 }
12262 }
12263
12264 // A call to ::info is needed to repopulate some SQL level structs. This is
12265 // necessary for online analyze because we cannot rely on another ::open
12266 // call to call info for us.
12267 if (info(HA_STATUS_CONST | HA_STATUS_VARIABLE) != HA_EXIT_SUCCESS) {
12268 DBUG_RETURN(HA_ADMIN_FAILED);
12269 }
12270
12271 DBUG_RETURN(HA_ADMIN_OK);
12272 }
12273
12274 void ha_rocksdb::get_auto_increment(ulonglong off, ulonglong inc,
12275 ulonglong nb_desired_values,
12276 ulonglong *const first_value,
12277 ulonglong *const nb_reserved_values) {
12278 /*
12279 MySQL has a somewhat complicated way of handling the auto-increment value.
12280 The first time get_auto_increment is called for a statement,
12281 nb_desired_values is the estimate for how many values will be needed. The
12282 engine can then reserve some values, and those will be automatically used
12283 by MySQL, until a hard-coded value shows up in the insert statement, after
12284 which MySQL again calls this function to reset its starting value.
12285 *
12286 For simplicity we will just ignore nb_desired_values - we aren't going to
12287 reserve any extra values for a multi-insert statement. Each row will
12288 simply acquire the next value as needed and we will always tell MySQL that
12289 we only reserved 1 value. Since we are using an atomic value for
12290 m_auto_incr_val this should be safe - if we had to grab a mutex, doing
12291 an actual reserve of some values might be a better solution.
12292 */
12293 DEBUG_SYNC(ha_thd(), "rocksdb.autoinc_vars");
12294 DEBUG_SYNC(ha_thd(), "rocksdb.autoinc_vars2");
12295
12296 if (off > inc) {
12297 off = 1;
12298 }
12299
12300 Field *field;
12301 ulonglong new_val, max_val;
12302 field = table->key_info[table->s->next_number_index].key_part[0].field;
12303 max_val = rdb_get_int_col_max_value(field);
12304
12305 // Local variable reference to simplify code below
12306 auto &auto_incr = m_tbl_def->m_auto_incr_val;
12307
12308 if (inc == 1) {
12309 DBUG_ASSERT(off == 1);
12310 // Optimization for the standard case where we are always simply
12311 // incrementing from the last position
12312
12313 // Use CAS operation in a loop to make sure automically get the next auto
12314 // increment value while ensuring that we don't wrap around to a negative
12315 // number.
12316 //
12317 // We set auto_incr to the min of max_val and new_val + 1. This means that
12318 // if we're at the maximum, we should be returning the same value for
12319 // multiple rows, resulting in duplicate key errors (as expected).
12320 //
12321 // If we return values greater than the max, the SQL layer will "truncate"
12322 // the value anyway, but it means that we store invalid values into
12323 // auto_incr that will be visible in SHOW CREATE TABLE.
12324 new_val = auto_incr;
12325 while (new_val != std::numeric_limits<ulonglong>::max()) {
12326 if (auto_incr.compare_exchange_weak(new_val,
12327 std::min(new_val + 1, max_val))) {
12328 break;
12329 }
12330 }
12331 } else {
12332 // The next value can be more complicated if either 'inc' or 'off' is not 1
12333 ulonglong last_val = auto_incr;
12334
12335 if (last_val > max_val) {
12336 new_val = std::numeric_limits<ulonglong>::max();
12337 } else {
12338 // Loop until we can correctly update the atomic value
12339 do {
12340 DBUG_ASSERT(last_val > 0);
12341 // Calculate the next value in the auto increment series: offset
12342 // + N * increment where N is 0, 1, 2, ...
12343 //
12344 // For further information please visit:
12345 // http://dev.mysql.com/doc/refman/5.7/en/replication-options-master.html
12346 //
12347 // The following is confusing so here is an explanation:
12348 // To get the next number in the sequence above you subtract out the
12349 // offset, calculate the next sequence (N * increment) and then add the
12350 // offset back in.
12351 //
12352 // The additions are rearranged to avoid overflow. The following is
12353 // equivalent to (last_val - 1 + inc - off) / inc. This uses the fact
12354 // that (a+b)/c = a/c + b/c + (a%c + b%c)/c. To show why:
12355 //
12356 // (a+b)/c
12357 // = (a - a%c + a%c + b - b%c + b%c) / c
12358 // = (a - a%c) / c + (b - b%c) / c + (a%c + b%c) / c
12359 // = a/c + b/c + (a%c + b%c) / c
12360 //
12361 // Now, substitute a = last_val - 1, b = inc - off, c = inc to get the
12362 // following statement.
12363 ulonglong n =
12364 (last_val - 1) / inc + ((last_val - 1) % inc + inc - off) / inc;
12365
12366 // Check if n * inc + off will overflow. This can only happen if we have
12367 // an UNSIGNED BIGINT field.
12368 if (n > (std::numeric_limits<ulonglong>::max() - off) / inc) {
12369 DBUG_ASSERT(max_val == std::numeric_limits<ulonglong>::max());
12370 // The 'last_val' value is already equal to or larger than the largest
12371 // value in the sequence. Continuing would wrap around (technically
12372 // the behavior would be undefined). What should we do?
12373 // We could:
12374 // 1) set the new value to the last possible number in our sequence
12375 // as described above. The problem with this is that this
12376 // number could be smaller than a value in an existing row.
12377 // 2) set the new value to the largest possible number. This number
12378 // may not be in our sequence, but it is guaranteed to be equal
12379 // to or larger than any other value already inserted.
12380 //
12381 // For now I'm going to take option 2.
12382 //
12383 // Returning ULLONG_MAX from get_auto_increment will cause the SQL
12384 // layer to fail with ER_AUTOINC_READ_FAILED. This means that due to
12385 // the SE API for get_auto_increment, inserts will fail with
12386 // ER_AUTOINC_READ_FAILED if the column is UNSIGNED BIGINT, but
12387 // inserts will fail with ER_DUP_ENTRY for other types (or no failure
12388 // if the column is in a non-unique SK).
12389 new_val = std::numeric_limits<ulonglong>::max();
12390 auto_incr = new_val; // Store the largest value into auto_incr
12391 break;
12392 }
12393
12394 new_val = n * inc + off;
12395
12396 // Attempt to store the new value (plus 1 since m_auto_incr_val contains
12397 // the next available value) into the atomic value. If the current
12398 // value no longer matches what we have in 'last_val' this will fail and
12399 // we will repeat the loop (`last_val` will automatically get updated
12400 // with the current value).
12401 //
12402 // See above explanation for inc == 1 for why we use std::min.
12403 } while (!auto_incr.compare_exchange_weak(
12404 last_val, std::min(new_val + 1, max_val)));
12405 }
12406 }
12407
12408 *first_value = new_val;
12409 *nb_reserved_values = 1;
12410 }
12411
12412 #ifndef DBUG_OFF
12413
12414 /* Debugger help function */
12415 static char dbug_item_print_buf[512];
12416
12417 const char *dbug_print_item(Item *const item) {
12418 char *const buf = dbug_item_print_buf;
12419 String str(buf, sizeof(dbug_item_print_buf), &my_charset_bin);
12420 str.length(0);
12421 if (!item) return "(Item*)nullptr";
12422 item->print(&str, QT_ORDINARY);
12423 if (str.c_ptr() == buf) {
12424 return buf;
12425 } else {
12426 return "Couldn't fit into buffer";
12427 }
12428 }
12429
12430 #endif /*DBUG_OFF*/
12431
12432 /**
12433 SQL layer calls this function to push an index condition.
12434
12435 @details
12436 The condition is for index keyno (only one condition can be pushed at a
12437 time).
12438 The caller guarantees that condition refers only to index fields; besides
12439 that, fields must have
12440
12441 $field->part_of_key.set_bit(keyno)
12442
12443 which means that
12444
12445 (handler->index_flags(keyno, $keypart, 0) & HA_KEYREAD_ONLY) == 1
12446
12447 which means that field value can be restored from the index tuple.
12448
12449 @return
12450 Part of condition we couldn't check (always nullptr).
12451 */
12452
12453 class Item *ha_rocksdb::idx_cond_push(uint keyno, class Item *const idx_cond) {
12454 DBUG_ENTER_FUNC();
12455
12456 DBUG_ASSERT(keyno != MAX_KEY);
12457 DBUG_ASSERT(idx_cond != nullptr);
12458
12459 pushed_idx_cond = idx_cond;
12460 pushed_idx_cond_keyno = keyno;
12461 in_range_check_pushed_down = TRUE;
12462
12463 /* We will check the whole condition */
12464 DBUG_RETURN(nullptr);
12465 }
12466
12467 /*
12468 Checks if inplace alter is supported for a given operation.
12469 */
12470
12471 my_core::enum_alter_inplace_result ha_rocksdb::check_if_supported_inplace_alter(
12472 TABLE *altered_table, my_core::Alter_inplace_info *const ha_alter_info) {
12473 DBUG_ENTER_FUNC();
12474
12475 DBUG_ASSERT(ha_alter_info != nullptr);
12476
12477 if (ha_alter_info->handler_flags &
12478 ~(ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX |
12479 ALTER_DROP_UNIQUE_INDEX |
12480 ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX |
12481 ALTER_PARTITIONED |
12482 ALTER_ADD_UNIQUE_INDEX |
12483 ALTER_INDEX_ORDER |
12484 ALTER_CHANGE_CREATE_OPTION)) {
12485 DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED);
12486 }
12487
12488 /* We don't support unique keys on table w/ no primary keys */
12489 if ((ha_alter_info->handler_flags &
12490 ALTER_ADD_UNIQUE_INDEX) &&
12491 has_hidden_pk(altered_table)) {
12492 DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED);
12493 }
12494
12495 /* We only support changing auto_increment for table options. */
12496 if ((ha_alter_info->handler_flags & ALTER_CHANGE_CREATE_OPTION) &&
12497 !(ha_alter_info->create_info->used_fields & HA_CREATE_USED_AUTO)) {
12498 DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED);
12499 }
12500
12501 /* FIXME: MDEV-16099 Use alter algorithm=nocopy or algorithm=instant
12502 for non-InnoDB engine */
12503 DBUG_RETURN(my_core::HA_ALTER_INPLACE_COPY_LOCK);
12504 }
12505
12506 /**
12507 Allows the storage engine to update internal structures with concurrent
12508 writes blocked. If check_if_supported_inplace_alter() returns
12509 HA_ALTER_INPLACE_COPY_NO_LOCK or HA_ALTER_INPLACE_COPY_LOCK.
12510 this function is called with exclusive lock otherwise the same level of
12511 locking as for inplace_alter_table() will be used.
12512
12513 @note Storage engines are responsible for reporting any errors by
12514 calling my_error()/print_error()
12515
12516 @note If this function reports error, commit_inplace_alter_table()
12517 will be called with commit= false.
12518
12519 @note For partitioning, failing to prepare one partition, means that
12520 commit_inplace_alter_table() will be called to roll back changes for
12521 all partitions. This means that commit_inplace_alter_table() might be
12522 called without prepare_inplace_alter_table() having been called first
12523 for a given partition.
12524
12525 @param altered_table TABLE object for new version of table.
12526 @param ha_alter_info Structure describing changes to be done
12527 by ALTER TABLE and holding data used
12528 during in-place alter.
12529
12530 @retval true Error
12531 @retval false Success
12532 */
12533 bool ha_rocksdb::prepare_inplace_alter_table(
12534 TABLE *const altered_table,
12535 my_core::Alter_inplace_info *const ha_alter_info) {
12536 DBUG_ENTER_FUNC();
12537
12538 DBUG_ASSERT(altered_table != nullptr);
12539 DBUG_ASSERT(ha_alter_info != nullptr);
12540
12541 Rdb_tbl_def *new_tdef = nullptr;
12542 std::shared_ptr<Rdb_key_def> *old_key_descr = nullptr;
12543 std::shared_ptr<Rdb_key_def> *new_key_descr = nullptr;
12544 uint old_n_keys = m_tbl_def->m_key_count;
12545 uint new_n_keys = altered_table->s->keys;
12546 std::unordered_set<std::shared_ptr<Rdb_key_def>> added_indexes;
12547 std::unordered_set<GL_INDEX_ID> dropped_index_ids;
12548 uint n_dropped_keys = 0;
12549 uint n_added_keys = 0;
12550 ulonglong max_auto_incr = 0;
12551
12552 if (ha_alter_info->handler_flags &
12553 (ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX |
12554 ALTER_DROP_UNIQUE_INDEX |
12555 ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX |
12556 ALTER_ADD_UNIQUE_INDEX)) {
12557 if (has_hidden_pk(altered_table)) {
12558 new_n_keys += 1;
12559 }
12560
12561 const TABLE *const old_table = table;
12562 old_key_descr = m_tbl_def->m_key_descr_arr;
12563 new_key_descr = new std::shared_ptr<Rdb_key_def>[new_n_keys];
12564
12565 new_tdef = new Rdb_tbl_def(m_tbl_def->full_tablename());
12566 new_tdef->m_key_descr_arr = new_key_descr;
12567 new_tdef->m_key_count = new_n_keys;
12568 new_tdef->m_auto_incr_val =
12569 m_tbl_def->m_auto_incr_val.load(std::memory_order_relaxed);
12570 new_tdef->m_hidden_pk_val =
12571 m_tbl_def->m_hidden_pk_val.load(std::memory_order_relaxed);
12572
12573 if (create_key_defs(altered_table, new_tdef, table, m_tbl_def)) {
12574 /* Delete the new key descriptors */
12575 delete[] new_key_descr;
12576
12577 /*
12578 Explicitly mark as nullptr so we don't accidentally remove entries
12579 from data dictionary on cleanup (or cause double delete[]).
12580 */
12581 new_tdef->m_key_descr_arr = nullptr;
12582 delete new_tdef;
12583
12584 my_error(ER_KEY_CREATE_DURING_ALTER, MYF(0));
12585 DBUG_RETURN(HA_EXIT_FAILURE);
12586 }
12587
12588 uint i;
12589 uint j;
12590
12591 /* Determine which(if any) key definition(s) need to be dropped */
12592 for (i = 0; i < ha_alter_info->index_drop_count; i++) {
12593 const KEY *const dropped_key = ha_alter_info->index_drop_buffer[i];
12594 for (j = 0; j < old_n_keys; j++) {
12595 const KEY *const old_key =
12596 &old_table->key_info[old_key_descr[j]->get_keyno()];
12597
12598 if (!compare_keys(old_key, dropped_key)) {
12599 dropped_index_ids.insert(old_key_descr[j]->get_gl_index_id());
12600 break;
12601 }
12602 }
12603 }
12604
12605 /* Determine which(if any) key definitions(s) need to be added */
12606 int identical_indexes_found = 0;
12607 for (i = 0; i < ha_alter_info->index_add_count; i++) {
12608 const KEY *const added_key =
12609 &ha_alter_info->key_info_buffer[ha_alter_info->index_add_buffer[i]];
12610 for (j = 0; j < new_n_keys; j++) {
12611 const KEY *const new_key =
12612 &altered_table->key_info[new_key_descr[j]->get_keyno()];
12613 if (!compare_keys(new_key, added_key)) {
12614 /*
12615 Check for cases where an 'identical' index is being dropped and
12616 re-added in a single ALTER statement. Turn this into a no-op as the
12617 index has not changed.
12618
12619 E.G. Unique index -> non-unique index requires no change
12620
12621 Note that cases where the index name remains the same but the
12622 key-parts are changed is already handled in create_inplace_key_defs.
12623 In these cases the index needs to be rebuilt.
12624 */
12625 if (dropped_index_ids.count(new_key_descr[j]->get_gl_index_id())) {
12626 dropped_index_ids.erase(new_key_descr[j]->get_gl_index_id());
12627 identical_indexes_found++;
12628 } else {
12629 added_indexes.insert(new_key_descr[j]);
12630 }
12631
12632 break;
12633 }
12634 }
12635 }
12636
12637 n_dropped_keys = ha_alter_info->index_drop_count - identical_indexes_found;
12638 n_added_keys = ha_alter_info->index_add_count - identical_indexes_found;
12639 DBUG_ASSERT(dropped_index_ids.size() == n_dropped_keys);
12640 DBUG_ASSERT(added_indexes.size() == n_added_keys);
12641 DBUG_ASSERT(new_n_keys == (old_n_keys - n_dropped_keys + n_added_keys));
12642 }
12643 if (ha_alter_info->handler_flags & ALTER_CHANGE_CREATE_OPTION) {
12644 if (!new_tdef) {
12645 new_tdef = m_tbl_def;
12646 }
12647 if (table->found_next_number_field) {
12648 max_auto_incr = load_auto_incr_value_from_index();
12649 }
12650 }
12651
12652 ha_alter_info->handler_ctx = new Rdb_inplace_alter_ctx(
12653 new_tdef, old_key_descr, new_key_descr, old_n_keys, new_n_keys,
12654 added_indexes, dropped_index_ids, n_added_keys, n_dropped_keys,
12655 max_auto_incr);
12656 DBUG_RETURN(HA_EXIT_SUCCESS);
12657 }
12658
12659 /**
12660 Alter the table structure in-place with operations specified using
12661 HA_ALTER_FLAGS and Alter_inplace_info. The level of concurrency allowed
12662 during this operation depends on the return value from
12663 check_if_supported_inplace_alter().
12664
12665 @note Storage engines are responsible for reporting any errors by
12666 calling my_error()/print_error()
12667
12668 @note If this function reports error, commit_inplace_alter_table()
12669 will be called with commit= false.
12670
12671 @param altered_table TABLE object for new version of table.
12672 @param ha_alter_info Structure describing changes to be done
12673 by ALTER TABLE and holding data used
12674 during in-place alter.
12675
12676 @retval true Error
12677 @retval false Success
12678 */
12679 bool ha_rocksdb::inplace_alter_table(
12680 TABLE *const altered_table,
12681 my_core::Alter_inplace_info *const ha_alter_info) {
12682 DBUG_ENTER_FUNC();
12683
12684 DBUG_ASSERT(altered_table != nullptr);
12685 DBUG_ASSERT(ha_alter_info != nullptr);
12686 DBUG_ASSERT(ha_alter_info->handler_ctx != nullptr);
12687
12688 Rdb_inplace_alter_ctx *const ctx =
12689 static_cast<Rdb_inplace_alter_ctx *>(ha_alter_info->handler_ctx);
12690
12691 if (ha_alter_info->handler_flags &
12692 (ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX |
12693 ALTER_ADD_UNIQUE_INDEX)) {
12694 /*
12695 Buffers need to be set up again to account for new, possibly longer
12696 secondary keys.
12697 */
12698 free_key_buffers();
12699
12700 DBUG_ASSERT(ctx != nullptr);
12701
12702 /*
12703 If adding unique index, allocate special buffers for duplicate checking.
12704 */
12705 int err;
12706 if ((err = alloc_key_buffers(
12707 altered_table, ctx->m_new_tdef,
12708 ha_alter_info->handler_flags &
12709 ALTER_ADD_UNIQUE_INDEX))) {
12710 my_error(ER_OUT_OF_RESOURCES, MYF(0));
12711 DBUG_RETURN(err);
12712 }
12713
12714 /* Populate all new secondary keys by scanning the primary key. */
12715 if ((err = inplace_populate_sk(altered_table, ctx->m_added_indexes))) {
12716 my_error(ER_SK_POPULATE_DURING_ALTER, MYF(0));
12717 DBUG_RETURN(HA_EXIT_FAILURE);
12718 }
12719 }
12720
12721 DBUG_EXECUTE_IF("myrocks_simulate_index_create_rollback", {
12722 dbug_create_err_inplace_alter();
12723 DBUG_RETURN(HA_EXIT_FAILURE);
12724 };);
12725
12726 DBUG_RETURN(HA_EXIT_SUCCESS);
12727 }
12728
12729 /**
12730 Scan the Primary Key index entries and populate the new secondary keys.
12731 */
12732 int ha_rocksdb::inplace_populate_sk(
12733 TABLE *const new_table_arg,
12734 const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes) {
12735 DBUG_ENTER_FUNC();
12736 int res = HA_EXIT_SUCCESS;
12737 const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
12738 rocksdb::WriteBatch *const batch = wb.get();
12739
12740 /* Update the data dictionary */
12741 std::unordered_set<GL_INDEX_ID> create_index_ids;
12742 for (const auto &index : indexes) {
12743 create_index_ids.insert(index->get_gl_index_id());
12744 }
12745 dict_manager.add_create_index(create_index_ids, batch);
12746 res = dict_manager.commit(batch);
12747 if (res != HA_EXIT_SUCCESS) {
12748 return res;
12749 }
12750
12751 /*
12752 Add uncommitted key definitons to ddl_manager. We need to do this
12753 so that the property collector can find this keydef when it needs to
12754 update stats. The property collector looks for the keydef in the
12755 data dictionary, but it won't be there yet since this key definition
12756 is still in the creation process.
12757 */
12758 ddl_manager.add_uncommitted_keydefs(indexes);
12759
12760 const bool hidden_pk_exists = has_hidden_pk(table);
12761
12762 Rdb_transaction *tx = get_or_create_tx(table->in_use);
12763
12764 /*
12765 There is one specific scenario where m_sst_info may not be nullptr. This
12766 happens if the handler we're using happens to be the handler where the PK
12767 bulk load was done on. The sequence of events that lead to this is as
12768 follows (T1 is PK bulk load, T2 is SK alter table):
12769
12770 T1: Execute last INSERT statement
12771 T1: Return TABLE and handler object back to Table_cache_manager
12772 T1: Close connection
12773 T2: Execute ALTER statement
12774 T2: Take same TABLE/handler from Table_cache_manager
12775 T2: Call closefrm which will call finalize_bulk_load on every other open
12776 table/handler *except* the one it's on.
12777 T2: Acquire stale snapshot of PK
12778 T1: Call finalize_bulk_load
12779
12780 This is rare because usually, closefrm will call the destructor (and thus
12781 finalize_bulk_load) on the handler where PK bulk load is done. However, if
12782 the thread ids of the bulk load thread and the alter thread differ by a
12783 multiple of table_cache_instances (8 by default), then they hash to the
12784 same bucket in Table_cache_manager and the alter thread will not not call
12785 the destructor on the handler it is holding. Thus, its m_sst_info will not
12786 be nullptr.
12787
12788 At this point, it is safe to refresh the snapshot because we know all other
12789 open handlers have been closed at this point, and the one we're on is the
12790 only one left.
12791 */
12792 if (m_sst_info) {
12793 if ((res = finalize_bulk_load())) {
12794 DBUG_RETURN(res);
12795 }
12796 tx->commit();
12797 }
12798
12799 const ulonglong rdb_merge_buf_size = THDVAR(ha_thd(), merge_buf_size);
12800 const ulonglong rdb_merge_combine_read_size =
12801 THDVAR(ha_thd(), merge_combine_read_size);
12802 const ulonglong rdb_merge_tmp_file_removal_delay =
12803 THDVAR(ha_thd(), merge_tmp_file_removal_delay_ms);
12804
12805 for (const auto &index : indexes) {
12806 bool is_unique_index =
12807 new_table_arg->key_info[index->get_keyno()].flags & HA_NOSAME;
12808
12809 Rdb_index_merge rdb_merge(tx->get_rocksdb_tmpdir(), rdb_merge_buf_size,
12810 rdb_merge_combine_read_size,
12811 rdb_merge_tmp_file_removal_delay,
12812 index->get_cf());
12813
12814 if ((res = rdb_merge.init())) {
12815 DBUG_RETURN(res);
12816 }
12817
12818 /*
12819 Note: We pass in the currently existing table + tbl_def object here,
12820 as the pk index position may have changed in the case of hidden primary
12821 keys.
12822 */
12823 const uint pk = pk_index(table, m_tbl_def);
12824 ha_index_init(pk, true);
12825
12826 /* Scan each record in the primary key in order */
12827 for (res = index_first(table->record[0]); res == 0;
12828 res = index_next(table->record[0])) {
12829 longlong hidden_pk_id = 0;
12830 if (hidden_pk_exists &&
12831 (res = read_hidden_pk_id_from_rowkey(&hidden_pk_id))) {
12832 // NO_LINT_DEBUG
12833 sql_print_error("Error retrieving hidden pk id.");
12834 ha_index_end();
12835 DBUG_RETURN(res);
12836 }
12837
12838 /* Create new secondary index entry */
12839 const int new_packed_size = index->pack_record(
12840 new_table_arg, m_pack_buffer, table->record[0], m_sk_packed_tuple,
12841 &m_sk_tails, should_store_row_debug_checksums(), hidden_pk_id, 0,
12842 nullptr, m_ttl_bytes);
12843
12844 const rocksdb::Slice key = rocksdb::Slice(
12845 reinterpret_cast<const char *>(m_sk_packed_tuple), new_packed_size);
12846 const rocksdb::Slice val =
12847 rocksdb::Slice(reinterpret_cast<const char *>(m_sk_tails.ptr()),
12848 m_sk_tails.get_current_pos());
12849
12850 /*
12851 Add record to offset tree in preparation for writing out to
12852 disk in sorted chunks.
12853 */
12854 if ((res = rdb_merge.add(key, val))) {
12855 ha_index_end();
12856 DBUG_RETURN(res);
12857 }
12858 }
12859
12860 if (res != HA_ERR_END_OF_FILE) {
12861 // NO_LINT_DEBUG
12862 sql_print_error("Error retrieving index entry from primary key.");
12863 ha_index_end();
12864 DBUG_RETURN(res);
12865 }
12866
12867 ha_index_end();
12868
12869 /*
12870 Perform an n-way merge of n sorted buffers on disk, then writes all
12871 results to RocksDB via SSTFileWriter API.
12872 */
12873 rocksdb::Slice merge_key;
12874 rocksdb::Slice merge_val;
12875
12876 struct unique_sk_buf_info sk_info;
12877 sk_info.dup_sk_buf = m_dup_sk_packed_tuple;
12878 sk_info.dup_sk_buf_old = m_dup_sk_packed_tuple_old;
12879
12880 while ((res = rdb_merge.next(&merge_key, &merge_val)) == 0) {
12881 /* Perform uniqueness check if needed */
12882 if (is_unique_index) {
12883 if (check_duplicate_sk(new_table_arg, *index, &merge_key, &sk_info)) {
12884 /*
12885 Duplicate entry found when trying to create unique secondary key.
12886 We need to unpack the record into new_table_arg->record[0] as it
12887 is used inside print_keydup_error so that the error message shows
12888 the duplicate record.
12889 */
12890 if (index->unpack_record(
12891 new_table_arg, new_table_arg->record[0], &merge_key,
12892 &merge_val, m_converter->get_verify_row_debug_checksums())) {
12893 /* Should never reach here */
12894 DBUG_ASSERT(0);
12895 }
12896
12897 print_keydup_error(new_table_arg,
12898 &new_table_arg->key_info[index->get_keyno()],
12899 MYF(0));
12900 DBUG_RETURN(ER_DUP_ENTRY);
12901 }
12902 }
12903
12904 /*
12905 Insert key and slice to SST via SSTFileWriter API.
12906 */
12907 if ((res = bulk_load_key(tx, *index, merge_key, merge_val, false))) {
12908 break;
12909 }
12910 }
12911
12912 /*
12913 Here, res == -1 means that we are finished, while > 0 means an error
12914 occurred.
12915 */
12916 if (res > 0) {
12917 // NO_LINT_DEBUG
12918 sql_print_error("Error while bulk loading keys in external merge sort.");
12919 DBUG_RETURN(res);
12920 }
12921
12922 bool is_critical_error;
12923 res = tx->finish_bulk_load(&is_critical_error);
12924 if (res && is_critical_error) {
12925 // NO_LINT_DEBUG
12926 sql_print_error("Error finishing bulk load.");
12927 DBUG_RETURN(res);
12928 }
12929 }
12930
12931 /*
12932 Explicitly tell jemalloc to clean up any unused dirty pages at this point.
12933 See https://reviews.facebook.net/D63723 for more details.
12934 */
12935 purge_all_jemalloc_arenas();
12936
12937 DBUG_EXECUTE_IF("crash_during_online_index_creation", DBUG_SUICIDE(););
12938 DBUG_RETURN(res);
12939 }
12940
12941 /**
12942 Commit or rollback the changes made during prepare_inplace_alter_table()
12943 and inplace_alter_table() inside the storage engine.
12944 Note that in case of rollback the allowed level of concurrency during
12945 this operation will be the same as for inplace_alter_table() and thus
12946 might be higher than during prepare_inplace_alter_table(). (For example,
12947 concurrent writes were blocked during prepare, but might not be during
12948 rollback).
12949
12950 @note Storage engines are responsible for reporting any errors by
12951 calling my_error()/print_error()
12952
12953 @note If this function with commit= true reports error, it will be called
12954 again with commit= false.
12955
12956 @note In case of partitioning, this function might be called for rollback
12957 without prepare_inplace_alter_table() having been called first.
12958 Also partitioned tables sets ha_alter_info->group_commit_ctx to a NULL
12959 terminated array of the partitions handlers and if all of them are
12960 committed as one, then group_commit_ctx should be set to NULL to indicate
12961 to the partitioning handler that all partitions handlers are committed.
12962 @see prepare_inplace_alter_table().
12963
12964 @param altered_table TABLE object for new version of table.
12965 @param ha_alter_info Structure describing changes to be done
12966 by ALTER TABLE and holding data used
12967 during in-place alter.
12968 @param commit True => Commit, False => Rollback.
12969
12970 @retval true Error
12971 @retval false Success
12972 */
12973 bool ha_rocksdb::commit_inplace_alter_table(
12974 my_core::TABLE *const altered_table,
12975 my_core::Alter_inplace_info *const ha_alter_info, bool commit) {
12976 DBUG_ENTER_FUNC();
12977
12978 DBUG_ASSERT(altered_table != nullptr);
12979 DBUG_ASSERT(ha_alter_info != nullptr);
12980
12981 Rdb_inplace_alter_ctx *const ctx0 =
12982 static_cast<Rdb_inplace_alter_ctx *>(ha_alter_info->handler_ctx);
12983
12984 DEBUG_SYNC(ha_thd(), "rocksdb.commit_in_place_alter_table");
12985
12986 /*
12987 IMPORTANT: When rollback is requested, mysql will abort with
12988 an assertion failure. That means every failed commit during inplace alter
12989 table will result in a fatal error on the server. Indexes ongoing creation
12990 will be detected when the server restarts, and dropped.
12991
12992 For partitioned tables, a rollback call to this function (commit == false)
12993 is done for each partition. A successful commit call only executes once
12994 for all partitions.
12995 */
12996 if (!commit) {
12997 /* If ctx has not been created yet, nothing to do here */
12998 if (!ctx0) {
12999 DBUG_RETURN(HA_EXIT_SUCCESS);
13000 }
13001
13002 /*
13003 Cannot call destructor for Rdb_tbl_def directly because we don't want to
13004 erase the mappings inside the ddl_manager, as the old_key_descr is still
13005 using them.
13006 */
13007 if (ctx0->m_new_key_descr) {
13008 /* Delete the new key descriptors */
13009 for (uint i = 0; i < ctx0->m_new_tdef->m_key_count; i++) {
13010 ctx0->m_new_key_descr[i] = nullptr;
13011 }
13012
13013 delete[] ctx0->m_new_key_descr;
13014 ctx0->m_new_key_descr = nullptr;
13015 ctx0->m_new_tdef->m_key_descr_arr = nullptr;
13016
13017 delete ctx0->m_new_tdef;
13018 }
13019
13020 /* Remove uncommitted key definitons from ddl_manager */
13021 ddl_manager.remove_uncommitted_keydefs(ctx0->m_added_indexes);
13022
13023 /* Rollback any partially created indexes */
13024 dict_manager.rollback_ongoing_index_creation();
13025
13026 DBUG_RETURN(HA_EXIT_SUCCESS);
13027 }
13028
13029 DBUG_ASSERT(ctx0);
13030
13031 /*
13032 For partitioned tables, we need to commit all changes to all tables at
13033 once, unlike in the other inplace alter API methods.
13034 */
13035 inplace_alter_handler_ctx **ctx_array;
13036 inplace_alter_handler_ctx *ctx_single[2];
13037
13038 if (ha_alter_info->group_commit_ctx) {
13039 DBUG_EXECUTE_IF("crash_during_index_creation_partition", DBUG_SUICIDE(););
13040 ctx_array = ha_alter_info->group_commit_ctx;
13041 } else {
13042 ctx_single[0] = ctx0;
13043 ctx_single[1] = nullptr;
13044 ctx_array = ctx_single;
13045 }
13046
13047 DBUG_ASSERT(ctx0 == ctx_array[0]);
13048 ha_alter_info->group_commit_ctx = nullptr;
13049
13050 if (ha_alter_info->handler_flags &
13051 (ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX |
13052 ALTER_DROP_UNIQUE_INDEX |
13053 ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX |
13054 ALTER_ADD_UNIQUE_INDEX)) {
13055 const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
13056 rocksdb::WriteBatch *const batch = wb.get();
13057 std::unordered_set<GL_INDEX_ID> create_index_ids;
13058
13059 m_tbl_def = ctx0->m_new_tdef;
13060 m_key_descr_arr = m_tbl_def->m_key_descr_arr;
13061 m_pk_descr = m_key_descr_arr[pk_index(altered_table, m_tbl_def)];
13062
13063 dict_manager.lock();
13064 for (inplace_alter_handler_ctx **pctx = ctx_array; *pctx; pctx++) {
13065 Rdb_inplace_alter_ctx *const ctx =
13066 static_cast<Rdb_inplace_alter_ctx *>(*pctx);
13067
13068 /* Mark indexes to be dropped */
13069 dict_manager.add_drop_index(ctx->m_dropped_index_ids, batch);
13070
13071 for (const auto &index : ctx->m_added_indexes) {
13072 create_index_ids.insert(index->get_gl_index_id());
13073 }
13074
13075 if (ddl_manager.put_and_write(ctx->m_new_tdef, batch)) {
13076 /*
13077 Failed to write new entry into data dictionary, this should never
13078 happen.
13079 */
13080 DBUG_ASSERT(0);
13081 }
13082
13083 /*
13084 Remove uncommitted key definitons from ddl_manager, as they are now
13085 committed into the data dictionary.
13086 */
13087 ddl_manager.remove_uncommitted_keydefs(ctx->m_added_indexes);
13088 }
13089
13090 if (dict_manager.commit(batch)) {
13091 /*
13092 Should never reach here. We assume MyRocks will abort if commit fails.
13093 */
13094 DBUG_ASSERT(0);
13095 }
13096
13097 dict_manager.unlock();
13098
13099 /* Mark ongoing create indexes as finished/remove from data dictionary */
13100 dict_manager.finish_indexes_operation(
13101 create_index_ids, Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
13102
13103 rdb_drop_idx_thread.signal();
13104 }
13105
13106 if (ha_alter_info->handler_flags & ALTER_CHANGE_CREATE_OPTION) {
13107 const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
13108 rocksdb::WriteBatch *const batch = wb.get();
13109 std::unordered_set<GL_INDEX_ID> create_index_ids;
13110
13111 ulonglong auto_incr_val = ha_alter_info->create_info->auto_increment_value;
13112
13113 for (inplace_alter_handler_ctx **pctx = ctx_array; *pctx; pctx++) {
13114 Rdb_inplace_alter_ctx *const ctx =
13115 static_cast<Rdb_inplace_alter_ctx *>(*pctx);
13116 auto_incr_val = std::max(auto_incr_val, ctx->m_max_auto_incr);
13117 dict_manager.put_auto_incr_val(
13118 batch, ctx->m_new_tdef->get_autoincr_gl_index_id(), auto_incr_val,
13119 true /* overwrite */);
13120 ctx->m_new_tdef->m_auto_incr_val = auto_incr_val;
13121 }
13122
13123 if (dict_manager.commit(batch)) {
13124 DBUG_ASSERT(0);
13125 }
13126 }
13127
13128 DBUG_RETURN(HA_EXIT_SUCCESS);
13129 }
13130
13131 #define SHOW_FNAME(name) rocksdb_show_##name
13132
13133 #define DEF_SHOW_FUNC(name, key) \
13134 static int SHOW_FNAME(name)(MYSQL_THD thd, SHOW_VAR * var, char *buff) { \
13135 rocksdb_status_counters.name = \
13136 rocksdb_stats->getTickerCount(rocksdb::key); \
13137 var->type = SHOW_LONGLONG; \
13138 var->value = reinterpret_cast<char *>(&rocksdb_status_counters.name); \
13139 return HA_EXIT_SUCCESS; \
13140 }
13141
13142 #define DEF_STATUS_VAR(name) \
13143 { "rocksdb_" #name, (char *)&SHOW_FNAME(name), SHOW_FUNC }
13144
13145 #define DEF_STATUS_VAR_PTR(name, ptr, option) \
13146 { "rocksdb_" name, (char *)ptr, option }
13147
13148 #define DEF_STATUS_VAR_FUNC(name, ptr, option) \
13149 { name, reinterpret_cast<char *>(ptr), option }
13150
13151 struct rocksdb_status_counters_t {
13152 uint64_t block_cache_miss;
13153 uint64_t block_cache_hit;
13154 uint64_t block_cache_add;
13155 uint64_t block_cache_add_failures;
13156 uint64_t block_cache_index_miss;
13157 uint64_t block_cache_index_hit;
13158 uint64_t block_cache_index_add;
13159 uint64_t block_cache_index_bytes_insert;
13160 uint64_t block_cache_index_bytes_evict;
13161 uint64_t block_cache_filter_miss;
13162 uint64_t block_cache_filter_hit;
13163 uint64_t block_cache_filter_add;
13164 uint64_t block_cache_filter_bytes_insert;
13165 uint64_t block_cache_filter_bytes_evict;
13166 uint64_t block_cache_bytes_read;
13167 uint64_t block_cache_bytes_write;
13168 uint64_t block_cache_data_bytes_insert;
13169 uint64_t block_cache_data_miss;
13170 uint64_t block_cache_data_hit;
13171 uint64_t block_cache_data_add;
13172 uint64_t bloom_filter_useful;
13173 uint64_t bloom_filter_full_positive;
13174 uint64_t bloom_filter_full_true_positive;
13175 uint64_t memtable_hit;
13176 uint64_t memtable_miss;
13177 uint64_t get_hit_l0;
13178 uint64_t get_hit_l1;
13179 uint64_t get_hit_l2_and_up;
13180 uint64_t compaction_key_drop_new;
13181 uint64_t compaction_key_drop_obsolete;
13182 uint64_t compaction_key_drop_user;
13183 uint64_t number_keys_written;
13184 uint64_t number_keys_read;
13185 uint64_t number_keys_updated;
13186 uint64_t bytes_written;
13187 uint64_t bytes_read;
13188 uint64_t number_db_seek;
13189 uint64_t number_db_seek_found;
13190 uint64_t number_db_next;
13191 uint64_t number_db_next_found;
13192 uint64_t number_db_prev;
13193 uint64_t number_db_prev_found;
13194 uint64_t iter_bytes_read;
13195 uint64_t no_file_closes;
13196 uint64_t no_file_opens;
13197 uint64_t no_file_errors;
13198 uint64_t stall_micros;
13199 uint64_t num_iterators;
13200 uint64_t number_multiget_get;
13201 uint64_t number_multiget_keys_read;
13202 uint64_t number_multiget_bytes_read;
13203 uint64_t number_deletes_filtered;
13204 uint64_t number_merge_failures;
13205 uint64_t bloom_filter_prefix_checked;
13206 uint64_t bloom_filter_prefix_useful;
13207 uint64_t number_reseeks_iteration;
13208 uint64_t getupdatessince_calls;
13209 uint64_t block_cachecompressed_miss;
13210 uint64_t block_cachecompressed_hit;
13211 uint64_t wal_synced;
13212 uint64_t wal_bytes;
13213 uint64_t write_self;
13214 uint64_t write_other;
13215 uint64_t write_timedout;
13216 uint64_t write_wal;
13217 uint64_t flush_write_bytes;
13218 uint64_t compact_read_bytes;
13219 uint64_t compact_write_bytes;
13220 uint64_t number_superversion_acquires;
13221 uint64_t number_superversion_releases;
13222 uint64_t number_superversion_cleanups;
13223 uint64_t number_block_not_compressed;
13224 };
13225
13226 static rocksdb_status_counters_t rocksdb_status_counters;
13227
13228 DEF_SHOW_FUNC(block_cache_miss, BLOCK_CACHE_MISS)
13229 DEF_SHOW_FUNC(block_cache_hit, BLOCK_CACHE_HIT)
13230 DEF_SHOW_FUNC(block_cache_add, BLOCK_CACHE_ADD)
13231 DEF_SHOW_FUNC(block_cache_add_failures, BLOCK_CACHE_ADD_FAILURES)
13232 DEF_SHOW_FUNC(block_cache_index_miss, BLOCK_CACHE_INDEX_MISS)
13233 DEF_SHOW_FUNC(block_cache_index_hit, BLOCK_CACHE_INDEX_HIT)
13234 DEF_SHOW_FUNC(block_cache_index_add, BLOCK_CACHE_INDEX_ADD)
13235 DEF_SHOW_FUNC(block_cache_index_bytes_insert, BLOCK_CACHE_INDEX_BYTES_INSERT)
13236 DEF_SHOW_FUNC(block_cache_index_bytes_evict, BLOCK_CACHE_INDEX_BYTES_EVICT)
13237 DEF_SHOW_FUNC(block_cache_filter_miss, BLOCK_CACHE_FILTER_MISS)
13238 DEF_SHOW_FUNC(block_cache_filter_hit, BLOCK_CACHE_FILTER_HIT)
13239 DEF_SHOW_FUNC(block_cache_filter_add, BLOCK_CACHE_FILTER_ADD)
13240 DEF_SHOW_FUNC(block_cache_filter_bytes_insert, BLOCK_CACHE_FILTER_BYTES_INSERT)
13241 DEF_SHOW_FUNC(block_cache_filter_bytes_evict, BLOCK_CACHE_FILTER_BYTES_EVICT)
13242 DEF_SHOW_FUNC(block_cache_bytes_read, BLOCK_CACHE_BYTES_READ)
13243 DEF_SHOW_FUNC(block_cache_bytes_write, BLOCK_CACHE_BYTES_WRITE)
13244 DEF_SHOW_FUNC(block_cache_data_bytes_insert, BLOCK_CACHE_DATA_BYTES_INSERT)
13245 DEF_SHOW_FUNC(block_cache_data_miss, BLOCK_CACHE_DATA_MISS)
13246 DEF_SHOW_FUNC(block_cache_data_hit, BLOCK_CACHE_DATA_HIT)
13247 DEF_SHOW_FUNC(block_cache_data_add, BLOCK_CACHE_DATA_ADD)
13248 DEF_SHOW_FUNC(bloom_filter_useful, BLOOM_FILTER_USEFUL)
13249 DEF_SHOW_FUNC(bloom_filter_full_positive, BLOOM_FILTER_FULL_POSITIVE)
13250 DEF_SHOW_FUNC(bloom_filter_full_true_positive, BLOOM_FILTER_FULL_TRUE_POSITIVE)
13251 DEF_SHOW_FUNC(memtable_hit, MEMTABLE_HIT)
13252 DEF_SHOW_FUNC(memtable_miss, MEMTABLE_MISS)
13253 DEF_SHOW_FUNC(get_hit_l0, GET_HIT_L0)
13254 DEF_SHOW_FUNC(get_hit_l1, GET_HIT_L1)
13255 DEF_SHOW_FUNC(get_hit_l2_and_up, GET_HIT_L2_AND_UP)
13256 DEF_SHOW_FUNC(compaction_key_drop_new, COMPACTION_KEY_DROP_NEWER_ENTRY)
13257 DEF_SHOW_FUNC(compaction_key_drop_obsolete, COMPACTION_KEY_DROP_OBSOLETE)
13258 DEF_SHOW_FUNC(compaction_key_drop_user, COMPACTION_KEY_DROP_USER)
13259 DEF_SHOW_FUNC(number_keys_written, NUMBER_KEYS_WRITTEN)
13260 DEF_SHOW_FUNC(number_keys_read, NUMBER_KEYS_READ)
13261 DEF_SHOW_FUNC(number_keys_updated, NUMBER_KEYS_UPDATED)
13262 DEF_SHOW_FUNC(bytes_written, BYTES_WRITTEN)
13263 DEF_SHOW_FUNC(bytes_read, BYTES_READ)
13264 DEF_SHOW_FUNC(number_db_seek, NUMBER_DB_SEEK)
13265 DEF_SHOW_FUNC(number_db_seek_found, NUMBER_DB_SEEK_FOUND)
13266 DEF_SHOW_FUNC(number_db_next, NUMBER_DB_NEXT)
13267 DEF_SHOW_FUNC(number_db_next_found, NUMBER_DB_NEXT_FOUND)
13268 DEF_SHOW_FUNC(number_db_prev, NUMBER_DB_PREV)
13269 DEF_SHOW_FUNC(number_db_prev_found, NUMBER_DB_PREV_FOUND)
13270 DEF_SHOW_FUNC(iter_bytes_read, ITER_BYTES_READ)
13271 DEF_SHOW_FUNC(no_file_closes, NO_FILE_CLOSES)
13272 DEF_SHOW_FUNC(no_file_opens, NO_FILE_OPENS)
13273 DEF_SHOW_FUNC(no_file_errors, NO_FILE_ERRORS)
13274 DEF_SHOW_FUNC(stall_micros, STALL_MICROS)
13275 DEF_SHOW_FUNC(num_iterators, NO_ITERATORS)
13276 DEF_SHOW_FUNC(number_multiget_get, NUMBER_MULTIGET_CALLS)
13277 DEF_SHOW_FUNC(number_multiget_keys_read, NUMBER_MULTIGET_KEYS_READ)
13278 DEF_SHOW_FUNC(number_multiget_bytes_read, NUMBER_MULTIGET_BYTES_READ)
13279 DEF_SHOW_FUNC(number_deletes_filtered, NUMBER_FILTERED_DELETES)
13280 DEF_SHOW_FUNC(number_merge_failures, NUMBER_MERGE_FAILURES)
13281 DEF_SHOW_FUNC(bloom_filter_prefix_checked, BLOOM_FILTER_PREFIX_CHECKED)
13282 DEF_SHOW_FUNC(bloom_filter_prefix_useful, BLOOM_FILTER_PREFIX_USEFUL)
13283 DEF_SHOW_FUNC(number_reseeks_iteration, NUMBER_OF_RESEEKS_IN_ITERATION)
13284 DEF_SHOW_FUNC(getupdatessince_calls, GET_UPDATES_SINCE_CALLS)
13285 DEF_SHOW_FUNC(block_cachecompressed_miss, BLOCK_CACHE_COMPRESSED_MISS)
13286 DEF_SHOW_FUNC(block_cachecompressed_hit, BLOCK_CACHE_COMPRESSED_HIT)
13287 DEF_SHOW_FUNC(wal_synced, WAL_FILE_SYNCED)
13288 DEF_SHOW_FUNC(wal_bytes, WAL_FILE_BYTES)
13289 DEF_SHOW_FUNC(write_self, WRITE_DONE_BY_SELF)
13290 DEF_SHOW_FUNC(write_other, WRITE_DONE_BY_OTHER)
13291 DEF_SHOW_FUNC(write_timedout, WRITE_TIMEDOUT)
13292 DEF_SHOW_FUNC(write_wal, WRITE_WITH_WAL)
13293 DEF_SHOW_FUNC(flush_write_bytes, FLUSH_WRITE_BYTES)
13294 DEF_SHOW_FUNC(compact_read_bytes, COMPACT_READ_BYTES)
13295 DEF_SHOW_FUNC(compact_write_bytes, COMPACT_WRITE_BYTES)
13296 DEF_SHOW_FUNC(number_superversion_acquires, NUMBER_SUPERVERSION_ACQUIRES)
13297 DEF_SHOW_FUNC(number_superversion_releases, NUMBER_SUPERVERSION_RELEASES)
13298 DEF_SHOW_FUNC(number_superversion_cleanups, NUMBER_SUPERVERSION_CLEANUPS)
13299 DEF_SHOW_FUNC(number_block_not_compressed, NUMBER_BLOCK_NOT_COMPRESSED)
13300
13301 static void myrocks_update_status() {
13302 export_stats.rows_deleted = global_stats.rows[ROWS_DELETED];
13303 export_stats.rows_inserted = global_stats.rows[ROWS_INSERTED];
13304 export_stats.rows_read = global_stats.rows[ROWS_READ];
13305 export_stats.rows_updated = global_stats.rows[ROWS_UPDATED];
13306 export_stats.rows_deleted_blind = global_stats.rows[ROWS_DELETED_BLIND];
13307 export_stats.rows_expired = global_stats.rows[ROWS_EXPIRED];
13308 export_stats.rows_filtered = global_stats.rows[ROWS_FILTERED];
13309
13310 export_stats.system_rows_deleted = global_stats.system_rows[ROWS_DELETED];
13311 export_stats.system_rows_inserted = global_stats.system_rows[ROWS_INSERTED];
13312 export_stats.system_rows_read = global_stats.system_rows[ROWS_READ];
13313 export_stats.system_rows_updated = global_stats.system_rows[ROWS_UPDATED];
13314
13315 export_stats.queries_point = global_stats.queries[QUERIES_POINT];
13316 export_stats.queries_range = global_stats.queries[QUERIES_RANGE];
13317
13318 export_stats.covered_secondary_key_lookups =
13319 global_stats.covered_secondary_key_lookups;
13320 }
13321
13322 static void myrocks_update_memory_status() {
13323 std::vector<rocksdb::DB *> dbs;
13324 std::unordered_set<const rocksdb::Cache *> cache_set;
13325 dbs.push_back(rdb);
13326 std::map<rocksdb::MemoryUtil::UsageType, uint64_t> temp_usage_by_type;
13327 rocksdb::MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set,
13328 &temp_usage_by_type);
13329 memory_stats.memtable_total =
13330 temp_usage_by_type[rocksdb::MemoryUtil::kMemTableTotal];
13331 memory_stats.memtable_unflushed =
13332 temp_usage_by_type[rocksdb::MemoryUtil::kMemTableUnFlushed];
13333 }
13334
13335 static SHOW_VAR myrocks_status_variables[] = {
13336 DEF_STATUS_VAR_FUNC("rows_deleted", &export_stats.rows_deleted,
13337 SHOW_LONGLONG),
13338 DEF_STATUS_VAR_FUNC("rows_inserted", &export_stats.rows_inserted,
13339 SHOW_LONGLONG),
13340 DEF_STATUS_VAR_FUNC("rows_read", &export_stats.rows_read, SHOW_LONGLONG),
13341 DEF_STATUS_VAR_FUNC("rows_updated", &export_stats.rows_updated,
13342 SHOW_LONGLONG),
13343 DEF_STATUS_VAR_FUNC("rows_deleted_blind", &export_stats.rows_deleted_blind,
13344 SHOW_LONGLONG),
13345 DEF_STATUS_VAR_FUNC("rows_expired", &export_stats.rows_expired,
13346 SHOW_LONGLONG),
13347 DEF_STATUS_VAR_FUNC("rows_filtered", &export_stats.rows_filtered,
13348 SHOW_LONGLONG),
13349 DEF_STATUS_VAR_FUNC("system_rows_deleted",
13350 &export_stats.system_rows_deleted, SHOW_LONGLONG),
13351 DEF_STATUS_VAR_FUNC("system_rows_inserted",
13352 &export_stats.system_rows_inserted, SHOW_LONGLONG),
13353 DEF_STATUS_VAR_FUNC("system_rows_read", &export_stats.system_rows_read,
13354 SHOW_LONGLONG),
13355 DEF_STATUS_VAR_FUNC("system_rows_updated",
13356 &export_stats.system_rows_updated, SHOW_LONGLONG),
13357 DEF_STATUS_VAR_FUNC("memtable_total", &memory_stats.memtable_total,
13358 SHOW_LONGLONG),
13359 DEF_STATUS_VAR_FUNC("memtable_unflushed", &memory_stats.memtable_unflushed,
13360 SHOW_LONGLONG),
13361 DEF_STATUS_VAR_FUNC("queries_point", &export_stats.queries_point,
13362 SHOW_LONGLONG),
13363 DEF_STATUS_VAR_FUNC("queries_range", &export_stats.queries_range,
13364 SHOW_LONGLONG),
13365 DEF_STATUS_VAR_FUNC("covered_secondary_key_lookups",
13366 &export_stats.covered_secondary_key_lookups,
13367 SHOW_LONGLONG),
13368
13369 {NullS, NullS, SHOW_LONG}};
13370
13371 static void show_myrocks_vars(THD *thd, SHOW_VAR *var, char *buff) {
13372 myrocks_update_status();
13373 myrocks_update_memory_status();
13374 var->type = SHOW_ARRAY;
13375 var->value = reinterpret_cast<char *>(&myrocks_status_variables);
13376 }
13377
13378 static ulonglong io_stall_prop_value(
13379 const std::map<std::string, std::string> &props, const std::string &key) {
13380 std::map<std::string, std::string>::const_iterator iter =
13381 props.find("io_stalls." + key);
13382 if (iter != props.end()) {
13383 return std::stoull(iter->second);
13384 } else {
13385 DBUG_PRINT("warning",
13386 ("RocksDB GetMapPropery hasn't returned key=%s", key.c_str()));
13387 DBUG_ASSERT(0);
13388 return 0;
13389 }
13390 }
13391
13392 static void update_rocksdb_stall_status() {
13393 st_io_stall_stats local_io_stall_stats;
13394 for (const auto &cf_name : cf_manager.get_cf_names()) {
13395 rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(cf_name);
13396 if (cfh == nullptr) {
13397 continue;
13398 }
13399
13400 std::map<std::string, std::string> props;
13401 if (!rdb->GetMapProperty(cfh, "rocksdb.cfstats", &props)) {
13402 continue;
13403 }
13404
13405 local_io_stall_stats.level0_slowdown +=
13406 io_stall_prop_value(props, "level0_slowdown");
13407 local_io_stall_stats.level0_slowdown_with_compaction +=
13408 io_stall_prop_value(props, "level0_slowdown_with_compaction");
13409 local_io_stall_stats.level0_numfiles +=
13410 io_stall_prop_value(props, "level0_numfiles");
13411 local_io_stall_stats.level0_numfiles_with_compaction +=
13412 io_stall_prop_value(props, "level0_numfiles_with_compaction");
13413 local_io_stall_stats.stop_for_pending_compaction_bytes +=
13414 io_stall_prop_value(props, "stop_for_pending_compaction_bytes");
13415 local_io_stall_stats.slowdown_for_pending_compaction_bytes +=
13416 io_stall_prop_value(props, "slowdown_for_pending_compaction_bytes");
13417 local_io_stall_stats.memtable_compaction +=
13418 io_stall_prop_value(props, "memtable_compaction");
13419 local_io_stall_stats.memtable_slowdown +=
13420 io_stall_prop_value(props, "memtable_slowdown");
13421 local_io_stall_stats.total_stop += io_stall_prop_value(props, "total_stop");
13422 local_io_stall_stats.total_slowdown +=
13423 io_stall_prop_value(props, "total_slowdown");
13424 }
13425 io_stall_stats = local_io_stall_stats;
13426 }
13427
13428 static SHOW_VAR rocksdb_stall_status_variables[] = {
13429 DEF_STATUS_VAR_FUNC("l0_file_count_limit_slowdowns",
13430 &io_stall_stats.level0_slowdown, SHOW_LONGLONG),
13431 DEF_STATUS_VAR_FUNC("locked_l0_file_count_limit_slowdowns",
13432 &io_stall_stats.level0_slowdown_with_compaction,
13433 SHOW_LONGLONG),
13434 DEF_STATUS_VAR_FUNC("l0_file_count_limit_stops",
13435 &io_stall_stats.level0_numfiles, SHOW_LONGLONG),
13436 DEF_STATUS_VAR_FUNC("locked_l0_file_count_limit_stops",
13437 &io_stall_stats.level0_numfiles_with_compaction,
13438 SHOW_LONGLONG),
13439 DEF_STATUS_VAR_FUNC("pending_compaction_limit_stops",
13440 &io_stall_stats.stop_for_pending_compaction_bytes,
13441 SHOW_LONGLONG),
13442 DEF_STATUS_VAR_FUNC("pending_compaction_limit_slowdowns",
13443 &io_stall_stats.slowdown_for_pending_compaction_bytes,
13444 SHOW_LONGLONG),
13445 DEF_STATUS_VAR_FUNC("memtable_limit_stops",
13446 &io_stall_stats.memtable_compaction, SHOW_LONGLONG),
13447 DEF_STATUS_VAR_FUNC("memtable_limit_slowdowns",
13448 &io_stall_stats.memtable_slowdown, SHOW_LONGLONG),
13449 DEF_STATUS_VAR_FUNC("total_stops", &io_stall_stats.total_stop,
13450 SHOW_LONGLONG),
13451 DEF_STATUS_VAR_FUNC("total_slowdowns", &io_stall_stats.total_slowdown,
13452 SHOW_LONGLONG),
13453 // end of the array marker
13454 {NullS, NullS, SHOW_LONG}};
13455
13456 static void show_rocksdb_stall_vars(THD *thd, SHOW_VAR *var, char *buff) {
13457 update_rocksdb_stall_status();
13458 var->type = SHOW_ARRAY;
13459 var->value = reinterpret_cast<char *>(&rocksdb_stall_status_variables);
13460 }
13461
13462 static SHOW_VAR rocksdb_status_vars[] = {
13463 DEF_STATUS_VAR(block_cache_miss),
13464 DEF_STATUS_VAR(block_cache_hit),
13465 DEF_STATUS_VAR(block_cache_add),
13466 DEF_STATUS_VAR(block_cache_add_failures),
13467 DEF_STATUS_VAR(block_cache_index_miss),
13468 DEF_STATUS_VAR(block_cache_index_hit),
13469 DEF_STATUS_VAR(block_cache_index_add),
13470 DEF_STATUS_VAR(block_cache_index_bytes_insert),
13471 DEF_STATUS_VAR(block_cache_index_bytes_evict),
13472 DEF_STATUS_VAR(block_cache_filter_miss),
13473 DEF_STATUS_VAR(block_cache_filter_hit),
13474 DEF_STATUS_VAR(block_cache_filter_add),
13475 DEF_STATUS_VAR(block_cache_filter_bytes_insert),
13476 DEF_STATUS_VAR(block_cache_filter_bytes_evict),
13477 DEF_STATUS_VAR(block_cache_bytes_read),
13478 DEF_STATUS_VAR(block_cache_bytes_write),
13479 DEF_STATUS_VAR(block_cache_data_bytes_insert),
13480 DEF_STATUS_VAR(block_cache_data_miss),
13481 DEF_STATUS_VAR(block_cache_data_hit),
13482 DEF_STATUS_VAR(block_cache_data_add),
13483 DEF_STATUS_VAR(bloom_filter_useful),
13484 DEF_STATUS_VAR(bloom_filter_full_positive),
13485 DEF_STATUS_VAR(bloom_filter_full_true_positive),
13486 DEF_STATUS_VAR(memtable_hit),
13487 DEF_STATUS_VAR(memtable_miss),
13488 DEF_STATUS_VAR(get_hit_l0),
13489 DEF_STATUS_VAR(get_hit_l1),
13490 DEF_STATUS_VAR(get_hit_l2_and_up),
13491 DEF_STATUS_VAR(compaction_key_drop_new),
13492 DEF_STATUS_VAR(compaction_key_drop_obsolete),
13493 DEF_STATUS_VAR(compaction_key_drop_user),
13494 DEF_STATUS_VAR(number_keys_written),
13495 DEF_STATUS_VAR(number_keys_read),
13496 DEF_STATUS_VAR(number_keys_updated),
13497 DEF_STATUS_VAR(bytes_written),
13498 DEF_STATUS_VAR(bytes_read),
13499 DEF_STATUS_VAR(number_db_seek),
13500 DEF_STATUS_VAR(number_db_seek_found),
13501 DEF_STATUS_VAR(number_db_next),
13502 DEF_STATUS_VAR(number_db_next_found),
13503 DEF_STATUS_VAR(number_db_prev),
13504 DEF_STATUS_VAR(number_db_prev_found),
13505 DEF_STATUS_VAR(iter_bytes_read),
13506 DEF_STATUS_VAR(no_file_closes),
13507 DEF_STATUS_VAR(no_file_opens),
13508 DEF_STATUS_VAR(no_file_errors),
13509 DEF_STATUS_VAR(stall_micros),
13510 DEF_STATUS_VAR(num_iterators),
13511 DEF_STATUS_VAR(number_multiget_get),
13512 DEF_STATUS_VAR(number_multiget_keys_read),
13513 DEF_STATUS_VAR(number_multiget_bytes_read),
13514 DEF_STATUS_VAR(number_deletes_filtered),
13515 DEF_STATUS_VAR(number_merge_failures),
13516 DEF_STATUS_VAR(bloom_filter_prefix_checked),
13517 DEF_STATUS_VAR(bloom_filter_prefix_useful),
13518 DEF_STATUS_VAR(number_reseeks_iteration),
13519 DEF_STATUS_VAR(getupdatessince_calls),
13520 DEF_STATUS_VAR(block_cachecompressed_miss),
13521 DEF_STATUS_VAR(block_cachecompressed_hit),
13522 DEF_STATUS_VAR(wal_synced),
13523 DEF_STATUS_VAR(wal_bytes),
13524 DEF_STATUS_VAR(write_self),
13525 DEF_STATUS_VAR(write_other),
13526 DEF_STATUS_VAR(write_timedout),
13527 DEF_STATUS_VAR(write_wal),
13528 DEF_STATUS_VAR(flush_write_bytes),
13529 DEF_STATUS_VAR(compact_read_bytes),
13530 DEF_STATUS_VAR(compact_write_bytes),
13531 DEF_STATUS_VAR(number_superversion_acquires),
13532 DEF_STATUS_VAR(number_superversion_releases),
13533 DEF_STATUS_VAR(number_superversion_cleanups),
13534 DEF_STATUS_VAR(number_block_not_compressed),
13535 DEF_STATUS_VAR_PTR("row_lock_deadlocks", &rocksdb_row_lock_deadlocks,
13536 SHOW_LONGLONG),
13537 DEF_STATUS_VAR_PTR("row_lock_wait_timeouts",
13538 &rocksdb_row_lock_wait_timeouts, SHOW_LONGLONG),
13539 DEF_STATUS_VAR_PTR("snapshot_conflict_errors",
13540 &rocksdb_snapshot_conflict_errors, SHOW_LONGLONG),
13541 DEF_STATUS_VAR_PTR("wal_group_syncs", &rocksdb_wal_group_syncs,
13542 SHOW_LONGLONG),
13543 DEF_STATUS_VAR_PTR("manual_compactions_processed",
13544 &rocksdb_manual_compactions_processed, SHOW_LONGLONG),
13545 DEF_STATUS_VAR_PTR("manual_compactions_running",
13546 &rocksdb_manual_compactions_running, SHOW_LONGLONG),
13547 DEF_STATUS_VAR_PTR("number_sst_entry_put", &rocksdb_num_sst_entry_put,
13548 SHOW_LONGLONG),
13549 DEF_STATUS_VAR_PTR("number_sst_entry_delete", &rocksdb_num_sst_entry_delete,
13550 SHOW_LONGLONG),
13551 DEF_STATUS_VAR_PTR("number_sst_entry_singledelete",
13552 &rocksdb_num_sst_entry_singledelete, SHOW_LONGLONG),
13553 DEF_STATUS_VAR_PTR("number_sst_entry_merge", &rocksdb_num_sst_entry_merge,
13554 SHOW_LONGLONG),
13555 DEF_STATUS_VAR_PTR("number_sst_entry_other", &rocksdb_num_sst_entry_other,
13556 SHOW_LONGLONG),
13557 #ifndef DBUG_OFF
13558 DEF_STATUS_VAR_PTR("num_get_for_update_calls",
13559 &rocksdb_num_get_for_update_calls, SHOW_LONGLONG),
13560 #endif
13561 // the variables generated by SHOW_FUNC are sorted only by prefix (first
13562 // arg in the tuple below), so make sure it is unique to make sorting
13563 // deterministic as quick sort is not stable
13564 {"rocksdb", reinterpret_cast<char *>(&show_myrocks_vars), SHOW_FUNC},
13565 {"rocksdb_stall", reinterpret_cast<char *>(&show_rocksdb_stall_vars),
13566 SHOW_FUNC},
13567 {NullS, NullS, SHOW_LONG}};
13568
13569 /*
13570 Background thread's main logic
13571 */
13572
13573 void Rdb_background_thread::run() {
13574 // How many seconds to wait till flushing the WAL next time.
13575 const int WAKE_UP_INTERVAL = 1;
13576
13577 timespec ts_next_sync;
13578 set_timespec(ts_next_sync, WAKE_UP_INTERVAL);
13579
13580 for (;;) {
13581 // Wait until the next timeout or until we receive a signal to stop the
13582 // thread. Request to stop the thread should only be triggered when the
13583 // storage engine is being unloaded.
13584 RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
13585 const auto ret MY_ATTRIBUTE((__unused__)) =
13586 mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts_next_sync);
13587
13588 // Check that we receive only the expected error codes.
13589 DBUG_ASSERT(ret == 0 || ret == ETIMEDOUT);
13590 const bool local_stop = m_stop;
13591 const bool local_save_stats = m_save_stats;
13592 reset();
13593 RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
13594
13595 if (local_stop) {
13596 // If we're here then that's because condition variable was signaled by
13597 // another thread and we're shutting down. Break out the loop to make
13598 // sure that shutdown thread can proceed.
13599 break;
13600 }
13601
13602 // This path should be taken only when the timer expired.
13603 DBUG_ASSERT(ret == ETIMEDOUT);
13604
13605 if (local_save_stats) {
13606 ddl_manager.persist_stats();
13607 }
13608
13609 // Set the next timestamp for mysql_cond_timedwait() (which ends up calling
13610 // pthread_cond_timedwait()) to wait on.
13611 set_timespec(ts_next_sync, WAKE_UP_INTERVAL);
13612
13613 // Flush the WAL. Sync it for both background and never modes to copy
13614 // InnoDB's behavior. For mode never, the wal file isn't even written,
13615 // whereas background writes to the wal file, but issues the syncs in a
13616 // background thread.
13617 if (rdb && (rocksdb_flush_log_at_trx_commit != FLUSH_LOG_SYNC) &&
13618 !rocksdb_db_options->allow_mmap_writes) {
13619 const rocksdb::Status s = rdb->FlushWAL(true);
13620 if (!s.ok()) {
13621 rdb_handle_io_error(s, RDB_IO_ERROR_BG_THREAD);
13622 }
13623 }
13624 // Recalculate statistics for indexes.
13625 if (rocksdb_stats_recalc_rate) {
13626 std::unordered_map<GL_INDEX_ID, std::shared_ptr<const Rdb_key_def>>
13627 to_recalc;
13628
13629 if (rdb_indexes_to_recalc.empty()) {
13630 struct Rdb_index_collector : public Rdb_tables_scanner {
13631 int add_table(Rdb_tbl_def *tdef) override {
13632 for (uint i = 0; i < tdef->m_key_count; i++) {
13633 rdb_indexes_to_recalc.push_back(
13634 tdef->m_key_descr_arr[i]->get_gl_index_id());
13635 }
13636 return HA_EXIT_SUCCESS;
13637 }
13638 } collector;
13639 ddl_manager.scan_for_tables(&collector);
13640 }
13641
13642 while (to_recalc.size() < rocksdb_stats_recalc_rate &&
13643 !rdb_indexes_to_recalc.empty()) {
13644 const auto index_id = rdb_indexes_to_recalc.back();
13645 rdb_indexes_to_recalc.pop_back();
13646
13647 std::shared_ptr<const Rdb_key_def> keydef =
13648 ddl_manager.safe_find(index_id);
13649
13650 if (keydef) {
13651 to_recalc.insert(std::make_pair(keydef->get_gl_index_id(), keydef));
13652 }
13653 }
13654
13655 if (!to_recalc.empty()) {
13656 calculate_stats(to_recalc, false);
13657 }
13658 }
13659
13660 }
13661
13662 // save remaining stats which might've left unsaved
13663 ddl_manager.persist_stats();
13664 }
13665
13666 /*
13667 A background thread to handle manual compactions,
13668 except for dropping indexes/tables. Every second, it checks
13669 pending manual compactions, and it calls CompactRange if there is.
13670 */
13671 void Rdb_manual_compaction_thread::run() {
13672 mysql_mutex_init(0, &m_mc_mutex, MY_MUTEX_INIT_FAST);
13673 RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
13674 for (;;) {
13675 if (m_stop) {
13676 break;
13677 }
13678 timespec ts;
13679 set_timespec(ts, 1);
13680
13681 const auto ret MY_ATTRIBUTE((__unused__)) =
13682 mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts);
13683 if (m_stop) {
13684 break;
13685 }
13686 // make sure, no program error is returned
13687 DBUG_ASSERT(ret == 0 || ret == ETIMEDOUT);
13688 RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
13689
13690 RDB_MUTEX_LOCK_CHECK(m_mc_mutex);
13691 // Grab the first item and proceed, if not empty.
13692 if (m_requests.empty()) {
13693 RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
13694 RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
13695 continue;
13696 }
13697 Manual_compaction_request &mcr = m_requests.begin()->second;
13698 DBUG_ASSERT(mcr.cf != nullptr);
13699 DBUG_ASSERT(mcr.state == Manual_compaction_request::INITED);
13700 mcr.state = Manual_compaction_request::RUNNING;
13701 RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
13702
13703 DBUG_ASSERT(mcr.state == Manual_compaction_request::RUNNING);
13704 // NO_LINT_DEBUG
13705 sql_print_information("Manual Compaction id %d cf %s started.", mcr.mc_id,
13706 mcr.cf->GetName().c_str());
13707 rocksdb_manual_compactions_running++;
13708 if (rocksdb_debug_manual_compaction_delay > 0) {
13709 my_sleep(rocksdb_debug_manual_compaction_delay * 1000000);
13710 }
13711 // CompactRange may take a very long time. On clean shutdown,
13712 // it is cancelled by CancelAllBackgroundWork, then status is
13713 // set to shutdownInProgress.
13714 const rocksdb::Status s = rdb->CompactRange(
13715 getCompactRangeOptions(mcr.concurrency), mcr.cf, mcr.start, mcr.limit);
13716 rocksdb_manual_compactions_running--;
13717 if (s.ok()) {
13718 // NO_LINT_DEBUG
13719 sql_print_information("Manual Compaction id %d cf %s ended.", mcr.mc_id,
13720 mcr.cf->GetName().c_str());
13721 } else {
13722 // NO_LINT_DEBUG
13723 sql_print_information("Manual Compaction id %d cf %s aborted. %s",
13724 mcr.mc_id, mcr.cf->GetName().c_str(), s.getState());
13725 if (!s.IsShutdownInProgress()) {
13726 rdb_handle_io_error(s, RDB_IO_ERROR_BG_THREAD);
13727 } else {
13728 DBUG_ASSERT(m_requests.size() == 1);
13729 }
13730 }
13731 rocksdb_manual_compactions_processed++;
13732 clear_manual_compaction_request(mcr.mc_id, false);
13733 RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
13734 }
13735 clear_all_manual_compaction_requests();
13736 DBUG_ASSERT(m_requests.empty());
13737 RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
13738 mysql_mutex_destroy(&m_mc_mutex);
13739 }
13740
13741 void Rdb_manual_compaction_thread::clear_all_manual_compaction_requests() {
13742 RDB_MUTEX_LOCK_CHECK(m_mc_mutex);
13743 m_requests.clear();
13744 RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
13745 }
13746
13747 void Rdb_manual_compaction_thread::clear_manual_compaction_request(
13748 int mc_id, bool init_only) {
13749 bool erase = true;
13750 RDB_MUTEX_LOCK_CHECK(m_mc_mutex);
13751 auto it = m_requests.find(mc_id);
13752 if (it != m_requests.end()) {
13753 if (init_only) {
13754 Manual_compaction_request mcr = it->second;
13755 if (mcr.state != Manual_compaction_request::INITED) {
13756 erase = false;
13757 }
13758 }
13759 if (erase) {
13760 m_requests.erase(it);
13761 }
13762 } else {
13763 // Current code path guarantees that erasing by the same mc_id happens
13764 // at most once. INITED state may be erased by a thread that requested
13765 // the compaction. RUNNING state is erased by mc thread only.
13766 DBUG_ASSERT(0);
13767 }
13768 RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
13769 }
13770
13771 int Rdb_manual_compaction_thread::request_manual_compaction(
13772 rocksdb::ColumnFamilyHandle *cf, rocksdb::Slice *start,
13773 rocksdb::Slice *limit, int concurrency) {
13774 int mc_id = -1;
13775 RDB_MUTEX_LOCK_CHECK(m_mc_mutex);
13776 if (m_requests.size() >= rocksdb_max_manual_compactions) {
13777 RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
13778 return mc_id;
13779 }
13780 Manual_compaction_request mcr;
13781 mc_id = mcr.mc_id = ++m_latest_mc_id;
13782 mcr.state = Manual_compaction_request::INITED;
13783 mcr.cf = cf;
13784 mcr.start = start;
13785 mcr.limit = limit;
13786 mcr.concurrency = concurrency;
13787 m_requests.insert(std::make_pair(mcr.mc_id, mcr));
13788 RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
13789 return mc_id;
13790 }
13791
13792 bool Rdb_manual_compaction_thread::is_manual_compaction_finished(int mc_id) {
13793 bool finished = false;
13794 RDB_MUTEX_LOCK_CHECK(m_mc_mutex);
13795 if (m_requests.count(mc_id) == 0) {
13796 finished = true;
13797 }
13798 RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex);
13799 return finished;
13800 }
13801
13802 /**
13803 * Locking read + Not Found + Read Committed occurs if we accessed
13804 * a row by Seek, tried to lock it, failed, released and reacquired the
13805 * snapshot (because of READ COMMITTED mode) and the row was deleted by
13806 * someone else in the meantime.
13807 * If so, we either just skipping the row, or re-creating a snapshot
13808 * and seek again. In both cases, Read Committed constraint is not broken.
13809 */
13810 bool ha_rocksdb::should_skip_invalidated_record(const int rc) {
13811 if ((m_lock_rows != RDB_LOCK_NONE && rc == HA_ERR_KEY_NOT_FOUND &&
13812 my_core::thd_tx_isolation(ha_thd()) == ISO_READ_COMMITTED)) {
13813 return true;
13814 }
13815 return false;
13816 }
13817 /**
13818 * Indicating snapshot needs to be re-created and retrying seek again,
13819 * instead of returning errors or empty set. This is normally applicable
13820 * when hitting kBusy when locking the first row of the transaction,
13821 * with Repeatable Read isolation level.
13822 */
13823 bool ha_rocksdb::should_recreate_snapshot(const int rc,
13824 const bool is_new_snapshot) {
13825 if (should_skip_invalidated_record(rc) ||
13826 (rc == HA_ERR_ROCKSDB_STATUS_BUSY && is_new_snapshot)) {
13827 return true;
13828 }
13829 return false;
13830 }
13831
13832 /**
13833 * If calling put/delete/singledelete without locking the row,
13834 * it is necessary to pass assume_tracked=false to RocksDB TX API.
13835 * Read Free Replication and Blind Deletes are the cases when
13836 * using TX API and skipping row locking.
13837 */
13838 bool ha_rocksdb::can_assume_tracked(THD *thd) {
13839 if (/* MARIAROCKS_NOT_YET use_read_free_rpl() ||*/ (THDVAR(thd, blind_delete_primary_key))) {
13840 return false;
13841 }
13842 return true;
13843 }
13844
13845 bool ha_rocksdb::check_bloom_and_set_bounds(
13846 THD *thd, const Rdb_key_def &kd, const rocksdb::Slice &eq_cond,
13847 const bool use_all_keys, size_t bound_len, uchar *const lower_bound,
13848 uchar *const upper_bound, rocksdb::Slice *lower_bound_slice,
13849 rocksdb::Slice *upper_bound_slice) {
13850 bool can_use_bloom = can_use_bloom_filter(thd, kd, eq_cond, use_all_keys);
13851 if (!can_use_bloom) {
13852 setup_iterator_bounds(kd, eq_cond, bound_len, lower_bound, upper_bound,
13853 lower_bound_slice, upper_bound_slice);
13854 }
13855 return can_use_bloom;
13856 }
13857
13858 /**
13859 Deciding if it is possible to use bloom filter or not.
13860
13861 @detail
13862 Even if bloom filter exists, it is not always possible
13863 to use bloom filter. If using bloom filter when you shouldn't,
13864 false negative may happen -- fewer rows than expected may be returned.
13865 It is users' responsibility to use bloom filter correctly.
13866
13867 If bloom filter does not exist, return value does not matter because
13868 RocksDB does not use bloom filter internally.
13869
13870 @param kd
13871 @param eq_cond Equal condition part of the key. This always includes
13872 system index id (4 bytes).
13873 @param use_all_keys True if all key parts are set with equal conditions.
13874 This is aware of extended keys.
13875 */
13876 bool ha_rocksdb::can_use_bloom_filter(THD *thd, const Rdb_key_def &kd,
13877 const rocksdb::Slice &eq_cond,
13878 const bool use_all_keys) {
13879 bool can_use = false;
13880
13881 if (THDVAR(thd, skip_bloom_filter_on_read)) {
13882 return can_use;
13883 }
13884
13885 const rocksdb::SliceTransform *prefix_extractor = kd.get_extractor();
13886 if (prefix_extractor) {
13887 /*
13888 This is an optimized use case for CappedPrefixTransform.
13889 If eq_cond length >= prefix extractor length and if
13890 all keys are used for equal lookup, it is
13891 always possible to use bloom filter.
13892
13893 Prefix bloom filter can't be used on descending scan with
13894 prefix lookup (i.e. WHERE id1=1 ORDER BY id2 DESC), because of
13895 RocksDB's limitation. On ascending (or not sorting) scan,
13896 keys longer than the capped prefix length will be truncated down
13897 to the capped length and the resulting key is added to the bloom filter.
13898
13899 Keys shorter than the capped prefix length will be added to
13900 the bloom filter. When keys are looked up, key conditionals
13901 longer than the capped length can be used; key conditionals
13902 shorter require all parts of the key to be available
13903 for the short key match.
13904 */
13905 if ((use_all_keys && prefix_extractor->InRange(eq_cond)) ||
13906 prefix_extractor->SameResultWhenAppended(eq_cond)) {
13907 can_use = true;
13908 } else {
13909 can_use = false;
13910 }
13911 } else {
13912 /*
13913 if prefix extractor is not defined, all key parts have to be
13914 used by eq_cond.
13915 */
13916 if (use_all_keys) {
13917 can_use = true;
13918 } else {
13919 can_use = false;
13920 }
13921 }
13922
13923 return can_use;
13924 }
13925
13926 /* For modules that need access to the global data structures */
13927 rocksdb::TransactionDB *rdb_get_rocksdb_db() { return rdb; }
13928
13929 Rdb_cf_manager &rdb_get_cf_manager() { return cf_manager; }
13930
13931 const rocksdb::BlockBasedTableOptions &rdb_get_table_options() {
13932 return *rocksdb_tbl_options;
13933 }
13934
13935 bool rdb_is_ttl_enabled() { return rocksdb_enable_ttl; }
13936 bool rdb_is_ttl_read_filtering_enabled() {
13937 return rocksdb_enable_ttl_read_filtering;
13938 }
13939 #ifndef DBUG_OFF
13940 int rdb_dbug_set_ttl_rec_ts() { return rocksdb_debug_ttl_rec_ts; }
13941 int rdb_dbug_set_ttl_snapshot_ts() { return rocksdb_debug_ttl_snapshot_ts; }
13942 int rdb_dbug_set_ttl_read_filter_ts() {
13943 return rocksdb_debug_ttl_read_filter_ts;
13944 }
13945 bool rdb_dbug_set_ttl_ignore_pk() { return rocksdb_debug_ttl_ignore_pk; }
13946 #endif
13947
13948 void rdb_update_global_stats(const operation_type &type, uint count,
13949 bool is_system_table) {
13950 DBUG_ASSERT(type < ROWS_MAX);
13951
13952 if (count == 0) {
13953 return;
13954 }
13955
13956 if (is_system_table) {
13957 global_stats.system_rows[type].add(count);
13958 } else {
13959 global_stats.rows[type].add(count);
13960 }
13961 }
13962
13963 int rdb_get_table_perf_counters(const char *const tablename,
13964 Rdb_perf_counters *const counters) {
13965 DBUG_ASSERT(tablename != nullptr);
13966
13967 Rdb_table_handler *table_handler;
13968 table_handler = rdb_open_tables.get_table_handler(tablename);
13969 if (table_handler == nullptr) {
13970 return HA_ERR_ROCKSDB_INVALID_TABLE;
13971 }
13972
13973 counters->load(table_handler->m_table_perf_context);
13974
13975 rdb_open_tables.release_table_handler(table_handler);
13976 return HA_EXIT_SUCCESS;
13977 }
13978
13979 const char *get_rdb_io_error_string(const RDB_IO_ERROR_TYPE err_type) {
13980 // If this assertion fails then this means that a member has been either added
13981 // to or removed from RDB_IO_ERROR_TYPE enum and this function needs to be
13982 // changed to return the appropriate value.
13983 static_assert(RDB_IO_ERROR_LAST == 4, "Please handle all the error types.");
13984
13985 switch (err_type) {
13986 case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_TX_COMMIT:
13987 return "RDB_IO_ERROR_TX_COMMIT";
13988 case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_DICT_COMMIT:
13989 return "RDB_IO_ERROR_DICT_COMMIT";
13990 case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_BG_THREAD:
13991 return "RDB_IO_ERROR_BG_THREAD";
13992 case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_GENERAL:
13993 return "RDB_IO_ERROR_GENERAL";
13994 default:
13995 DBUG_ASSERT(false);
13996 return "(unknown)";
13997 }
13998 }
13999
14000 // In case of core dump generation we want this function NOT to be optimized
14001 // so that we can capture as much data as possible to debug the root cause
14002 // more efficiently.
14003 #ifdef __GNUC__
14004 #endif
14005 void rdb_handle_io_error(const rocksdb::Status status,
14006 const RDB_IO_ERROR_TYPE err_type) {
14007 if (status.IsIOError()) {
14008 /* skip dumping core if write failed and we are allowed to do so */
14009 #ifdef MARIAROCKS_NOT_YET
14010 if (skip_core_dump_on_error) {
14011 opt_core_file = false;
14012 }
14013 #endif
14014 switch (err_type) {
14015 case RDB_IO_ERROR_TX_COMMIT:
14016 case RDB_IO_ERROR_DICT_COMMIT: {
14017 rdb_log_status_error(status, "failed to write to WAL");
14018 /* NO_LINT_DEBUG */
14019 sql_print_error("MyRocks: aborting on WAL write error.");
14020 abort();
14021 break;
14022 }
14023 case RDB_IO_ERROR_BG_THREAD: {
14024 rdb_log_status_error(status, "BG thread failed to write to RocksDB");
14025 /* NO_LINT_DEBUG */
14026 sql_print_error("MyRocks: aborting on BG write error.");
14027 abort();
14028 break;
14029 }
14030 case RDB_IO_ERROR_GENERAL: {
14031 rdb_log_status_error(status, "failed on I/O");
14032 /* NO_LINT_DEBUG */
14033 sql_print_error("MyRocks: aborting on I/O error.");
14034 abort();
14035 break;
14036 }
14037 default:
14038 DBUG_ASSERT(0);
14039 break;
14040 }
14041 } else if (status.IsCorruption()) {
14042 rdb_log_status_error(status, "data corruption detected!");
14043 rdb_persist_corruption_marker();
14044 /* NO_LINT_DEBUG */
14045 sql_print_error("MyRocks: aborting because of data corruption.");
14046 abort();
14047 } else if (!status.ok()) {
14048 switch (err_type) {
14049 case RDB_IO_ERROR_DICT_COMMIT: {
14050 rdb_log_status_error(status, "Failed to write to WAL (dictionary)");
14051 /* NO_LINT_DEBUG */
14052 sql_print_error("MyRocks: aborting on WAL write error.");
14053 abort();
14054 break;
14055 }
14056 default:
14057 rdb_log_status_error(status, "Failed to read/write in RocksDB");
14058 break;
14059 }
14060 }
14061 }
14062 #ifdef __GNUC__
14063 #endif
14064 Rdb_dict_manager *rdb_get_dict_manager(void) { return &dict_manager; }
14065
14066 Rdb_ddl_manager *rdb_get_ddl_manager(void) { return &ddl_manager; }
14067
14068 Rdb_binlog_manager *rdb_get_binlog_manager(void) { return &binlog_manager; }
14069
14070 void rocksdb_set_compaction_options(
14071 my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
14072 my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14073 void *const var_ptr, const void *const save) {
14074 if (var_ptr && save) {
14075 *(uint64_t *)var_ptr = *(const uint64_t *)save;
14076 }
14077 const Rdb_compact_params params = {
14078 (uint64_t)rocksdb_compaction_sequential_deletes,
14079 (uint64_t)rocksdb_compaction_sequential_deletes_window,
14080 (uint64_t)rocksdb_compaction_sequential_deletes_file_size};
14081 if (properties_collector_factory) {
14082 properties_collector_factory->SetCompactionParams(params);
14083 }
14084 }
14085
14086 void rocksdb_set_table_stats_sampling_pct(
14087 my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
14088 my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14089 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
14090 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14091
14092 const uint32_t new_val = *static_cast<const uint32_t *>(save);
14093
14094 if (new_val != rocksdb_table_stats_sampling_pct) {
14095 rocksdb_table_stats_sampling_pct = new_val;
14096
14097 if (properties_collector_factory) {
14098 properties_collector_factory->SetTableStatsSamplingPct(
14099 rocksdb_table_stats_sampling_pct);
14100 }
14101 }
14102
14103 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14104 }
14105
14106 /*
14107 This function allows setting the rate limiter's bytes per second value
14108 but only if the rate limiter is turned on which has to be done at startup.
14109 If the rate is already 0 (turned off) or we are changing it to 0 (trying
14110 to turn it off) this function will push a warning to the client and do
14111 nothing.
14112 This is similar to the code in innodb_doublewrite_update (found in
14113 storage/innobase/handler/ha_innodb.cc).
14114 */
14115 void rocksdb_set_rate_limiter_bytes_per_sec(
14116 my_core::THD *const thd,
14117 my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14118 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
14119 const uint64_t new_val = *static_cast<const uint64_t *>(save);
14120 if (new_val == 0 || rocksdb_rate_limiter_bytes_per_sec == 0) {
14121 /*
14122 If a rate_limiter was not enabled at startup we can't change it nor
14123 can we disable it if one was created at startup
14124 */
14125 push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_WRONG_ARGUMENTS,
14126 "RocksDB: rocksdb_rate_limiter_bytes_per_sec cannot "
14127 "be dynamically changed to or from 0. Do a clean "
14128 "shutdown if you want to change it from or to 0.");
14129 } else if (new_val != rocksdb_rate_limiter_bytes_per_sec) {
14130 /* Apply the new value to the rate limiter and store it locally */
14131 DBUG_ASSERT(rocksdb_rate_limiter != nullptr);
14132 rocksdb_rate_limiter_bytes_per_sec = new_val;
14133 rocksdb_rate_limiter->SetBytesPerSecond(new_val);
14134 }
14135 }
14136
14137 void rocksdb_set_sst_mgr_rate_bytes_per_sec(
14138 my_core::THD *const thd,
14139 my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14140 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
14141 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14142
14143 const uint64_t new_val = *static_cast<const uint64_t *>(save);
14144
14145 if (new_val != rocksdb_sst_mgr_rate_bytes_per_sec) {
14146 rocksdb_sst_mgr_rate_bytes_per_sec = new_val;
14147
14148 rocksdb_db_options->sst_file_manager->SetDeleteRateBytesPerSecond(
14149 rocksdb_sst_mgr_rate_bytes_per_sec);
14150 }
14151
14152 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14153 }
14154
14155 void rocksdb_set_delayed_write_rate(THD *thd, struct st_mysql_sys_var *var,
14156 void *var_ptr, const void *save) {
14157 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14158 const uint64_t new_val = *static_cast<const uint64_t *>(save);
14159 if (rocksdb_delayed_write_rate != new_val) {
14160 rocksdb_delayed_write_rate = new_val;
14161 rocksdb::Status s =
14162 rdb->SetDBOptions({{"delayed_write_rate", std::to_string(new_val)}});
14163
14164 if (!s.ok()) {
14165 /* NO_LINT_DEBUG */
14166 sql_print_warning(
14167 "MyRocks: failed to update delayed_write_rate. "
14168 "status code = %d, status = %s",
14169 s.code(), s.ToString().c_str());
14170 }
14171 }
14172 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14173 }
14174
14175 void rocksdb_set_max_latest_deadlocks(THD *thd, struct st_mysql_sys_var *var,
14176 void *var_ptr, const void *save) {
14177 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14178 const uint32_t new_val = *static_cast<const uint32_t *>(save);
14179 if (rocksdb_max_latest_deadlocks != new_val) {
14180 rocksdb_max_latest_deadlocks = new_val;
14181 rdb->SetDeadlockInfoBufferSize(rocksdb_max_latest_deadlocks);
14182 }
14183 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14184 }
14185
14186 void rdb_set_collation_exception_list(const char *const exception_list) {
14187 DBUG_ASSERT(rdb_collation_exceptions != nullptr);
14188
14189 if (!rdb_collation_exceptions->set_patterns(exception_list)) {
14190 my_core::warn_about_bad_patterns(rdb_collation_exceptions,
14191 "strict_collation_exceptions");
14192 }
14193 }
14194
14195 void rocksdb_set_collation_exception_list(THD *const thd,
14196 struct st_mysql_sys_var *const var,
14197 void *const var_ptr,
14198 const void *const save) {
14199 const char *const val = *static_cast<const char *const *>(save);
14200
14201 rdb_set_collation_exception_list(val == nullptr ? "" : val);
14202
14203 //psergey-todo: what is the purpose of the below??
14204 const char *val_copy= val? my_strdup(PSI_INSTRUMENT_ME, val, MYF(0)): nullptr;
14205 my_free(*static_cast<char**>(var_ptr));
14206 *static_cast<const char**>(var_ptr) = val_copy;
14207 }
14208
14209 int mysql_value_to_bool(struct st_mysql_value *value, my_bool *return_value) {
14210 int new_value_type = value->value_type(value);
14211 if (new_value_type == MYSQL_VALUE_TYPE_STRING) {
14212 char buf[16];
14213 int len = sizeof(buf);
14214 const char *str = value->val_str(value, buf, &len);
14215 if (str && (my_strcasecmp(system_charset_info, "true", str) == 0 ||
14216 my_strcasecmp(system_charset_info, "on", str) == 0)) {
14217 *return_value = TRUE;
14218 } else if (str && (my_strcasecmp(system_charset_info, "false", str) == 0 ||
14219 my_strcasecmp(system_charset_info, "off", str) == 0)) {
14220 *return_value = FALSE;
14221 } else {
14222 return 1;
14223 }
14224 } else if (new_value_type == MYSQL_VALUE_TYPE_INT) {
14225 long long intbuf;
14226 value->val_int(value, &intbuf);
14227 if (intbuf > 1) return 1;
14228 *return_value = intbuf > 0 ? TRUE : FALSE;
14229 } else {
14230 return 1;
14231 }
14232
14233 return 0;
14234 }
14235
14236 int rocksdb_check_bulk_load(
14237 THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)),
14238 void *save, struct st_mysql_value *value) {
14239 my_bool new_value;
14240 if (mysql_value_to_bool(value, &new_value) != 0) {
14241 return 1;
14242 }
14243
14244 Rdb_transaction *tx = get_tx_from_thd(thd);
14245 if (tx != nullptr) {
14246 bool is_critical_error;
14247 const int rc = tx->finish_bulk_load(&is_critical_error);
14248 if (rc != 0 && is_critical_error) {
14249 // NO_LINT_DEBUG
14250 sql_print_error(
14251 "RocksDB: Error %d finalizing last SST file while "
14252 "setting bulk loading variable",
14253 rc);
14254 THDVAR(thd, bulk_load) = 0;
14255 return 1;
14256 }
14257 }
14258
14259 *static_cast<bool *>(save) = new_value;
14260 return 0;
14261 }
14262
14263 int rocksdb_check_bulk_load_allow_unsorted(
14264 THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)),
14265 void *save, struct st_mysql_value *value) {
14266 my_bool new_value;
14267 if (mysql_value_to_bool(value, &new_value) != 0) {
14268 return 1;
14269 }
14270
14271 if (THDVAR(thd, bulk_load)) {
14272 my_error(ER_ERROR_WHEN_EXECUTING_COMMAND, MYF(0), "SET",
14273 "Cannot change this setting while bulk load is enabled");
14274
14275 return 1;
14276 }
14277
14278 *static_cast<bool *>(save) = new_value;
14279 return 0;
14280 }
14281
14282 static void rocksdb_set_max_background_jobs(THD *thd,
14283 struct st_mysql_sys_var *const var,
14284 void *const var_ptr,
14285 const void *const save) {
14286 DBUG_ASSERT(save != nullptr);
14287 DBUG_ASSERT(rocksdb_db_options != nullptr);
14288 DBUG_ASSERT(rocksdb_db_options->env != nullptr);
14289
14290 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14291
14292 const int new_val = *static_cast<const int *>(save);
14293
14294 if (rocksdb_db_options->max_background_jobs != new_val) {
14295 rocksdb_db_options->max_background_jobs = new_val;
14296 rocksdb::Status s =
14297 rdb->SetDBOptions({{"max_background_jobs", std::to_string(new_val)}});
14298
14299 if (!s.ok()) {
14300 /* NO_LINT_DEBUG */
14301 sql_print_warning(
14302 "MyRocks: failed to update max_background_jobs. "
14303 "Status code = %d, status = %s.",
14304 s.code(), s.ToString().c_str());
14305 }
14306 }
14307
14308 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14309 }
14310
14311 static void rocksdb_set_bytes_per_sync(
14312 THD *thd MY_ATTRIBUTE((__unused__)),
14313 struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14314 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
14315 DBUG_ASSERT(save != nullptr);
14316 DBUG_ASSERT(rocksdb_db_options != nullptr);
14317 DBUG_ASSERT(rocksdb_db_options->env != nullptr);
14318
14319 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14320
14321 const ulonglong new_val = *static_cast<const ulonglong *>(save);
14322
14323 if (rocksdb_db_options->bytes_per_sync != new_val) {
14324 rocksdb_db_options->bytes_per_sync = new_val;
14325 rocksdb::Status s =
14326 rdb->SetDBOptions({{"bytes_per_sync", std::to_string(new_val)}});
14327
14328 if (!s.ok()) {
14329 /* NO_LINT_DEBUG */
14330 sql_print_warning(
14331 "MyRocks: failed to update max_background_jobs. "
14332 "Status code = %d, status = %s.",
14333 s.code(), s.ToString().c_str());
14334 }
14335 }
14336
14337 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14338 }
14339
14340 static void rocksdb_set_wal_bytes_per_sync(
14341 THD *thd MY_ATTRIBUTE((__unused__)),
14342 struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14343 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
14344 DBUG_ASSERT(save != nullptr);
14345 DBUG_ASSERT(rocksdb_db_options != nullptr);
14346 DBUG_ASSERT(rocksdb_db_options->env != nullptr);
14347
14348 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14349
14350 const ulonglong new_val = *static_cast<const ulonglong *>(save);
14351
14352 if (rocksdb_db_options->wal_bytes_per_sync != new_val) {
14353 rocksdb_db_options->wal_bytes_per_sync = new_val;
14354 rocksdb::Status s =
14355 rdb->SetDBOptions({{"wal_bytes_per_sync", std::to_string(new_val)}});
14356
14357 if (!s.ok()) {
14358 /* NO_LINT_DEBUG */
14359 sql_print_warning(
14360 "MyRocks: failed to update max_background_jobs. "
14361 "Status code = %d, status = %s.",
14362 s.code(), s.ToString().c_str());
14363 }
14364 }
14365
14366 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14367 }
14368
14369 /*
14370 Validating and updating block cache size via sys_var::check path.
14371 SetCapacity may take seconds when reducing block cache, and
14372 sys_var::update holds LOCK_global_system_variables mutex, so
14373 updating block cache size is done at check path instead.
14374 */
14375 static int rocksdb_validate_set_block_cache_size(
14376 THD *thd MY_ATTRIBUTE((__unused__)),
14377 struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
14378 void *var_ptr, struct st_mysql_value *value) {
14379 DBUG_ASSERT(value != nullptr);
14380
14381 long long new_value;
14382
14383 /* value is NULL */
14384 if (value->val_int(value, &new_value)) {
14385 return HA_EXIT_FAILURE;
14386 }
14387
14388 if (new_value < RDB_MIN_BLOCK_CACHE_SIZE ||
14389 (uint64_t)new_value > (uint64_t)LLONG_MAX) {
14390 return HA_EXIT_FAILURE;
14391 }
14392
14393 RDB_MUTEX_LOCK_CHECK(rdb_block_cache_resize_mutex);
14394 const rocksdb::BlockBasedTableOptions &table_options =
14395 rdb_get_table_options();
14396
14397 if (rocksdb_block_cache_size != new_value && table_options.block_cache) {
14398 table_options.block_cache->SetCapacity(new_value);
14399 }
14400 *static_cast<int64_t *>(var_ptr) = static_cast<int64_t>(new_value);
14401 RDB_MUTEX_UNLOCK_CHECK(rdb_block_cache_resize_mutex);
14402 return HA_EXIT_SUCCESS;
14403 }
14404
14405 static int rocksdb_validate_update_cf_options(
14406 THD * /* unused */, struct st_mysql_sys_var * /*unused*/, void *save,
14407 struct st_mysql_value *value) {
14408 char buff[STRING_BUFFER_USUAL_SIZE];
14409 const char *str;
14410 int length;
14411 length = sizeof(buff);
14412 str = value->val_str(value, buff, &length);
14413 // In some cases, str can point to buff in the stack.
14414 // This can cause invalid memory access after validation is finished.
14415 // To avoid this kind case, let's alway duplicate the str if str is not
14416 // nullptr
14417 *(const char **)save = (str == nullptr) ? nullptr : my_strdup(PSI_INSTRUMENT_ME, str, MYF(0));
14418
14419 if (str == nullptr) {
14420 return HA_EXIT_SUCCESS;
14421 }
14422
14423 Rdb_cf_options::Name_to_config_t option_map;
14424
14425 // Basic sanity checking and parsing the options into a map. If this fails
14426 // then there's no point to proceed.
14427 if (!Rdb_cf_options::parse_cf_options(str, &option_map)) {
14428 my_error(ER_WRONG_VALUE_FOR_VAR, MYF(0), "rocksdb_update_cf_options", str);
14429 // Free what we've copied with my_strdup above.
14430 my_free((void*)(*(const char **)save));
14431 return HA_EXIT_FAILURE;
14432 }
14433 // Loop through option_map and create missing column families
14434 for (Rdb_cf_options::Name_to_config_t::iterator it = option_map.begin();
14435 it != option_map.end(); ++it) {
14436 cf_manager.get_or_create_cf(rdb, it->first);
14437 }
14438 return HA_EXIT_SUCCESS;
14439 }
14440
14441 static void rocksdb_set_update_cf_options(
14442 THD *const /* unused */, struct st_mysql_sys_var *const /* unused */,
14443 void *const var_ptr, const void *const save) {
14444 const char *const val = *static_cast<const char *const *>(save);
14445
14446 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
14447
14448 my_free(*reinterpret_cast<char **>(var_ptr));
14449
14450 if (!val) {
14451 *reinterpret_cast<char **>(var_ptr) = nullptr;
14452 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14453 return;
14454 }
14455
14456 DBUG_ASSERT(val != nullptr);
14457
14458 // Reset the pointers regardless of how much success we had with updating
14459 // the CF options. This will results in consistent behavior and avoids
14460 // dealing with cases when only a subset of CF-s was successfully updated.
14461 *reinterpret_cast<const char **>(var_ptr) = val;
14462
14463 // Do the real work of applying the changes.
14464 Rdb_cf_options::Name_to_config_t option_map;
14465
14466 // This should never fail, because of rocksdb_validate_update_cf_options
14467 if (!Rdb_cf_options::parse_cf_options(val, &option_map)) {
14468 my_free(*reinterpret_cast<char**>(var_ptr));
14469 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14470 return;
14471 }
14472
14473 // For each CF we have, see if we need to update any settings.
14474 for (const auto &cf_name : cf_manager.get_cf_names()) {
14475 DBUG_ASSERT(!cf_name.empty());
14476
14477 rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(cf_name);
14478 DBUG_ASSERT(cfh != nullptr);
14479
14480 const auto it = option_map.find(cf_name);
14481 std::string per_cf_options = (it != option_map.end()) ? it->second : "";
14482
14483 if (!per_cf_options.empty()) {
14484 Rdb_cf_options::Name_to_config_t opt_map;
14485 rocksdb::Status s = rocksdb::StringToMap(per_cf_options, &opt_map);
14486
14487 if (s != rocksdb::Status::OK()) {
14488 // NO_LINT_DEBUG
14489 sql_print_warning(
14490 "MyRocks: failed to convert the options for column "
14491 "family '%s' to a map. %s",
14492 cf_name.c_str(), s.ToString().c_str());
14493 } else {
14494 DBUG_ASSERT(rdb != nullptr);
14495
14496 // Finally we can apply the options.
14497 s = rdb->SetOptions(cfh, opt_map);
14498
14499 if (s != rocksdb::Status::OK()) {
14500 // NO_LINT_DEBUG
14501 sql_print_warning(
14502 "MyRocks: failed to apply the options for column "
14503 "family '%s'. %s",
14504 cf_name.c_str(), s.ToString().c_str());
14505 } else {
14506 // NO_LINT_DEBUG
14507 sql_print_information(
14508 "MyRocks: options for column family '%s' "
14509 "have been successfully updated.",
14510 cf_name.c_str());
14511
14512 // Make sure that data is internally consistent as well and update
14513 // the CF options. This is necessary also to make sure that the CF
14514 // options will be correctly reflected in the relevant table:
14515 // ROCKSDB_CF_OPTIONS in INFORMATION_SCHEMA.
14516 rocksdb::ColumnFamilyOptions cf_options = rdb->GetOptions(cfh);
14517 std::string updated_options;
14518
14519 s = rocksdb::GetStringFromColumnFamilyOptions(&updated_options,
14520 cf_options);
14521
14522 DBUG_ASSERT(s == rocksdb::Status::OK());
14523 DBUG_ASSERT(!updated_options.empty());
14524
14525 cf_manager.update_options_map(cf_name, updated_options);
14526 }
14527 }
14528 }
14529 }
14530
14531 // Our caller (`plugin_var_memalloc_global_update`) will call `my_free` to
14532 // free up resources used before.
14533
14534 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
14535 }
14536
14537 void rdb_queue_save_stats_request() { rdb_bg_thread.request_save_stats(); }
14538
14539 #ifdef MARIAROCKS_NOT_YET // MDEV-10976
14540
14541 void ha_rocksdb::rpl_before_delete_rows() {
14542 DBUG_ENTER_FUNC();
14543
14544 m_in_rpl_delete_rows = true;
14545
14546 DBUG_VOID_RETURN;
14547 }
14548
14549 void ha_rocksdb::rpl_after_delete_rows() {
14550 DBUG_ENTER_FUNC();
14551
14552 m_in_rpl_delete_rows = false;
14553
14554 DBUG_VOID_RETURN;
14555 }
14556
14557 void ha_rocksdb::rpl_before_update_rows() {
14558 DBUG_ENTER_FUNC();
14559
14560 m_in_rpl_update_rows = true;
14561
14562 DBUG_VOID_RETURN;
14563 }
14564
14565 void ha_rocksdb::rpl_after_update_rows() {
14566 DBUG_ENTER_FUNC();
14567
14568 m_in_rpl_update_rows = false;
14569
14570 DBUG_VOID_RETURN;
14571 }
14572
14573 #if 0
14574 bool ha_rocksdb::is_read_free_rpl_table() const {
14575 return table->s && m_tbl_def->m_is_read_free_rpl_table;
14576 }
14577 #endif
14578
14579 /**
14580 @brief
14581 Read Free Replication can be used or not. Returning true means
14582 Read Free Replication can be used.
14583 */
14584 bool ha_rocksdb::use_read_free_rpl() const {
14585 DBUG_ENTER_FUNC();
14586
14587 if (!ha_thd()->rli_slave || table->triggers || /* !is_read_free_rpl_table()*/ ) {
14588 DBUG_RETURN(false);
14589 }
14590
14591 #if 0 // MARIAROCKS_NOT_YET : read-free replication is not supported
14592 switch (rocksdb_read_free_rpl) {
14593 case read_free_rpl_type::OFF:
14594 DBUG_RETURN(false);
14595 case read_free_rpl_type::PK_ONLY:
14596 DBUG_RETURN(!has_hidden_pk(table) && table->s->keys == 1);
14597 case read_free_rpl_type::PK_SK:
14598 DBUG_RETURN(!has_hidden_pk(table));
14599 }
14600 #else
14601 DBUG_RETURN(false);
14602 #endif
14603
14604 DBUG_ASSERT(false);
14605 DBUG_RETURN(false);
14606 }
14607 #endif // MARIAROCKS_NOT_YET
14608
14609 double ha_rocksdb::read_time(uint index, uint ranges, ha_rows rows) {
14610 DBUG_ENTER_FUNC();
14611
14612 if (index != table->s->primary_key) {
14613 /* Non covering index range scan */
14614 DBUG_RETURN(handler::read_time(index, ranges, rows));
14615 }
14616
14617 DBUG_RETURN((rows / 20.0) + 1);
14618 }
14619
14620 void ha_rocksdb::print_error(int error, myf errflag) {
14621 if (error == HA_ERR_ROCKSDB_STATUS_BUSY) {
14622 error = HA_ERR_LOCK_DEADLOCK;
14623 }
14624 handler::print_error(error, errflag);
14625 }
14626
14627 std::string rdb_corruption_marker_file_name() {
14628 std::string ret(rocksdb_datadir);
14629 ret.append("/ROCKSDB_CORRUPTED");
14630 return ret;
14631 }
14632
14633 void sql_print_verbose_info(const char *format, ...)
14634 {
14635 va_list args;
14636
14637 if (global_system_variables.log_warnings > 2) {
14638 va_start(args, format);
14639 sql_print_information_v(format, args);
14640 va_end(args);
14641 }
14642 }
14643
14644 } // namespace myrocks
14645
14646
14647 /**
14648 Construct and emit duplicate key error message using information
14649 from table's record buffer.
14650
14651 @sa print_keydup_error(table, key, msg, errflag, thd, org_table_name).
14652 */
14653
14654 void print_keydup_error(TABLE *table, KEY *key, myf errflag,
14655 const THD *thd, const char *org_table_name)
14656 {
14657 print_keydup_error(table, key, ER(ER_DUP_ENTRY_WITH_KEY_NAME), errflag);
14658 }
14659
14660 /*
14661 Register the storage engine plugin outside of myrocks namespace
14662 so that mysql_declare_plugin does not get confused when it does
14663 its name generation.
14664 */
14665
14666
14667 struct st_mysql_storage_engine rocksdb_storage_engine = {
14668 MYSQL_HANDLERTON_INTERFACE_VERSION};
14669
14670 maria_declare_plugin(rocksdb_se){
14671 MYSQL_STORAGE_ENGINE_PLUGIN, /* Plugin Type */
14672 &rocksdb_storage_engine, /* Plugin Descriptor */
14673 "ROCKSDB", /* Plugin Name */
14674 "Monty Program Ab", /* Plugin Author */
14675 "RocksDB storage engine", /* Plugin Description */
14676 PLUGIN_LICENSE_GPL, /* Plugin Licence */
14677 myrocks::rocksdb_init_func, /* Plugin Entry Point */
14678 myrocks::rocksdb_done_func, /* Plugin Deinitializer */
14679 0x0001, /* version number (0.1) */
14680 myrocks::rocksdb_status_vars, /* status variables */
14681 myrocks::rocksdb_system_variables, /* system variables */
14682 "1.0", /* string version */
14683 myrocks::MYROCKS_MARIADB_PLUGIN_MATURITY_LEVEL
14684 },
14685 myrocks::rdb_i_s_cfstats, myrocks::rdb_i_s_dbstats,
14686 myrocks::rdb_i_s_perf_context, myrocks::rdb_i_s_perf_context_global,
14687 myrocks::rdb_i_s_cfoptions, myrocks::rdb_i_s_compact_stats,
14688 myrocks::rdb_i_s_global_info, myrocks::rdb_i_s_ddl,
14689 myrocks::rdb_i_s_sst_props, myrocks::rdb_i_s_index_file_map,
14690 myrocks::rdb_i_s_lock_info, myrocks::rdb_i_s_trx_info,
14691 myrocks::rdb_i_s_deadlock_info
14692 maria_declare_plugin_end;
14693