1 /* 2 Copyright (c) 2012,2013 Monty Program Ab 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; version 2 of the License. 7 8 This program is distributed in the hope that it will be useful, 9 but WITHOUT ANY WARRANTY; without even the implied warranty of 10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 GNU General Public License for more details. 12 13 You should have received a copy of the GNU General Public License 14 along with this program; if not, write to the Free Software 15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ 16 #pragma once 17 18 #ifdef USE_PRAGMA_INTERFACE 19 #pragma interface /* gcc class implementation */ 20 #endif 21 22 #define ROCKSDB_INCLUDE_RFR 1 23 24 /* C++ standard header files */ 25 #include <cinttypes> 26 #include <set> 27 #include <string> 28 #include <unordered_map> 29 #include <unordered_set> 30 #include <vector> 31 32 /* MySQL header files */ 33 #include "field.h" 34 #include "handler.h" /* handler */ 35 #include "my_global.h" /* ulonglong */ 36 #include "my_icp.h" 37 #include "sql_bitmap.h" 38 #include "sql_string.h" 39 40 /* RocksDB header files */ 41 #include "rocksdb/cache.h" 42 #include "rocksdb/merge_operator.h" 43 #include "rocksdb/perf_context.h" 44 #include "rocksdb/sst_file_manager.h" 45 #include "rocksdb/statistics.h" 46 #include "rocksdb/utilities/options_util.h" 47 #include "rocksdb/utilities/transaction_db.h" 48 #include "rocksdb/utilities/write_batch_with_index.h" 49 50 /* MyRocks header files */ 51 #include "./rdb_buff.h" 52 #include "./rdb_comparator.h" 53 #include "./rdb_global.h" 54 #include "./rdb_index_merge.h" 55 #include "./rdb_perf_context.h" 56 #include "./rdb_sst_info.h" 57 #include "./rdb_utils.h" 58 59 /** 60 @note MyRocks Coding Conventions: 61 MyRocks code follows the baseline MySQL coding conventions, available at 62 http://dev.mysql.com/doc/internals/en/coding-guidelines.html, with several 63 refinements (@see /storage/rocksdb/README file). 64 */ 65 66 /** 67 @note MyRocks Coding Conventions: 68 MyRocks code follows the baseline MySQL coding conventions, available at 69 http://dev.mysql.com/doc/internals/en/coding-guidelines.html, with several 70 refinements (@see /storage/rocksdb/README file). 71 */ 72 73 namespace myrocks { 74 75 class Rdb_converter; 76 class Rdb_key_def; 77 class Rdb_tbl_def; 78 class Rdb_transaction; 79 class Rdb_transaction_impl; 80 class Rdb_writebatch_impl; 81 class Rdb_field_encoder; 82 83 extern char *rocksdb_read_free_rpl_tables; 84 extern ulong rocksdb_max_row_locks; 85 #if defined(HAVE_PSI_INTERFACE) 86 extern PSI_rwlock_key key_rwlock_read_free_rpl_tables; 87 #endif 88 extern Regex rdb_read_free_regex_handler; 89 90 /** 91 @brief 92 Rdb_table_handler is a reference-counted structure storing information for 93 each open table. All the objects are stored in a global hash map. 94 95 //TODO: join this with Rdb_tbl_def ? 96 */ 97 struct Rdb_table_handler { 98 char *m_table_name; 99 uint m_table_name_length; 100 int m_ref_count; 101 102 my_core::THR_LOCK m_thr_lock; ///< MySQL latch needed by m_db_lock 103 104 /* Stores cumulative table statistics */ 105 my_io_perf_atomic_t m_io_perf_read; 106 Rdb_atomic_perf_counters m_table_perf_context; 107 108 /* Stores cached memtable estimate statistics */ 109 std::atomic_uint m_mtcache_lock; 110 uint64_t m_mtcache_count; 111 uint64_t m_mtcache_size; 112 uint64_t m_mtcache_last_update; 113 }; 114 115 } // namespace myrocks 116 117 /* Provide hash function for GL_INDEX_ID so we can include it in sets */ 118 namespace std { 119 template <> 120 struct hash<myrocks::GL_INDEX_ID> { 121 std::size_t operator()(const myrocks::GL_INDEX_ID &gl_index_id) const { 122 const uint64_t val = 123 ((uint64_t)gl_index_id.cf_id << 32 | (uint64_t)gl_index_id.index_id); 124 return std::hash<uint64_t>()(val); 125 } 126 }; 127 } // namespace std 128 129 namespace myrocks { 130 enum table_cardinality_scan_type { 131 SCAN_TYPE_NONE, 132 SCAN_TYPE_MEMTABLE_ONLY, 133 SCAN_TYPE_FULL_TABLE, 134 }; 135 136 /** 137 @brief 138 Class definition for ROCKSDB storage engine plugin handler 139 */ 140 141 class ha_rocksdb : public my_core::handler { 142 my_core::THR_LOCK_DATA m_db_lock; ///< MySQL database lock 143 144 Rdb_table_handler *m_table_handler; ///< Open table handler 145 146 /* Iterator used for range scans and for full table/index scans */ 147 rocksdb::Iterator *m_scan_it; 148 149 /* Whether m_scan_it was created with skip_bloom=true */ 150 bool m_scan_it_skips_bloom; 151 152 const rocksdb::Snapshot *m_scan_it_snapshot; 153 154 /* Buffers used for upper/lower bounds for m_scan_it. */ 155 uchar *m_scan_it_lower_bound; 156 uchar *m_scan_it_upper_bound; 157 rocksdb::Slice m_scan_it_lower_bound_slice; 158 rocksdb::Slice m_scan_it_upper_bound_slice; 159 160 Rdb_tbl_def *m_tbl_def; 161 162 /* Primary Key encoder from KeyTupleFormat to StorageFormat */ 163 std::shared_ptr<Rdb_key_def> m_pk_descr; 164 165 /* Array of index descriptors */ 166 std::shared_ptr<Rdb_key_def> *m_key_descr_arr; 167 168 static bool check_keyread_allowed(bool &pk_can_be_decoded, 169 const TABLE_SHARE *table_share, uint inx, 170 uint part, bool all_parts); 171 172 /* 173 Number of key parts in PK. This is the same as 174 table->key_info[table->s->primary_key].keyparts 175 */ 176 uint m_pk_key_parts; 177 178 /* 179 true <=> Primary Key columns can be decoded from the index. It should be 180 enabled by default and may be disabled in init_with_fields() after initial 181 keys info is loaded and it turns out the feature isn't supported for 182 particular table. 183 */ 184 mutable bool m_pk_can_be_decoded; 185 186 uchar *m_pk_tuple; /* Buffer for storing PK in KeyTupleFormat */ 187 uchar *m_pk_packed_tuple; /* Buffer for storing PK in StorageFormat */ 188 // ^^ todo: change it to 'char*'? TODO: ^ can we join this with last_rowkey? 189 190 /* 191 Temporary buffers for storing the key part of the Key/Value pair 192 for secondary indexes. 193 */ 194 uchar *m_sk_packed_tuple; 195 196 /* 197 Temporary buffers for storing end key part of the Key/Value pair. 198 This is used for range scan only. 199 */ 200 uchar *m_end_key_packed_tuple; 201 202 Rdb_string_writer m_sk_tails; 203 Rdb_string_writer m_pk_unpack_info; 204 205 /* 206 ha_rockdb->index_read_map(.. HA_READ_KEY_EXACT or similar) will save here 207 mem-comparable form of the index lookup tuple. 208 */ 209 uchar *m_sk_match_prefix; 210 uint m_sk_match_length; 211 212 /* Buffer space for the above */ 213 uchar *m_sk_match_prefix_buf; 214 215 /* Second buffers, used by UPDATE. */ 216 uchar *m_sk_packed_tuple_old; 217 Rdb_string_writer m_sk_tails_old; 218 219 /* Buffers used for duplicate checking during unique_index_creation */ 220 uchar *m_dup_sk_packed_tuple; 221 uchar *m_dup_sk_packed_tuple_old; 222 223 /* 224 Temporary space for packing VARCHARs (we provide it to 225 pack_record()/pack_index_tuple() calls). 226 */ 227 uchar *m_pack_buffer; 228 229 /* class to convert between Mysql format and RocksDB format*/ 230 std::unique_ptr<Rdb_converter> m_converter; 231 232 /* 233 Pointer to the original TTL timestamp value (8 bytes) during UPDATE. 234 */ 235 char *m_ttl_bytes; 236 /* 237 The TTL timestamp value can change if the explicit TTL column is 238 updated. If we detect this when updating the PK, we indicate it here so 239 we know we must always update any SK's. 240 */ 241 bool m_ttl_bytes_updated; 242 243 /* rowkey of the last record we've read, in StorageFormat. */ 244 String m_last_rowkey; 245 246 /* 247 Last retrieved record, in table->record[0] data format. 248 249 This is used only when we get the record with rocksdb's Get() call (The 250 other option is when we get a rocksdb::Slice from an iterator) 251 */ 252 rocksdb::PinnableSlice m_retrieved_record; 253 254 /* Type of locking to apply to rows */ 255 enum { RDB_LOCK_NONE, RDB_LOCK_READ, RDB_LOCK_WRITE } m_lock_rows; 256 257 /* true means we're doing an index-only read. false means otherwise. */ 258 bool m_keyread_only; 259 260 bool m_skip_scan_it_next_call; 261 262 /* true means we are accessing the first row after a snapshot was created */ 263 bool m_rnd_scan_is_new_snapshot; 264 265 /* 266 TRUE means INSERT ON DUPLICATE KEY UPDATE. In such case we can optimize by 267 remember the failed attempt (if there is one that violates uniqueness check) 268 in write_row and in the following index_read to skip the lock check and read 269 entirely 270 */ 271 bool m_insert_with_update; 272 273 /* 274 TRUE if last time the insertion failed due to duplicate key error. 275 (m_dupp_errkey holds the key# that we've had error for) 276 */ 277 bool m_dup_key_found; 278 279 #ifndef NDEBUG 280 /* 281 Last retrieved record (for duplicate PK) or index tuple (for duplicate 282 unique SK). Used for sanity checking. 283 */ 284 String m_dup_key_retrieved_record; 285 #endif 286 287 /** 288 @brief 289 This is a bitmap of indexes (i.e. a set) whose keys (in future, values) may 290 be changed by this statement. Indexes that are not in the bitmap do not need 291 to be updated. 292 @note Valid inside UPDATE statements, IIF(old_pk_slice is set). 293 */ 294 my_core::key_map m_update_scope; 295 296 /* SST information used for bulk loading the primary key */ 297 std::shared_ptr<Rdb_sst_info> m_sst_info; 298 299 /* 300 MySQL index number for duplicate key error 301 */ 302 uint m_dupp_errkey; 303 304 int create_key_defs(const TABLE *const table_arg, 305 Rdb_tbl_def *const tbl_def_arg, 306 const TABLE *const old_table_arg = nullptr, 307 const Rdb_tbl_def *const old_tbl_def_arg = nullptr) const 308 MY_ATTRIBUTE((__warn_unused_result__)); 309 int secondary_index_read(const int keyno, uchar *const buf) 310 MY_ATTRIBUTE((__warn_unused_result__)); 311 void setup_iterator_for_rnd_scan(); 312 bool is_ascending(const Rdb_key_def &keydef, 313 enum ha_rkey_function find_flag) const 314 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); 315 static void setup_iterator_bounds(const Rdb_key_def &kd, 316 const rocksdb::Slice &eq_cond, 317 size_t bound_len, uchar *const lower_bound, 318 uchar *const upper_bound, 319 rocksdb::Slice *lower_bound_slice, 320 rocksdb::Slice *upper_bound_slice); 321 static bool can_use_bloom_filter(THD *thd, const Rdb_key_def &kd, 322 const rocksdb::Slice &eq_cond, 323 const bool use_all_keys); 324 void setup_scan_iterator(const Rdb_key_def &kd, rocksdb::Slice *slice, 325 const bool use_all_keys, const uint eq_cond_len); 326 void release_scan_iterator(void); 327 328 rocksdb::Status get_for_update(Rdb_transaction *const tx, 329 const Rdb_key_def &kd, 330 const rocksdb::Slice &key, 331 rocksdb::PinnableSlice *value) const; 332 333 int get_row_by_rowid(uchar *const buf, const char *const rowid, 334 const uint rowid_size, const bool skip_ttl_check = true, 335 const bool skip_lookup = false) 336 MY_ATTRIBUTE((__warn_unused_result__)); 337 int get_row_by_rowid(uchar *const buf, const uchar *const rowid, 338 const uint rowid_size, const bool skip_ttl_check = true, 339 const bool skip_lookup = false) 340 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)) { 341 return get_row_by_rowid(buf, reinterpret_cast<const char *>(rowid), 342 rowid_size, skip_ttl_check, skip_lookup); 343 } 344 345 void load_auto_incr_value(); 346 ulonglong load_auto_incr_value_from_index(); 347 void update_auto_incr_val(ulonglong val); 348 void update_auto_incr_val_from_field(); 349 rocksdb::Status get_datadic_auto_incr(Rdb_transaction *const tx, 350 const GL_INDEX_ID &gl_index_id, 351 ulonglong *new_val) const; 352 longlong update_hidden_pk_val(); 353 int load_hidden_pk_value() MY_ATTRIBUTE((__warn_unused_result__)); 354 int read_hidden_pk_id_from_rowkey(longlong *const hidden_pk_id) 355 MY_ATTRIBUTE((__warn_unused_result__)); 356 bool can_use_single_delete(const uint index) const 357 MY_ATTRIBUTE((__warn_unused_result__)); 358 bool is_blind_delete_enabled(); 359 bool skip_unique_check() const MY_ATTRIBUTE((__warn_unused_result__)); 360 bool commit_in_the_middle() MY_ATTRIBUTE((__warn_unused_result__)); 361 bool do_bulk_commit(Rdb_transaction *const tx) 362 MY_ATTRIBUTE((__warn_unused_result__)); 363 bool has_hidden_pk(const TABLE *const table) const 364 MY_ATTRIBUTE((__warn_unused_result__)); 365 366 void update_row_stats(const operation_type &type); 367 368 void set_last_rowkey(const uchar *const old_data); 369 370 int alloc_key_buffers(const TABLE *const table_arg, 371 const Rdb_tbl_def *const tbl_def_arg, 372 bool alloc_alter_buffers = false) 373 MY_ATTRIBUTE((__warn_unused_result__)); 374 void free_key_buffers(); 375 376 // the buffer size should be at least 2*Rdb_key_def::INDEX_NUMBER_SIZE 377 rocksdb::Range get_range(const int i, uchar buf[]) const; 378 379 void records_in_range_internal(uint inx, key_range *const min_key, 380 key_range *const max_key, int64 disk_size, 381 int64 rows, ulonglong *total_size, 382 ulonglong *row_count); 383 384 /* 385 Perf timers for data reads 386 */ 387 Rdb_io_perf m_io_perf; 388 389 /* 390 Update stats 391 */ 392 void update_stats(void); 393 394 public: 395 /* 396 Controls whether writes include checksums. This is updated from the session 397 variable 398 at the start of each query. 399 */ 400 bool m_store_row_debug_checksums; 401 402 int m_checksums_pct; 403 404 ha_rocksdb(my_core::handlerton *const hton, 405 my_core::TABLE_SHARE *const table_arg); 406 virtual ~ha_rocksdb() override; 407 408 /** @brief 409 The name that will be used for display purposes. 410 */ 411 const char *table_type() const override { 412 DBUG_ENTER_FUNC(); 413 414 DBUG_RETURN(rocksdb_hton_name); 415 } 416 417 /* The following is only used by SHOW KEYS: */ 418 const char *index_type(uint inx) override { 419 DBUG_ENTER_FUNC(); 420 421 DBUG_RETURN("LSMTREE"); 422 } 423 424 /** @brief 425 The file extensions. 426 */ 427 const char **bas_ext() const override; 428 429 /* 430 Returns the name of the table's base name 431 */ 432 const std::string &get_table_basename() const; 433 434 /** @brief 435 This is a list of flags that indicate what functionality the storage engine 436 implements. The current table flags are documented in handler.h 437 */ 438 Table_flags table_flags() const override; 439 440 static Table_flags table_flags(const bool pk_can_be_decoded) { 441 DBUG_ENTER_FUNC(); 442 443 /* 444 HA_BINLOG_STMT_CAPABLE 445 We are saying that this engine is just statement capable to have 446 an engine that can only handle statement-based logging. This is 447 used in testing. 448 HA_REC_NOT_IN_SEQ 449 If we don't set it, filesort crashes, because it assumes rowids are 450 1..8 byte numbers 451 */ 452 DBUG_RETURN(HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE | 453 HA_REC_NOT_IN_SEQ | HA_CAN_INDEX_BLOBS | 454 (pk_can_be_decoded ? HA_PRIMARY_KEY_IN_READ_INDEX : 0) | 455 HA_PRIMARY_KEY_REQUIRED_FOR_POSITION | HA_NULL_IN_KEY | 456 HA_PARTIAL_COLUMN_READ | HA_ONLINE_ANALYZE); 457 } 458 459 bool init_with_fields() override; 460 461 static ulong index_flags(bool &pk_can_be_decoded, 462 const TABLE_SHARE *table_share, uint inx, uint part, 463 bool all_parts); 464 465 /** @brief 466 This is a bitmap of flags that indicates how the storage engine 467 implements indexes. The current index flags are documented in 468 handler.h. If you do not implement indexes, just return zero here. 469 470 @details 471 part is the key part to check. First key part is 0. 472 If all_parts is set, MySQL wants to know the flags for the combined 473 index, up to and including 'part'. 474 */ 475 ulong index_flags(uint inx, uint part, bool all_parts) const override; 476 477 bool rpl_can_handle_stm_event() const override; 478 479 const key_map *keys_to_use_for_scanning() override { 480 DBUG_ENTER_FUNC(); 481 482 DBUG_RETURN(&key_map_full); 483 } 484 485 bool primary_key_is_clustered() const override { 486 DBUG_ENTER_FUNC(); 487 488 DBUG_RETURN(true); 489 } 490 491 bool should_store_row_debug_checksums() const { 492 return m_store_row_debug_checksums && (rand() % 100 < m_checksums_pct); 493 } 494 495 MY_NODISCARD 496 int rename_partitioned_table(const char *const from, const char *const to, 497 const std::string &partition_string); 498 499 MY_NODISCARD 500 int rename_non_partitioned_table(const char *const from, 501 const char *const to); 502 503 MY_NODISCARD 504 int rename_table(const char *const from, const char *const to) override; 505 506 int convert_record_from_storage_format(const rocksdb::Slice *const key, 507 const rocksdb::Slice *const value, 508 uchar *const buf) 509 MY_ATTRIBUTE((__warn_unused_result__)); 510 511 int convert_record_from_storage_format(const rocksdb::Slice *const key, 512 uchar *const buf) 513 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); 514 515 static const std::vector<std::string> parse_into_tokens(const std::string &s, 516 const char delim); 517 518 static const std::string generate_cf_name( 519 const uint index, const TABLE *const table_arg, 520 const Rdb_tbl_def *const tbl_def_arg, bool *per_part_match_found); 521 522 static const char *get_key_name(const uint index, 523 const TABLE *const table_arg, 524 const Rdb_tbl_def *const tbl_def_arg) 525 MY_ATTRIBUTE((__warn_unused_result__)); 526 527 static const char *get_key_comment(const uint index, 528 const TABLE *const table_arg, 529 const Rdb_tbl_def *const tbl_def_arg) 530 MY_ATTRIBUTE((__warn_unused_result__)); 531 532 static const std::string get_table_comment(const TABLE *const table_arg) 533 MY_ATTRIBUTE((__warn_unused_result__)); 534 535 static bool is_hidden_pk(const uint index, const TABLE *const table_arg, 536 const Rdb_tbl_def *const tbl_def_arg) 537 MY_ATTRIBUTE((__warn_unused_result__)); 538 539 static uint pk_index(const TABLE *const table_arg, 540 const Rdb_tbl_def *const tbl_def_arg) 541 MY_ATTRIBUTE((__warn_unused_result__)); 542 543 static bool is_pk(const uint index, const TABLE *table_arg, 544 const Rdb_tbl_def *tbl_def_arg) 545 MY_ATTRIBUTE((__warn_unused_result__)); 546 /** @brief 547 unireg.cc will call max_supported_record_length(), max_supported_keys(), 548 max_supported_key_parts(), uint max_supported_key_length() 549 to make sure that the storage engine can handle the data it is about to 550 send. Return *real* limits of your storage engine here; MySQL will do 551 min(your_limits, MySQL_limits) automatically. 552 */ 553 uint max_supported_record_length() const override { 554 DBUG_ENTER_FUNC(); 555 556 DBUG_RETURN(HA_MAX_REC_LENGTH); 557 } 558 559 uint max_supported_keys() const override { 560 DBUG_ENTER_FUNC(); 561 562 DBUG_RETURN(MAX_INDEXES); 563 } 564 565 uint max_supported_key_parts() const override { 566 DBUG_ENTER_FUNC(); 567 568 DBUG_RETURN(MAX_REF_PARTS); 569 } 570 571 uint max_supported_key_part_length(HA_CREATE_INFO *) const override; 572 573 /** @brief 574 unireg.cc will call this to make sure that the storage engine can handle 575 the data it is about to send. Return *real* limits of your storage engine 576 here; MySQL will do min(your_limits, MySQL_limits) automatically. 577 578 @details 579 There is no need to implement ..._key_... methods if your engine doesn't 580 support indexes. 581 */ 582 uint max_supported_key_length() const override { 583 DBUG_ENTER_FUNC(); 584 585 DBUG_RETURN(16 * 1024); /* just to return something*/ 586 } 587 588 /** 589 TODO: return actual upper bound of number of records in the table. 590 (e.g. save number of records seen on full table scan and/or use file size 591 as upper bound) 592 */ 593 ha_rows estimate_rows_upper_bound() override { 594 DBUG_ENTER_FUNC(); 595 596 DBUG_RETURN(HA_POS_ERROR); 597 } 598 599 /* At the moment, we're ok with default handler::index_init() implementation. 600 */ 601 int index_read_map(uchar *const buf, const uchar *const key, 602 key_part_map keypart_map, 603 enum ha_rkey_function find_flag) override 604 MY_ATTRIBUTE((__warn_unused_result__)); 605 606 int index_read_map_impl(uchar *const buf, const uchar *const key, 607 key_part_map keypart_map, 608 enum ha_rkey_function find_flag, 609 const key_range *end_key) 610 MY_ATTRIBUTE((__warn_unused_result__)); 611 612 int index_read_last_map(uchar *const buf, const uchar *const key, 613 key_part_map keypart_map) override 614 MY_ATTRIBUTE((__warn_unused_result__)); 615 616 int read_range_first(const key_range *const start_key, 617 const key_range *const end_key, bool eq_range, 618 bool sorted) override 619 MY_ATTRIBUTE((__warn_unused_result__)); 620 621 virtual double scan_time() override { 622 DBUG_ENTER_FUNC(); 623 624 DBUG_RETURN( 625 static_cast<double>((stats.records + stats.deleted) / 20.0 + 10)); 626 } 627 628 virtual double read_time(uint, uint, ha_rows rows) override; 629 virtual void print_error(int error, myf errflag) override; 630 631 int open(const char *const name, int mode, uint test_if_locked) override 632 MY_ATTRIBUTE((__warn_unused_result__)); 633 int close(void) override MY_ATTRIBUTE((__warn_unused_result__)); 634 635 int write_row(uchar *const buf) override 636 MY_ATTRIBUTE((__warn_unused_result__)); 637 int update_row(const uchar *const old_data, uchar *const new_data) override 638 MY_ATTRIBUTE((__warn_unused_result__)); 639 int delete_row(const uchar *const buf) override 640 MY_ATTRIBUTE((__warn_unused_result__)); 641 void update_table_stats_if_needed(); 642 rocksdb::Status delete_or_singledelete(uint index, Rdb_transaction *const tx, 643 rocksdb::ColumnFamilyHandle *const cf, 644 const rocksdb::Slice &key) 645 MY_ATTRIBUTE((__warn_unused_result__)); 646 647 int index_next(uchar *const buf) override 648 MY_ATTRIBUTE((__warn_unused_result__)); 649 int index_next_with_direction(uchar *const buf, bool move_forward) 650 MY_ATTRIBUTE((__warn_unused_result__)); 651 int index_prev(uchar *const buf) override 652 MY_ATTRIBUTE((__warn_unused_result__)); 653 654 int index_first(uchar *const buf) override 655 MY_ATTRIBUTE((__warn_unused_result__)); 656 int index_last(uchar *const buf) override 657 MY_ATTRIBUTE((__warn_unused_result__)); 658 659 class Item *idx_cond_push(uint keyno, class Item *const idx_cond) override; 660 /* 661 Default implementation from cancel_pushed_idx_cond() suits us 662 */ 663 664 static bool check_bloom_and_set_bounds( 665 THD *thd, const Rdb_key_def &kd, const rocksdb::Slice &eq_cond, 666 const bool use_all_keys, size_t bound_len, uchar *const lower_bound, 667 uchar *const upper_bound, rocksdb::Slice *lower_bound_slice, 668 rocksdb::Slice *upper_bound_slice); 669 670 private: 671 struct key_def_cf_info { 672 std::shared_ptr<rocksdb::ColumnFamilyHandle> cf_handle; 673 bool is_reverse_cf; 674 bool is_per_partition_cf; 675 }; 676 677 struct update_row_info { 678 Rdb_transaction *tx; 679 const uchar *new_data; 680 const uchar *old_data; 681 rocksdb::Slice new_pk_slice; 682 rocksdb::Slice old_pk_slice; 683 rocksdb::Slice old_pk_rec; 684 685 // "unpack_info" data for the new PK value 686 Rdb_string_writer *new_pk_unpack_info; 687 688 longlong hidden_pk_id; 689 bool skip_unique_check; 690 }; 691 692 /* 693 Used to check for duplicate entries during fast unique secondary index 694 creation. 695 */ 696 struct unique_sk_buf_info { 697 bool sk_buf_switch = false; 698 rocksdb::Slice sk_memcmp_key; 699 rocksdb::Slice sk_memcmp_key_old; 700 uchar *dup_sk_buf; 701 uchar *dup_sk_buf_old; 702 703 /* 704 This method is meant to be called back to back during inplace creation 705 of unique indexes. It will switch between two buffers, which 706 will each store the memcmp form of secondary keys, which are then 707 converted to slices in sk_memcmp_key or sk_memcmp_key_old. 708 709 Switching buffers on each iteration allows us to retain the 710 sk_memcmp_key_old value for duplicate comparison. 711 */ 712 inline uchar *swap_and_get_sk_buf() { 713 sk_buf_switch = !sk_buf_switch; 714 return sk_buf_switch ? dup_sk_buf : dup_sk_buf_old; 715 } 716 }; 717 718 int create_cfs(const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg, 719 std::array<struct key_def_cf_info, MAX_INDEXES + 1> *const cfs) 720 const MY_ATTRIBUTE((__warn_unused_result__)); 721 722 int create_key_def(const TABLE *const table_arg, const uint i, 723 const Rdb_tbl_def *const tbl_def_arg, 724 std::shared_ptr<Rdb_key_def> *const new_key_def, 725 const struct key_def_cf_info &cf_info, uint64 ttl_duration, 726 const std::string &ttl_column) const 727 MY_ATTRIBUTE((__warn_unused_result__)); 728 729 int create_inplace_key_defs( 730 const TABLE *const table_arg, Rdb_tbl_def *vtbl_def_arg, 731 const TABLE *const old_table_arg, 732 const Rdb_tbl_def *const old_tbl_def_arg, 733 const std::array<key_def_cf_info, MAX_INDEXES + 1> &cf, 734 uint64 ttl_duration, const std::string &ttl_column) const 735 MY_ATTRIBUTE((__warn_unused_result__)); 736 737 std::unordered_map<std::string, uint> get_old_key_positions( 738 const TABLE *table_arg, const Rdb_tbl_def *tbl_def_arg, 739 const TABLE *old_table_arg, const Rdb_tbl_def *old_tbl_def_arg) const; 740 741 int compare_key_parts(const KEY *const old_key, 742 const KEY *const new_key) const 743 MY_ATTRIBUTE((__warn_unused_result__)); 744 745 int compare_keys(const KEY *const old_key, const KEY *const new_key) const 746 MY_ATTRIBUTE((__warn_unused_result__)); 747 748 bool should_hide_ttl_rec(const Rdb_key_def &kd, 749 const rocksdb::Slice &ttl_rec_val, 750 const int64_t curr_ts) 751 MY_ATTRIBUTE((__warn_unused_result__)); 752 int rocksdb_skip_expired_records(const Rdb_key_def &kd, 753 rocksdb::Iterator *const iter, 754 bool seek_backward); 755 756 int index_first_intern(uchar *buf) MY_ATTRIBUTE((__warn_unused_result__)); 757 int index_last_intern(uchar *buf) MY_ATTRIBUTE((__warn_unused_result__)); 758 759 enum icp_result check_index_cond() const; 760 int find_icp_matching_index_rec(const bool move_forward, uchar *const buf) 761 MY_ATTRIBUTE((__warn_unused_result__)); 762 763 void calc_updated_indexes(); 764 int update_write_row(const uchar *const old_data, const uchar *const new_data, 765 const bool skip_unique_check) 766 MY_ATTRIBUTE((__warn_unused_result__)); 767 int get_pk_for_update(struct update_row_info *const row_info); 768 int check_and_lock_unique_pk(const uint key_id, 769 const struct update_row_info &row_info, 770 bool *const found, const bool skip_unique_check) 771 MY_ATTRIBUTE((__warn_unused_result__)); 772 int check_and_lock_sk(const uint key_id, 773 const struct update_row_info &row_info, 774 bool *const found, const bool skip_unique_check) 775 MY_ATTRIBUTE((__warn_unused_result__)); 776 int check_uniqueness_and_lock(const struct update_row_info &row_info, 777 bool pk_changed, const bool skip_unique_check) 778 MY_ATTRIBUTE((__warn_unused_result__)); 779 bool over_bulk_load_threshold(int *err) 780 MY_ATTRIBUTE((__warn_unused_result__)); 781 int check_duplicate_sk(const TABLE *table_arg, const Rdb_key_def &key_def, 782 const rocksdb::Slice *key, 783 struct unique_sk_buf_info *sk_info) 784 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); 785 int bulk_load_key(Rdb_transaction *const tx, const Rdb_key_def &kd, 786 const rocksdb::Slice &key, const rocksdb::Slice &value, 787 bool sort) 788 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); 789 int update_write_pk(const Rdb_key_def &kd, 790 const struct update_row_info &row_info, 791 const bool pk_changed) 792 MY_ATTRIBUTE((__warn_unused_result__)); 793 int update_write_sk(const TABLE *const table_arg, const Rdb_key_def &kd, 794 const struct update_row_info &row_info, 795 const bool bulk_load_sk) 796 MY_ATTRIBUTE((__warn_unused_result__)); 797 int update_write_indexes(const struct update_row_info &row_info, 798 const bool pk_changed) 799 MY_ATTRIBUTE((__warn_unused_result__)); 800 801 int read_key_exact(const Rdb_key_def &kd, rocksdb::Iterator *const iter, 802 const bool using_full_key, const rocksdb::Slice &key_slice, 803 const int64_t ttl_filter_ts) 804 MY_ATTRIBUTE((__warn_unused_result__)); 805 int read_before_key(const Rdb_key_def &kd, const bool using_full_key, 806 const rocksdb::Slice &key_slice, 807 const int64_t ttl_filter_ts) 808 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); 809 int read_after_key(const Rdb_key_def &kd, const rocksdb::Slice &key_slice, 810 const int64_t ttl_filter_ts) 811 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); 812 int position_to_correct_key(const Rdb_key_def &kd, 813 const enum ha_rkey_function &find_flag, 814 const bool full_key_match, const uchar *const key, 815 const key_part_map &keypart_map, 816 const rocksdb::Slice &key_slice, 817 bool *const move_forward, 818 const int64_t ttl_filter_ts) 819 MY_ATTRIBUTE((__warn_unused_result__)); 820 821 int read_row_from_primary_key(uchar *const buf) 822 MY_ATTRIBUTE((__warn_unused_result__)); 823 int read_row_from_secondary_key(uchar *const buf, const Rdb_key_def &kd, 824 bool move_forward) 825 MY_ATTRIBUTE((__warn_unused_result__)); 826 827 int calc_eq_cond_len(const Rdb_key_def &kd, 828 const enum ha_rkey_function &find_flag, 829 const rocksdb::Slice &slice, 830 const int bytes_changed_by_succ, 831 const key_range *const end_key, 832 uint *const end_key_packed_size) 833 MY_ATTRIBUTE((__warn_unused_result__)); 834 835 Rdb_tbl_def *get_table_if_exists(const char *const tablename) 836 MY_ATTRIBUTE((__warn_unused_result__)); 837 void read_thd_vars(THD *const thd) MY_ATTRIBUTE((__nonnull__)); 838 839 bool contains_foreign_key(THD *const thd) 840 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); 841 842 int inplace_populate_sk( 843 TABLE *const table_arg, 844 const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes) 845 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); 846 847 int finalize_bulk_load(bool print_client_error = true) 848 MY_ATTRIBUTE((__warn_unused_result__)); 849 850 void inc_table_n_rows(); 851 void dec_table_n_rows(); 852 853 bool should_skip_invalidated_record(const int rc) const; 854 bool should_recreate_snapshot(const int rc, const bool is_new_snapshot) const; 855 856 bool can_assume_tracked(THD *thd); 857 858 public: 859 void set_pk_can_be_decoded(bool flag) { m_pk_can_be_decoded = flag; } 860 int index_init(uint idx, bool sorted) override 861 MY_ATTRIBUTE((__warn_unused_result__)); 862 int index_end() override MY_ATTRIBUTE((__warn_unused_result__)); 863 864 void unlock_row() override; 865 866 /** @brief 867 Unlike index_init(), rnd_init() can be called two consecutive times 868 without rnd_end() in between (it only makes sense if scan=1). In this 869 case, the second call should prepare for the new table scan (e.g if 870 rnd_init() allocates the cursor, the second call should position the 871 cursor to the start of the table; no need to deallocate and allocate 872 it again. This is a required method. 873 */ 874 int rnd_init(bool scan) override MY_ATTRIBUTE((__warn_unused_result__)); 875 int rnd_end() override MY_ATTRIBUTE((__warn_unused_result__)); 876 877 int rnd_next(uchar *const buf) override 878 MY_ATTRIBUTE((__warn_unused_result__)); 879 int rnd_next_with_direction(uchar *const buf, bool move_forward) 880 MY_ATTRIBUTE((__warn_unused_result__)); 881 882 int rnd_pos(uchar *const buf, uchar *const pos) override 883 MY_ATTRIBUTE((__warn_unused_result__)); 884 void position(const uchar *const record) override; 885 int info(uint) override; 886 887 /* This function will always return success, therefore no annotation related 888 * to checking the return value. Can't change the signature because it's 889 * required by the interface. */ 890 int extra(enum ha_extra_function operation) override; 891 892 int start_stmt(THD *const thd, thr_lock_type lock_type) override 893 MY_ATTRIBUTE((__warn_unused_result__)); 894 int external_lock(THD *const thd, int lock_type) override 895 MY_ATTRIBUTE((__warn_unused_result__)); 896 int truncate() override MY_ATTRIBUTE((__warn_unused_result__)); 897 898 int reset() override { 899 DBUG_ENTER_FUNC(); 900 901 /* Free blob data */ 902 m_retrieved_record.Reset(); 903 904 DBUG_RETURN(HA_EXIT_SUCCESS); 905 } 906 907 int check(THD *const thd, HA_CHECK_OPT *const check_opt) override 908 MY_ATTRIBUTE((__warn_unused_result__)); 909 int remove_rows(Rdb_tbl_def *const tbl); 910 ha_rows records_in_range(uint inx, key_range *const min_key, 911 key_range *const max_key) override 912 MY_ATTRIBUTE((__warn_unused_result__)); 913 int delete_non_partitioned_table(const char *const from) 914 MY_ATTRIBUTE((__warn_unused_result__)); 915 int delete_partitioned_table(const char *const from, 916 const std::string &partition_info_str) 917 MY_ATTRIBUTE((__warn_unused_result__)); 918 919 int delete_table(Rdb_tbl_def *const tbl); 920 int delete_table(const char *const from) override 921 MY_ATTRIBUTE((__warn_unused_result__)); 922 int create(const char *const name, TABLE *const form, 923 HA_CREATE_INFO *const create_info) override 924 MY_ATTRIBUTE((__warn_unused_result__)); 925 int create_table(const std::string &table_name, const TABLE *table_arg, 926 ulonglong auto_increment_value); 927 int truncate_table(Rdb_tbl_def *tbl_def, TABLE *table_arg, 928 ulonglong auto_increment_value); 929 bool check_if_incompatible_data(HA_CREATE_INFO *const info, 930 uint table_changes) override 931 MY_ATTRIBUTE((__warn_unused_result__)); 932 933 THR_LOCK_DATA **store_lock(THD *const thd, THR_LOCK_DATA **to, 934 enum thr_lock_type lock_type) override 935 MY_ATTRIBUTE((__warn_unused_result__)); 936 937 my_bool register_query_cache_table(THD *const thd, char *const table_key, 938 size_t key_length, 939 qc_engine_callback *const engine_callback, 940 ulonglong *const engine_data) override { 941 DBUG_ENTER_FUNC(); 942 943 /* Currently, we don't support query cache */ 944 DBUG_RETURN(FALSE); 945 } 946 947 bool get_error_message(const int error, String *const buf) override; 948 949 static int rdb_error_to_mysql(const rocksdb::Status &s, 950 const char *msg = nullptr) 951 MY_ATTRIBUTE((__warn_unused_result__)); 952 953 void get_auto_increment(ulonglong offset, ulonglong increment, 954 ulonglong nb_desired_values, 955 ulonglong *const first_value, 956 ulonglong *const nb_reserved_values) override; 957 void update_create_info(HA_CREATE_INFO *const create_info) override; 958 int optimize(THD *const thd, HA_CHECK_OPT *const check_opt) override 959 MY_ATTRIBUTE((__warn_unused_result__)); 960 int analyze(THD *const thd, HA_CHECK_OPT *const check_opt) override 961 MY_ATTRIBUTE((__warn_unused_result__)); 962 963 enum_alter_inplace_result check_if_supported_inplace_alter( 964 TABLE *altered_table, 965 my_core::Alter_inplace_info *const ha_alter_info) override; 966 967 bool prepare_inplace_alter_table( 968 TABLE *const altered_table, 969 my_core::Alter_inplace_info *const ha_alter_info) override; 970 971 bool inplace_alter_table( 972 TABLE *const altered_table, 973 my_core::Alter_inplace_info *const ha_alter_info) override; 974 975 bool commit_inplace_alter_table( 976 TABLE *const altered_table, 977 my_core::Alter_inplace_info *const ha_alter_info, bool commit) override; 978 979 bool is_read_free_rpl_table() const; 980 int adjust_handler_stats_sst_and_memtable(); 981 int adjust_handler_stats_table_scan(); 982 983 void build_decoder(); 984 void check_build_decoder(); 985 986 static void inc_covered_sk_lookup(); 987 988 #if defined(ROCKSDB_INCLUDE_RFR) && ROCKSDB_INCLUDE_RFR 989 public: 990 virtual void rpl_before_delete_rows() override; 991 virtual void rpl_after_delete_rows() override; 992 virtual void rpl_before_update_rows() override; 993 virtual void rpl_after_update_rows() override; 994 virtual bool rpl_lookup_rows() override; 995 996 virtual bool use_read_free_rpl() const; // MyRocks only 997 998 private: 999 /* Flags tracking if we are inside different replication operation */ 1000 bool m_in_rpl_delete_rows; 1001 bool m_in_rpl_update_rows; 1002 #endif // defined(ROCKSDB_INCLUDE_RFR) && ROCKSDB_INCLUDE_RFR 1003 1004 bool m_force_skip_unique_check; 1005 1006 /* Need to build decoder on next read operation */ 1007 bool m_need_build_decoder; 1008 }; 1009 1010 /* 1011 Helper class for in-place alter, for storing handler context between inplace 1012 alter calls 1013 */ 1014 struct Rdb_inplace_alter_ctx : public my_core::inplace_alter_handler_ctx { 1015 /* The new table definition */ 1016 Rdb_tbl_def *const m_new_tdef; 1017 1018 /* Stores the original key definitions */ 1019 std::shared_ptr<Rdb_key_def> *const m_old_key_descr; 1020 1021 /* Stores the new key definitions */ 1022 std::shared_ptr<Rdb_key_def> *m_new_key_descr; 1023 1024 /* Stores the old number of key definitions */ 1025 const uint m_old_n_keys; 1026 1027 /* Stores the new number of key definitions */ 1028 const uint m_new_n_keys; 1029 1030 /* Stores the added key glids */ 1031 const std::unordered_set<std::shared_ptr<Rdb_key_def>> m_added_indexes; 1032 1033 /* Stores the dropped key glids */ 1034 const std::unordered_set<GL_INDEX_ID> m_dropped_index_ids; 1035 1036 /* Stores number of keys to add */ 1037 const uint m_n_added_keys; 1038 1039 /* Stores number of keys to drop */ 1040 const uint m_n_dropped_keys; 1041 1042 /* Stores the largest current auto increment value in the index */ 1043 const ulonglong m_max_auto_incr; 1044 1045 Rdb_inplace_alter_ctx( 1046 Rdb_tbl_def *new_tdef, std::shared_ptr<Rdb_key_def> *old_key_descr, 1047 std::shared_ptr<Rdb_key_def> *new_key_descr, uint old_n_keys, 1048 uint new_n_keys, 1049 std::unordered_set<std::shared_ptr<Rdb_key_def>> added_indexes, 1050 std::unordered_set<GL_INDEX_ID> dropped_index_ids, uint n_added_keys, 1051 uint n_dropped_keys, ulonglong max_auto_incr) 1052 : my_core::inplace_alter_handler_ctx(), 1053 m_new_tdef(new_tdef), 1054 m_old_key_descr(old_key_descr), 1055 m_new_key_descr(new_key_descr), 1056 m_old_n_keys(old_n_keys), 1057 m_new_n_keys(new_n_keys), 1058 m_added_indexes(added_indexes), 1059 m_dropped_index_ids(dropped_index_ids), 1060 m_n_added_keys(n_added_keys), 1061 m_n_dropped_keys(n_dropped_keys), 1062 m_max_auto_incr(max_auto_incr) {} 1063 1064 ~Rdb_inplace_alter_ctx() {} 1065 1066 private: 1067 /* Disable Copying */ 1068 Rdb_inplace_alter_ctx(const Rdb_inplace_alter_ctx &); 1069 Rdb_inplace_alter_ctx &operator=(const Rdb_inplace_alter_ctx &); 1070 }; 1071 1072 /* 1073 Helper class to control access/init to handlerton instance. 1074 Contains a flag that is set if the handlerton is in an initialized, usable 1075 state, plus a reader-writer lock to protect it without serializing reads. 1076 Since we don't have static initializers for the opaque mysql_rwlock type, 1077 use constructor and destructor functions to create and destroy 1078 the lock before and after main(), respectively. 1079 */ 1080 struct Rdb_hton_init_state { 1081 struct Scoped_lock { 1082 Scoped_lock(Rdb_hton_init_state &state, bool write) : m_state(state) { 1083 if (write) 1084 m_state.lock_write(); 1085 else 1086 m_state.lock_read(); 1087 } 1088 ~Scoped_lock() { m_state.unlock(); } 1089 1090 private: 1091 Scoped_lock(const Scoped_lock &sl) : m_state(sl.m_state) {} 1092 void operator=(const Scoped_lock &) {} 1093 1094 Rdb_hton_init_state &m_state; 1095 }; 1096 1097 Rdb_hton_init_state() : m_initialized(false) { 1098 /* 1099 m_rwlock can not be instrumented as it must be initialized before 1100 mysql_mutex_register() call to protect some globals from race condition. 1101 */ 1102 mysql_rwlock_init(0, &m_rwlock); 1103 } 1104 1105 ~Rdb_hton_init_state() { mysql_rwlock_destroy(&m_rwlock); } 1106 1107 void lock_read() { mysql_rwlock_rdlock(&m_rwlock); } 1108 1109 void lock_write() { mysql_rwlock_wrlock(&m_rwlock); } 1110 1111 void unlock() { mysql_rwlock_unlock(&m_rwlock); } 1112 1113 /* 1114 Must be called with either a read or write lock held, unable to enforce 1115 behavior as mysql_rwlock has no means of determining if a thread has a lock 1116 */ 1117 bool initialized() const { return m_initialized; } 1118 1119 /* 1120 Must be called with only a write lock held, unable to enforce behavior as 1121 mysql_rwlock has no means of determining if a thread has a lock 1122 */ 1123 void set_initialized(bool init) { m_initialized = init; } 1124 1125 private: 1126 mysql_rwlock_t m_rwlock; 1127 bool m_initialized; 1128 }; 1129 1130 // file name indicating RocksDB data corruption 1131 std::string rdb_corruption_marker_file_name(); 1132 1133 } // namespace myrocks 1134