1 /*
2 Copyright (c) 2012,2013 Monty Program Ab
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; version 2 of the License.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
16 #pragma once
17
18 #ifdef USE_PRAGMA_INTERFACE
19 #pragma interface /* gcc class implementation */
20 #endif
21
22 /* C++ standard header files */
23 #include <cinttypes>
24 #include <set>
25 #include <string>
26 #include <unordered_set>
27 #include <vector>
28
29 /* MySQL header files */
30 #include "./handler.h" /* handler */
31 #include "./my_global.h" /* ulonglong */
32 #include "./sql_string.h"
33 #include "./ib_ut0counter.h"
34
35 /* RocksDB header files */
36 #include "rocksdb/cache.h"
37 #include "rocksdb/perf_context.h"
38 #include "rocksdb/statistics.h"
39 #include "rocksdb/utilities/options_util.h"
40 #include "rocksdb/utilities/transaction_db.h"
41 #include "rocksdb/utilities/write_batch_with_index.h"
42
43 /* MyRocks header files */
44 #include "./rdb_comparator.h"
45 #include "./rdb_index_merge.h"
46 #include "./rdb_perf_context.h"
47 #include "./rdb_sst_info.h"
48 #include "./rdb_utils.h"
49
50 /**
51 @note MyRocks Coding Conventions:
52 MyRocks code follows the baseline MySQL coding conventions, available at
53 http://dev.mysql.com/doc/internals/en/coding-guidelines.html, with several
54 refinements (@see /storage/rocksdb/README file).
55 */
56
57 /**
58 @note MyRocks Coding Conventions:
59 MyRocks code follows the baseline MySQL coding conventions, available at
60 http://dev.mysql.com/doc/internals/en/coding-guidelines.html, with several
61 refinements (@see /storage/rocksdb/README file).
62 */
63
64 namespace myrocks {
65
66 /*
67 * class for exporting transaction information for
68 * information_schema.rocksdb_trx
69 */
70 struct Rdb_trx_info {
71 std::string name;
72 ulonglong trx_id;
73 ulonglong write_count;
74 ulonglong lock_count;
75 int timeout_sec;
76 std::string state;
77 std::string waiting_key;
78 ulonglong waiting_cf_id;
79 int is_replication;
80 int skip_trx_api;
81 int read_only;
82 int deadlock_detect;
83 int num_ongoing_bulk_load;
84 ulong thread_id;
85 std::string query_str;
86 };
87
88 std::vector<Rdb_trx_info> rdb_get_all_trx_info();
89
90 /*
91 This is
92 - the name of the default Column Family (the CF which stores indexes which
93 didn't explicitly specify which CF they are in)
94 - the name used to set the default column family parameter for per-cf
95 arguments.
96 */
97 const char *const DEFAULT_CF_NAME = "default";
98
99 /*
100 This is the name of the Column Family used for storing the data dictionary.
101 */
102 const char *const DEFAULT_SYSTEM_CF_NAME = "__system__";
103
104 /*
105 This is the name of the hidden primary key for tables with no pk.
106 */
107 const char *const HIDDEN_PK_NAME = "HIDDEN_PK_ID";
108
109 /*
110 Column family name which means "put this index into its own column family".
111 See Rdb_cf_manager::get_per_index_cf_name().
112 */
113 const char *const PER_INDEX_CF_NAME = "$per_index_cf";
114
115 /*
116 Name for the background thread.
117 */
118 const char *const BG_THREAD_NAME = "myrocks-bg";
119
120 /*
121 Name for the drop index thread.
122 */
123 const char *const INDEX_THREAD_NAME = "myrocks-index";
124
125 /*
126 Default, minimal valid, and maximum valid sampling rate values when collecting
127 statistics about table.
128 */
129 #define RDB_DEFAULT_TBL_STATS_SAMPLE_PCT 10
130 #define RDB_TBL_STATS_SAMPLE_PCT_MIN 1
131 #define RDB_TBL_STATS_SAMPLE_PCT_MAX 100
132
133 /*
134 Default and maximum values for rocksdb-compaction-sequential-deletes and
135 rocksdb-compaction-sequential-deletes-window to add basic boundary checking.
136 */
137 #define DEFAULT_COMPACTION_SEQUENTIAL_DELETES 0
138 #define MAX_COMPACTION_SEQUENTIAL_DELETES 2000000
139
140 #define DEFAULT_COMPACTION_SEQUENTIAL_DELETES_WINDOW 0
141 #define MAX_COMPACTION_SEQUENTIAL_DELETES_WINDOW 2000000
142
143 /*
144 Default and maximum values for various compaction and flushing related
145 options. Numbers are based on the hardware we currently use and our internal
146 benchmarks which indicate that parallelization helps with the speed of
147 compactions.
148
149 Ideally of course we'll use heuristic technique to determine the number of
150 CPU-s and derive the values from there. This however has its own set of
151 problems and we'll choose simplicity for now.
152 */
153 #define MAX_BACKGROUND_COMPACTIONS 64
154 #define MAX_BACKGROUND_FLUSHES 64
155
156 #define DEFAULT_SUBCOMPACTIONS 1
157 #define MAX_SUBCOMPACTIONS 64
158
159 /*
160 Defines the field sizes for serializing XID object to a string representation.
161 string byte format: [field_size: field_value, ...]
162 [
163 8: XID.formatID,
164 1: XID.gtrid_length,
165 1: XID.bqual_length,
166 XID.gtrid_length + XID.bqual_length: XID.data
167 ]
168 */
169 #define RDB_FORMATID_SZ 8
170 #define RDB_GTRID_SZ 1
171 #define RDB_BQUAL_SZ 1
172 #define RDB_XIDHDR_LEN (RDB_FORMATID_SZ + RDB_GTRID_SZ + RDB_BQUAL_SZ)
173
174 /*
175 To fix an unhandled exception we specify the upper bound as LONGLONGMAX
176 instead of ULONGLONGMAX because the latter is -1 and causes an exception when
177 cast to jlong (signed) of JNI
178
179 The reason behind the cast issue is the lack of unsigned int support in Java.
180 */
181 #define MAX_RATE_LIMITER_BYTES_PER_SEC static_cast<uint64_t>(LONGLONG_MAX)
182
183 /*
184 Hidden PK column (for tables with no primary key) is a longlong (aka 8 bytes).
185 static_assert() in code will validate this assumption.
186 */
187 #define ROCKSDB_SIZEOF_HIDDEN_PK_COLUMN sizeof(longlong)
188
189 /*
190 MyRocks specific error codes. NB! Please make sure that you will update
191 HA_ERR_ROCKSDB_LAST when adding new ones.
192 */
193 #define HA_ERR_ROCKSDB_UNIQUE_NOT_SUPPORTED (HA_ERR_LAST + 1)
194 #define HA_ERR_ROCKSDB_PK_REQUIRED (HA_ERR_LAST + 2)
195 #define HA_ERR_ROCKSDB_TOO_MANY_LOCKS (HA_ERR_LAST + 3)
196 #define HA_ERR_ROCKSDB_LAST HA_ERR_ROCKSDB_TOO_MANY_LOCKS
197
looks_like_per_index_cf_typo(const char * const name)198 inline bool looks_like_per_index_cf_typo(const char *const name) {
199 return (name && name[0] == '$' && strcmp(name, PER_INDEX_CF_NAME));
200 }
201
202 /**
203 @brief
204 Rdb_table_handler is a reference-counted structure storing information for
205 each open table. All the objects are stored in a global hash map.
206
207 //TODO: join this with Rdb_tbl_def ?
208 */
209 struct Rdb_table_handler {
210 char *m_table_name;
211 uint m_table_name_length;
212 int m_ref_count;
213
214 my_core::THR_LOCK m_thr_lock; ///< MySQL latch needed by m_db_lock
215
216 /* Stores cumulative table statistics */
217 my_io_perf_atomic_t m_io_perf_read;
218 Rdb_atomic_perf_counters m_table_perf_context;
219 };
220
221 class Rdb_key_def;
222 class Rdb_tbl_def;
223 class Rdb_transaction;
224 class Rdb_transaction_impl;
225 class Rdb_writebatch_impl;
226 class Rdb_field_encoder;
227
228 const char *const rocksdb_hton_name = "ROCKSDB";
229
230 typedef struct _gl_index_id_s {
231 uint32_t cf_id;
232 uint32_t index_id;
233 bool operator==(const struct _gl_index_id_s &other) const {
234 return cf_id == other.cf_id && index_id == other.index_id;
235 }
236 bool operator!=(const struct _gl_index_id_s &other) const {
237 return cf_id != other.cf_id || index_id != other.index_id;
238 }
239 bool operator<(const struct _gl_index_id_s &other) const {
240 return cf_id < other.cf_id ||
241 (cf_id == other.cf_id && index_id < other.index_id);
242 }
243 bool operator<=(const struct _gl_index_id_s &other) const {
244 return cf_id < other.cf_id ||
245 (cf_id == other.cf_id && index_id <= other.index_id);
246 }
247 bool operator>(const struct _gl_index_id_s &other) const {
248 return cf_id > other.cf_id ||
249 (cf_id == other.cf_id && index_id > other.index_id);
250 }
251 bool operator>=(const struct _gl_index_id_s &other) const {
252 return cf_id > other.cf_id ||
253 (cf_id == other.cf_id && index_id >= other.index_id);
254 }
255 } GL_INDEX_ID;
256
257 enum operation_type {
258 ROWS_DELETED = 0,
259 ROWS_INSERTED,
260 ROWS_READ,
261 ROWS_UPDATED,
262 ROWS_MAX
263 };
264
265 #if defined(HAVE_SCHED_GETCPU)
266 #define RDB_INDEXER get_sched_indexer_t
267 #else
268 #define RDB_INDEXER thread_id_indexer_t
269 #endif
270
271 /* Global statistics struct used inside MyRocks */
272 struct st_global_stats {
273 ib_counter_t<ulonglong, 64, RDB_INDEXER> rows[ROWS_MAX];
274
275 // system_rows_ stats are only for system
276 // tables. They are not counted in rows_* stats.
277 ib_counter_t<ulonglong, 64, RDB_INDEXER> system_rows[ROWS_MAX];
278 };
279
280 /* Struct used for exporting status to MySQL */
281 struct st_export_stats {
282 ulonglong rows_deleted;
283 ulonglong rows_inserted;
284 ulonglong rows_read;
285 ulonglong rows_updated;
286
287 ulonglong system_rows_deleted;
288 ulonglong system_rows_inserted;
289 ulonglong system_rows_read;
290 ulonglong system_rows_updated;
291 };
292
293 } // namespace myrocks
294
295 #include "./rdb_buff.h"
296
297 /* Provide hash function for GL_INDEX_ID so we can include it in sets */
298 namespace std {
299 template <> struct hash<myrocks::GL_INDEX_ID> {
300 std::size_t operator()(const myrocks::GL_INDEX_ID &gl_index_id) const {
301 const uint64_t val =
302 ((uint64_t)gl_index_id.cf_id << 32 | (uint64_t)gl_index_id.index_id);
303 return std::hash<uint64_t>()(val);
304 }
305 };
306 } // namespace std
307
308 namespace myrocks {
309
310 /**
311 @brief
312 Class definition for ROCKSDB storage engine plugin handler
313 */
314
315 class ha_rocksdb : public my_core::handler {
316 my_core::THR_LOCK_DATA m_db_lock; ///< MySQL database lock
317
318 Rdb_table_handler *m_table_handler; ///< Open table handler
319
320 /* Iterator used for range scans and for full table/index scans */
321 rocksdb::Iterator *m_scan_it;
322
323 /* Whether m_scan_it was created with skip_bloom=true */
324 bool m_scan_it_skips_bloom;
325
326 const rocksdb::Snapshot *m_scan_it_snapshot;
327
328 Rdb_tbl_def *m_tbl_def;
329
330 /* Primary Key encoder from KeyTupleFormat to StorageFormat */
331 std::shared_ptr<Rdb_key_def> m_pk_descr;
332
333 /* Array of index descriptors */
334 std::shared_ptr<Rdb_key_def> *m_key_descr_arr;
335
336 bool check_keyread_allowed(uint inx, uint part, bool all_parts) const;
337
338 /*
339 Number of key parts in PK. This is the same as
340 table->key_info[table->s->primary_key].keyparts
341 */
342 uint m_pk_key_parts;
343
344 /*
345 TRUE <=> Primary Key columns can be decoded from the index
346 */
347 mutable bool m_pk_can_be_decoded;
348
349 /*
350 TRUE <=> Some fields in the PK may require unpack_info.
351 */
352 bool m_maybe_unpack_info;
353
354 uchar *m_pk_tuple; /* Buffer for storing PK in KeyTupleFormat */
355 uchar *m_pk_packed_tuple; /* Buffer for storing PK in StorageFormat */
356 // ^^ todo: change it to 'char*'? TODO: ^ can we join this with last_rowkey?
357
358 /*
359 Temporary buffers for storing the key part of the Key/Value pair
360 for secondary indexes.
361 */
362 uchar *m_sk_packed_tuple;
363
364 /*
365 Temporary buffers for storing end key part of the Key/Value pair.
366 This is used for range scan only.
367 */
368 uchar *m_end_key_packed_tuple;
369
370 Rdb_string_writer m_sk_tails;
371 Rdb_string_writer m_pk_unpack_info;
372
373 /*
374 ha_rockdb->index_read_map(.. HA_READ_KEY_EXACT or similar) will save here
375 mem-comparable form of the index lookup tuple.
376 */
377 uchar *m_sk_match_prefix;
378 uint m_sk_match_length;
379
380 /* Buffer space for the above */
381 uchar *m_sk_match_prefix_buf;
382
383 /* Second buffers, used by UPDATE. */
384 uchar *m_sk_packed_tuple_old;
385 Rdb_string_writer m_sk_tails_old;
386
387 /* Buffers used for duplicate checking during unique_index_creation */
388 uchar *m_dup_sk_packed_tuple;
389 uchar *m_dup_sk_packed_tuple_old;
390
391 /*
392 Temporary space for packing VARCHARs (we provide it to
393 pack_record()/pack_index_tuple() calls).
394 */
395 uchar *m_pack_buffer;
396
397 /* rowkey of the last record we've read, in StorageFormat. */
398 String m_last_rowkey;
399
400 /* Buffer used by convert_record_to_storage_format() */
401 String m_storage_record;
402
403 /*
404 Last retrieved record, in table->record[0] data format.
405
406 This is used only when we get the record with rocksdb's Get() call (The
407 other option is when we get a rocksdb::Slice from an iterator)
408 */
409 std::string m_retrieved_record;
410
411 /* Type of locking to apply to rows */
412 enum { RDB_LOCK_NONE, RDB_LOCK_READ, RDB_LOCK_WRITE } m_lock_rows;
413
414 /* TRUE means we're doing an index-only read. FALSE means otherwise. */
415 bool m_keyread_only;
416
417 bool m_skip_scan_it_next_call;
418
419 /* TRUE means we are accessing the first row after a snapshot was created */
420 bool m_rnd_scan_is_new_snapshot;
421
422 /* TRUE means the replication slave will use Read Free Replication */
423 bool m_use_read_free_rpl;
424
425 /**
426 @brief
427 This is a bitmap of indexes (i.e. a set) whose keys (in future, values) may
428 be changed by this statement. Indexes that are not in the bitmap do not need
429 to be updated.
430 @note Valid inside UPDATE statements, IIF(m_update_scope_is_valid == true).
431 */
432 my_core::key_map m_update_scope;
433 bool m_update_scope_is_valid;
434
435 /* SST information used for bulk loading the primary key */
436 std::shared_ptr<Rdb_sst_info> m_sst_info;
437 Rdb_transaction *m_bulk_load_tx;
438 /* Mutex to protect finalizing bulk load */
439 mysql_mutex_t m_bulk_load_mutex;
440
441 /*
442 MySQL index number for duplicate key error
443 */
444 int m_dupp_errkey;
445
446 int create_key_defs(const TABLE *const table_arg,
447 Rdb_tbl_def *const tbl_def_arg,
448 const TABLE *const old_table_arg = nullptr,
449 const Rdb_tbl_def *const old_tbl_def_arg = nullptr) const
450 __attribute__((__nonnull__(2, 3), __warn_unused_result__));
451 int secondary_index_read(const int keyno, uchar *const buf)
452 __attribute__((__nonnull__, __warn_unused_result__));
453 void setup_iterator_for_rnd_scan();
454 void setup_scan_iterator(const Rdb_key_def &kd, rocksdb::Slice *const slice)
455 __attribute__((__nonnull__)) {
456 setup_scan_iterator(kd, slice, false, false, 0);
457 }
458 bool is_ascending(const Rdb_key_def &keydef,
459 enum ha_rkey_function find_flag) const
460 __attribute__((__nonnull__, __warn_unused_result__));
461 void setup_scan_iterator(const Rdb_key_def &kd, rocksdb::Slice *slice,
462 const bool use_all_keys, const bool is_ascending,
463 const uint eq_cond_len) __attribute__((__nonnull__));
464 void release_scan_iterator(void);
465
466 rocksdb::Status
467 get_for_update(Rdb_transaction *const tx,
468 rocksdb::ColumnFamilyHandle *const column_family,
469 const rocksdb::Slice &key, std::string *const value) const;
470
471 int get_row_by_rowid(uchar *const buf, const char *const rowid,
472 const uint rowid_size)
473 __attribute__((__nonnull__, __warn_unused_result__));
474 int get_row_by_rowid(uchar *const buf, const uchar *const rowid,
475 const uint rowid_size)
476 __attribute__((__nonnull__, __warn_unused_result__)) {
477 return get_row_by_rowid(buf, reinterpret_cast<const char *>(rowid),
478 rowid_size);
479 }
480
481 void update_auto_incr_val();
482 void load_auto_incr_value();
483 longlong update_hidden_pk_val();
484 int load_hidden_pk_value() __attribute__((__warn_unused_result__));
485 int read_hidden_pk_id_from_rowkey(longlong *const hidden_pk_id)
486 __attribute__((__nonnull__, __warn_unused_result__));
487 bool can_use_single_delete(const uint &index) const
488 __attribute__((__warn_unused_result__));
489 bool skip_unique_check() const __attribute__((__warn_unused_result__));
490 bool commit_in_the_middle() __attribute__((__warn_unused_result__));
491 bool do_bulk_commit(Rdb_transaction *const tx)
492 __attribute__((__nonnull__, __warn_unused_result__));
493 bool has_hidden_pk(const TABLE *const table) const
494 __attribute__((__nonnull__, __warn_unused_result__));
495
496 void update_row_stats(const operation_type &type);
497
498 void set_last_rowkey(const uchar *const old_data);
499
500 /*
501 Array of table->s->fields elements telling how to store fields in the
502 record.
503 */
504 Rdb_field_encoder *m_encoder_arr;
505
506 /* Describes instructions on how to decode the field */
507 class READ_FIELD {
508 public:
509 /* Points to Rdb_field_encoder describing the field */
510 Rdb_field_encoder *m_field_enc;
511 /* if true, decode the field, otherwise skip it */
512 bool m_decode;
513 /* Skip this many bytes before reading (or skipping) this field */
514 int m_skip;
515 };
516
517 /*
518 This tells which table fields should be decoded (or skipped) when
519 decoding table row from (pk, encoded_row) pair. (Secondary keys are
520 just always decoded in full currently)
521 */
522 std::vector<READ_FIELD> m_decoders_vect;
523
524 /* Setup field_decoders based on type of scan and table->read_set */
525 void setup_read_decoders();
526
527 /*
528 Number of bytes in on-disk (storage) record format that are used for
529 storing SQL NULL flags.
530 */
531 uint m_null_bytes_in_rec;
532
533 void get_storage_type(Rdb_field_encoder *const encoder, const uint &kp);
534 void setup_field_converters();
535 int alloc_key_buffers(const TABLE *const table_arg,
536 const Rdb_tbl_def *const tbl_def_arg,
537 bool alloc_alter_buffers = false)
538 __attribute__((__nonnull__, __warn_unused_result__));
539 void free_key_buffers();
540
541 // the buffer size should be at least 2*Rdb_key_def::INDEX_NUMBER_SIZE
542 rocksdb::Range get_range(const int &i, uchar buf[]) const;
543
544 /*
545 Perf timers for data reads
546 */
547 Rdb_io_perf m_io_perf;
548
549 /*
550 A counter of how many row checksums were checked for this table. Note that
551 this does not include checksums for secondary index entries.
552 */
553 my_core::ha_rows m_row_checksums_checked;
554
555 /*
556 Update stats
557 */
558 void update_stats(void);
559
560 public:
561 /*
562 Controls whether writes include checksums. This is updated from the session
563 variable
564 at the start of each query.
565 */
566 bool m_store_row_debug_checksums;
567
568 /* Same as above but for verifying checksums when reading */
569 bool m_verify_row_debug_checksums;
570 int m_checksums_pct;
571
572 ha_rocksdb(my_core::handlerton *const hton,
573 my_core::TABLE_SHARE *const table_arg);
574 ~ha_rocksdb() {
575 int err __attribute__((__unused__));
576 err = finalize_bulk_load();
577 DBUG_ASSERT(err == 0);
578 mysql_mutex_destroy(&m_bulk_load_mutex);
579 }
580
581 /** @brief
582 The name that will be used for display purposes.
583 */
584 const char *table_type() const override {
585 DBUG_ENTER_FUNC();
586
587 DBUG_RETURN(rocksdb_hton_name);
588 }
589
590 /* The following is only used by SHOW KEYS: */
591 const char *index_type(uint inx) override {
592 DBUG_ENTER_FUNC();
593
594 DBUG_RETURN("LSMTREE");
595 }
596
597 /** @brief
598 The file extensions.
599 */
600 const char **bas_ext() const override;
601
602 /*
603 See if this is the same base table - this should only be true for different
604 partitions of the same table.
605 */
606 bool same_table(const ha_rocksdb &other) const;
607
608 /** @brief
609 This is a list of flags that indicate what functionality the storage engine
610 implements. The current table flags are documented in handler.h
611 */
612 ulonglong table_flags() const override {
613 DBUG_ENTER_FUNC();
614
615 /*
616 HA_BINLOG_STMT_CAPABLE
617 We are saying that this engine is just statement capable to have
618 an engine that can only handle statement-based logging. This is
619 used in testing.
620 HA_REC_NOT_IN_SEQ
621 If we don't set it, filesort crashes, because it assumes rowids are
622 1..8 byte numbers
623 */
624 DBUG_RETURN(HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE |
625 HA_REC_NOT_IN_SEQ | HA_CAN_INDEX_BLOBS |
626 (m_pk_can_be_decoded ? HA_PRIMARY_KEY_IN_READ_INDEX : 0) |
627 HA_PRIMARY_KEY_REQUIRED_FOR_POSITION | HA_NULL_IN_KEY |
628 HA_PARTIAL_COLUMN_READ);
629 }
630
631 bool init_with_fields() override;
632
633 /** @brief
634 This is a bitmap of flags that indicates how the storage engine
635 implements indexes. The current index flags are documented in
636 handler.h. If you do not implement indexes, just return zero here.
637
638 @details
639 part is the key part to check. First key part is 0.
640 If all_parts is set, MySQL wants to know the flags for the combined
641 index, up to and including 'part'.
642 */
643 ulong index_flags(uint inx, uint part, bool all_parts) const override;
644
645 const key_map *keys_to_use_for_scanning() override {
646 DBUG_ENTER_FUNC();
647
648 DBUG_RETURN(&key_map_full);
649 }
650
651 bool primary_key_is_clustered() override {
652 DBUG_ENTER_FUNC();
653
654 DBUG_RETURN(true);
655 }
656
657 bool should_store_row_debug_checksums() const {
658 return m_store_row_debug_checksums && (rand() % 100 < m_checksums_pct);
659 }
660
661 int rename_table(const char *const from, const char *const to) override
662 __attribute__((__nonnull__, __warn_unused_result__));
663
664 int convert_record_from_storage_format(const rocksdb::Slice *const key,
665 const rocksdb::Slice *const value,
666 uchar *const buf)
667 __attribute__((__nonnull__, __warn_unused_result__));
668
669 int convert_record_from_storage_format(const rocksdb::Slice *const key,
670 uchar *const buf)
671 __attribute__((__nonnull__, __warn_unused_result__));
672
673 void convert_record_to_storage_format(const rocksdb::Slice &pk_packed_slice,
674 Rdb_string_writer *const pk_unpack_info,
675 rocksdb::Slice *const packed_rec)
676 __attribute__((__nonnull__));
677
678 static const char *get_key_name(const uint index,
679 const TABLE *const table_arg,
680 const Rdb_tbl_def *const tbl_def_arg)
681 __attribute__((__nonnull__, __warn_unused_result__));
682
683 static const char *get_key_comment(const uint index,
684 const TABLE *const table_arg,
685 const Rdb_tbl_def *const tbl_def_arg)
686 __attribute__((__nonnull__, __warn_unused_result__));
687
688 static bool is_hidden_pk(const uint index, const TABLE *const table_arg,
689 const Rdb_tbl_def *const tbl_def_arg)
690 __attribute__((__nonnull__, __warn_unused_result__));
691
692 static uint pk_index(const TABLE *const table_arg,
693 const Rdb_tbl_def *const tbl_def_arg)
694 __attribute__((__nonnull__, __warn_unused_result__));
695
696 static bool is_pk(const uint index, const TABLE *table_arg,
697 const Rdb_tbl_def *tbl_def_arg)
698 __attribute__((__nonnull__, __warn_unused_result__));
699
700 /** @brief
701 unireg.cc will call max_supported_record_length(), max_supported_keys(),
702 max_supported_key_parts(), uint max_supported_key_length()
703 to make sure that the storage engine can handle the data it is about to
704 send. Return *real* limits of your storage engine here; MySQL will do
705 min(your_limits, MySQL_limits) automatically.
706 */
707 uint max_supported_record_length() const override {
708 DBUG_ENTER_FUNC();
709
710 DBUG_RETURN(HA_MAX_REC_LENGTH);
711 }
712
713 uint max_supported_keys() const override {
714 DBUG_ENTER_FUNC();
715
716 DBUG_RETURN(MAX_INDEXES);
717 }
718
719 uint max_supported_key_parts() const override {
720 DBUG_ENTER_FUNC();
721
722 DBUG_RETURN(MAX_REF_PARTS);
723 }
724
725 uint max_supported_key_part_length() const override {
726 DBUG_ENTER_FUNC();
727
728 DBUG_RETURN(2048);
729 }
730
731 /** @brief
732 unireg.cc will call this to make sure that the storage engine can handle
733 the data it is about to send. Return *real* limits of your storage engine
734 here; MySQL will do min(your_limits, MySQL_limits) automatically.
735
736 @details
737 There is no need to implement ..._key_... methods if your engine doesn't
738 support indexes.
739 */
740 uint max_supported_key_length() const override {
741 DBUG_ENTER_FUNC();
742
743 DBUG_RETURN(16 * 1024); /* just to return something*/
744 }
745
746 /**
747 TODO: return actual upper bound of number of records in the table.
748 (e.g. save number of records seen on full table scan and/or use file size
749 as upper bound)
750 */
751 ha_rows estimate_rows_upper_bound() override {
752 DBUG_ENTER_FUNC();
753
754 DBUG_RETURN(HA_POS_ERROR);
755 }
756
757 /* At the moment, we're ok with default handler::index_init() implementation.
758 */
759 int index_read_map(uchar *const buf, const uchar *const key,
760 key_part_map keypart_map,
761 enum ha_rkey_function find_flag) override
762 __attribute__((__warn_unused_result__));
763
764 int index_read_map_impl(uchar *const buf, const uchar *const key,
765 key_part_map keypart_map,
766 enum ha_rkey_function find_flag,
767 const key_range *end_key)
768 __attribute__((__warn_unused_result__));
769
770 int index_read_last_map(uchar *const buf, const uchar *const key,
771 key_part_map keypart_map) override
772 __attribute__((__warn_unused_result__));
773
774 int read_range_first(const key_range *const start_key,
775 const key_range *const end_key, bool eq_range,
776 bool sorted) override
777 __attribute__((__warn_unused_result__));
778
779 virtual double scan_time() override {
780 DBUG_ENTER_FUNC();
781
782 DBUG_RETURN(
783 static_cast<double>((stats.records + stats.deleted) / 20.0 + 10));
784 }
785
786 virtual double read_time(uint, uint, ha_rows rows) override;
787
788 int open(const char *const name, int mode, uint test_if_locked) override
789 __attribute__((__warn_unused_result__));
790 int close(void) override __attribute__((__warn_unused_result__));
791
792 int write_row(uchar *const buf) override
793 __attribute__((__warn_unused_result__));
794 int update_row(const uchar *const old_data, uchar *const new_data) override
795 __attribute__((__warn_unused_result__));
796 int delete_row(const uchar *const buf) override
797 __attribute__((__warn_unused_result__));
798 rocksdb::Status delete_or_singledelete(uint index, Rdb_transaction *const tx,
799 rocksdb::ColumnFamilyHandle *const cf,
800 const rocksdb::Slice &key)
801 __attribute__((__warn_unused_result__));
802
803 int index_next(uchar *const buf) override
804 __attribute__((__warn_unused_result__));
805 int index_next_with_direction(uchar *const buf, bool move_forward)
806 __attribute__((__warn_unused_result__));
807 int index_prev(uchar *const buf) override
808 __attribute__((__warn_unused_result__));
809
810 int index_first(uchar *const buf) override
811 __attribute__((__warn_unused_result__));
812 int index_last(uchar *const buf) override
813 __attribute__((__warn_unused_result__));
814
815 class Item *idx_cond_push(uint keyno, class Item *const idx_cond) override;
816 /*
817 Default implementation from cancel_pushed_idx_cond() suits us
818 */
819 private:
820 struct key_def_cf_info {
821 rocksdb::ColumnFamilyHandle *cf_handle;
822 bool is_reverse_cf;
823 bool is_auto_cf;
824 };
825
826 struct update_row_info {
827 Rdb_transaction *tx;
828 const uchar *new_data;
829 const uchar *old_data;
830 rocksdb::Slice new_pk_slice;
831 rocksdb::Slice old_pk_slice;
832
833 // "unpack_info" data for the new PK value
834 Rdb_string_writer *new_pk_unpack_info;
835
836 longlong hidden_pk_id;
837 bool skip_unique_check;
838 };
839
840 /*
841 Used to check for duplicate entries during fast unique secondary index
842 creation.
843 */
844 struct unique_sk_buf_info {
845 bool sk_buf_switch = false;
846 rocksdb::Slice sk_memcmp_key;
847 rocksdb::Slice sk_memcmp_key_old;
848 uchar *dup_sk_buf;
849 uchar *dup_sk_buf_old;
850
851 /*
852 This method is meant to be called back to back during inplace creation
853 of unique indexes. It will switch between two buffers, which
854 will each store the memcmp form of secondary keys, which are then
855 converted to slices in sk_memcmp_key or sk_memcmp_key_old.
856
857 Switching buffers on each iteration allows us to retain the
858 sk_memcmp_key_old value for duplicate comparison.
859 */
860 inline uchar *swap_and_get_sk_buf() {
861 sk_buf_switch = !sk_buf_switch;
862 return sk_buf_switch ? dup_sk_buf : dup_sk_buf_old;
863 }
864 };
865
866 int create_cfs(const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
867 std::array<struct key_def_cf_info, MAX_INDEXES + 1> *const cfs)
868 const __attribute__((__nonnull__, __warn_unused_result__));
869
870 int create_key_def(const TABLE *const table_arg, const uint &i,
871 const Rdb_tbl_def *const tbl_def_arg,
872 std::shared_ptr<Rdb_key_def> *const new_key_def,
873 const struct key_def_cf_info &cf_info) const
874 __attribute__((__nonnull__, __warn_unused_result__));
875
876 int create_inplace_key_defs(
877 const TABLE *const table_arg, Rdb_tbl_def *vtbl_def_arg,
878 const TABLE *const old_table_arg,
879 const Rdb_tbl_def *const old_tbl_def_arg,
880 const std::array<key_def_cf_info, MAX_INDEXES + 1> &cfs) const
881 __attribute__((__nonnull__, __warn_unused_result__));
882
883 std::unordered_map<std::string, uint>
884 get_old_key_positions(const TABLE *table_arg, const Rdb_tbl_def *tbl_def_arg,
885 const TABLE *old_table_arg,
886 const Rdb_tbl_def *old_tbl_def_arg) const
887 __attribute__((__nonnull__));
888
889 int compare_key_parts(const KEY *const old_key,
890 const KEY *const new_key) const;
891 __attribute__((__nonnull__, __warn_unused_result__));
892
893 int index_first_intern(uchar *buf)
894 __attribute__((__nonnull__, __warn_unused_result__));
895 int index_last_intern(uchar *buf)
896 __attribute__((__nonnull__, __warn_unused_result__));
897
898 enum icp_result check_index_cond() const;
899 int find_icp_matching_index_rec(const bool &move_forward, uchar *const buf)
900 __attribute__((__nonnull__, __warn_unused_result__));
901
902 void calc_updated_indexes();
903 int update_write_row(const uchar *const old_data, const uchar *const new_data,
904 const bool skip_unique_check)
905 __attribute__((__warn_unused_result__));
906 int get_pk_for_update(struct update_row_info *const row_info);
907 int check_and_lock_unique_pk(const uint &key_id,
908 const struct update_row_info &row_info,
909 bool *const found, bool *const pk_changed)
910 __attribute__((__warn_unused_result__));
911 int check_and_lock_sk(const uint &key_id,
912 const struct update_row_info &row_info,
913 bool *const found) const
914 __attribute__((__warn_unused_result__));
915 int check_uniqueness_and_lock(const struct update_row_info &row_info,
916 bool *const pk_changed)
917 __attribute__((__warn_unused_result__));
918 bool over_bulk_load_threshold(int *err)
919 __attribute__((__warn_unused_result__));
920 int check_duplicate_sk(const TABLE *table_arg, const Rdb_key_def &index,
921 const rocksdb::Slice *key,
922 struct unique_sk_buf_info *sk_info)
923 __attribute__((__nonnull__, __warn_unused_result__));
924 int bulk_load_key(Rdb_transaction *const tx, const Rdb_key_def &kd,
925 const rocksdb::Slice &key, const rocksdb::Slice &value)
926 __attribute__((__nonnull__, __warn_unused_result__));
927 int update_pk(const Rdb_key_def &kd, const struct update_row_info &row_info,
928 const bool &pk_changed) __attribute__((__warn_unused_result__));
929 int update_sk(const TABLE *const table_arg, const Rdb_key_def &kd,
930 const struct update_row_info &row_info)
931 __attribute__((__warn_unused_result__));
932 int update_indexes(const struct update_row_info &row_info,
933 const bool &pk_changed)
934 __attribute__((__warn_unused_result__));
935
936 int read_key_exact(const Rdb_key_def &kd, rocksdb::Iterator *const iter,
937 const bool &using_full_key,
938 const rocksdb::Slice &key_slice) const
939 __attribute__((__nonnull__, __warn_unused_result__));
940 int read_before_key(const Rdb_key_def &kd, const bool &using_full_key,
941 const rocksdb::Slice &key_slice)
942 __attribute__((__nonnull__, __warn_unused_result__));
943 int read_after_key(const Rdb_key_def &kd, const bool &using_full_key,
944 const rocksdb::Slice &key_slice)
945 __attribute__((__nonnull__, __warn_unused_result__));
946
947 int position_to_correct_key(
948 const Rdb_key_def &kd, const enum ha_rkey_function &find_flag,
949 const bool &full_key_match, const uchar *const key,
950 const key_part_map &keypart_map, const rocksdb::Slice &key_slice,
951 bool *const move_forward) __attribute__((__warn_unused_result__));
952
953 int read_row_from_primary_key(uchar *const buf)
954 __attribute__((__nonnull__, __warn_unused_result__));
955 int read_row_from_secondary_key(uchar *const buf, const Rdb_key_def &kd,
956 bool move_forward)
957 __attribute__((__nonnull__, __warn_unused_result__));
958
959 int calc_eq_cond_len(const Rdb_key_def &kd,
960 const enum ha_rkey_function &find_flag,
961 const rocksdb::Slice &slice,
962 const int &bytes_changed_by_succ,
963 const key_range *const end_key,
964 uint *const end_key_packed_size)
965 __attribute__((__warn_unused_result__));
966
967 Rdb_tbl_def *get_table_if_exists(const char *const tablename)
968 __attribute__((__nonnull__, __warn_unused_result__));
969 void read_thd_vars(THD *const thd) __attribute__((__nonnull__));
970 const char *thd_rocksdb_tmpdir()
971 __attribute__((__nonnull__, __warn_unused_result__));
972
973 bool contains_foreign_key(THD *const thd)
974 __attribute__((__nonnull__, __warn_unused_result__));
975
976 int inplace_populate_sk(
977 TABLE *const table_arg,
978 const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes)
979 __attribute__((__nonnull__, __warn_unused_result__));
980
981 public:
982 int index_init(uint idx, bool sorted) override
983 __attribute__((__warn_unused_result__));
984 int index_end() override __attribute__((__warn_unused_result__));
985
986 void unlock_row() override;
987
988 /** @brief
989 Unlike index_init(), rnd_init() can be called two consecutive times
990 without rnd_end() in between (it only makes sense if scan=1). In this
991 case, the second call should prepare for the new table scan (e.g if
992 rnd_init() allocates the cursor, the second call should position the
993 cursor to the start of the table; no need to deallocate and allocate
994 it again. This is a required method.
995 */
996 int rnd_init(bool scan) override __attribute__((__warn_unused_result__));
997 int rnd_end() override __attribute__((__warn_unused_result__));
998
999 int rnd_next(uchar *const buf) override
1000 __attribute__((__warn_unused_result__));
1001 int rnd_next_with_direction(uchar *const buf, bool move_forward)
1002 __attribute__((__warn_unused_result__));
1003
1004 int rnd_pos(uchar *const buf, uchar *const pos) override
1005 __attribute__((__warn_unused_result__));
1006 void position(const uchar *const record) override;
1007 int info(uint) override;
1008
1009 /* This function will always return success, therefore no annotation related
1010 * to checking the return value. Can't change the signature because it's
1011 * required by the interface. */
1012 int extra(enum ha_extra_function operation) override;
1013
1014 int start_stmt(THD *const thd, thr_lock_type lock_type) override
1015 __attribute__((__warn_unused_result__));
1016 int external_lock(THD *const thd, int lock_type) override
1017 __attribute__((__warn_unused_result__));
1018 int truncate() override __attribute__((__warn_unused_result__));
1019
1020 int reset() override {
1021 DBUG_ENTER_FUNC();
1022
1023 /* Free blob data */
1024 m_retrieved_record.clear();
1025
1026 DBUG_RETURN(HA_EXIT_SUCCESS);
1027 }
1028
1029 int check(THD *const thd, HA_CHECK_OPT *const check_opt) override
1030 __attribute__((__warn_unused_result__));
1031 void remove_rows(Rdb_tbl_def *const tbl);
1032 ha_rows records_in_range(uint inx, key_range *const min_key,
1033 key_range *const max_key) override
1034 __attribute__((__warn_unused_result__));
1035 int delete_table(const char *const from) override
1036 __attribute__((__warn_unused_result__));
1037 int create(const char *const name, TABLE *const form,
1038 HA_CREATE_INFO *const create_info) override
1039 __attribute__((__warn_unused_result__));
1040 bool check_if_incompatible_data(HA_CREATE_INFO *const info,
1041 uint table_changes) override
1042 __attribute__((__warn_unused_result__));
1043
1044 THR_LOCK_DATA **store_lock(THD *const thd, THR_LOCK_DATA **to,
1045 enum thr_lock_type lock_type) override
1046 __attribute__((__warn_unused_result__));
1047
1048 my_bool register_query_cache_table(THD *const thd, char *const table_key,
1049 uint key_length,
1050 qc_engine_callback *const engine_callback,
1051 ulonglong *const engine_data) override {
1052 DBUG_ENTER_FUNC();
1053
1054 /* Currently, we don't support query cache */
1055 DBUG_RETURN(FALSE);
1056 }
1057
1058 bool get_error_message(const int error, String *const buf) override
1059 __attribute__((__nonnull__));
1060
1061 void get_auto_increment(ulonglong offset, ulonglong increment,
1062 ulonglong nb_desired_values,
1063 ulonglong *const first_value,
1064 ulonglong *const nb_reserved_values) override;
1065 void update_create_info(HA_CREATE_INFO *const create_info) override;
1066 int optimize(THD *const thd, HA_CHECK_OPT *const check_opt) override
1067 __attribute__((__warn_unused_result__));
1068 int analyze(THD *const thd, HA_CHECK_OPT *const check_opt) override
1069 __attribute__((__warn_unused_result__));
1070 int calculate_stats(const TABLE *const table_arg, THD *const thd,
1071 HA_CHECK_OPT *const check_opt)
1072 __attribute__((__warn_unused_result__));
1073
1074 enum_alter_inplace_result check_if_supported_inplace_alter(
1075 TABLE *altered_table,
1076 my_core::Alter_inplace_info *const ha_alter_info) override;
1077
1078 bool prepare_inplace_alter_table(
1079 TABLE *const altered_table,
1080 my_core::Alter_inplace_info *const ha_alter_info) override;
1081
1082 bool inplace_alter_table(
1083 TABLE *const altered_table,
1084 my_core::Alter_inplace_info *const ha_alter_info) override;
1085
1086 bool
1087 commit_inplace_alter_table(TABLE *const altered_table,
1088 my_core::Alter_inplace_info *const ha_alter_info,
1089 bool commit) override;
1090
1091 int finalize_bulk_load() __attribute__((__warn_unused_result__));
1092
1093 void set_use_read_free_rpl(const char *const whitelist);
1094
1095 public:
1096 virtual void rpl_before_delete_rows() override;
1097 virtual void rpl_after_delete_rows() override;
1098 virtual void rpl_before_update_rows() override;
1099 virtual void rpl_after_update_rows() override;
1100 virtual bool use_read_free_rpl();
1101
1102 private:
1103 /* Flags tracking if we are inside different replication operation */
1104 bool m_in_rpl_delete_rows;
1105 bool m_in_rpl_update_rows;
1106
1107 };
1108
1109 /*
1110 Helper class for in-place alter, for storing handler context between inplace
1111 alter calls
1112 */
1113 struct Rdb_inplace_alter_ctx : public my_core::inplace_alter_handler_ctx {
1114 /* The new table definition */
1115 Rdb_tbl_def *const m_new_tdef;
1116
1117 /* Stores the original key definitions */
1118 std::shared_ptr<Rdb_key_def> *const m_old_key_descr;
1119
1120 /* Stores the new key definitions */
1121 std::shared_ptr<Rdb_key_def> *m_new_key_descr;
1122
1123 /* Stores the old number of key definitions */
1124 const uint m_old_n_keys;
1125
1126 /* Stores the new number of key definitions */
1127 const uint m_new_n_keys;
1128
1129 /* Stores the added key glids */
1130 const std::unordered_set<std::shared_ptr<Rdb_key_def>> m_added_indexes;
1131
1132 /* Stores the dropped key glids */
1133 const std::unordered_set<GL_INDEX_ID> m_dropped_index_ids;
1134
1135 /* Stores number of keys to add */
1136 const uint m_n_added_keys;
1137
1138 /* Stores number of keys to drop */
1139 const uint m_n_dropped_keys;
1140
1141 Rdb_inplace_alter_ctx(
1142 Rdb_tbl_def *new_tdef, std::shared_ptr<Rdb_key_def> *old_key_descr,
1143 std::shared_ptr<Rdb_key_def> *new_key_descr, uint old_n_keys,
1144 uint new_n_keys,
1145 std::unordered_set<std::shared_ptr<Rdb_key_def>> added_indexes,
1146 std::unordered_set<GL_INDEX_ID> dropped_index_ids, uint n_added_keys,
1147 uint n_dropped_keys)
1148 : my_core::inplace_alter_handler_ctx(), m_new_tdef(new_tdef),
1149 m_old_key_descr(old_key_descr), m_new_key_descr(new_key_descr),
1150 m_old_n_keys(old_n_keys), m_new_n_keys(new_n_keys),
1151 m_added_indexes(added_indexes), m_dropped_index_ids(dropped_index_ids),
1152 m_n_added_keys(n_added_keys), m_n_dropped_keys(n_dropped_keys) {}
1153
1154 ~Rdb_inplace_alter_ctx() {}
1155
1156 private:
1157 /* Disable Copying */
1158 Rdb_inplace_alter_ctx(const Rdb_inplace_alter_ctx &);
1159 Rdb_inplace_alter_ctx &operator=(const Rdb_inplace_alter_ctx &);
1160 };
1161
1162 } // namespace myrocks
1163