1 /*
2    Copyright (c) 2012,2013 Monty Program Ab
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; version 2 of the License.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
16 #pragma once
17 
18 #ifdef USE_PRAGMA_INTERFACE
19 #pragma interface /* gcc class implementation */
20 #endif
21 
22 /* C++ standard header files */
23 #include <cinttypes>
24 #include <set>
25 #include <string>
26 #include <unordered_set>
27 #include <vector>
28 
29 /* MySQL header files */
30 #include "./handler.h"   /* handler */
31 #include "./my_global.h" /* ulonglong */
32 #include "./sql_string.h"
33 #include "./ib_ut0counter.h"
34 
35 /* RocksDB header files */
36 #include "rocksdb/cache.h"
37 #include "rocksdb/perf_context.h"
38 #include "rocksdb/statistics.h"
39 #include "rocksdb/utilities/options_util.h"
40 #include "rocksdb/utilities/transaction_db.h"
41 #include "rocksdb/utilities/write_batch_with_index.h"
42 
43 /* MyRocks header files */
44 #include "./rdb_comparator.h"
45 #include "./rdb_index_merge.h"
46 #include "./rdb_perf_context.h"
47 #include "./rdb_sst_info.h"
48 #include "./rdb_utils.h"
49 
50 /**
51   @note MyRocks Coding Conventions:
52   MyRocks code follows the baseline MySQL coding conventions, available at
53   http://dev.mysql.com/doc/internals/en/coding-guidelines.html, with several
54   refinements (@see /storage/rocksdb/README file).
55 */
56 
57 /**
58   @note MyRocks Coding Conventions:
59   MyRocks code follows the baseline MySQL coding conventions, available at
60   http://dev.mysql.com/doc/internals/en/coding-guidelines.html, with several
61   refinements (@see /storage/rocksdb/README file).
62 */
63 
64 namespace myrocks {
65 
66 /*
67  * class for exporting transaction information for
68  * information_schema.rocksdb_trx
69  */
70 struct Rdb_trx_info {
71   std::string name;
72   ulonglong trx_id;
73   ulonglong write_count;
74   ulonglong lock_count;
75   int timeout_sec;
76   std::string state;
77   std::string waiting_key;
78   ulonglong waiting_cf_id;
79   int is_replication;
80   int skip_trx_api;
81   int read_only;
82   int deadlock_detect;
83   int num_ongoing_bulk_load;
84   ulong thread_id;
85   std::string query_str;
86 };
87 
88 std::vector<Rdb_trx_info> rdb_get_all_trx_info();
89 
90 /*
91   This is
92   - the name of the default Column Family (the CF which stores indexes which
93     didn't explicitly specify which CF they are in)
94   - the name used to set the default column family parameter for per-cf
95     arguments.
96 */
97 const char *const DEFAULT_CF_NAME = "default";
98 
99 /*
100   This is the name of the Column Family used for storing the data dictionary.
101 */
102 const char *const DEFAULT_SYSTEM_CF_NAME = "__system__";
103 
104 /*
105   This is the name of the hidden primary key for tables with no pk.
106 */
107 const char *const HIDDEN_PK_NAME = "HIDDEN_PK_ID";
108 
109 /*
110   Column family name which means "put this index into its own column family".
111   See Rdb_cf_manager::get_per_index_cf_name().
112 */
113 const char *const PER_INDEX_CF_NAME = "$per_index_cf";
114 
115 /*
116   Name for the background thread.
117 */
118 const char *const BG_THREAD_NAME = "myrocks-bg";
119 
120 /*
121   Name for the drop index thread.
122 */
123 const char *const INDEX_THREAD_NAME = "myrocks-index";
124 
125 /*
126   Default, minimal valid, and maximum valid sampling rate values when collecting
127   statistics about table.
128 */
129 #define RDB_DEFAULT_TBL_STATS_SAMPLE_PCT 10
130 #define RDB_TBL_STATS_SAMPLE_PCT_MIN 1
131 #define RDB_TBL_STATS_SAMPLE_PCT_MAX 100
132 
133 /*
134   Default and maximum values for rocksdb-compaction-sequential-deletes and
135   rocksdb-compaction-sequential-deletes-window to add basic boundary checking.
136 */
137 #define DEFAULT_COMPACTION_SEQUENTIAL_DELETES 0
138 #define MAX_COMPACTION_SEQUENTIAL_DELETES 2000000
139 
140 #define DEFAULT_COMPACTION_SEQUENTIAL_DELETES_WINDOW 0
141 #define MAX_COMPACTION_SEQUENTIAL_DELETES_WINDOW 2000000
142 
143 /*
144   Default and maximum values for various compaction and flushing related
145   options. Numbers are based on the hardware we currently use and our internal
146   benchmarks which indicate that parallelization helps with the speed of
147   compactions.
148 
149   Ideally of course we'll use heuristic technique to determine the number of
150   CPU-s and derive the values from there. This however has its own set of
151   problems and we'll choose simplicity for now.
152 */
153 #define MAX_BACKGROUND_COMPACTIONS 64
154 #define MAX_BACKGROUND_FLUSHES 64
155 
156 #define DEFAULT_SUBCOMPACTIONS 1
157 #define MAX_SUBCOMPACTIONS 64
158 
159 /*
160   Defines the field sizes for serializing XID object to a string representation.
161   string byte format: [field_size: field_value, ...]
162   [
163     8: XID.formatID,
164     1: XID.gtrid_length,
165     1: XID.bqual_length,
166     XID.gtrid_length + XID.bqual_length: XID.data
167   ]
168 */
169 #define RDB_FORMATID_SZ 8
170 #define RDB_GTRID_SZ 1
171 #define RDB_BQUAL_SZ 1
172 #define RDB_XIDHDR_LEN (RDB_FORMATID_SZ + RDB_GTRID_SZ + RDB_BQUAL_SZ)
173 
174 /*
175   To fix an unhandled exception we specify the upper bound as LONGLONGMAX
176   instead of ULONGLONGMAX because the latter is -1 and causes an exception when
177   cast to jlong (signed) of JNI
178 
179   The reason behind the cast issue is the lack of unsigned int support in Java.
180 */
181 #define MAX_RATE_LIMITER_BYTES_PER_SEC static_cast<uint64_t>(LONGLONG_MAX)
182 
183 /*
184   Hidden PK column (for tables with no primary key) is a longlong (aka 8 bytes).
185   static_assert() in code will validate this assumption.
186 */
187 #define ROCKSDB_SIZEOF_HIDDEN_PK_COLUMN sizeof(longlong)
188 
189 /*
190   MyRocks specific error codes. NB! Please make sure that you will update
191   HA_ERR_ROCKSDB_LAST when adding new ones.
192 */
193 #define HA_ERR_ROCKSDB_UNIQUE_NOT_SUPPORTED (HA_ERR_LAST + 1)
194 #define HA_ERR_ROCKSDB_PK_REQUIRED (HA_ERR_LAST + 2)
195 #define HA_ERR_ROCKSDB_TOO_MANY_LOCKS (HA_ERR_LAST + 3)
196 #define HA_ERR_ROCKSDB_LAST HA_ERR_ROCKSDB_TOO_MANY_LOCKS
197 
looks_like_per_index_cf_typo(const char * const name)198 inline bool looks_like_per_index_cf_typo(const char *const name) {
199   return (name && name[0] == '$' && strcmp(name, PER_INDEX_CF_NAME));
200 }
201 
202 /**
203   @brief
204   Rdb_table_handler is a reference-counted structure storing information for
205   each open table. All the objects are stored in a global hash map.
206 
207   //TODO: join this with Rdb_tbl_def ?
208 */
209 struct Rdb_table_handler {
210   char *m_table_name;
211   uint m_table_name_length;
212   int m_ref_count;
213 
214   my_core::THR_LOCK m_thr_lock; ///< MySQL latch needed by m_db_lock
215 
216   /* Stores cumulative table statistics */
217   my_io_perf_atomic_t m_io_perf_read;
218   Rdb_atomic_perf_counters m_table_perf_context;
219 };
220 
221 class Rdb_key_def;
222 class Rdb_tbl_def;
223 class Rdb_transaction;
224 class Rdb_transaction_impl;
225 class Rdb_writebatch_impl;
226 class Rdb_field_encoder;
227 
228 const char *const rocksdb_hton_name = "ROCKSDB";
229 
230 typedef struct _gl_index_id_s {
231   uint32_t cf_id;
232   uint32_t index_id;
233   bool operator==(const struct _gl_index_id_s &other) const {
234     return cf_id == other.cf_id && index_id == other.index_id;
235   }
236   bool operator!=(const struct _gl_index_id_s &other) const {
237     return cf_id != other.cf_id || index_id != other.index_id;
238   }
239   bool operator<(const struct _gl_index_id_s &other) const {
240     return cf_id < other.cf_id ||
241            (cf_id == other.cf_id && index_id < other.index_id);
242   }
243   bool operator<=(const struct _gl_index_id_s &other) const {
244     return cf_id < other.cf_id ||
245            (cf_id == other.cf_id && index_id <= other.index_id);
246   }
247   bool operator>(const struct _gl_index_id_s &other) const {
248     return cf_id > other.cf_id ||
249            (cf_id == other.cf_id && index_id > other.index_id);
250   }
251   bool operator>=(const struct _gl_index_id_s &other) const {
252     return cf_id > other.cf_id ||
253            (cf_id == other.cf_id && index_id >= other.index_id);
254   }
255 } GL_INDEX_ID;
256 
257 enum operation_type {
258   ROWS_DELETED = 0,
259   ROWS_INSERTED,
260   ROWS_READ,
261   ROWS_UPDATED,
262   ROWS_MAX
263 };
264 
265 #if defined(HAVE_SCHED_GETCPU)
266 #define RDB_INDEXER get_sched_indexer_t
267 #else
268 #define RDB_INDEXER thread_id_indexer_t
269 #endif
270 
271 /* Global statistics struct used inside MyRocks */
272 struct st_global_stats {
273   ib_counter_t<ulonglong, 64, RDB_INDEXER> rows[ROWS_MAX];
274 
275   // system_rows_ stats are only for system
276   // tables. They are not counted in rows_* stats.
277   ib_counter_t<ulonglong, 64, RDB_INDEXER> system_rows[ROWS_MAX];
278 };
279 
280 /* Struct used for exporting status to MySQL */
281 struct st_export_stats {
282   ulonglong rows_deleted;
283   ulonglong rows_inserted;
284   ulonglong rows_read;
285   ulonglong rows_updated;
286 
287   ulonglong system_rows_deleted;
288   ulonglong system_rows_inserted;
289   ulonglong system_rows_read;
290   ulonglong system_rows_updated;
291 };
292 
293 } // namespace myrocks
294 
295 #include "./rdb_buff.h"
296 
297 /* Provide hash function for GL_INDEX_ID so we can include it in sets */
298 namespace std {
299 template <> struct hash<myrocks::GL_INDEX_ID> {
300   std::size_t operator()(const myrocks::GL_INDEX_ID &gl_index_id) const {
301     const uint64_t val =
302         ((uint64_t)gl_index_id.cf_id << 32 | (uint64_t)gl_index_id.index_id);
303     return std::hash<uint64_t>()(val);
304   }
305 };
306 } // namespace std
307 
308 namespace myrocks {
309 
310 /**
311   @brief
312   Class definition for ROCKSDB storage engine plugin handler
313 */
314 
315 class ha_rocksdb : public my_core::handler {
316   my_core::THR_LOCK_DATA m_db_lock; ///< MySQL database lock
317 
318   Rdb_table_handler *m_table_handler; ///< Open table handler
319 
320   /* Iterator used for range scans and for full table/index scans */
321   rocksdb::Iterator *m_scan_it;
322 
323   /* Whether m_scan_it was created with skip_bloom=true */
324   bool m_scan_it_skips_bloom;
325 
326   const rocksdb::Snapshot *m_scan_it_snapshot;
327 
328   Rdb_tbl_def *m_tbl_def;
329 
330   /* Primary Key encoder from KeyTupleFormat to StorageFormat */
331   std::shared_ptr<Rdb_key_def> m_pk_descr;
332 
333   /* Array of index descriptors */
334   std::shared_ptr<Rdb_key_def> *m_key_descr_arr;
335 
336   bool check_keyread_allowed(uint inx, uint part, bool all_parts) const;
337 
338   /*
339     Number of key parts in PK. This is the same as
340       table->key_info[table->s->primary_key].keyparts
341   */
342   uint m_pk_key_parts;
343 
344   /*
345     TRUE <=> Primary Key columns can be decoded from the index
346   */
347   mutable bool m_pk_can_be_decoded;
348 
349   /*
350    TRUE <=> Some fields in the PK may require unpack_info.
351   */
352   bool m_maybe_unpack_info;
353 
354   uchar *m_pk_tuple;        /* Buffer for storing PK in KeyTupleFormat */
355   uchar *m_pk_packed_tuple; /* Buffer for storing PK in StorageFormat */
356   // ^^ todo: change it to 'char*'? TODO: ^ can we join this with last_rowkey?
357 
358   /*
359     Temporary buffers for storing the key part of the Key/Value pair
360     for secondary indexes.
361   */
362   uchar *m_sk_packed_tuple;
363 
364   /*
365     Temporary buffers for storing end key part of the Key/Value pair.
366     This is used for range scan only.
367   */
368   uchar *m_end_key_packed_tuple;
369 
370   Rdb_string_writer m_sk_tails;
371   Rdb_string_writer m_pk_unpack_info;
372 
373   /*
374     ha_rockdb->index_read_map(.. HA_READ_KEY_EXACT or similar) will save here
375     mem-comparable form of the index lookup tuple.
376   */
377   uchar *m_sk_match_prefix;
378   uint m_sk_match_length;
379 
380   /* Buffer space for the above */
381   uchar *m_sk_match_prefix_buf;
382 
383   /* Second buffers, used by UPDATE. */
384   uchar *m_sk_packed_tuple_old;
385   Rdb_string_writer m_sk_tails_old;
386 
387   /* Buffers used for duplicate checking during unique_index_creation */
388   uchar *m_dup_sk_packed_tuple;
389   uchar *m_dup_sk_packed_tuple_old;
390 
391   /*
392     Temporary space for packing VARCHARs (we provide it to
393     pack_record()/pack_index_tuple() calls).
394   */
395   uchar *m_pack_buffer;
396 
397   /* rowkey of the last record we've read, in StorageFormat. */
398   String m_last_rowkey;
399 
400   /* Buffer used by convert_record_to_storage_format() */
401   String m_storage_record;
402 
403   /*
404     Last retrieved record, in table->record[0] data format.
405 
406     This is used only when we get the record with rocksdb's Get() call (The
407     other option is when we get a rocksdb::Slice from an iterator)
408   */
409   std::string m_retrieved_record;
410 
411   /* Type of locking to apply to rows */
412   enum { RDB_LOCK_NONE, RDB_LOCK_READ, RDB_LOCK_WRITE } m_lock_rows;
413 
414   /* TRUE means we're doing an index-only read. FALSE means otherwise. */
415   bool m_keyread_only;
416 
417   bool m_skip_scan_it_next_call;
418 
419   /* TRUE means we are accessing the first row after a snapshot was created */
420   bool m_rnd_scan_is_new_snapshot;
421 
422   /* TRUE means the replication slave will use Read Free Replication */
423   bool m_use_read_free_rpl;
424 
425   /**
426     @brief
427     This is a bitmap of indexes (i.e. a set) whose keys (in future, values) may
428     be changed by this statement. Indexes that are not in the bitmap do not need
429     to be updated.
430     @note Valid inside UPDATE statements, IIF(m_update_scope_is_valid == true).
431   */
432   my_core::key_map m_update_scope;
433   bool m_update_scope_is_valid;
434 
435   /* SST information used for bulk loading the primary key */
436   std::shared_ptr<Rdb_sst_info> m_sst_info;
437   Rdb_transaction *m_bulk_load_tx;
438   /* Mutex to protect finalizing bulk load */
439   mysql_mutex_t m_bulk_load_mutex;
440 
441   /*
442     MySQL index number for duplicate key error
443   */
444   int m_dupp_errkey;
445 
446   int create_key_defs(const TABLE *const table_arg,
447                       Rdb_tbl_def *const tbl_def_arg,
448                       const TABLE *const old_table_arg = nullptr,
449                       const Rdb_tbl_def *const old_tbl_def_arg = nullptr) const
450       __attribute__((__nonnull__(2, 3), __warn_unused_result__));
451   int secondary_index_read(const int keyno, uchar *const buf)
452       __attribute__((__nonnull__, __warn_unused_result__));
453   void setup_iterator_for_rnd_scan();
454   void setup_scan_iterator(const Rdb_key_def &kd, rocksdb::Slice *const slice)
455       __attribute__((__nonnull__)) {
456     setup_scan_iterator(kd, slice, false, false, 0);
457   }
458   bool is_ascending(const Rdb_key_def &keydef,
459                     enum ha_rkey_function find_flag) const
460       __attribute__((__nonnull__, __warn_unused_result__));
461   void setup_scan_iterator(const Rdb_key_def &kd, rocksdb::Slice *slice,
462                            const bool use_all_keys, const bool is_ascending,
463                            const uint eq_cond_len) __attribute__((__nonnull__));
464   void release_scan_iterator(void);
465 
466   rocksdb::Status
467   get_for_update(Rdb_transaction *const tx,
468                  rocksdb::ColumnFamilyHandle *const column_family,
469                  const rocksdb::Slice &key, std::string *const value) const;
470 
471   int get_row_by_rowid(uchar *const buf, const char *const rowid,
472                        const uint rowid_size)
473       __attribute__((__nonnull__, __warn_unused_result__));
474   int get_row_by_rowid(uchar *const buf, const uchar *const rowid,
475                        const uint rowid_size)
476       __attribute__((__nonnull__, __warn_unused_result__)) {
477     return get_row_by_rowid(buf, reinterpret_cast<const char *>(rowid),
478                             rowid_size);
479   }
480 
481   void update_auto_incr_val();
482   void load_auto_incr_value();
483   longlong update_hidden_pk_val();
484   int load_hidden_pk_value() __attribute__((__warn_unused_result__));
485   int read_hidden_pk_id_from_rowkey(longlong *const hidden_pk_id)
486       __attribute__((__nonnull__, __warn_unused_result__));
487   bool can_use_single_delete(const uint &index) const
488       __attribute__((__warn_unused_result__));
489   bool skip_unique_check() const __attribute__((__warn_unused_result__));
490   bool commit_in_the_middle() __attribute__((__warn_unused_result__));
491   bool do_bulk_commit(Rdb_transaction *const tx)
492       __attribute__((__nonnull__, __warn_unused_result__));
493   bool has_hidden_pk(const TABLE *const table) const
494       __attribute__((__nonnull__, __warn_unused_result__));
495 
496   void update_row_stats(const operation_type &type);
497 
498   void set_last_rowkey(const uchar *const old_data);
499 
500   /*
501     Array of table->s->fields elements telling how to store fields in the
502     record.
503   */
504   Rdb_field_encoder *m_encoder_arr;
505 
506   /* Describes instructions on how to decode the field */
507   class READ_FIELD {
508   public:
509     /* Points to Rdb_field_encoder describing the field */
510     Rdb_field_encoder *m_field_enc;
511     /* if true, decode the field, otherwise skip it */
512     bool m_decode;
513     /* Skip this many bytes before reading (or skipping) this field */
514     int m_skip;
515   };
516 
517   /*
518     This tells which table fields should be decoded (or skipped) when
519     decoding table row from (pk, encoded_row) pair. (Secondary keys are
520     just always decoded in full currently)
521   */
522   std::vector<READ_FIELD> m_decoders_vect;
523 
524   /* Setup field_decoders based on type of scan and table->read_set */
525   void setup_read_decoders();
526 
527   /*
528     Number of bytes in on-disk (storage) record format that are used for
529     storing SQL NULL flags.
530   */
531   uint m_null_bytes_in_rec;
532 
533   void get_storage_type(Rdb_field_encoder *const encoder, const uint &kp);
534   void setup_field_converters();
535   int alloc_key_buffers(const TABLE *const table_arg,
536                         const Rdb_tbl_def *const tbl_def_arg,
537                         bool alloc_alter_buffers = false)
538       __attribute__((__nonnull__, __warn_unused_result__));
539   void free_key_buffers();
540 
541   // the buffer size should be at least 2*Rdb_key_def::INDEX_NUMBER_SIZE
542   rocksdb::Range get_range(const int &i, uchar buf[]) const;
543 
544   /*
545     Perf timers for data reads
546   */
547   Rdb_io_perf m_io_perf;
548 
549   /*
550     A counter of how many row checksums were checked for this table. Note that
551     this does not include checksums for secondary index entries.
552   */
553   my_core::ha_rows m_row_checksums_checked;
554 
555   /*
556     Update stats
557   */
558   void update_stats(void);
559 
560 public:
561   /*
562     Controls whether writes include checksums. This is updated from the session
563     variable
564     at the start of each query.
565   */
566   bool m_store_row_debug_checksums;
567 
568   /* Same as above but for verifying checksums when reading */
569   bool m_verify_row_debug_checksums;
570   int m_checksums_pct;
571 
572   ha_rocksdb(my_core::handlerton *const hton,
573              my_core::TABLE_SHARE *const table_arg);
574   ~ha_rocksdb() {
575     int err __attribute__((__unused__));
576     err = finalize_bulk_load();
577     DBUG_ASSERT(err == 0);
578     mysql_mutex_destroy(&m_bulk_load_mutex);
579   }
580 
581   /** @brief
582     The name that will be used for display purposes.
583    */
584   const char *table_type() const override {
585     DBUG_ENTER_FUNC();
586 
587     DBUG_RETURN(rocksdb_hton_name);
588   }
589 
590   /* The following is only used by SHOW KEYS: */
591   const char *index_type(uint inx) override {
592     DBUG_ENTER_FUNC();
593 
594     DBUG_RETURN("LSMTREE");
595   }
596 
597   /** @brief
598     The file extensions.
599    */
600   const char **bas_ext() const override;
601 
602   /*
603     See if this is the same base table - this should only be true for different
604     partitions of the same table.
605   */
606   bool same_table(const ha_rocksdb &other) const;
607 
608   /** @brief
609     This is a list of flags that indicate what functionality the storage engine
610     implements. The current table flags are documented in handler.h
611   */
612   ulonglong table_flags() const override {
613     DBUG_ENTER_FUNC();
614 
615     /*
616       HA_BINLOG_STMT_CAPABLE
617         We are saying that this engine is just statement capable to have
618         an engine that can only handle statement-based logging. This is
619         used in testing.
620       HA_REC_NOT_IN_SEQ
621         If we don't set it, filesort crashes, because it assumes rowids are
622         1..8 byte numbers
623     */
624     DBUG_RETURN(HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE |
625                 HA_REC_NOT_IN_SEQ | HA_CAN_INDEX_BLOBS |
626                 (m_pk_can_be_decoded ? HA_PRIMARY_KEY_IN_READ_INDEX : 0) |
627                 HA_PRIMARY_KEY_REQUIRED_FOR_POSITION | HA_NULL_IN_KEY |
628                 HA_PARTIAL_COLUMN_READ);
629   }
630 
631   bool init_with_fields() override;
632 
633   /** @brief
634     This is a bitmap of flags that indicates how the storage engine
635     implements indexes. The current index flags are documented in
636     handler.h. If you do not implement indexes, just return zero here.
637 
638     @details
639     part is the key part to check. First key part is 0.
640     If all_parts is set, MySQL wants to know the flags for the combined
641     index, up to and including 'part'.
642   */
643   ulong index_flags(uint inx, uint part, bool all_parts) const override;
644 
645   const key_map *keys_to_use_for_scanning() override {
646     DBUG_ENTER_FUNC();
647 
648     DBUG_RETURN(&key_map_full);
649   }
650 
651   bool primary_key_is_clustered() override {
652     DBUG_ENTER_FUNC();
653 
654     DBUG_RETURN(true);
655   }
656 
657   bool should_store_row_debug_checksums() const {
658     return m_store_row_debug_checksums && (rand() % 100 < m_checksums_pct);
659   }
660 
661   int rename_table(const char *const from, const char *const to) override
662       __attribute__((__nonnull__, __warn_unused_result__));
663 
664   int convert_record_from_storage_format(const rocksdb::Slice *const key,
665                                          const rocksdb::Slice *const value,
666                                          uchar *const buf)
667       __attribute__((__nonnull__, __warn_unused_result__));
668 
669   int convert_record_from_storage_format(const rocksdb::Slice *const key,
670                                          uchar *const buf)
671       __attribute__((__nonnull__, __warn_unused_result__));
672 
673   void convert_record_to_storage_format(const rocksdb::Slice &pk_packed_slice,
674                                         Rdb_string_writer *const pk_unpack_info,
675                                         rocksdb::Slice *const packed_rec)
676       __attribute__((__nonnull__));
677 
678   static const char *get_key_name(const uint index,
679                                   const TABLE *const table_arg,
680                                   const Rdb_tbl_def *const tbl_def_arg)
681       __attribute__((__nonnull__, __warn_unused_result__));
682 
683   static const char *get_key_comment(const uint index,
684                                      const TABLE *const table_arg,
685                                      const Rdb_tbl_def *const tbl_def_arg)
686       __attribute__((__nonnull__, __warn_unused_result__));
687 
688   static bool is_hidden_pk(const uint index, const TABLE *const table_arg,
689                            const Rdb_tbl_def *const tbl_def_arg)
690       __attribute__((__nonnull__, __warn_unused_result__));
691 
692   static uint pk_index(const TABLE *const table_arg,
693                        const Rdb_tbl_def *const tbl_def_arg)
694       __attribute__((__nonnull__, __warn_unused_result__));
695 
696   static bool is_pk(const uint index, const TABLE *table_arg,
697                     const Rdb_tbl_def *tbl_def_arg)
698       __attribute__((__nonnull__, __warn_unused_result__));
699 
700   /** @brief
701     unireg.cc will call max_supported_record_length(), max_supported_keys(),
702     max_supported_key_parts(), uint max_supported_key_length()
703     to make sure that the storage engine can handle the data it is about to
704     send. Return *real* limits of your storage engine here; MySQL will do
705     min(your_limits, MySQL_limits) automatically.
706    */
707   uint max_supported_record_length() const override {
708     DBUG_ENTER_FUNC();
709 
710     DBUG_RETURN(HA_MAX_REC_LENGTH);
711   }
712 
713   uint max_supported_keys() const override {
714     DBUG_ENTER_FUNC();
715 
716     DBUG_RETURN(MAX_INDEXES);
717   }
718 
719   uint max_supported_key_parts() const override {
720     DBUG_ENTER_FUNC();
721 
722     DBUG_RETURN(MAX_REF_PARTS);
723   }
724 
725   uint max_supported_key_part_length() const override {
726     DBUG_ENTER_FUNC();
727 
728     DBUG_RETURN(2048);
729   }
730 
731   /** @brief
732     unireg.cc will call this to make sure that the storage engine can handle
733     the data it is about to send. Return *real* limits of your storage engine
734     here; MySQL will do min(your_limits, MySQL_limits) automatically.
735 
736       @details
737     There is no need to implement ..._key_... methods if your engine doesn't
738     support indexes.
739    */
740   uint max_supported_key_length() const override {
741     DBUG_ENTER_FUNC();
742 
743     DBUG_RETURN(16 * 1024); /* just to return something*/
744   }
745 
746   /**
747     TODO: return actual upper bound of number of records in the table.
748     (e.g. save number of records seen on full table scan and/or use file size
749     as upper bound)
750   */
751   ha_rows estimate_rows_upper_bound() override {
752     DBUG_ENTER_FUNC();
753 
754     DBUG_RETURN(HA_POS_ERROR);
755   }
756 
757   /* At the moment, we're ok with default handler::index_init() implementation.
758    */
759   int index_read_map(uchar *const buf, const uchar *const key,
760                      key_part_map keypart_map,
761                      enum ha_rkey_function find_flag) override
762       __attribute__((__warn_unused_result__));
763 
764   int index_read_map_impl(uchar *const buf, const uchar *const key,
765                           key_part_map keypart_map,
766                           enum ha_rkey_function find_flag,
767                           const key_range *end_key)
768       __attribute__((__warn_unused_result__));
769 
770   int index_read_last_map(uchar *const buf, const uchar *const key,
771                           key_part_map keypart_map) override
772       __attribute__((__warn_unused_result__));
773 
774   int read_range_first(const key_range *const start_key,
775                        const key_range *const end_key, bool eq_range,
776                        bool sorted) override
777       __attribute__((__warn_unused_result__));
778 
779   virtual double scan_time() override {
780     DBUG_ENTER_FUNC();
781 
782     DBUG_RETURN(
783         static_cast<double>((stats.records + stats.deleted) / 20.0 + 10));
784   }
785 
786   virtual double read_time(uint, uint, ha_rows rows) override;
787 
788   int open(const char *const name, int mode, uint test_if_locked) override
789       __attribute__((__warn_unused_result__));
790   int close(void) override __attribute__((__warn_unused_result__));
791 
792   int write_row(uchar *const buf) override
793       __attribute__((__warn_unused_result__));
794   int update_row(const uchar *const old_data, uchar *const new_data) override
795       __attribute__((__warn_unused_result__));
796   int delete_row(const uchar *const buf) override
797       __attribute__((__warn_unused_result__));
798   rocksdb::Status delete_or_singledelete(uint index, Rdb_transaction *const tx,
799                                          rocksdb::ColumnFamilyHandle *const cf,
800                                          const rocksdb::Slice &key)
801       __attribute__((__warn_unused_result__));
802 
803   int index_next(uchar *const buf) override
804       __attribute__((__warn_unused_result__));
805   int index_next_with_direction(uchar *const buf, bool move_forward)
806       __attribute__((__warn_unused_result__));
807   int index_prev(uchar *const buf) override
808       __attribute__((__warn_unused_result__));
809 
810   int index_first(uchar *const buf) override
811       __attribute__((__warn_unused_result__));
812   int index_last(uchar *const buf) override
813       __attribute__((__warn_unused_result__));
814 
815   class Item *idx_cond_push(uint keyno, class Item *const idx_cond) override;
816   /*
817     Default implementation from cancel_pushed_idx_cond() suits us
818   */
819 private:
820   struct key_def_cf_info {
821     rocksdb::ColumnFamilyHandle *cf_handle;
822     bool is_reverse_cf;
823     bool is_auto_cf;
824   };
825 
826   struct update_row_info {
827     Rdb_transaction *tx;
828     const uchar *new_data;
829     const uchar *old_data;
830     rocksdb::Slice new_pk_slice;
831     rocksdb::Slice old_pk_slice;
832 
833     // "unpack_info" data for the new PK value
834     Rdb_string_writer *new_pk_unpack_info;
835 
836     longlong hidden_pk_id;
837     bool skip_unique_check;
838   };
839 
840   /*
841     Used to check for duplicate entries during fast unique secondary index
842     creation.
843   */
844   struct unique_sk_buf_info {
845     bool sk_buf_switch = false;
846     rocksdb::Slice sk_memcmp_key;
847     rocksdb::Slice sk_memcmp_key_old;
848     uchar *dup_sk_buf;
849     uchar *dup_sk_buf_old;
850 
851     /*
852       This method is meant to be called back to back during inplace creation
853       of unique indexes.  It will switch between two buffers, which
854       will each store the memcmp form of secondary keys, which are then
855       converted to slices in sk_memcmp_key or sk_memcmp_key_old.
856 
857       Switching buffers on each iteration allows us to retain the
858       sk_memcmp_key_old value for duplicate comparison.
859     */
860     inline uchar *swap_and_get_sk_buf() {
861       sk_buf_switch = !sk_buf_switch;
862       return sk_buf_switch ? dup_sk_buf : dup_sk_buf_old;
863     }
864   };
865 
866   int create_cfs(const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
867                  std::array<struct key_def_cf_info, MAX_INDEXES + 1> *const cfs)
868       const __attribute__((__nonnull__, __warn_unused_result__));
869 
870   int create_key_def(const TABLE *const table_arg, const uint &i,
871                      const Rdb_tbl_def *const tbl_def_arg,
872                      std::shared_ptr<Rdb_key_def> *const new_key_def,
873                      const struct key_def_cf_info &cf_info) const
874       __attribute__((__nonnull__, __warn_unused_result__));
875 
876   int create_inplace_key_defs(
877       const TABLE *const table_arg, Rdb_tbl_def *vtbl_def_arg,
878       const TABLE *const old_table_arg,
879       const Rdb_tbl_def *const old_tbl_def_arg,
880       const std::array<key_def_cf_info, MAX_INDEXES + 1> &cfs) const
881       __attribute__((__nonnull__, __warn_unused_result__));
882 
883   std::unordered_map<std::string, uint>
884   get_old_key_positions(const TABLE *table_arg, const Rdb_tbl_def *tbl_def_arg,
885                         const TABLE *old_table_arg,
886                         const Rdb_tbl_def *old_tbl_def_arg) const
887       __attribute__((__nonnull__));
888 
889   int compare_key_parts(const KEY *const old_key,
890                         const KEY *const new_key) const;
891   __attribute__((__nonnull__, __warn_unused_result__));
892 
893   int index_first_intern(uchar *buf)
894       __attribute__((__nonnull__, __warn_unused_result__));
895   int index_last_intern(uchar *buf)
896       __attribute__((__nonnull__, __warn_unused_result__));
897 
898   enum icp_result check_index_cond() const;
899   int find_icp_matching_index_rec(const bool &move_forward, uchar *const buf)
900       __attribute__((__nonnull__, __warn_unused_result__));
901 
902   void calc_updated_indexes();
903   int update_write_row(const uchar *const old_data, const uchar *const new_data,
904                        const bool skip_unique_check)
905       __attribute__((__warn_unused_result__));
906   int get_pk_for_update(struct update_row_info *const row_info);
907   int check_and_lock_unique_pk(const uint &key_id,
908                                const struct update_row_info &row_info,
909                                bool *const found, bool *const pk_changed)
910       __attribute__((__warn_unused_result__));
911   int check_and_lock_sk(const uint &key_id,
912                         const struct update_row_info &row_info,
913                         bool *const found) const
914       __attribute__((__warn_unused_result__));
915   int check_uniqueness_and_lock(const struct update_row_info &row_info,
916                                 bool *const pk_changed)
917       __attribute__((__warn_unused_result__));
918   bool over_bulk_load_threshold(int *err)
919       __attribute__((__warn_unused_result__));
920   int check_duplicate_sk(const TABLE *table_arg, const Rdb_key_def &index,
921                          const rocksdb::Slice *key,
922                          struct unique_sk_buf_info *sk_info)
923       __attribute__((__nonnull__, __warn_unused_result__));
924   int bulk_load_key(Rdb_transaction *const tx, const Rdb_key_def &kd,
925                     const rocksdb::Slice &key, const rocksdb::Slice &value)
926       __attribute__((__nonnull__, __warn_unused_result__));
927   int update_pk(const Rdb_key_def &kd, const struct update_row_info &row_info,
928                 const bool &pk_changed) __attribute__((__warn_unused_result__));
929   int update_sk(const TABLE *const table_arg, const Rdb_key_def &kd,
930                 const struct update_row_info &row_info)
931       __attribute__((__warn_unused_result__));
932   int update_indexes(const struct update_row_info &row_info,
933                      const bool &pk_changed)
934       __attribute__((__warn_unused_result__));
935 
936   int read_key_exact(const Rdb_key_def &kd, rocksdb::Iterator *const iter,
937                      const bool &using_full_key,
938                      const rocksdb::Slice &key_slice) const
939       __attribute__((__nonnull__, __warn_unused_result__));
940   int read_before_key(const Rdb_key_def &kd, const bool &using_full_key,
941                       const rocksdb::Slice &key_slice)
942       __attribute__((__nonnull__, __warn_unused_result__));
943   int read_after_key(const Rdb_key_def &kd, const bool &using_full_key,
944                      const rocksdb::Slice &key_slice)
945       __attribute__((__nonnull__, __warn_unused_result__));
946 
947   int position_to_correct_key(
948       const Rdb_key_def &kd, const enum ha_rkey_function &find_flag,
949       const bool &full_key_match, const uchar *const key,
950       const key_part_map &keypart_map, const rocksdb::Slice &key_slice,
951       bool *const move_forward) __attribute__((__warn_unused_result__));
952 
953   int read_row_from_primary_key(uchar *const buf)
954       __attribute__((__nonnull__, __warn_unused_result__));
955   int read_row_from_secondary_key(uchar *const buf, const Rdb_key_def &kd,
956                                   bool move_forward)
957       __attribute__((__nonnull__, __warn_unused_result__));
958 
959   int calc_eq_cond_len(const Rdb_key_def &kd,
960                        const enum ha_rkey_function &find_flag,
961                        const rocksdb::Slice &slice,
962                        const int &bytes_changed_by_succ,
963                        const key_range *const end_key,
964                        uint *const end_key_packed_size)
965       __attribute__((__warn_unused_result__));
966 
967   Rdb_tbl_def *get_table_if_exists(const char *const tablename)
968       __attribute__((__nonnull__, __warn_unused_result__));
969   void read_thd_vars(THD *const thd) __attribute__((__nonnull__));
970   const char *thd_rocksdb_tmpdir()
971       __attribute__((__nonnull__, __warn_unused_result__));
972 
973   bool contains_foreign_key(THD *const thd)
974       __attribute__((__nonnull__, __warn_unused_result__));
975 
976   int inplace_populate_sk(
977       TABLE *const table_arg,
978       const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes)
979       __attribute__((__nonnull__, __warn_unused_result__));
980 
981 public:
982   int index_init(uint idx, bool sorted) override
983       __attribute__((__warn_unused_result__));
984   int index_end() override __attribute__((__warn_unused_result__));
985 
986   void unlock_row() override;
987 
988   /** @brief
989     Unlike index_init(), rnd_init() can be called two consecutive times
990     without rnd_end() in between (it only makes sense if scan=1). In this
991     case, the second call should prepare for the new table scan (e.g if
992     rnd_init() allocates the cursor, the second call should position the
993     cursor to the start of the table; no need to deallocate and allocate
994     it again. This is a required method.
995   */
996   int rnd_init(bool scan) override __attribute__((__warn_unused_result__));
997   int rnd_end() override __attribute__((__warn_unused_result__));
998 
999   int rnd_next(uchar *const buf) override
1000       __attribute__((__warn_unused_result__));
1001   int rnd_next_with_direction(uchar *const buf, bool move_forward)
1002       __attribute__((__warn_unused_result__));
1003 
1004   int rnd_pos(uchar *const buf, uchar *const pos) override
1005       __attribute__((__warn_unused_result__));
1006   void position(const uchar *const record) override;
1007   int info(uint) override;
1008 
1009   /* This function will always return success, therefore no annotation related
1010    * to checking the return value. Can't change the signature because it's
1011    * required by the interface. */
1012   int extra(enum ha_extra_function operation) override;
1013 
1014   int start_stmt(THD *const thd, thr_lock_type lock_type) override
1015       __attribute__((__warn_unused_result__));
1016   int external_lock(THD *const thd, int lock_type) override
1017       __attribute__((__warn_unused_result__));
1018   int truncate() override __attribute__((__warn_unused_result__));
1019 
1020   int reset() override {
1021     DBUG_ENTER_FUNC();
1022 
1023     /* Free blob data */
1024     m_retrieved_record.clear();
1025 
1026     DBUG_RETURN(HA_EXIT_SUCCESS);
1027   }
1028 
1029   int check(THD *const thd, HA_CHECK_OPT *const check_opt) override
1030       __attribute__((__warn_unused_result__));
1031   void remove_rows(Rdb_tbl_def *const tbl);
1032   ha_rows records_in_range(uint inx, key_range *const min_key,
1033                            key_range *const max_key) override
1034       __attribute__((__warn_unused_result__));
1035   int delete_table(const char *const from) override
1036       __attribute__((__warn_unused_result__));
1037   int create(const char *const name, TABLE *const form,
1038              HA_CREATE_INFO *const create_info) override
1039       __attribute__((__warn_unused_result__));
1040   bool check_if_incompatible_data(HA_CREATE_INFO *const info,
1041                                   uint table_changes) override
1042       __attribute__((__warn_unused_result__));
1043 
1044   THR_LOCK_DATA **store_lock(THD *const thd, THR_LOCK_DATA **to,
1045                              enum thr_lock_type lock_type) override
1046       __attribute__((__warn_unused_result__));
1047 
1048   my_bool register_query_cache_table(THD *const thd, char *const table_key,
1049                                      uint key_length,
1050                                      qc_engine_callback *const engine_callback,
1051                                      ulonglong *const engine_data) override {
1052     DBUG_ENTER_FUNC();
1053 
1054     /* Currently, we don't support query cache */
1055     DBUG_RETURN(FALSE);
1056   }
1057 
1058   bool get_error_message(const int error, String *const buf) override
1059       __attribute__((__nonnull__));
1060 
1061   void get_auto_increment(ulonglong offset, ulonglong increment,
1062                           ulonglong nb_desired_values,
1063                           ulonglong *const first_value,
1064                           ulonglong *const nb_reserved_values) override;
1065   void update_create_info(HA_CREATE_INFO *const create_info) override;
1066   int optimize(THD *const thd, HA_CHECK_OPT *const check_opt) override
1067       __attribute__((__warn_unused_result__));
1068   int analyze(THD *const thd, HA_CHECK_OPT *const check_opt) override
1069       __attribute__((__warn_unused_result__));
1070   int calculate_stats(const TABLE *const table_arg, THD *const thd,
1071                       HA_CHECK_OPT *const check_opt)
1072       __attribute__((__warn_unused_result__));
1073 
1074   enum_alter_inplace_result check_if_supported_inplace_alter(
1075       TABLE *altered_table,
1076       my_core::Alter_inplace_info *const ha_alter_info) override;
1077 
1078   bool prepare_inplace_alter_table(
1079       TABLE *const altered_table,
1080       my_core::Alter_inplace_info *const ha_alter_info) override;
1081 
1082   bool inplace_alter_table(
1083       TABLE *const altered_table,
1084       my_core::Alter_inplace_info *const ha_alter_info) override;
1085 
1086   bool
1087   commit_inplace_alter_table(TABLE *const altered_table,
1088                              my_core::Alter_inplace_info *const ha_alter_info,
1089                              bool commit) override;
1090 
1091   int finalize_bulk_load() __attribute__((__warn_unused_result__));
1092 
1093   void set_use_read_free_rpl(const char *const whitelist);
1094 
1095 public:
1096   virtual void rpl_before_delete_rows() override;
1097   virtual void rpl_after_delete_rows() override;
1098   virtual void rpl_before_update_rows() override;
1099   virtual void rpl_after_update_rows() override;
1100   virtual bool use_read_free_rpl();
1101 
1102 private:
1103   /* Flags tracking if we are inside different replication operation */
1104   bool m_in_rpl_delete_rows;
1105   bool m_in_rpl_update_rows;
1106 
1107 };
1108 
1109 /*
1110   Helper class for in-place alter, for storing handler context between inplace
1111   alter calls
1112 */
1113 struct Rdb_inplace_alter_ctx : public my_core::inplace_alter_handler_ctx {
1114   /* The new table definition */
1115   Rdb_tbl_def *const m_new_tdef;
1116 
1117   /* Stores the original key definitions */
1118   std::shared_ptr<Rdb_key_def> *const m_old_key_descr;
1119 
1120   /* Stores the new key definitions */
1121   std::shared_ptr<Rdb_key_def> *m_new_key_descr;
1122 
1123   /* Stores the old number of key definitions */
1124   const uint m_old_n_keys;
1125 
1126   /* Stores the new number of key definitions */
1127   const uint m_new_n_keys;
1128 
1129   /* Stores the added key glids */
1130   const std::unordered_set<std::shared_ptr<Rdb_key_def>> m_added_indexes;
1131 
1132   /* Stores the dropped key glids */
1133   const std::unordered_set<GL_INDEX_ID> m_dropped_index_ids;
1134 
1135   /* Stores number of keys to add */
1136   const uint m_n_added_keys;
1137 
1138   /* Stores number of keys to drop */
1139   const uint m_n_dropped_keys;
1140 
1141   Rdb_inplace_alter_ctx(
1142       Rdb_tbl_def *new_tdef, std::shared_ptr<Rdb_key_def> *old_key_descr,
1143       std::shared_ptr<Rdb_key_def> *new_key_descr, uint old_n_keys,
1144       uint new_n_keys,
1145       std::unordered_set<std::shared_ptr<Rdb_key_def>> added_indexes,
1146       std::unordered_set<GL_INDEX_ID> dropped_index_ids, uint n_added_keys,
1147       uint n_dropped_keys)
1148       : my_core::inplace_alter_handler_ctx(), m_new_tdef(new_tdef),
1149         m_old_key_descr(old_key_descr), m_new_key_descr(new_key_descr),
1150         m_old_n_keys(old_n_keys), m_new_n_keys(new_n_keys),
1151         m_added_indexes(added_indexes), m_dropped_index_ids(dropped_index_ids),
1152         m_n_added_keys(n_added_keys), m_n_dropped_keys(n_dropped_keys) {}
1153 
1154   ~Rdb_inplace_alter_ctx() {}
1155 
1156 private:
1157   /* Disable Copying */
1158   Rdb_inplace_alter_ctx(const Rdb_inplace_alter_ctx &);
1159   Rdb_inplace_alter_ctx &operator=(const Rdb_inplace_alter_ctx &);
1160 };
1161 
1162 } // namespace myrocks
1163