1 /*
2    Copyright (c) 2012,2013 Monty Program Ab
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; version 2 of the License.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
16 #pragma once
17 
18 #ifdef USE_PRAGMA_INTERFACE
19 #pragma interface /* gcc class implementation */
20 #endif
21 
22 #define ROCKSDB_INCLUDE_RFR 1
23 
24 /* C++ standard header files */
25 #include <cinttypes>
26 #include <set>
27 #include <string>
28 #include <unordered_map>
29 #include <unordered_set>
30 #include <vector>
31 
32 /* MySQL header files */
33 #include "field.h"
34 #include "handler.h"   /* handler */
35 #include "my_global.h" /* ulonglong */
36 #include "my_icp.h"
37 #include "sql_bitmap.h"
38 #include "sql_string.h"
39 
40 /* RocksDB header files */
41 #include "rocksdb/cache.h"
42 #include "rocksdb/merge_operator.h"
43 #include "rocksdb/perf_context.h"
44 #include "rocksdb/sst_file_manager.h"
45 #include "rocksdb/statistics.h"
46 #include "rocksdb/utilities/options_util.h"
47 #include "rocksdb/utilities/transaction_db.h"
48 #include "rocksdb/utilities/write_batch_with_index.h"
49 
50 /* MyRocks header files */
51 #include "./rdb_buff.h"
52 #include "./rdb_comparator.h"
53 #include "./rdb_global.h"
54 #include "./rdb_index_merge.h"
55 #include "./rdb_perf_context.h"
56 #include "./rdb_sst_info.h"
57 #include "./rdb_utils.h"
58 
59 /**
60   @note MyRocks Coding Conventions:
61   MyRocks code follows the baseline MySQL coding conventions, available at
62   http://dev.mysql.com/doc/internals/en/coding-guidelines.html, with several
63   refinements (@see /storage/rocksdb/README file).
64 */
65 
66 /**
67   @note MyRocks Coding Conventions:
68   MyRocks code follows the baseline MySQL coding conventions, available at
69   http://dev.mysql.com/doc/internals/en/coding-guidelines.html, with several
70   refinements (@see /storage/rocksdb/README file).
71 */
72 
73 namespace myrocks {
74 
75 class Rdb_converter;
76 class Rdb_key_def;
77 class Rdb_tbl_def;
78 class Rdb_transaction;
79 class Rdb_transaction_impl;
80 class Rdb_writebatch_impl;
81 class Rdb_field_encoder;
82 
83 extern char *rocksdb_read_free_rpl_tables;
84 extern ulong rocksdb_max_row_locks;
85 #if defined(HAVE_PSI_INTERFACE)
86 extern PSI_rwlock_key key_rwlock_read_free_rpl_tables;
87 #endif
88 extern Regex rdb_read_free_regex_handler;
89 
90 /**
91   @brief
92   Rdb_table_handler is a reference-counted structure storing information for
93   each open table. All the objects are stored in a global hash map.
94 
95   //TODO: join this with Rdb_tbl_def ?
96 */
97 struct Rdb_table_handler {
98   char *m_table_name;
99   uint m_table_name_length;
100   int m_ref_count;
101 
102   my_core::THR_LOCK m_thr_lock;  ///< MySQL latch needed by m_db_lock
103 
104   /* Stores cumulative table statistics */
105   my_io_perf_atomic_t m_io_perf_read;
106   Rdb_atomic_perf_counters m_table_perf_context;
107 
108   /* Stores cached memtable estimate statistics */
109   std::atomic_uint m_mtcache_lock;
110   uint64_t m_mtcache_count;
111   uint64_t m_mtcache_size;
112   uint64_t m_mtcache_last_update;
113 };
114 
115 }  // namespace myrocks
116 
117 /* Provide hash function for GL_INDEX_ID so we can include it in sets */
118 namespace std {
119 template <>
120 struct hash<myrocks::GL_INDEX_ID> {
121   std::size_t operator()(const myrocks::GL_INDEX_ID &gl_index_id) const {
122     const uint64_t val =
123         ((uint64_t)gl_index_id.cf_id << 32 | (uint64_t)gl_index_id.index_id);
124     return std::hash<uint64_t>()(val);
125   }
126 };
127 }  // namespace std
128 
129 namespace myrocks {
130 enum table_cardinality_scan_type {
131   SCAN_TYPE_NONE,
132   SCAN_TYPE_MEMTABLE_ONLY,
133   SCAN_TYPE_FULL_TABLE,
134 };
135 
136 /**
137   @brief
138   Class definition for ROCKSDB storage engine plugin handler
139 */
140 
141 class ha_rocksdb : public my_core::handler {
142   my_core::THR_LOCK_DATA m_db_lock;  ///< MySQL database lock
143 
144   Rdb_table_handler *m_table_handler;  ///< Open table handler
145 
146   /* Iterator used for range scans and for full table/index scans */
147   rocksdb::Iterator *m_scan_it;
148 
149   /* Whether m_scan_it was created with skip_bloom=true */
150   bool m_scan_it_skips_bloom;
151 
152   const rocksdb::Snapshot *m_scan_it_snapshot;
153 
154   /* Buffers used for upper/lower bounds for m_scan_it. */
155   uchar *m_scan_it_lower_bound;
156   uchar *m_scan_it_upper_bound;
157   rocksdb::Slice m_scan_it_lower_bound_slice;
158   rocksdb::Slice m_scan_it_upper_bound_slice;
159 
160   Rdb_tbl_def *m_tbl_def;
161 
162   /* Primary Key encoder from KeyTupleFormat to StorageFormat */
163   std::shared_ptr<Rdb_key_def> m_pk_descr;
164 
165   /* Array of index descriptors */
166   std::shared_ptr<Rdb_key_def> *m_key_descr_arr;
167 
168   static bool check_keyread_allowed(bool &pk_can_be_decoded,
169                                     const TABLE_SHARE *table_share, uint inx,
170                                     uint part, bool all_parts);
171 
172   /*
173     Number of key parts in PK. This is the same as
174       table->key_info[table->s->primary_key].keyparts
175   */
176   uint m_pk_key_parts;
177 
178   /*
179     true <=> Primary Key columns can be decoded from the index. It should be
180     enabled by default and may be disabled in init_with_fields() after initial
181     keys info is loaded and it turns out the feature isn't supported for
182     particular table.
183   */
184   mutable bool m_pk_can_be_decoded;
185 
186   uchar *m_pk_tuple;        /* Buffer for storing PK in KeyTupleFormat */
187   uchar *m_pk_packed_tuple; /* Buffer for storing PK in StorageFormat */
188   // ^^ todo: change it to 'char*'? TODO: ^ can we join this with last_rowkey?
189 
190   /*
191     Temporary buffers for storing the key part of the Key/Value pair
192     for secondary indexes.
193   */
194   uchar *m_sk_packed_tuple;
195 
196   /*
197     Temporary buffers for storing end key part of the Key/Value pair.
198     This is used for range scan only.
199   */
200   uchar *m_end_key_packed_tuple;
201 
202   Rdb_string_writer m_sk_tails;
203   Rdb_string_writer m_pk_unpack_info;
204 
205   /*
206     ha_rockdb->index_read_map(.. HA_READ_KEY_EXACT or similar) will save here
207     mem-comparable form of the index lookup tuple.
208   */
209   uchar *m_sk_match_prefix;
210   uint m_sk_match_length;
211 
212   /* Buffer space for the above */
213   uchar *m_sk_match_prefix_buf;
214 
215   /* Second buffers, used by UPDATE. */
216   uchar *m_sk_packed_tuple_old;
217   Rdb_string_writer m_sk_tails_old;
218 
219   /* Buffers used for duplicate checking during unique_index_creation */
220   uchar *m_dup_sk_packed_tuple;
221   uchar *m_dup_sk_packed_tuple_old;
222 
223   /*
224     Temporary space for packing VARCHARs (we provide it to
225     pack_record()/pack_index_tuple() calls).
226   */
227   uchar *m_pack_buffer;
228 
229   /* class to convert between Mysql format and RocksDB format*/
230   std::unique_ptr<Rdb_converter> m_converter;
231 
232   /*
233     Pointer to the original TTL timestamp value (8 bytes) during UPDATE.
234   */
235   char *m_ttl_bytes;
236   /*
237     The TTL timestamp value can change if the explicit TTL column is
238     updated. If we detect this when updating the PK, we indicate it here so
239     we know we must always update any SK's.
240   */
241   bool m_ttl_bytes_updated;
242 
243   /* rowkey of the last record we've read, in StorageFormat. */
244   String m_last_rowkey;
245 
246   /*
247     Last retrieved record, in table->record[0] data format.
248 
249     This is used only when we get the record with rocksdb's Get() call (The
250     other option is when we get a rocksdb::Slice from an iterator)
251   */
252   rocksdb::PinnableSlice m_retrieved_record;
253 
254   /* Type of locking to apply to rows */
255   enum { RDB_LOCK_NONE, RDB_LOCK_READ, RDB_LOCK_WRITE } m_lock_rows;
256 
257   /* true means we're doing an index-only read. false means otherwise. */
258   bool m_keyread_only;
259 
260   bool m_skip_scan_it_next_call;
261 
262   /* true means we are accessing the first row after a snapshot was created */
263   bool m_rnd_scan_is_new_snapshot;
264 
265   /*
266     TRUE means INSERT ON DUPLICATE KEY UPDATE. In such case we can optimize by
267     remember the failed attempt (if there is one that violates uniqueness check)
268     in write_row and in the following index_read to skip the lock check and read
269     entirely
270    */
271   bool m_insert_with_update;
272 
273   /*
274     TRUE if last time the insertion failed due to duplicate key error.
275     (m_dupp_errkey holds the key# that we've had error for)
276   */
277   bool m_dup_key_found;
278 
279 #ifndef NDEBUG
280   /*
281     Last retrieved record (for duplicate PK) or index tuple (for duplicate
282     unique SK). Used for sanity checking.
283   */
284   String m_dup_key_retrieved_record;
285 #endif
286 
287   /**
288     @brief
289     This is a bitmap of indexes (i.e. a set) whose keys (in future, values) may
290     be changed by this statement. Indexes that are not in the bitmap do not need
291     to be updated.
292     @note Valid inside UPDATE statements, IIF(old_pk_slice is set).
293   */
294   my_core::key_map m_update_scope;
295 
296   /* SST information used for bulk loading the primary key */
297   std::shared_ptr<Rdb_sst_info> m_sst_info;
298 
299   /*
300     MySQL index number for duplicate key error
301   */
302   uint m_dupp_errkey;
303 
304   int create_key_defs(const TABLE *const table_arg,
305                       Rdb_tbl_def *const tbl_def_arg,
306                       const TABLE *const old_table_arg = nullptr,
307                       const Rdb_tbl_def *const old_tbl_def_arg = nullptr) const
308       MY_ATTRIBUTE((__warn_unused_result__));
309   int secondary_index_read(const int keyno, uchar *const buf)
310       MY_ATTRIBUTE((__warn_unused_result__));
311   void setup_iterator_for_rnd_scan();
312   bool is_ascending(const Rdb_key_def &keydef,
313                     enum ha_rkey_function find_flag) const
314       MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
315   static void setup_iterator_bounds(const Rdb_key_def &kd,
316                                     const rocksdb::Slice &eq_cond,
317                                     size_t bound_len, uchar *const lower_bound,
318                                     uchar *const upper_bound,
319                                     rocksdb::Slice *lower_bound_slice,
320                                     rocksdb::Slice *upper_bound_slice);
321   static bool can_use_bloom_filter(THD *thd, const Rdb_key_def &kd,
322                                    const rocksdb::Slice &eq_cond,
323                                    const bool use_all_keys);
324   void setup_scan_iterator(const Rdb_key_def &kd, rocksdb::Slice *slice,
325                            const bool use_all_keys, const uint eq_cond_len);
326   void release_scan_iterator(void);
327 
328   rocksdb::Status get_for_update(Rdb_transaction *const tx,
329                                  const Rdb_key_def &kd,
330                                  const rocksdb::Slice &key,
331                                  rocksdb::PinnableSlice *value) const;
332 
333   int get_row_by_rowid(uchar *const buf, const char *const rowid,
334                        const uint rowid_size, const bool skip_ttl_check = true,
335                        const bool skip_lookup = false)
336       MY_ATTRIBUTE((__warn_unused_result__));
337   int get_row_by_rowid(uchar *const buf, const uchar *const rowid,
338                        const uint rowid_size, const bool skip_ttl_check = true,
339                        const bool skip_lookup = false)
340       MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)) {
341     return get_row_by_rowid(buf, reinterpret_cast<const char *>(rowid),
342                             rowid_size, skip_ttl_check, skip_lookup);
343   }
344 
345   void load_auto_incr_value();
346   ulonglong load_auto_incr_value_from_index();
347   void update_auto_incr_val(ulonglong val);
348   void update_auto_incr_val_from_field();
349   rocksdb::Status get_datadic_auto_incr(Rdb_transaction *const tx,
350                                         const GL_INDEX_ID &gl_index_id,
351                                         ulonglong *new_val) const;
352   longlong update_hidden_pk_val();
353   int load_hidden_pk_value() MY_ATTRIBUTE((__warn_unused_result__));
354   int read_hidden_pk_id_from_rowkey(longlong *const hidden_pk_id)
355       MY_ATTRIBUTE((__warn_unused_result__));
356   bool can_use_single_delete(const uint index) const
357       MY_ATTRIBUTE((__warn_unused_result__));
358   bool is_blind_delete_enabled();
359   bool skip_unique_check() const MY_ATTRIBUTE((__warn_unused_result__));
360   bool commit_in_the_middle() MY_ATTRIBUTE((__warn_unused_result__));
361   bool do_bulk_commit(Rdb_transaction *const tx)
362       MY_ATTRIBUTE((__warn_unused_result__));
363   bool has_hidden_pk(const TABLE *const table) const
364       MY_ATTRIBUTE((__warn_unused_result__));
365 
366   void update_row_stats(const operation_type &type);
367 
368   void set_last_rowkey(const uchar *const old_data);
369 
370   int alloc_key_buffers(const TABLE *const table_arg,
371                         const Rdb_tbl_def *const tbl_def_arg,
372                         bool alloc_alter_buffers = false)
373       MY_ATTRIBUTE((__warn_unused_result__));
374   void free_key_buffers();
375 
376   // the buffer size should be at least 2*Rdb_key_def::INDEX_NUMBER_SIZE
377   rocksdb::Range get_range(const int i, uchar buf[]) const;
378 
379   void records_in_range_internal(uint inx, key_range *const min_key,
380                                  key_range *const max_key, int64 disk_size,
381                                  int64 rows, ulonglong *total_size,
382                                  ulonglong *row_count);
383 
384   /*
385     Perf timers for data reads
386   */
387   Rdb_io_perf m_io_perf;
388 
389   /*
390     Update stats
391   */
392   void update_stats(void);
393 
394  public:
395   /*
396     Controls whether writes include checksums. This is updated from the session
397     variable
398     at the start of each query.
399   */
400   bool m_store_row_debug_checksums;
401 
402   int m_checksums_pct;
403 
404   ha_rocksdb(my_core::handlerton *const hton,
405              my_core::TABLE_SHARE *const table_arg);
406   virtual ~ha_rocksdb() override;
407 
408   /** @brief
409     The name that will be used for display purposes.
410    */
411   const char *table_type() const override {
412     DBUG_ENTER_FUNC();
413 
414     DBUG_RETURN(rocksdb_hton_name);
415   }
416 
417   /* The following is only used by SHOW KEYS: */
418   const char *index_type(uint inx) override {
419     DBUG_ENTER_FUNC();
420 
421     DBUG_RETURN("LSMTREE");
422   }
423 
424   /** @brief
425     The file extensions.
426    */
427   const char **bas_ext() const override;
428 
429   /*
430     Returns the name of the table's base name
431   */
432   const std::string &get_table_basename() const;
433 
434   /** @brief
435     This is a list of flags that indicate what functionality the storage engine
436     implements. The current table flags are documented in handler.h
437   */
438   Table_flags table_flags() const override;
439 
440   static Table_flags table_flags(const bool pk_can_be_decoded) {
441     DBUG_ENTER_FUNC();
442 
443     /*
444       HA_BINLOG_STMT_CAPABLE
445         We are saying that this engine is just statement capable to have
446         an engine that can only handle statement-based logging. This is
447         used in testing.
448       HA_REC_NOT_IN_SEQ
449         If we don't set it, filesort crashes, because it assumes rowids are
450         1..8 byte numbers
451     */
452     DBUG_RETURN(HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE |
453                 HA_REC_NOT_IN_SEQ | HA_CAN_INDEX_BLOBS |
454                 (pk_can_be_decoded ? HA_PRIMARY_KEY_IN_READ_INDEX : 0) |
455                 HA_PRIMARY_KEY_REQUIRED_FOR_POSITION | HA_NULL_IN_KEY |
456                 HA_PARTIAL_COLUMN_READ | HA_ONLINE_ANALYZE);
457   }
458 
459   bool init_with_fields() override;
460 
461   static ulong index_flags(bool &pk_can_be_decoded,
462                            const TABLE_SHARE *table_share, uint inx, uint part,
463                            bool all_parts);
464 
465   /** @brief
466     This is a bitmap of flags that indicates how the storage engine
467     implements indexes. The current index flags are documented in
468     handler.h. If you do not implement indexes, just return zero here.
469 
470     @details
471     part is the key part to check. First key part is 0.
472     If all_parts is set, MySQL wants to know the flags for the combined
473     index, up to and including 'part'.
474   */
475   ulong index_flags(uint inx, uint part, bool all_parts) const override;
476 
477   bool rpl_can_handle_stm_event() const override;
478 
479   const key_map *keys_to_use_for_scanning() override {
480     DBUG_ENTER_FUNC();
481 
482     DBUG_RETURN(&key_map_full);
483   }
484 
485   bool primary_key_is_clustered() const override {
486     DBUG_ENTER_FUNC();
487 
488     DBUG_RETURN(true);
489   }
490 
491   bool should_store_row_debug_checksums() const {
492     return m_store_row_debug_checksums && (rand() % 100 < m_checksums_pct);
493   }
494 
495   MY_NODISCARD
496   int rename_partitioned_table(const char *const from, const char *const to,
497                                const std::string &partition_string);
498 
499   MY_NODISCARD
500   int rename_non_partitioned_table(const char *const from,
501                                    const char *const to);
502 
503   MY_NODISCARD
504   int rename_table(const char *const from, const char *const to) override;
505 
506   int convert_record_from_storage_format(const rocksdb::Slice *const key,
507                                          const rocksdb::Slice *const value,
508                                          uchar *const buf)
509       MY_ATTRIBUTE((__warn_unused_result__));
510 
511   int convert_record_from_storage_format(const rocksdb::Slice *const key,
512                                          uchar *const buf)
513       MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
514 
515   static const std::vector<std::string> parse_into_tokens(const std::string &s,
516                                                           const char delim);
517 
518   static const std::string generate_cf_name(
519       const uint index, const TABLE *const table_arg,
520       const Rdb_tbl_def *const tbl_def_arg, bool *per_part_match_found);
521 
522   static const char *get_key_name(const uint index,
523                                   const TABLE *const table_arg,
524                                   const Rdb_tbl_def *const tbl_def_arg)
525       MY_ATTRIBUTE((__warn_unused_result__));
526 
527   static const char *get_key_comment(const uint index,
528                                      const TABLE *const table_arg,
529                                      const Rdb_tbl_def *const tbl_def_arg)
530       MY_ATTRIBUTE((__warn_unused_result__));
531 
532   static const std::string get_table_comment(const TABLE *const table_arg)
533       MY_ATTRIBUTE((__warn_unused_result__));
534 
535   static bool is_hidden_pk(const uint index, const TABLE *const table_arg,
536                            const Rdb_tbl_def *const tbl_def_arg)
537       MY_ATTRIBUTE((__warn_unused_result__));
538 
539   static uint pk_index(const TABLE *const table_arg,
540                        const Rdb_tbl_def *const tbl_def_arg)
541       MY_ATTRIBUTE((__warn_unused_result__));
542 
543   static bool is_pk(const uint index, const TABLE *table_arg,
544                     const Rdb_tbl_def *tbl_def_arg)
545       MY_ATTRIBUTE((__warn_unused_result__));
546   /** @brief
547     unireg.cc will call max_supported_record_length(), max_supported_keys(),
548     max_supported_key_parts(), uint max_supported_key_length()
549     to make sure that the storage engine can handle the data it is about to
550     send. Return *real* limits of your storage engine here; MySQL will do
551     min(your_limits, MySQL_limits) automatically.
552    */
553   uint max_supported_record_length() const override {
554     DBUG_ENTER_FUNC();
555 
556     DBUG_RETURN(HA_MAX_REC_LENGTH);
557   }
558 
559   uint max_supported_keys() const override {
560     DBUG_ENTER_FUNC();
561 
562     DBUG_RETURN(MAX_INDEXES);
563   }
564 
565   uint max_supported_key_parts() const override {
566     DBUG_ENTER_FUNC();
567 
568     DBUG_RETURN(MAX_REF_PARTS);
569   }
570 
571   uint max_supported_key_part_length(HA_CREATE_INFO *) const override;
572 
573   /** @brief
574     unireg.cc will call this to make sure that the storage engine can handle
575     the data it is about to send. Return *real* limits of your storage engine
576     here; MySQL will do min(your_limits, MySQL_limits) automatically.
577 
578       @details
579     There is no need to implement ..._key_... methods if your engine doesn't
580     support indexes.
581    */
582   uint max_supported_key_length() const override {
583     DBUG_ENTER_FUNC();
584 
585     DBUG_RETURN(16 * 1024); /* just to return something*/
586   }
587 
588   /**
589     TODO: return actual upper bound of number of records in the table.
590     (e.g. save number of records seen on full table scan and/or use file size
591     as upper bound)
592   */
593   ha_rows estimate_rows_upper_bound() override {
594     DBUG_ENTER_FUNC();
595 
596     DBUG_RETURN(HA_POS_ERROR);
597   }
598 
599   /* At the moment, we're ok with default handler::index_init() implementation.
600    */
601   int index_read_map(uchar *const buf, const uchar *const key,
602                      key_part_map keypart_map,
603                      enum ha_rkey_function find_flag) override
604       MY_ATTRIBUTE((__warn_unused_result__));
605 
606   int index_read_map_impl(uchar *const buf, const uchar *const key,
607                           key_part_map keypart_map,
608                           enum ha_rkey_function find_flag,
609                           const key_range *end_key)
610       MY_ATTRIBUTE((__warn_unused_result__));
611 
612   int index_read_last_map(uchar *const buf, const uchar *const key,
613                           key_part_map keypart_map) override
614       MY_ATTRIBUTE((__warn_unused_result__));
615 
616   int read_range_first(const key_range *const start_key,
617                        const key_range *const end_key, bool eq_range,
618                        bool sorted) override
619       MY_ATTRIBUTE((__warn_unused_result__));
620 
621   virtual double scan_time() override {
622     DBUG_ENTER_FUNC();
623 
624     DBUG_RETURN(
625         static_cast<double>((stats.records + stats.deleted) / 20.0 + 10));
626   }
627 
628   virtual double read_time(uint, uint, ha_rows rows) override;
629   virtual void print_error(int error, myf errflag) override;
630 
631   int open(const char *const name, int mode, uint test_if_locked) override
632       MY_ATTRIBUTE((__warn_unused_result__));
633   int close(void) override MY_ATTRIBUTE((__warn_unused_result__));
634 
635   int write_row(uchar *const buf) override
636       MY_ATTRIBUTE((__warn_unused_result__));
637   int update_row(const uchar *const old_data, uchar *const new_data) override
638       MY_ATTRIBUTE((__warn_unused_result__));
639   int delete_row(const uchar *const buf) override
640       MY_ATTRIBUTE((__warn_unused_result__));
641   void update_table_stats_if_needed();
642   rocksdb::Status delete_or_singledelete(uint index, Rdb_transaction *const tx,
643                                          rocksdb::ColumnFamilyHandle *const cf,
644                                          const rocksdb::Slice &key)
645       MY_ATTRIBUTE((__warn_unused_result__));
646 
647   int index_next(uchar *const buf) override
648       MY_ATTRIBUTE((__warn_unused_result__));
649   int index_next_with_direction(uchar *const buf, bool move_forward)
650       MY_ATTRIBUTE((__warn_unused_result__));
651   int index_prev(uchar *const buf) override
652       MY_ATTRIBUTE((__warn_unused_result__));
653 
654   int index_first(uchar *const buf) override
655       MY_ATTRIBUTE((__warn_unused_result__));
656   int index_last(uchar *const buf) override
657       MY_ATTRIBUTE((__warn_unused_result__));
658 
659   class Item *idx_cond_push(uint keyno, class Item *const idx_cond) override;
660   /*
661     Default implementation from cancel_pushed_idx_cond() suits us
662   */
663 
664   static bool check_bloom_and_set_bounds(
665       THD *thd, const Rdb_key_def &kd, const rocksdb::Slice &eq_cond,
666       const bool use_all_keys, size_t bound_len, uchar *const lower_bound,
667       uchar *const upper_bound, rocksdb::Slice *lower_bound_slice,
668       rocksdb::Slice *upper_bound_slice);
669 
670  private:
671   struct key_def_cf_info {
672     std::shared_ptr<rocksdb::ColumnFamilyHandle> cf_handle;
673     bool is_reverse_cf;
674     bool is_per_partition_cf;
675   };
676 
677   struct update_row_info {
678     Rdb_transaction *tx;
679     const uchar *new_data;
680     const uchar *old_data;
681     rocksdb::Slice new_pk_slice;
682     rocksdb::Slice old_pk_slice;
683     rocksdb::Slice old_pk_rec;
684 
685     // "unpack_info" data for the new PK value
686     Rdb_string_writer *new_pk_unpack_info;
687 
688     longlong hidden_pk_id;
689     bool skip_unique_check;
690   };
691 
692   /*
693     Used to check for duplicate entries during fast unique secondary index
694     creation.
695   */
696   struct unique_sk_buf_info {
697     bool sk_buf_switch = false;
698     rocksdb::Slice sk_memcmp_key;
699     rocksdb::Slice sk_memcmp_key_old;
700     uchar *dup_sk_buf;
701     uchar *dup_sk_buf_old;
702 
703     /*
704       This method is meant to be called back to back during inplace creation
705       of unique indexes.  It will switch between two buffers, which
706       will each store the memcmp form of secondary keys, which are then
707       converted to slices in sk_memcmp_key or sk_memcmp_key_old.
708 
709       Switching buffers on each iteration allows us to retain the
710       sk_memcmp_key_old value for duplicate comparison.
711     */
712     inline uchar *swap_and_get_sk_buf() {
713       sk_buf_switch = !sk_buf_switch;
714       return sk_buf_switch ? dup_sk_buf : dup_sk_buf_old;
715     }
716   };
717 
718   int create_cfs(const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
719                  std::array<struct key_def_cf_info, MAX_INDEXES + 1> *const cfs)
720       const MY_ATTRIBUTE((__warn_unused_result__));
721 
722   int create_key_def(const TABLE *const table_arg, const uint i,
723                      const Rdb_tbl_def *const tbl_def_arg,
724                      std::shared_ptr<Rdb_key_def> *const new_key_def,
725                      const struct key_def_cf_info &cf_info, uint64 ttl_duration,
726                      const std::string &ttl_column) const
727       MY_ATTRIBUTE((__warn_unused_result__));
728 
729   int create_inplace_key_defs(
730       const TABLE *const table_arg, Rdb_tbl_def *vtbl_def_arg,
731       const TABLE *const old_table_arg,
732       const Rdb_tbl_def *const old_tbl_def_arg,
733       const std::array<key_def_cf_info, MAX_INDEXES + 1> &cf,
734       uint64 ttl_duration, const std::string &ttl_column) const
735       MY_ATTRIBUTE((__warn_unused_result__));
736 
737   std::unordered_map<std::string, uint> get_old_key_positions(
738       const TABLE *table_arg, const Rdb_tbl_def *tbl_def_arg,
739       const TABLE *old_table_arg, const Rdb_tbl_def *old_tbl_def_arg) const;
740 
741   int compare_key_parts(const KEY *const old_key,
742                         const KEY *const new_key) const
743       MY_ATTRIBUTE((__warn_unused_result__));
744 
745   int compare_keys(const KEY *const old_key, const KEY *const new_key) const
746       MY_ATTRIBUTE((__warn_unused_result__));
747 
748   bool should_hide_ttl_rec(const Rdb_key_def &kd,
749                            const rocksdb::Slice &ttl_rec_val,
750                            const int64_t curr_ts)
751       MY_ATTRIBUTE((__warn_unused_result__));
752   int rocksdb_skip_expired_records(const Rdb_key_def &kd,
753                                    rocksdb::Iterator *const iter,
754                                    bool seek_backward);
755 
756   int index_first_intern(uchar *buf) MY_ATTRIBUTE((__warn_unused_result__));
757   int index_last_intern(uchar *buf) MY_ATTRIBUTE((__warn_unused_result__));
758 
759   enum icp_result check_index_cond() const;
760   int find_icp_matching_index_rec(const bool move_forward, uchar *const buf)
761       MY_ATTRIBUTE((__warn_unused_result__));
762 
763   void calc_updated_indexes();
764   int update_write_row(const uchar *const old_data, const uchar *const new_data,
765                        const bool skip_unique_check)
766       MY_ATTRIBUTE((__warn_unused_result__));
767   int get_pk_for_update(struct update_row_info *const row_info);
768   int check_and_lock_unique_pk(const uint key_id,
769                                const struct update_row_info &row_info,
770                                bool *const found, const bool skip_unique_check)
771       MY_ATTRIBUTE((__warn_unused_result__));
772   int check_and_lock_sk(const uint key_id,
773                         const struct update_row_info &row_info,
774                         bool *const found, const bool skip_unique_check)
775       MY_ATTRIBUTE((__warn_unused_result__));
776   int check_uniqueness_and_lock(const struct update_row_info &row_info,
777                                 bool pk_changed, const bool skip_unique_check)
778       MY_ATTRIBUTE((__warn_unused_result__));
779   bool over_bulk_load_threshold(int *err)
780       MY_ATTRIBUTE((__warn_unused_result__));
781   int check_duplicate_sk(const TABLE *table_arg, const Rdb_key_def &key_def,
782                          const rocksdb::Slice *key,
783                          struct unique_sk_buf_info *sk_info)
784       MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
785   int bulk_load_key(Rdb_transaction *const tx, const Rdb_key_def &kd,
786                     const rocksdb::Slice &key, const rocksdb::Slice &value,
787                     bool sort)
788       MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
789   int update_write_pk(const Rdb_key_def &kd,
790                       const struct update_row_info &row_info,
791                       const bool pk_changed)
792       MY_ATTRIBUTE((__warn_unused_result__));
793   int update_write_sk(const TABLE *const table_arg, const Rdb_key_def &kd,
794                       const struct update_row_info &row_info,
795                       const bool bulk_load_sk)
796       MY_ATTRIBUTE((__warn_unused_result__));
797   int update_write_indexes(const struct update_row_info &row_info,
798                            const bool pk_changed)
799       MY_ATTRIBUTE((__warn_unused_result__));
800 
801   int read_key_exact(const Rdb_key_def &kd, rocksdb::Iterator *const iter,
802                      const bool using_full_key, const rocksdb::Slice &key_slice,
803                      const int64_t ttl_filter_ts)
804       MY_ATTRIBUTE((__warn_unused_result__));
805   int read_before_key(const Rdb_key_def &kd, const bool using_full_key,
806                       const rocksdb::Slice &key_slice,
807                       const int64_t ttl_filter_ts)
808       MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
809   int read_after_key(const Rdb_key_def &kd, const rocksdb::Slice &key_slice,
810                      const int64_t ttl_filter_ts)
811       MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
812   int position_to_correct_key(const Rdb_key_def &kd,
813                               const enum ha_rkey_function &find_flag,
814                               const bool full_key_match, const uchar *const key,
815                               const key_part_map &keypart_map,
816                               const rocksdb::Slice &key_slice,
817                               bool *const move_forward,
818                               const int64_t ttl_filter_ts)
819       MY_ATTRIBUTE((__warn_unused_result__));
820 
821   int read_row_from_primary_key(uchar *const buf)
822       MY_ATTRIBUTE((__warn_unused_result__));
823   int read_row_from_secondary_key(uchar *const buf, const Rdb_key_def &kd,
824                                   bool move_forward)
825       MY_ATTRIBUTE((__warn_unused_result__));
826 
827   int calc_eq_cond_len(const Rdb_key_def &kd,
828                        const enum ha_rkey_function &find_flag,
829                        const rocksdb::Slice &slice,
830                        const int bytes_changed_by_succ,
831                        const key_range *const end_key,
832                        uint *const end_key_packed_size)
833       MY_ATTRIBUTE((__warn_unused_result__));
834 
835   Rdb_tbl_def *get_table_if_exists(const char *const tablename)
836       MY_ATTRIBUTE((__warn_unused_result__));
837   void read_thd_vars(THD *const thd) MY_ATTRIBUTE((__nonnull__));
838 
839   bool contains_foreign_key(THD *const thd)
840       MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
841 
842   int inplace_populate_sk(
843       TABLE *const table_arg,
844       const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes)
845       MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
846 
847   int finalize_bulk_load(bool print_client_error = true)
848       MY_ATTRIBUTE((__warn_unused_result__));
849 
850   void inc_table_n_rows();
851   void dec_table_n_rows();
852 
853   bool should_skip_invalidated_record(const int rc) const;
854   bool should_recreate_snapshot(const int rc, const bool is_new_snapshot) const;
855 
856   bool can_assume_tracked(THD *thd);
857 
858  public:
859   void set_pk_can_be_decoded(bool flag) { m_pk_can_be_decoded = flag; }
860   int index_init(uint idx, bool sorted) override
861       MY_ATTRIBUTE((__warn_unused_result__));
862   int index_end() override MY_ATTRIBUTE((__warn_unused_result__));
863 
864   void unlock_row() override;
865 
866   /** @brief
867     Unlike index_init(), rnd_init() can be called two consecutive times
868     without rnd_end() in between (it only makes sense if scan=1). In this
869     case, the second call should prepare for the new table scan (e.g if
870     rnd_init() allocates the cursor, the second call should position the
871     cursor to the start of the table; no need to deallocate and allocate
872     it again. This is a required method.
873   */
874   int rnd_init(bool scan) override MY_ATTRIBUTE((__warn_unused_result__));
875   int rnd_end() override MY_ATTRIBUTE((__warn_unused_result__));
876 
877   int rnd_next(uchar *const buf) override
878       MY_ATTRIBUTE((__warn_unused_result__));
879   int rnd_next_with_direction(uchar *const buf, bool move_forward)
880       MY_ATTRIBUTE((__warn_unused_result__));
881 
882   int rnd_pos(uchar *const buf, uchar *const pos) override
883       MY_ATTRIBUTE((__warn_unused_result__));
884   void position(const uchar *const record) override;
885   int info(uint) override;
886 
887   /* This function will always return success, therefore no annotation related
888    * to checking the return value. Can't change the signature because it's
889    * required by the interface. */
890   int extra(enum ha_extra_function operation) override;
891 
892   int start_stmt(THD *const thd, thr_lock_type lock_type) override
893       MY_ATTRIBUTE((__warn_unused_result__));
894   int external_lock(THD *const thd, int lock_type) override
895       MY_ATTRIBUTE((__warn_unused_result__));
896   int truncate() override MY_ATTRIBUTE((__warn_unused_result__));
897 
898   int reset() override {
899     DBUG_ENTER_FUNC();
900 
901     /* Free blob data */
902     m_retrieved_record.Reset();
903 
904     DBUG_RETURN(HA_EXIT_SUCCESS);
905   }
906 
907   int check(THD *const thd, HA_CHECK_OPT *const check_opt) override
908       MY_ATTRIBUTE((__warn_unused_result__));
909   int remove_rows(Rdb_tbl_def *const tbl);
910   ha_rows records_in_range(uint inx, key_range *const min_key,
911                            key_range *const max_key) override
912       MY_ATTRIBUTE((__warn_unused_result__));
913   int delete_non_partitioned_table(const char *const from)
914       MY_ATTRIBUTE((__warn_unused_result__));
915   int delete_partitioned_table(const char *const from,
916                                const std::string &partition_info_str)
917       MY_ATTRIBUTE((__warn_unused_result__));
918 
919   int delete_table(Rdb_tbl_def *const tbl);
920   int delete_table(const char *const from) override
921       MY_ATTRIBUTE((__warn_unused_result__));
922   int create(const char *const name, TABLE *const form,
923              HA_CREATE_INFO *const create_info) override
924       MY_ATTRIBUTE((__warn_unused_result__));
925   int create_table(const std::string &table_name, const TABLE *table_arg,
926                    ulonglong auto_increment_value);
927   int truncate_table(Rdb_tbl_def *tbl_def, TABLE *table_arg,
928                      ulonglong auto_increment_value);
929   bool check_if_incompatible_data(HA_CREATE_INFO *const info,
930                                   uint table_changes) override
931       MY_ATTRIBUTE((__warn_unused_result__));
932 
933   THR_LOCK_DATA **store_lock(THD *const thd, THR_LOCK_DATA **to,
934                              enum thr_lock_type lock_type) override
935       MY_ATTRIBUTE((__warn_unused_result__));
936 
937   my_bool register_query_cache_table(THD *const thd, char *const table_key,
938                                      size_t key_length,
939                                      qc_engine_callback *const engine_callback,
940                                      ulonglong *const engine_data) override {
941     DBUG_ENTER_FUNC();
942 
943     /* Currently, we don't support query cache */
944     DBUG_RETURN(FALSE);
945   }
946 
947   bool get_error_message(const int error, String *const buf) override;
948 
949   static int rdb_error_to_mysql(const rocksdb::Status &s,
950                                 const char *msg = nullptr)
951       MY_ATTRIBUTE((__warn_unused_result__));
952 
953   void get_auto_increment(ulonglong offset, ulonglong increment,
954                           ulonglong nb_desired_values,
955                           ulonglong *const first_value,
956                           ulonglong *const nb_reserved_values) override;
957   void update_create_info(HA_CREATE_INFO *const create_info) override;
958   int optimize(THD *const thd, HA_CHECK_OPT *const check_opt) override
959       MY_ATTRIBUTE((__warn_unused_result__));
960   int analyze(THD *const thd, HA_CHECK_OPT *const check_opt) override
961       MY_ATTRIBUTE((__warn_unused_result__));
962 
963   enum_alter_inplace_result check_if_supported_inplace_alter(
964       TABLE *altered_table,
965       my_core::Alter_inplace_info *const ha_alter_info) override;
966 
967   bool prepare_inplace_alter_table(
968       TABLE *const altered_table,
969       my_core::Alter_inplace_info *const ha_alter_info) override;
970 
971   bool inplace_alter_table(
972       TABLE *const altered_table,
973       my_core::Alter_inplace_info *const ha_alter_info) override;
974 
975   bool commit_inplace_alter_table(
976       TABLE *const altered_table,
977       my_core::Alter_inplace_info *const ha_alter_info, bool commit) override;
978 
979   bool is_read_free_rpl_table() const;
980   int adjust_handler_stats_sst_and_memtable();
981   int adjust_handler_stats_table_scan();
982 
983   void build_decoder();
984   void check_build_decoder();
985 
986   static void inc_covered_sk_lookup();
987 
988 #if defined(ROCKSDB_INCLUDE_RFR) && ROCKSDB_INCLUDE_RFR
989  public:
990   virtual void rpl_before_delete_rows() override;
991   virtual void rpl_after_delete_rows() override;
992   virtual void rpl_before_update_rows() override;
993   virtual void rpl_after_update_rows() override;
994   virtual bool rpl_lookup_rows() override;
995 
996   virtual bool use_read_free_rpl() const;  // MyRocks only
997 
998  private:
999   /* Flags tracking if we are inside different replication operation */
1000   bool m_in_rpl_delete_rows;
1001   bool m_in_rpl_update_rows;
1002 #endif  // defined(ROCKSDB_INCLUDE_RFR) && ROCKSDB_INCLUDE_RFR
1003 
1004   bool m_force_skip_unique_check;
1005 
1006   /* Need to build decoder on next read operation */
1007   bool m_need_build_decoder;
1008 };
1009 
1010 /*
1011   Helper class for in-place alter, for storing handler context between inplace
1012   alter calls
1013 */
1014 struct Rdb_inplace_alter_ctx : public my_core::inplace_alter_handler_ctx {
1015   /* The new table definition */
1016   Rdb_tbl_def *const m_new_tdef;
1017 
1018   /* Stores the original key definitions */
1019   std::shared_ptr<Rdb_key_def> *const m_old_key_descr;
1020 
1021   /* Stores the new key definitions */
1022   std::shared_ptr<Rdb_key_def> *m_new_key_descr;
1023 
1024   /* Stores the old number of key definitions */
1025   const uint m_old_n_keys;
1026 
1027   /* Stores the new number of key definitions */
1028   const uint m_new_n_keys;
1029 
1030   /* Stores the added key glids */
1031   const std::unordered_set<std::shared_ptr<Rdb_key_def>> m_added_indexes;
1032 
1033   /* Stores the dropped key glids */
1034   const std::unordered_set<GL_INDEX_ID> m_dropped_index_ids;
1035 
1036   /* Stores number of keys to add */
1037   const uint m_n_added_keys;
1038 
1039   /* Stores number of keys to drop */
1040   const uint m_n_dropped_keys;
1041 
1042   /* Stores the largest current auto increment value in the index */
1043   const ulonglong m_max_auto_incr;
1044 
1045   Rdb_inplace_alter_ctx(
1046       Rdb_tbl_def *new_tdef, std::shared_ptr<Rdb_key_def> *old_key_descr,
1047       std::shared_ptr<Rdb_key_def> *new_key_descr, uint old_n_keys,
1048       uint new_n_keys,
1049       std::unordered_set<std::shared_ptr<Rdb_key_def>> added_indexes,
1050       std::unordered_set<GL_INDEX_ID> dropped_index_ids, uint n_added_keys,
1051       uint n_dropped_keys, ulonglong max_auto_incr)
1052       : my_core::inplace_alter_handler_ctx(),
1053         m_new_tdef(new_tdef),
1054         m_old_key_descr(old_key_descr),
1055         m_new_key_descr(new_key_descr),
1056         m_old_n_keys(old_n_keys),
1057         m_new_n_keys(new_n_keys),
1058         m_added_indexes(added_indexes),
1059         m_dropped_index_ids(dropped_index_ids),
1060         m_n_added_keys(n_added_keys),
1061         m_n_dropped_keys(n_dropped_keys),
1062         m_max_auto_incr(max_auto_incr) {}
1063 
1064   ~Rdb_inplace_alter_ctx() {}
1065 
1066  private:
1067   /* Disable Copying */
1068   Rdb_inplace_alter_ctx(const Rdb_inplace_alter_ctx &);
1069   Rdb_inplace_alter_ctx &operator=(const Rdb_inplace_alter_ctx &);
1070 };
1071 
1072 /*
1073   Helper class to control access/init to handlerton instance.
1074   Contains a flag that is set if the handlerton is in an initialized, usable
1075   state, plus a reader-writer lock to protect it without serializing reads.
1076   Since we don't have static initializers for the opaque mysql_rwlock type,
1077   use constructor and destructor functions to create and destroy
1078   the lock before and after main(), respectively.
1079 */
1080 struct Rdb_hton_init_state {
1081   struct Scoped_lock {
1082     Scoped_lock(Rdb_hton_init_state &state, bool write) : m_state(state) {
1083       if (write)
1084         m_state.lock_write();
1085       else
1086         m_state.lock_read();
1087     }
1088     ~Scoped_lock() { m_state.unlock(); }
1089 
1090    private:
1091     Scoped_lock(const Scoped_lock &sl) : m_state(sl.m_state) {}
1092     void operator=(const Scoped_lock &) {}
1093 
1094     Rdb_hton_init_state &m_state;
1095   };
1096 
1097   Rdb_hton_init_state() : m_initialized(false) {
1098     /*
1099       m_rwlock can not be instrumented as it must be initialized before
1100       mysql_mutex_register() call to protect some globals from race condition.
1101     */
1102     mysql_rwlock_init(0, &m_rwlock);
1103   }
1104 
1105   ~Rdb_hton_init_state() { mysql_rwlock_destroy(&m_rwlock); }
1106 
1107   void lock_read() { mysql_rwlock_rdlock(&m_rwlock); }
1108 
1109   void lock_write() { mysql_rwlock_wrlock(&m_rwlock); }
1110 
1111   void unlock() { mysql_rwlock_unlock(&m_rwlock); }
1112 
1113   /*
1114     Must be called with either a read or write lock held, unable to enforce
1115     behavior as mysql_rwlock has no means of determining if a thread has a lock
1116   */
1117   bool initialized() const { return m_initialized; }
1118 
1119   /*
1120     Must be called with only a write lock held, unable to enforce behavior as
1121     mysql_rwlock has no means of determining if a thread has a lock
1122   */
1123   void set_initialized(bool init) { m_initialized = init; }
1124 
1125  private:
1126   mysql_rwlock_t m_rwlock;
1127   bool m_initialized;
1128 };
1129 
1130 // file name indicating RocksDB data corruption
1131 std::string rdb_corruption_marker_file_name();
1132 
1133 }  // namespace myrocks
1134