1 /*
2    Copyright (c) 2012,2013 Monty Program Ab
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; version 2 of the License.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
16 #pragma once
17 
18 #define ROCKSDB_INCLUDE_VALIDATE_TABLES 1
19 
20 /* C++ standard header files */
21 #include <algorithm>
22 #include <array>
23 #include <atomic>
24 #include <map>
25 #include <mutex>
26 #include <string>
27 #include <unordered_map>
28 #include <unordered_set>
29 #include <utility>
30 #include <vector>
31 
32 /* C standard header files */
33 #include <arpa/inet.h>
34 
35 /* MyRocks header files */
36 #include "./ha_rocksdb.h"
37 #include "./properties_collector.h"
38 #include "./rdb_buff.h"
39 #include "./rdb_utils.h"
40 
41 namespace myrocks {
42 
43 class Rdb_dict_manager;
44 class Rdb_key_def;
45 class Rdb_field_packing;
46 class Rdb_cf_manager;
47 class Rdb_ddl_manager;
48 
49 class Rdb_convert_to_record_key_decoder {
50  public:
51   Rdb_convert_to_record_key_decoder() = default;
52   Rdb_convert_to_record_key_decoder(
53       const Rdb_convert_to_record_key_decoder &decoder) = delete;
54   Rdb_convert_to_record_key_decoder &operator=(
55       const Rdb_convert_to_record_key_decoder &decoder) = delete;
56   static int decode(uchar *const buf, Rdb_field_packing *fpi, TABLE *table,
57                     bool has_unpack_info, Rdb_string_reader *reader,
58                     Rdb_string_reader *unpack_reader);
59   static int skip(const Rdb_field_packing *fpi, const Field *field,
60                   Rdb_string_reader *reader, Rdb_string_reader *unpack_reader);
61 
62  private:
63   static int decode_field(Rdb_field_packing *fpi, TABLE *table, uchar *buf,
64                           Rdb_string_reader *reader,
65                           Rdb_string_reader *unpack_reader);
66 };
67 
68 /*
69   @brief
70   Field packing context.
71   The idea is to ensure that a call to rdb_index_field_pack_t function
72   is followed by a call to rdb_make_unpack_info_t.
73 
74   @detail
75   For some datatypes, unpack_info is produced as a side effect of
76   rdb_index_field_pack_t function call.
77   For other datatypes, packing is just calling make_sort_key(), while
78   rdb_make_unpack_info_t is a custom function.
79   In order to accommodate both cases, we require both calls to be made and
80   unpack_info is passed as context data between the two.
81 */
82 class Rdb_pack_field_context {
83  public:
84   Rdb_pack_field_context(const Rdb_pack_field_context &) = delete;
85   Rdb_pack_field_context &operator=(const Rdb_pack_field_context &) = delete;
86 
Rdb_pack_field_context(Rdb_string_writer * const writer_arg)87   explicit Rdb_pack_field_context(Rdb_string_writer *const writer_arg)
88       : writer(writer_arg) {}
89 
90   // NULL means we're not producing unpack_info.
91   Rdb_string_writer *writer;
92 };
93 
94 class Rdb_key_field_iterator {
95  private:
96   TABLE *m_table;
97   Rdb_string_reader *m_reader;
98   Rdb_string_reader *m_unp_reader;
99   uint m_curr_bitmap_pos;
100   const MY_BITMAP *m_covered_bitmap;
101   uchar *m_buf;
102   bool m_has_unpack_info;
103   const Rdb_key_def *m_key_def;
104   bool m_secondary_key;
105   bool m_hidden_pk_exists;
106   bool m_is_hidden_pk;
107   bool m_is_null;
108   Rdb_field_packing *m_fpi;
109   Rdb_field_packing *m_fpi_end;
110 
111  public:
112   Rdb_key_field_iterator(const Rdb_key_field_iterator &) = delete;
113   Rdb_key_field_iterator &operator=(const Rdb_key_field_iterator &) = delete;
114   Rdb_key_field_iterator(const Rdb_key_def *key_def,
115                          Rdb_field_packing *pack_info,
116                          Rdb_string_reader *reader,
117                          Rdb_string_reader *unp_reader, TABLE *table,
118                          bool has_unpack_info, const MY_BITMAP *covered_bitmap,
119                          uchar *buf);
120 
121   int next();
122   bool has_next();
123 };
124 
125 struct Rdb_collation_codec;
126 struct Rdb_index_info;
127 
128 /*
129   C-style "virtual table" allowing different handling of packing logic based
130   on the field type. See Rdb_field_packing::setup() implementation.
131   */
132 using rdb_make_unpack_info_t = void (*)(const Rdb_collation_codec *codec,
133                                         const Field *field,
134                                         Rdb_pack_field_context *pack_ctx);
135 using rdb_index_field_unpack_t = int (*)(Rdb_field_packing *fpi,
136                                          uchar *field_ptr,
137                                          Rdb_string_reader *reader,
138                                          Rdb_string_reader *unpack_reader);
139 using rdb_index_field_skip_t = int (*)(const Rdb_field_packing *fpi,
140                                        Rdb_string_reader *reader);
141 using rdb_index_field_pack_t = void (*)(Rdb_field_packing *fpi, Field *field,
142                                         uchar *buf, uchar **dst,
143                                         Rdb_pack_field_context *pack_ctx);
144 
145 const constexpr uint RDB_INVALID_KEY_LEN = uint(-1);
146 
147 /* How much one checksum occupies when stored in the record */
148 const constexpr size_t RDB_CHECKSUM_SIZE = sizeof(uint32_t);
149 
150 /*
151   How much the checksum data occupies in record, in total.
152   It is storing two checksums plus 1 tag-byte.
153 */
154 const constexpr size_t RDB_CHECKSUM_CHUNK_SIZE = 2 * RDB_CHECKSUM_SIZE + 1;
155 
156 /*
157   Checksum data starts from CHECKSUM_DATA_TAG which is followed by two CRC32
158   checksums.
159 */
160 const constexpr char RDB_CHECKSUM_DATA_TAG = 0x01;
161 
162 /*
163   Unpack data is variable length. The header is 1 tag-byte plus a two byte
164   length field. The length field includes the header as well.
165 */
166 const constexpr char RDB_UNPACK_DATA_TAG = 0x02;
167 const constexpr size_t RDB_UNPACK_DATA_LEN_SIZE = sizeof(uint16_t);
168 const constexpr size_t RDB_UNPACK_HEADER_SIZE =
169     sizeof(RDB_UNPACK_DATA_TAG) + RDB_UNPACK_DATA_LEN_SIZE;
170 
171 /*
172   This header format is 1 tag-byte plus a two byte length field plus a two byte
173   covered bitmap. The length field includes the header size.
174 */
175 const constexpr char RDB_UNPACK_COVERED_DATA_TAG = 0x03;
176 const constexpr size_t RDB_UNPACK_COVERED_DATA_LEN_SIZE = sizeof(uint16_t);
177 const constexpr size_t RDB_COVERED_BITMAP_SIZE = sizeof(uint16_t);
178 const constexpr size_t RDB_UNPACK_COVERED_HEADER_SIZE =
179     sizeof(RDB_UNPACK_COVERED_DATA_TAG) + RDB_UNPACK_COVERED_DATA_LEN_SIZE +
180     RDB_COVERED_BITMAP_SIZE;
181 
182 /*
183   Data dictionary index info field sizes.
184 */
185 const constexpr size_t RDB_SIZEOF_INDEX_INFO_VERSION = sizeof(uint16);
186 const constexpr size_t RDB_SIZEOF_INDEX_TYPE = sizeof(uchar);
187 const constexpr size_t RDB_SIZEOF_KV_VERSION = sizeof(uint16);
188 const constexpr size_t RDB_SIZEOF_INDEX_FLAGS = sizeof(uint32);
189 const constexpr size_t RDB_SIZEOF_AUTO_INCREMENT_VERSION = sizeof(uint16);
190 
191 // Possible return values for rdb_index_field_unpack_t functions.
192 enum {
193   UNPACK_SUCCESS = 0,
194   UNPACK_FAILURE = 1,
195 };
196 
197 /*
198   An object of this class represents information about an index in an SQL
199   table. It provides services to encode and decode index tuples.
200 
201   Note: a table (as in, on-disk table) has a single Rdb_key_def object which
202   is shared across multiple TABLE* objects and may be used simultaneously from
203   different threads.
204 
205   There are several data encodings:
206 
207   === SQL LAYER ===
208   SQL layer uses two encodings:
209 
210   - "Table->record format". This is the format that is used for the data in
211      the record buffers, table->record[i]
212 
213   - KeyTupleFormat (see opt_range.cc) - this is used in parameters to index
214     lookup functions, like handler::index_read_map().
215 
216   === Inside RocksDB ===
217   Primary Key is stored as a mapping:
218 
219     index_tuple -> StoredRecord
220 
221   StoredRecord is in Table->record format, except for blobs, which are stored
222   in-place. See ha_rocksdb::convert_record_to_storage_format for details.
223 
224   Secondary indexes are stored as one of two variants:
225 
226     index_tuple -> unpack_info
227     index_tuple -> empty_string
228 
229   index_tuple here is the form of key that can be compared with memcmp(), aka
230   "mem-comparable form".
231 
232   unpack_info is extra data that allows to restore the original value from its
233   mem-comparable form. It is present only if the index supports index-only
234   reads.
235 */
236 
237 class Rdb_key_def {
238  public:
239   /* Convert a key from KeyTupleFormat to mem-comparable form */
240   uint pack_index_tuple(TABLE *const tbl, uchar *const pack_buffer,
241                         uchar *const packed_tuple, const uchar *const key_tuple,
242                         const key_part_map &keypart_map) const;
243 
244   uchar *pack_field(Field *const field, Rdb_field_packing *pack_info,
245                     uchar *tuple, uchar *const packed_tuple,
246                     uchar *const pack_buffer,
247                     Rdb_string_writer *const unpack_info,
248                     uint *const n_null_fields) const;
249   /* Convert a key from Table->record format to mem-comparable form */
250   uint pack_record(const TABLE *const tbl, uchar *const pack_buffer,
251                    const uchar *const record, uchar *const packed_tuple,
252                    Rdb_string_writer *const unpack_info,
253                    const bool should_store_row_debug_checksums,
254                    const longlong hidden_pk_id = 0, uint n_key_parts = 0,
255                    uint *const n_null_fields = nullptr,
256                    const char *const ttl_bytes = nullptr) const;
257   /* Pack the hidden primary key into mem-comparable form. */
258   uint pack_hidden_pk(const longlong hidden_pk_id,
259                       uchar *const packed_tuple) const;
260   int unpack_record(TABLE *const table, uchar *const buf,
261                     const rocksdb::Slice *const packed_key,
262                     const rocksdb::Slice *const unpack_info,
263                     const bool verify_row_debug_checksums) const;
264 
265   static bool unpack_info_has_checksum(const rocksdb::Slice &unpack_info);
266   int compare_keys(const rocksdb::Slice *key1, const rocksdb::Slice *key2,
267                    std::size_t *const column_index) const;
268 
269   size_t key_length(const TABLE *const table, const rocksdb::Slice &key) const;
270 
271   /* Get the key that is the "infimum" for this index */
get_infimum_key(uchar * const key,uint * const size)272   inline void get_infimum_key(uchar *const key, uint *const size) const {
273     rdb_netbuf_store_index(key, m_index_number);
274     *size = INDEX_NUMBER_SIZE;
275   }
276 
277   /* Get the key that is a "supremum" for this index */
get_supremum_key(uchar * const key,uint * const size)278   inline void get_supremum_key(uchar *const key, uint *const size) const {
279     rdb_netbuf_store_index(key, m_index_number + 1);
280     *size = INDEX_NUMBER_SIZE;
281   }
282 
283   /*
284     Get the first key that you need to position at to start iterating.
285     Stores into *key a "supremum" or "infimum" key value for the index.
286     @parameters key    OUT  Big Endian, value is m_index_number or
287                             m_index_number + 1
288     @parameters size   OUT  key size, value is INDEX_NUMBER_SIZE
289     @return Number of bytes in the key that are usable for bloom filter use.
290   */
get_first_key(uchar * const key,uint * const size)291   inline int get_first_key(uchar *const key, uint *const size) const {
292     if (m_is_reverse_cf) {
293       get_supremum_key(key, size);
294       /* Find out how many bytes of infimum are the same as m_index_number */
295       uchar unmodified_key[INDEX_NUMBER_SIZE];
296       rdb_netbuf_store_index(unmodified_key, m_index_number);
297       int i;
298       for (i = 0; i < INDEX_NUMBER_SIZE; i++) {
299         if (key[i] != unmodified_key[i]) {
300           break;
301         }
302       }
303       return i;
304     } else {
305       get_infimum_key(key, size);
306       // For infimum key, its value will be m_index_number
307       // Thus return its own size instead.
308       return INDEX_NUMBER_SIZE;
309     }
310   }
311 
312   /*
313     The same as get_first_key, but get the key for the last entry in the index
314     @parameters key    OUT  Big Endian, value is m_index_number or
315                             m_index_number + 1
316     @parameters size   OUT  key size, value is INDEX_NUMBER_SIZE
317 
318     @return Number of bytes in the key that are usable for bloom filter use.
319   */
get_last_key(uchar * const key,uint * const size)320   inline int get_last_key(uchar *const key, uint *const size) const {
321     if (m_is_reverse_cf) {
322       get_infimum_key(key, size);
323       // For infimum key, its value will be m_index_number
324       // Thus return its own size instead.
325       return INDEX_NUMBER_SIZE;
326     } else {
327       get_supremum_key(key, size);
328       /* Find out how many bytes are the same as m_index_number */
329       uchar unmodified_key[INDEX_NUMBER_SIZE];
330       rdb_netbuf_store_index(unmodified_key, m_index_number);
331       int i;
332       for (i = 0; i < INDEX_NUMBER_SIZE; i++) {
333         if (key[i] != unmodified_key[i]) {
334           break;
335         }
336       }
337       return i;
338     }
339   }
340 
341   /* Make a key that is right after the given key. */
342   static int successor(uchar *const packed_tuple, const uint len);
343 
344   /* Make a key that is right before the given key. */
345   static int predecessor(uchar *const packed_tuple, const uint len);
346 
347   /*
348     This can be used to compare prefixes.
349     if  X is a prefix of Y, then we consider that X = Y.
350   */
351   // b describes the lookup key, which can be a prefix of a.
352   // b might be outside of the index_number range, if successor() is called.
cmp_full_keys(const rocksdb::Slice & a,const rocksdb::Slice & b)353   int cmp_full_keys(const rocksdb::Slice &a, const rocksdb::Slice &b) const {
354     assert(covers_key(a));
355 
356     return memcmp(a.data(), b.data(), std::min(a.size(), b.size()));
357   }
358 
359   /* Check if given mem-comparable key belongs to this index */
covers_key(const rocksdb::Slice & slice)360   bool covers_key(const rocksdb::Slice &slice) const {
361     if (slice.size() < INDEX_NUMBER_SIZE) return false;
362 
363     if (memcmp(slice.data(), m_index_number_storage_form, INDEX_NUMBER_SIZE)) {
364       return false;
365     }
366 
367     return true;
368   }
369 
370   void get_lookup_bitmap(const TABLE *table, MY_BITMAP *map) const;
371 
372   bool covers_lookup(const rocksdb::Slice *const unpack_info,
373                      const MY_BITMAP *const map) const;
374 
use_covered_bitmap_format()375   inline bool use_covered_bitmap_format() const {
376     return m_index_type == INDEX_TYPE_SECONDARY &&
377            m_kv_format_version >= SECONDARY_FORMAT_VERSION_UPDATE3;
378   }
379 
is_primary_key()380   inline bool is_primary_key() const {
381     return m_index_type == INDEX_TYPE_PRIMARY ||
382            m_index_type == INDEX_TYPE_HIDDEN_PRIMARY;
383   }
384 
385   /* Indicates that all key parts can be unpacked to cover a secondary lookup */
386   bool can_cover_lookup() const;
387 
388   /*
389     Return true if the passed mem-comparable key
390     - is from this index, and
391     - it matches the passed key prefix (the prefix is also in mem-comparable
392       form)
393   */
value_matches_prefix(const rocksdb::Slice & value,const rocksdb::Slice & prefix)394   bool value_matches_prefix(const rocksdb::Slice &value,
395                             const rocksdb::Slice &prefix) const {
396     return covers_key(value) && !cmp_full_keys(value, prefix);
397   }
398 
get_keyno()399   uint32 get_keyno() const { return m_keyno; }
400 
get_index_number()401   uint32 get_index_number() const { return m_index_number; }
402 
get_gl_index_id()403   GL_INDEX_ID get_gl_index_id() const {
404     const GL_INDEX_ID gl_index_id = {m_cf_handle->GetID(), m_index_number};
405     return gl_index_id;
406   }
407 
408   int read_memcmp_key_part(const TABLE *table_arg, Rdb_string_reader *reader,
409                            const uint part_num) const;
410 
411   /* Must only be called for secondary keys: */
412   uint get_primary_key_tuple(const TABLE *const tbl,
413                              const Rdb_key_def &pk_descr,
414                              const rocksdb::Slice *const key,
415                              uchar *const pk_buffer) const;
416 
417   uint get_memcmp_sk_parts(const TABLE *table, const rocksdb::Slice &key,
418                            uchar *sk_buffer, uint *n_null_fields) const;
419 
420   /* Return max length of mem-comparable form */
max_storage_fmt_length()421   uint max_storage_fmt_length() const { return m_maxlength; }
422 
get_key_parts()423   uint get_key_parts() const { return m_key_parts; }
424 
get_ttl_field_index()425   uint get_ttl_field_index() const { return m_ttl_field_index; }
426 
427   /*
428     Get a field object for key part #part_no
429 
430     @detail
431       SQL layer thinks unique secondary indexes and indexes in partitioned
432       tables are not "Extended" with Primary Key columns.
433 
434       Internally, we always extend all indexes with PK columns. This function
435       uses our definition of how the index is Extended.
436   */
437   inline Field *get_table_field_for_part_no(TABLE *table, uint part_no) const;
438 
get_name()439   const std::string &get_name() const { return m_name; }
440 
get_extractor()441   const rocksdb::SliceTransform *get_extractor() const {
442     return m_prefix_extractor.get();
443   }
444 
445   static size_t get_unpack_header_size(char tag);
446 
447   Rdb_key_def &operator=(const Rdb_key_def &) = delete;
448   Rdb_key_def(const Rdb_key_def &k);
449   Rdb_key_def(uint indexnr_arg, uint keyno_arg,
450               std::shared_ptr<rocksdb::ColumnFamilyHandle> cf_handle_arg,
451               uint16_t index_dict_version_arg, uchar index_type_arg,
452               uint16_t kv_format_version_arg, bool is_reverse_cf_arg,
453               bool is_per_partition_cf, const char *name,
454               Rdb_index_stats stats = Rdb_index_stats(), uint32 index_flags = 0,
455               uint32 ttl_rec_offset = UINT_MAX, uint64 ttl_duration = 0);
456   ~Rdb_key_def();
457 
458   enum {
459     INDEX_NUMBER_SIZE = 4,
460     VERSION_SIZE = 2,
461     CF_NUMBER_SIZE = 4,
462     CF_FLAG_SIZE = 4,
463     PACKED_SIZE = 4,  // one int
464   };
465 
466   // bit flags for combining bools when writing to disk
467   enum {
468     REVERSE_CF_FLAG = 1,
469     AUTO_CF_FLAG = 2,  // Deprecated
470     PER_PARTITION_CF_FLAG = 4,
471   };
472 
473   // bit flags which denote myrocks specific fields stored in the record
474   // currently only used for TTL.
475   enum INDEX_FLAG {
476     TTL_FLAG = 1 << 0,
477 
478     // MAX_FLAG marks where the actual record starts
479     // This flag always needs to be set to the last index flag enum.
480     MAX_FLAG = TTL_FLAG << 1,
481   };
482 
483   // Set of flags to ignore when comparing two CF-s and determining if
484   // they're same.
485   static const uint CF_FLAGS_TO_IGNORE = PER_PARTITION_CF_FLAG;
486 
487   // Data dictionary types
488   enum DATA_DICT_TYPE {
489     DDL_ENTRY_INDEX_START_NUMBER = 1,
490     INDEX_INFO = 2,
491     CF_DEFINITION = 3,
492     BINLOG_INFO_INDEX_NUMBER = 4,
493     DDL_DROP_INDEX_ONGOING = 5,
494     INDEX_STATISTICS = 6,
495     MAX_INDEX_ID = 7,
496     DDL_CREATE_INDEX_ONGOING = 8,
497     AUTO_INC = 9,
498     DROPPED_CF = 10,
499     END_DICT_INDEX_ID = 255
500   };
501 
502   // Data dictionary schema version. Introduce newer versions
503   // if changing schema layout
504   enum {
505     DDL_ENTRY_INDEX_VERSION = 1,
506     CF_DEFINITION_VERSION = 1,
507     BINLOG_INFO_INDEX_NUMBER_VERSION = 1,
508     DDL_DROP_INDEX_ONGOING_VERSION = 1,
509     MAX_INDEX_ID_VERSION = 1,
510     DDL_CREATE_INDEX_ONGOING_VERSION = 1,
511     AUTO_INCREMENT_VERSION = 1,
512     DROPPED_CF_VERSION = 1,
513     // Version for index stats is stored in IndexStats struct
514   };
515 
516   // Index info version.  Introduce newer versions when changing the
517   // INDEX_INFO layout. Update INDEX_INFO_VERSION_LATEST to point to the
518   // latest version number.
519   enum {
520     INDEX_INFO_VERSION_INITIAL = 1,  // Obsolete
521     INDEX_INFO_VERSION_KV_FORMAT,
522     INDEX_INFO_VERSION_GLOBAL_ID,
523     // There is no change to data format in this version, but this version
524     // verifies KV format version, whereas previous versions do not. A version
525     // bump is needed to prevent older binaries from skipping the KV version
526     // check inadvertently.
527     INDEX_INFO_VERSION_VERIFY_KV_FORMAT,
528     // This changes the data format to include a 8 byte TTL duration for tables
529     INDEX_INFO_VERSION_TTL,
530     // This changes the data format to include a bitmap before the TTL duration
531     // which will indicate in the future whether TTL or other special fields
532     // are turned on or off.
533     INDEX_INFO_VERSION_FIELD_FLAGS,
534     // This normally point to the latest (currently it does).
535     INDEX_INFO_VERSION_LATEST = INDEX_INFO_VERSION_FIELD_FLAGS,
536   };
537 
538   // MyRocks index types
539   enum {
540     INDEX_TYPE_PRIMARY = 1,
541     INDEX_TYPE_SECONDARY = 2,
542     INDEX_TYPE_HIDDEN_PRIMARY = 3,
543   };
544 
545   // Key/Value format version for each index type
546   enum {
547     PRIMARY_FORMAT_VERSION_INITIAL = 10,
548     // This change includes:
549     //  - For columns that can be unpacked with unpack_info, PK
550     //    stores the unpack_info.
551     //  - DECIMAL datatype is no longer stored in the row (because
552     //    it can be decoded from its mem-comparable form)
553     //  - VARCHAR-columns use endspace-padding.
554     PRIMARY_FORMAT_VERSION_UPDATE1 = 11,
555     // This change includes:
556     //  - Binary encoded variable length fields have a new format that avoids
557     //    an inefficient where data that was a multiple of 8 bytes in length
558     //    had an extra 9 bytes of encoded data.
559     PRIMARY_FORMAT_VERSION_UPDATE2 = 12,
560     // This change includes support for TTL
561     //  - This means that when TTL is specified for the table an 8-byte TTL
562     //    field is prepended in front of each value.
563     PRIMARY_FORMAT_VERSION_TTL = 13,
564     PRIMARY_FORMAT_VERSION_LATEST = PRIMARY_FORMAT_VERSION_TTL,
565 
566     SECONDARY_FORMAT_VERSION_INITIAL = 10,
567     // This change the SK format to include unpack_info.
568     SECONDARY_FORMAT_VERSION_UPDATE1 = 11,
569     // This change includes:
570     //  - Binary encoded variable length fields have a new format that avoids
571     //    an inefficient where data that was a multiple of 8 bytes in length
572     //    had an extra 9 bytes of encoded data.
573     SECONDARY_FORMAT_VERSION_UPDATE2 = 12,
574     // This change includes support for TTL
575     //  - This means that when TTL is specified for the table an 8-byte TTL
576     //    field is prepended in front of each value.
577     SECONDARY_FORMAT_VERSION_TTL = 13,
578     SECONDARY_FORMAT_VERSION_LATEST = SECONDARY_FORMAT_VERSION_TTL,
579     // This change includes support for covering SK lookups for varchars.  A
580     // 2-byte bitmap is added after the tag-byte to unpack_info only for
581     // records which have covered varchar columns. Currently waiting before
582     // enabling in prod.
583     SECONDARY_FORMAT_VERSION_UPDATE3 = 65535,
584   };
585 
586   void setup(const TABLE *const table, const Rdb_tbl_def *const tbl_def);
587 
588   static uint extract_ttl_duration(const TABLE *const table_arg,
589                                    const Rdb_tbl_def *const tbl_def_arg,
590                                    uint64 *ttl_duration);
591   static uint extract_ttl_col(const TABLE *const table_arg,
592                               const Rdb_tbl_def *const tbl_def_arg,
593                               std::string *ttl_column, uint *ttl_field_index,
594                               bool skip_checks = false);
has_ttl()595   inline bool has_ttl() const { return m_ttl_duration > 0; }
596 
597   static bool has_index_flag(uint32 index_flags, enum INDEX_FLAG flag);
598   static uint32 calculate_index_flag_offset(uint32 index_flags,
599                                             enum INDEX_FLAG flag,
600                                             uint *const field_length = nullptr);
601   void write_index_flag_field(Rdb_string_writer *const buf,
602                               const uchar *const val,
603                               enum INDEX_FLAG flag) const;
604 
605   static const std::string gen_qualifier_for_table(
606       const char *const qualifier, const std::string &partition_name = "");
607   static const std::string gen_cf_name_qualifier_for_partition(
608       const std::string &s);
609   static const std::string gen_ttl_duration_qualifier_for_partition(
610       const std::string &s);
611   static const std::string gen_ttl_col_qualifier_for_partition(
612       const std::string &s);
613 
614   static const std::string parse_comment_for_qualifier(
615       const std::string &comment, const TABLE *const table_arg,
616       const Rdb_tbl_def *const tbl_def_arg, bool *per_part_match_found,
617       const char *const qualifier);
618 
get_cf()619   rocksdb::ColumnFamilyHandle *get_cf() const { return m_cf_handle.get(); }
get_shared_cf()620   std::shared_ptr<rocksdb::ColumnFamilyHandle> get_shared_cf() const {
621     return m_cf_handle;
622   }
623 
624   /* Check if keypart #kp can be unpacked from index tuple */
625   inline bool can_unpack(const uint kp) const;
626   /* Check if keypart #kp needs unpack info */
627   inline bool has_unpack_info(const uint kp) const;
628 
629   /* Check if given table has a primary key */
630   static bool table_has_hidden_pk(const TABLE *const table);
631 
632   void report_checksum_mismatch(const bool is_key, const char *const data,
633                                 const size_t data_size) const;
634 
635   /* Check if index is at least pk_min if it is a PK,
636     or at least sk_min if SK.*/
637   bool index_format_min_check(const int pk_min, const int sk_min) const;
638 
639   static void pack_tiny(Rdb_field_packing *const fpi, Field *const field,
640                         uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
641                         Rdb_pack_field_context *const pack_ctx
642                             MY_ATTRIBUTE((__unused__)));
643 
644   static void pack_short(Rdb_field_packing *const fpi, Field *const field,
645                          uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
646                          Rdb_pack_field_context *const pack_ctx
647                              MY_ATTRIBUTE((__unused__)));
648 
649   static void pack_medium(Rdb_field_packing *const fpi, Field *const field,
650                           uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
651                           Rdb_pack_field_context *const pack_ctx
652                               MY_ATTRIBUTE((__unused__)));
653 
654   static void pack_long(Rdb_field_packing *const fpi, Field *const field,
655                         uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
656                         Rdb_pack_field_context *const pack_ctx
657                             MY_ATTRIBUTE((__unused__)));
658 
659   static void pack_longlong(Rdb_field_packing *const fpi, Field *const field,
660                             uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
661                             Rdb_pack_field_context *const pack_ctx
662                                 MY_ATTRIBUTE((__unused__)));
663 
664   static void pack_double(Rdb_field_packing *const fpi, Field *const field,
665                           uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
666                           Rdb_pack_field_context *const pack_ctx
667                               MY_ATTRIBUTE((__unused__)));
668 
669   static void pack_float(Rdb_field_packing *const fpi, Field *const field,
670                          uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
671                          Rdb_pack_field_context *const pack_ctx
672                              MY_ATTRIBUTE((__unused__)));
673 
674   static void pack_new_decimal(
675       Rdb_field_packing *const fpi, Field *const field,
676       uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
677       Rdb_pack_field_context *const pack_ctx MY_ATTRIBUTE((__unused__)));
678 
679   static void pack_datetime2(Rdb_field_packing *const fpi, Field *const field,
680                              uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
681                              Rdb_pack_field_context *const pack_ctx
682                                  MY_ATTRIBUTE((__unused__)));
683 
684   static void pack_timestamp2(
685       Rdb_field_packing *const fpi, Field *const field,
686       uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
687       Rdb_pack_field_context *const pack_ctx MY_ATTRIBUTE((__unused__)));
688 
689   static void pack_time2(Rdb_field_packing *const fpi, Field *const field,
690                          uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
691                          Rdb_pack_field_context *const pack_ctx
692                              MY_ATTRIBUTE((__unused__)));
693 
694   static void pack_year(Rdb_field_packing *const fpi, Field *const field,
695                         uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
696                         Rdb_pack_field_context *const pack_ctx
697                             MY_ATTRIBUTE((__unused__)));
698 
699   static void pack_newdate(Rdb_field_packing *const fpi, Field *const field,
700                            uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
701                            Rdb_pack_field_context *const pack_ctx
702                                MY_ATTRIBUTE((__unused__)));
703 
704   static void pack_blob(Rdb_field_packing *const fpi, Field *const field,
705                         uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
706                         Rdb_pack_field_context *const pack_ctx
707                             MY_ATTRIBUTE((__unused__)));
708 
709   static void pack_with_make_sort_key(
710       Rdb_field_packing *const fpi, Field *const field,
711       uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
712       Rdb_pack_field_context *const pack_ctx MY_ATTRIBUTE((__unused__)));
713 
714   static void pack_with_varchar_encoding(
715       Rdb_field_packing *const fpi, Field *const field, uchar *buf, uchar **dst,
716       Rdb_pack_field_context *const pack_ctx MY_ATTRIBUTE((__unused__)));
717 
718   static void pack_with_varchar_space_pad(
719       Rdb_field_packing *const fpi, Field *const field, uchar *buf, uchar **dst,
720       Rdb_pack_field_context *const pack_ctx);
721 
722   template <int length>
723   static int unpack_integer(Rdb_field_packing *const fpi, uchar *const to,
724                             Rdb_string_reader *const reader,
725                             Rdb_string_reader *const unp_reader
726                                 MY_ATTRIBUTE((__unused__)));
727 
728   static int unpack_double(
729       Rdb_field_packing *const fpi MY_ATTRIBUTE((__unused__)),
730       uchar *const field_ptr, Rdb_string_reader *const reader,
731       Rdb_string_reader *const unp_reader MY_ATTRIBUTE((__unused__)));
732 
733   static int unpack_float(Rdb_field_packing *const fpi, uchar *const field_ptr,
734                           Rdb_string_reader *const reader,
735                           Rdb_string_reader *const unp_reader
736                               MY_ATTRIBUTE((__unused__)));
737 
738   static int unpack_binary_str(Rdb_field_packing *const fpi, uchar *const to,
739                                Rdb_string_reader *const reader,
740                                Rdb_string_reader *const unp_reader
741                                    MY_ATTRIBUTE((__unused__)));
742 
743   static int unpack_binary_or_utf8_varchar(
744       Rdb_field_packing *const fpi, uchar *dst, Rdb_string_reader *const reader,
745       Rdb_string_reader *const unp_reader MY_ATTRIBUTE((__unused__)));
746 
747   static int unpack_binary_or_utf8_varchar_space_pad(
748       Rdb_field_packing *const fpi, uchar *dst, Rdb_string_reader *const reader,
749       Rdb_string_reader *const unp_reader);
750 
751   static int unpack_newdate(
752       Rdb_field_packing *const fpi, uchar *const field_ptr,
753       Rdb_string_reader *const reader,
754       Rdb_string_reader *const unp_reader MY_ATTRIBUTE((__unused__)));
755 
756   static int unpack_utf8_str(
757       Rdb_field_packing *const fpi, uchar *dst, Rdb_string_reader *const reader,
758       Rdb_string_reader *const unp_reader MY_ATTRIBUTE((__unused__)));
759 
760   static int unpack_unknown_varchar(Rdb_field_packing *const fpi, uchar *dst,
761                                     Rdb_string_reader *const reader,
762                                     Rdb_string_reader *const unp_reader);
763 
764   static int unpack_simple_varchar_space_pad(
765       Rdb_field_packing *const fpi, uchar *dst, Rdb_string_reader *const reader,
766       Rdb_string_reader *const unp_reader);
767 
768   static int unpack_simple(Rdb_field_packing *const fpi, uchar *const dst,
769                            Rdb_string_reader *const reader,
770                            Rdb_string_reader *const unp_reader);
771 
772   static int unpack_unknown(Rdb_field_packing *const fpi, uchar *const dst,
773                             Rdb_string_reader *const reader,
774                             Rdb_string_reader *const unp_reader);
775 
776   static int unpack_floating_point(uchar *const dst,
777                                    Rdb_string_reader *const reader,
778                                    const size_t size, const int exp_digit,
779                                    const uchar *const zero_pattern,
780                                    const uchar *const zero_val,
781                                    void (*swap_func)(uchar *, const uchar *));
782 
783   static void make_unpack_simple_varchar(
784       const Rdb_collation_codec *const codec, const Field *const field,
785       Rdb_pack_field_context *const pack_ctx);
786 
787   static void make_unpack_simple(const Rdb_collation_codec *const codec,
788                                  const Field *const field,
789                                  Rdb_pack_field_context *const pack_ctx);
790 
791   static void make_unpack_unknown(
792       const Rdb_collation_codec *codec MY_ATTRIBUTE((__unused__)),
793       const Field *const field, Rdb_pack_field_context *const pack_ctx);
794 
795   static void make_unpack_unknown_varchar(
796       const Rdb_collation_codec *const codec MY_ATTRIBUTE((__unused__)),
797       const Field *const field, Rdb_pack_field_context *const pack_ctx);
798 
799   static void dummy_make_unpack_info(
800       const Rdb_collation_codec *codec MY_ATTRIBUTE((__unused__)),
801       const Field *field MY_ATTRIBUTE((__unused__)),
802       Rdb_pack_field_context *pack_ctx MY_ATTRIBUTE((__unused__)));
803 
804   static int skip_max_length(const Rdb_field_packing *const fpi,
805                              Rdb_string_reader *const reader);
806 
807   static int skip_variable_length(const Rdb_field_packing *const fpi,
808                                   Rdb_string_reader *const reader);
809 
810   static int skip_variable_space_pad(const Rdb_field_packing *const fpi,
811                                      Rdb_string_reader *const reader);
812 
use_legacy_varbinary_format()813   inline bool use_legacy_varbinary_format() const {
814     return !index_format_min_check(PRIMARY_FORMAT_VERSION_UPDATE2,
815                                    SECONDARY_FORMAT_VERSION_UPDATE2);
816   }
817 
is_unpack_data_tag(char c)818   static inline bool is_unpack_data_tag(char c) {
819     return c == RDB_UNPACK_DATA_TAG || c == RDB_UNPACK_COVERED_DATA_TAG;
820   }
821 
822  private:
823 #ifndef NDEBUG
is_storage_available(const int offset,const int needed)824   inline bool is_storage_available(const int offset, const int needed) const {
825     const int storage_length = static_cast<int>(max_storage_fmt_length());
826     return (storage_length - offset) >= needed;
827   }
828 #endif  // NDEBUG
829 
830   /* Global number of this index (used as prefix in StorageFormat) */
831   const uint32 m_index_number;
832 
833   uchar m_index_number_storage_form[INDEX_NUMBER_SIZE];
834 
835   std::shared_ptr<rocksdb::ColumnFamilyHandle> m_cf_handle;
836 
837   static void pack_legacy_variable_format(const uchar *src, size_t src_len,
838                                           uchar **dst);
839 
840   static void pack_variable_format(const uchar *src, size_t src_len,
841                                    uchar **dst);
842 
843   static uint calc_unpack_legacy_variable_format(uchar flag, bool *done);
844 
845   static uint calc_unpack_variable_format(uchar flag, bool *done);
846 
847  public:
848   uint16_t m_index_dict_version;
849   uchar m_index_type;
850   /* KV format version for the index id */
851   uint16_t m_kv_format_version;
852   /* If true, the column family stores data in the reverse order */
853   bool m_is_reverse_cf;
854 
855   /* If true, then column family is created per partition. */
856   bool m_is_per_partition_cf;
857 
858   std::string m_name;
859   mutable Rdb_index_stats m_stats;
860 
861   /*
862     Bitmap containing information about whether TTL or other special fields
863     are enabled for the given index.
864   */
865   uint32 m_index_flags_bitmap;
866 
867   /*
868     How much space in bytes the index flag fields occupy.
869   */
870   uint32 m_total_index_flags_length;
871 
872   /*
873     Offset in the records where the 8-byte TTL is stored (UINT_MAX if no TTL)
874   */
875   uint32 m_ttl_rec_offset;
876 
877   /* Default TTL duration */
878   uint64 m_ttl_duration;
879 
880   /* TTL column (if defined by user, otherwise implicit TTL is used) */
881   std::string m_ttl_column;
882 
883  private:
884   /* Number of key parts in the primary key*/
885   uint m_pk_key_parts;
886 
887   /*
888      pk_part_no[X]=Y means that keypart #X of this key is key part #Y of the
889      primary key.  Y==-1 means this column is not present in the primary key.
890   */
891   uint *m_pk_part_no;
892 
893   /* Array of index-part descriptors. */
894   Rdb_field_packing *m_pack_info;
895 
896   uint m_keyno; /* number of this index in the table */
897 
898   /*
899     Number of key parts in the index (including "index extension"). This is how
900     many elements are in the m_pack_info array.
901   */
902   uint m_key_parts;
903 
904   /*
905     If TTL column is part of the PK, offset of the column within pk.
906     Default is UINT_MAX to denote that TTL col is not part of PK.
907   */
908   uint m_ttl_pk_key_part_offset;
909 
910   /*
911     Index of the TTL column in table->s->fields, if it exists.
912     Default is UINT_MAX to denote that it does not exist.
913   */
914   uint m_ttl_field_index;
915 
916   /* Prefix extractor for the column family of the key definiton */
917   std::shared_ptr<const rocksdb::SliceTransform> m_prefix_extractor;
918 
919   /* Maximum length of the mem-comparable form. */
920   uint m_maxlength;
921 
922   /* mutex to protect setup */
923   mysql_mutex_t m_mutex;
924 };
925 
926 // "Simple" collations (those specified in strings/ctype-simple.c) are simple
927 // because their strnxfrm function maps one byte to one byte. However, the
928 // mapping is not injective, so the inverse function will take in an extra
929 // index parameter containing information to disambiguate what the original
930 // character was.
931 //
932 // The m_enc* members are for encoding. Generally, we want encoding to be:
933 //      src -> (dst, idx)
934 //
935 // Since strnxfrm already gives us dst, we just need m_enc_idx[src] to give us
936 // idx.
937 //
938 // For the inverse, we have:
939 //      (dst, idx) -> src
940 //
941 // We have m_dec_idx[idx][dst] = src to get our original character back.
942 //
943 struct Rdb_collation_codec {
944   const my_core::CHARSET_INFO *m_cs;
945   // The first element unpacks VARCHAR(n), the second one - CHAR(n).
946   std::array<rdb_make_unpack_info_t, 2> m_make_unpack_info_func;
947   std::array<rdb_index_field_unpack_t, 2> m_unpack_func;
948 
949   std::array<uchar, 256> m_enc_idx;
950   std::array<uchar, 256> m_enc_size;
951 
952   std::array<uchar, 256> m_dec_size;
953   std::vector<std::array<uchar, 256>> m_dec_idx;
954 };
955 
956 extern mysql_mutex_t rdb_collation_data_mutex;
957 extern mysql_mutex_t rdb_mem_cmp_space_mutex;
958 extern std::array<const Rdb_collation_codec *, MY_ALL_CHARSETS_SIZE>
959     rdb_collation_data;
960 
961 class Rdb_field_packing {
962  public:
963   Rdb_field_packing(const Rdb_field_packing &);
964   Rdb_field_packing &operator=(const Rdb_field_packing &) = delete;
965   Rdb_field_packing();
966 
967   /* Length of mem-comparable image of the field, in bytes */
968   int m_max_image_len;
969 
970   /* Length of image in the unpack data */
971   int m_unpack_data_len;
972   int m_unpack_data_offset;
973 
974   /*
975     Cached field information for faster access
976   */
977   bool m_field_maybe_null; /* TRUE <=> NULL-byte is stored */
978   bool m_field_unsigned_flag;
979   enum_field_types m_field_real_type;
980   uchar m_field_null_bit_mask;
981   uint m_field_pack_length;
982   uint m_field_null_offset;
983   my_ptrdiff_t m_field_offset;
984   const CHARSET_INFO *m_field_charset;
985 
986   /*
987     Valid only for VARCHAR fields.
988   */
989   uint m_varchar_length_bytes;
990   uint m_varchar_char_length;
991   bool m_use_legacy_varbinary_format;
992 
993   // (Valid when Variable Length Space Padded Encoding is used):
994   uint m_segment_size;  // size of segment used
995 
996   // number of bytes used to store number of trimmed (or added)
997   // spaces in the upack_info
998   bool m_unpack_info_uses_two_bytes;
999 
1000   /*
1001     True implies that an index-only read is always possible for this field.
1002     False means an index-only read may be possible depending on the record and
1003     field type.
1004   */
1005   bool m_covered;
1006 
1007   const std::vector<uchar> *space_xfrm;
1008   size_t space_xfrm_len;
1009   size_t space_mb_len;
1010 
1011   const Rdb_collation_codec *m_charset_codec;
1012 
1013   /*
1014     @return TRUE: this field makes use of unpack_info.
1015   */
uses_unpack_info()1016   bool uses_unpack_info() const { return (m_make_unpack_info_func != nullptr); }
1017 
1018   /* TRUE means unpack_info stores the original field value */
1019   bool m_unpack_info_stores_value;
1020 
1021   rdb_index_field_pack_t m_pack_func;
1022   rdb_make_unpack_info_t m_make_unpack_info_func;
1023 
1024   /*
1025     This function takes
1026     - mem-comparable form
1027     - unpack_info data
1028     and restores the original value.
1029   */
1030   rdb_index_field_unpack_t m_unpack_func;
1031 
1032   /*
1033     This function skips over mem-comparable form.
1034   */
1035   rdb_index_field_skip_t m_skip_func;
1036 
1037  private:
1038   /*
1039     Location of the field in the table (key number and key part number).
1040 
1041     Note that this describes not the field, but rather a position of field in
1042     the index. Consider an example:
1043 
1044       col1 VARCHAR (100),
1045       INDEX idx1 (col1)),
1046       INDEX idx2 (col1(10)),
1047 
1048     Here, idx2 has a special Field object that is set to describe a 10-char
1049     prefix of col1.
1050 
1051     We must also store the keynr. It is needed for implicit "extended keys".
1052     Every key in MyRocks needs to include PK columns.  Generally, SQL layer
1053     includes PK columns as part of its "Extended Keys" feature, but sometimes
1054     it does not (known examples are unique secondary indexes and partitioned
1055     tables).
1056     In that case, MyRocks's index descriptor has invisible suffix of PK
1057     columns (and the point is that these columns are parts of PK, not parts
1058     of the current index).
1059   */
1060   uint m_keynr;
1061   uint m_key_part;
1062 
1063  public:
1064   bool setup(const Rdb_key_def *const key_descr, const Field *const field,
1065              const uint keynr_arg, const uint key_part_arg,
1066              const uint16 key_length);
1067   Field *get_field_in_table(const TABLE *const tbl) const;
1068   void fill_hidden_pk_val(uchar **dst, const longlong hidden_pk_id) const;
1069 };
1070 
1071 /*
1072   Descriptor telling how to decode/encode a field to on-disk record storage
1073   format. Not all information is in the structure yet, but eventually we
1074   want to have as much as possible there to avoid virtual calls.
1075 
1076   For encoding/decoding of index tuples, see Rdb_key_def.
1077   */
1078 class Rdb_field_encoder {
1079  public:
1080   Rdb_field_encoder(const Rdb_field_encoder &) = delete;
1081   Rdb_field_encoder &operator=(const Rdb_field_encoder &) = delete;
1082   /*
1083     STORE_NONE is set when a column can be decoded solely from their
1084     mem-comparable form.
1085     STORE_SOME is set when a column can be decoded from their mem-comparable
1086     form plus unpack_info.
1087     STORE_ALL is set when a column cannot be decoded, so its original value
1088     must be stored in the PK records.
1089     */
1090   enum STORAGE_TYPE {
1091     STORE_NONE,
1092     STORE_SOME,
1093     STORE_ALL,
1094   };
1095   STORAGE_TYPE m_storage_type;
1096 
1097   uint m_null_offset;
1098   uchar m_null_mask;  // 0 means the field cannot be null
1099 
1100   /*
1101     Cached field information
1102   */
1103   my_core::enum_field_types m_field_type;
1104   uchar m_field_null_mask;
1105   uint16 m_field_index;
1106   uint m_field_pack_length;
1107   uint m_field_length_bytes;
1108   uint m_field_length;
1109   my_ptrdiff_t m_field_null_offset;
1110   my_ptrdiff_t m_field_offset;
1111 
maybe_null()1112   bool maybe_null() const { return m_null_mask != 0; }
1113 
uses_variable_len_encoding()1114   bool uses_variable_len_encoding() const {
1115     return (m_field_type == MYSQL_TYPE_BLOB ||
1116             m_field_type == MYSQL_TYPE_VARCHAR ||
1117             m_field_type == MYSQL_TYPE_JSON);
1118   }
1119 };
1120 
get_table_field_for_part_no(TABLE * table,uint part_no)1121 inline Field *Rdb_key_def::get_table_field_for_part_no(TABLE *table,
1122                                                        uint part_no) const {
1123   assert(part_no < get_key_parts());
1124   return m_pack_info[part_no].get_field_in_table(table);
1125 }
1126 
can_unpack(const uint kp)1127 inline bool Rdb_key_def::can_unpack(const uint kp) const {
1128   assert(kp < m_key_parts);
1129   return (m_pack_info[kp].m_unpack_func != nullptr);
1130 }
1131 
has_unpack_info(const uint kp)1132 inline bool Rdb_key_def::has_unpack_info(const uint kp) const {
1133   assert(kp < m_key_parts);
1134   return m_pack_info[kp].uses_unpack_info();
1135 }
1136 
1137 /*
1138   A table definition. This is an entry in the mapping
1139 
1140     dbname.tablename -> {index_nr, index_nr, ... }
1141 
1142   There is only one Rdb_tbl_def object for a given table.
1143   That's why we keep auto_increment value here, too.
1144 */
1145 
1146 class Rdb_tbl_def {
1147  private:
1148   void check_if_is_mysql_system_table();
1149 
1150   /* Stores 'dbname.tablename' */
1151   std::string m_dbname_tablename;
1152 
1153   /* Store the db name, table name, and partition name */
1154   std::string m_dbname;
1155   std::string m_tablename;
1156   std::string m_partition;
1157 
1158   void set_name(const std::string &name);
1159 
1160  public:
1161   Rdb_tbl_def(const Rdb_tbl_def &) = delete;
1162   Rdb_tbl_def &operator=(const Rdb_tbl_def &) = delete;
1163 
Rdb_tbl_def(const std::string & name)1164   explicit Rdb_tbl_def(const std::string &name)
1165       : m_key_descr_arr(nullptr),
1166         m_hidden_pk_val(0),
1167         m_auto_incr_val(0),
1168         m_tbl_stats(),
1169         m_update_time(0),
1170         m_create_time(CREATE_TIME_UNKNOWN) {
1171     set_name(name);
1172   }
1173 
Rdb_tbl_def(const char * const name,const size_t len)1174   Rdb_tbl_def(const char *const name, const size_t len)
1175       : m_key_descr_arr(nullptr),
1176         m_hidden_pk_val(0),
1177         m_auto_incr_val(0),
1178         m_tbl_stats(),
1179         m_update_time(0),
1180         m_create_time(CREATE_TIME_UNKNOWN) {
1181     set_name(std::string(name, len));
1182   }
1183 
1184   explicit Rdb_tbl_def(const rocksdb::Slice &slice, const size_t pos = 0)
m_key_descr_arr(nullptr)1185       : m_key_descr_arr(nullptr),
1186         m_hidden_pk_val(0),
1187         m_auto_incr_val(0),
1188         m_tbl_stats(),
1189         m_update_time(0),
1190         m_create_time(CREATE_TIME_UNKNOWN) {
1191     set_name(std::string(slice.data() + pos, slice.size() - pos));
1192   }
1193 
1194   ~Rdb_tbl_def();
1195 
1196   void check_and_set_read_free_rpl_table();
1197 
1198   /* Number of indexes */
1199   uint m_key_count;
1200 
1201   /* Array of index descriptors */
1202   std::shared_ptr<Rdb_key_def> *m_key_descr_arr;
1203 
1204   std::atomic<longlong> m_hidden_pk_val;
1205   std::atomic<ulonglong> m_auto_incr_val;
1206 
1207   /* Is this a system table */
1208   bool m_is_mysql_system_table;
1209 
1210   /* Is this table read free repl enabled */
1211   std::atomic_bool m_is_read_free_rpl_table{false};
1212 
1213   Rdb_table_stats m_tbl_stats;
1214 
1215   bool put_dict(Rdb_dict_manager *const dict, Rdb_cf_manager *const cf_manager,
1216                 rocksdb::WriteBatch *const batch, const rocksdb::Slice &key);
1217 
full_tablename()1218   const std::string &full_tablename() const { return m_dbname_tablename; }
base_dbname()1219   const std::string &base_dbname() const { return m_dbname; }
base_tablename()1220   const std::string &base_tablename() const { return m_tablename; }
base_partition()1221   const std::string &base_partition() const { return m_partition; }
1222   GL_INDEX_ID get_autoincr_gl_index_id();
1223 
1224   time_t get_create_time();
1225   std::atomic<time_t> m_update_time;  // in-memory only value
1226  private:
1227   const time_t CREATE_TIME_UNKNOWN = 1;
1228   // CREATE_TIME_UNKNOWN means "didn't try to read, yet"
1229   // 0 means "no data available"
1230   std::atomic<time_t> m_create_time;
1231 };
1232 
1233 /*
1234   A thread-safe sequential number generator. Its performance is not a concern
1235   hence it is ok to protect it by a mutex.
1236 */
1237 
1238 class Rdb_seq_generator {
1239   uint m_next_number = 0;
1240 
1241   mysql_mutex_t m_mutex;
1242 
1243  public:
1244   Rdb_seq_generator(const Rdb_seq_generator &) = delete;
1245   Rdb_seq_generator &operator=(const Rdb_seq_generator &) = delete;
1246   Rdb_seq_generator() = default;
1247 
init(const uint initial_number)1248   void init(const uint initial_number) {
1249     mysql_mutex_init(0, &m_mutex, MY_MUTEX_INIT_FAST);
1250     m_next_number = initial_number;
1251   }
1252 
1253   uint get_and_update_next_number(Rdb_dict_manager *const dict);
1254 
cleanup()1255   void cleanup() { mysql_mutex_destroy(&m_mutex); }
1256 };
1257 
1258 interface Rdb_tables_scanner {
1259   virtual int add_table(Rdb_tbl_def * tdef) = 0;
1260 };
1261 
1262 /*
1263   This contains a mapping of
1264 
1265      dbname.table_name -> array{Rdb_key_def}.
1266 
1267   objects are shared among all threads.
1268 */
1269 
1270 class Rdb_ddl_manager {
1271   Rdb_dict_manager *m_dict = nullptr;
1272   Rdb_cf_manager *m_cf_manager = nullptr;
1273 
1274   // Contains Rdb_tbl_def elements
1275   std::unordered_map<std::string, Rdb_tbl_def *> m_ddl_map;
1276 
1277   // Maps index id to <table_name, index number>
1278   std::map<GL_INDEX_ID, std::pair<std::string, uint>> m_index_num_to_keydef;
1279 
1280   // Maps index id to key definitons not yet committed to data dictionary.
1281   // This is mainly used to store key definitions during ALTER TABLE.
1282   std::map<GL_INDEX_ID, std::shared_ptr<Rdb_key_def>>
1283       m_index_num_to_uncommitted_keydef;
1284   mysql_rwlock_t m_rwlock;
1285 
1286   Rdb_seq_generator m_sequence;
1287   // A queue of table stats to write into data dictionary
1288   // It is produced by event listener (ie compaction and flush threads)
1289   // and consumed by the rocksdb background thread
1290   std::map<GL_INDEX_ID, Rdb_index_stats> m_stats2store;
1291 
1292   const std::shared_ptr<Rdb_key_def> &find(GL_INDEX_ID gl_index_id);
1293 
1294  public:
1295   Rdb_ddl_manager(const Rdb_ddl_manager &) = delete;
1296   Rdb_ddl_manager &operator=(const Rdb_ddl_manager &) = delete;
Rdb_ddl_manager()1297   Rdb_ddl_manager() {}
1298 
1299     /* Load the data dictionary from on-disk storage */
1300 #if defined(ROCKSDB_INCLUDE_VALIDATE_TABLES) && ROCKSDB_INCLUDE_VALIDATE_TABLES
1301   bool init(Rdb_dict_manager *const dict_arg, Rdb_cf_manager *const cf_manager,
1302             const uint32_t validate_tables);
1303 #else
1304   bool init(Rdb_dict_manager *const dict_arg, Rdb_cf_manager *const cf_manager);
1305 #endif  // defined(ROCKSDB_INCLUDE_VALIDATE_TABLES) &&
1306         // ROCKSDB_INCLUDE_VALIDATE_TABLES
1307 
1308   void cleanup();
1309 
1310   Rdb_tbl_def *find(const std::string &table_name, const bool lock = true);
1311   int find_indexes(const std::string &table_name,
1312                    std::vector<GL_INDEX_ID> *indexes);
1313   int find_table_stats(const std::string &table_name,
1314                        Rdb_table_stats *tbl_stats);
1315   std::shared_ptr<const Rdb_key_def> safe_find(GL_INDEX_ID gl_index_id);
1316   void set_stats(const std::unordered_map<GL_INDEX_ID, Rdb_index_stats> &stats);
1317   void adjust_stats(const std::vector<Rdb_index_stats> &new_data,
1318                     const std::vector<Rdb_index_stats> &deleted_data =
1319                         std::vector<Rdb_index_stats>());
1320   void persist_stats(const bool sync = false);
1321 
1322   void set_table_stats(const std::string &tbl_name);
1323 
1324   /* Modify the mapping and write it to on-disk storage */
1325   int put_and_write(Rdb_tbl_def *const key_descr,
1326                     rocksdb::WriteBatch *const batch);
1327   void remove(Rdb_tbl_def *const rec, rocksdb::WriteBatch *const batch,
1328               const bool lock = true);
1329   bool rename(const std::string &from, const std::string &to,
1330               rocksdb::WriteBatch *const batch);
1331 
get_and_update_next_number(Rdb_dict_manager * const dict)1332   uint get_and_update_next_number(Rdb_dict_manager *const dict) {
1333     return m_sequence.get_and_update_next_number(dict);
1334   }
1335 
1336   const std::string safe_get_table_name(const GL_INDEX_ID &gl_index_id);
1337 
1338   /* Walk the data dictionary */
1339   int scan_for_tables(Rdb_tables_scanner *tables_scanner);
1340 
1341   void erase_index_num(const GL_INDEX_ID &gl_index_id);
1342   void add_uncommitted_keydefs(
1343       const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes);
1344   void remove_uncommitted_keydefs(
1345       const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes);
1346   int find_in_uncommitted_keydef(const uint32_t &cf_id);
1347 
1348  private:
1349   /* Put the data into in-memory table (only) */
1350   int put(Rdb_tbl_def *const key_descr, const bool lock = true);
1351 
1352   /* Helper functions to be passed to my_core::HASH object */
1353   static const uchar *get_hash_key(Rdb_tbl_def *const rec, size_t *const length,
1354                                    my_bool not_used MY_ATTRIBUTE((unused)));
1355   static void free_hash_elem(void *const data);
1356 
1357 #if defined(ROCKSDB_INCLUDE_VALIDATE_TABLES) && ROCKSDB_INCLUDE_VALIDATE_TABLES
1358   bool validate_schemas();
1359 
1360   bool validate_auto_incr();
1361 #endif  // defined(ROCKSDB_INCLUDE_VALIDATE_TABLES) &&
1362         // ROCKSDB_INCLUDE_VALIDATE_TABLES
1363 };
1364 
1365 /*
1366    Rdb_dict_manager manages how MySQL on RocksDB (MyRocks) stores its
1367   internal data dictionary.
1368    MyRocks stores data dictionary on dedicated system column family
1369   named __system__. The system column family is used by MyRocks
1370   internally only, and not used by applications.
1371 
1372    Currently MyRocks has the following data dictionary data models.
1373 
1374   1. Table Name => internal index id mappings
1375   key: Rdb_key_def::DDL_ENTRY_INDEX_START_NUMBER(0x1) + dbname.tablename
1376   value: version, {cf_id, index_id}*n_indexes_of_the_table
1377   version is 2 bytes. cf_id and index_id are 4 bytes.
1378 
1379   2. internal cf_id, index id => index information
1380   key: Rdb_key_def::INDEX_INFO(0x2) + cf_id + index_id
1381   value: version, index_type, kv_format_version, index_flags, ttl_duration
1382   index_type is 1 byte, version and kv_format_version are 2 bytes.
1383   index_flags is 4 bytes.
1384   ttl_duration is 8 bytes.
1385 
1386   3. CF id => CF flags
1387   key: Rdb_key_def::CF_DEFINITION(0x3) + cf_id
1388   value: version, {is_reverse_cf, is_auto_cf (deprecated), is_per_partition_cf}
1389   cf_flags is 4 bytes in total.
1390 
1391   4. Binlog entry (updated at commit)
1392   key: Rdb_key_def::BINLOG_INFO_INDEX_NUMBER (0x4)
1393   value: version, {binlog_name,binlog_pos,binlog_gtid}
1394 
1395   5. Ongoing drop index entry
1396   key: Rdb_key_def::DDL_DROP_INDEX_ONGOING(0x5) + cf_id + index_id
1397   value: version
1398 
1399   6. index stats
1400   key: Rdb_key_def::INDEX_STATISTICS(0x6) + cf_id + index_id
1401   value: version, {materialized PropertiesCollector::IndexStats}
1402 
1403   7. maximum index id
1404   key: Rdb_key_def::MAX_INDEX_ID(0x7)
1405   value: index_id
1406   index_id is 4 bytes
1407 
1408   8. Ongoing create index entry
1409   key: Rdb_key_def::DDL_CREATE_INDEX_ONGOING(0x8) + cf_id + index_id
1410   value: version
1411 
1412   9. auto_increment values
1413   key: Rdb_key_def::AUTO_INC(0x9) + cf_id + index_id
1414   value: version, {max auto_increment so far}
1415   max auto_increment is 8 bytes
1416 
1417   10. dropped cfs
1418   key: Rdb_key_def::DROPPED_CF(0xa) + cf_id
1419   value: version
1420 
1421   Data dictionary operations are atomic inside RocksDB. For example,
1422   when creating a table with two indexes, it is necessary to call Put
1423   three times. They have to be atomic. Rdb_dict_manager has a wrapper function
1424   begin() and commit() to make it easier to do atomic operations.
1425 
1426 */
1427 class Rdb_dict_manager {
1428  private:
1429   mysql_mutex_t m_mutex;
1430   rocksdb::TransactionDB *m_db = nullptr;
1431   rocksdb::ColumnFamilyHandle *m_system_cfh = nullptr;
1432   /* Utility to put INDEX_INFO and CF_DEFINITION */
1433 
1434   uchar m_key_buf_max_index_id[Rdb_key_def::INDEX_NUMBER_SIZE] = {0};
1435   rocksdb::Slice m_key_slice_max_index_id;
1436 
1437   static void dump_index_id(uchar *const netbuf,
1438                             Rdb_key_def::DATA_DICT_TYPE dict_type,
1439                             const GL_INDEX_ID &gl_index_id);
1440   template <size_t T>
dump_index_id(Rdb_buf_writer<T> * buf_writer,Rdb_key_def::DATA_DICT_TYPE dict_type,const GL_INDEX_ID & gl_index_id)1441   static void dump_index_id(Rdb_buf_writer<T> *buf_writer,
1442                             Rdb_key_def::DATA_DICT_TYPE dict_type,
1443                             const GL_INDEX_ID &gl_index_id) {
1444     buf_writer->write_uint32(dict_type);
1445     buf_writer->write_uint32(gl_index_id.cf_id);
1446     buf_writer->write_uint32(gl_index_id.index_id);
1447   }
1448 
1449   void delete_with_prefix(rocksdb::WriteBatch *const batch,
1450                           Rdb_key_def::DATA_DICT_TYPE dict_type,
1451                           const GL_INDEX_ID &gl_index_id) const;
1452   /* Functions for fast DROP TABLE/INDEX */
1453   void resume_drop_indexes() const;
1454   void log_start_drop_table(const std::shared_ptr<Rdb_key_def> *const key_descr,
1455                             const uint32 n_keys,
1456                             const char *const log_action) const;
1457   void log_start_drop_index(GL_INDEX_ID gl_index_id,
1458                             const char *log_action) const;
1459 
1460  public:
1461   Rdb_dict_manager(const Rdb_dict_manager &) = delete;
1462   Rdb_dict_manager &operator=(const Rdb_dict_manager &) = delete;
1463   Rdb_dict_manager() = default;
1464 
1465   bool init(rocksdb::TransactionDB *const rdb_dict,
1466             Rdb_cf_manager *const cf_manager,
1467             const my_bool enable_remove_orphaned_cf_flags);
1468 
cleanup()1469   inline void cleanup() { mysql_mutex_destroy(&m_mutex); }
1470 
lock()1471   inline void lock() { RDB_MUTEX_LOCK_CHECK(m_mutex); }
1472 
unlock()1473   inline void unlock() { RDB_MUTEX_UNLOCK_CHECK(m_mutex); }
1474 
assert_lock_held()1475   inline void assert_lock_held() { mysql_mutex_assert_owner(&m_mutex); }
1476 
get_system_cf()1477   inline rocksdb::ColumnFamilyHandle *get_system_cf() const {
1478     return m_system_cfh;
1479   }
1480 
1481   /* Raw RocksDB operations */
1482   std::unique_ptr<rocksdb::WriteBatch> begin() const;
1483   int commit(rocksdb::WriteBatch *const batch, const bool sync = true) const;
1484   rocksdb::Status get_value(const rocksdb::Slice &key,
1485                             std::string *const value) const;
1486   void put_key(rocksdb::WriteBatchBase *const batch, const rocksdb::Slice &key,
1487                const rocksdb::Slice &value) const;
1488   void delete_key(rocksdb::WriteBatchBase *batch,
1489                   const rocksdb::Slice &key) const;
1490   rocksdb::Iterator *new_iterator() const;
1491 
1492   /* Internal Index id => CF */
1493   void add_or_update_index_cf_mapping(
1494       rocksdb::WriteBatch *batch,
1495       struct Rdb_index_info *const index_info) const;
1496   void delete_index_info(rocksdb::WriteBatch *batch,
1497                          const GL_INDEX_ID &index_id) const;
1498   bool get_index_info(const GL_INDEX_ID &gl_index_id,
1499                       struct Rdb_index_info *const index_info) const;
1500 
1501   /* CF id => CF flags */
1502   void add_cf_flags(rocksdb::WriteBatch *const batch, const uint cf_id,
1503                     const uint cf_flags) const;
1504   bool get_cf_flags(const uint cf_id, uint *const cf_flags) const;
1505 
1506   void add_dropped_cf(rocksdb::WriteBatch *const batch,
1507                       const uint &cf_id) const;
1508   void delete_dropped_cf(rocksdb::WriteBatch *const batch,
1509                          const uint &cf_id) const;
1510   bool get_dropped_cf(const uint &cf_id) const;
1511   void get_all_dropped_cfs(std::unordered_set<uint32> *dropped_cf_ids) const;
1512 
1513   int add_missing_cf_flags(Rdb_cf_manager *const cf_manager) const;
1514 
1515   int remove_orphaned_dropped_cfs(
1516       Rdb_cf_manager *const cf_manager,
1517       const my_bool &enable_remove_orphaned_dropped_cfs) const;
1518 
1519   void delete_dropped_cf_and_flags(rocksdb::WriteBatch *const batch,
1520                                    const uint &cf_id) const;
1521 
1522   /* Functions for fast CREATE/DROP TABLE/INDEX */
1523   void get_ongoing_index_operation(
1524       std::unordered_set<GL_INDEX_ID> *gl_index_ids,
1525       Rdb_key_def::DATA_DICT_TYPE dd_type) const;
1526   bool is_index_operation_ongoing(const GL_INDEX_ID &gl_index_id,
1527                                   Rdb_key_def::DATA_DICT_TYPE dd_type) const;
1528   void start_ongoing_index_operation(rocksdb::WriteBatch *batch,
1529                                      const GL_INDEX_ID &gl_index_id,
1530                                      Rdb_key_def::DATA_DICT_TYPE dd_type) const;
1531   void end_ongoing_index_operation(rocksdb::WriteBatch *const batch,
1532                                    const GL_INDEX_ID &gl_index_id,
1533                                    Rdb_key_def::DATA_DICT_TYPE dd_type) const;
1534   bool is_drop_index_empty() const;
1535   void add_drop_table(std::shared_ptr<Rdb_key_def> *const key_descr,
1536                       const uint32 n_keys,
1537                       rocksdb::WriteBatch *const batch) const;
1538   void add_drop_index(const std::unordered_set<GL_INDEX_ID> &gl_index_ids,
1539                       rocksdb::WriteBatch *const batch) const;
1540   void add_create_index(const std::unordered_set<GL_INDEX_ID> &gl_index_ids,
1541                         rocksdb::WriteBatch *const batch) const;
1542   void finish_indexes_operation(
1543       const std::unordered_set<GL_INDEX_ID> &gl_index_ids,
1544       Rdb_key_def::DATA_DICT_TYPE dd_type) const;
1545   void rollback_ongoing_index_creation() const;
1546   void rollback_ongoing_index_creation(
1547       const std::unordered_set<GL_INDEX_ID> &gl_index_ids) const;
1548 
get_ongoing_drop_indexes(std::unordered_set<GL_INDEX_ID> * gl_index_ids)1549   inline void get_ongoing_drop_indexes(
1550       std::unordered_set<GL_INDEX_ID> *gl_index_ids) const {
1551     get_ongoing_index_operation(gl_index_ids,
1552                                 Rdb_key_def::DDL_DROP_INDEX_ONGOING);
1553   }
get_ongoing_create_indexes(std::unordered_set<GL_INDEX_ID> * gl_index_ids)1554   inline void get_ongoing_create_indexes(
1555       std::unordered_set<GL_INDEX_ID> *gl_index_ids) const {
1556     get_ongoing_index_operation(gl_index_ids,
1557                                 Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
1558   }
start_drop_index(rocksdb::WriteBatch * wb,const GL_INDEX_ID & gl_index_id)1559   inline void start_drop_index(rocksdb::WriteBatch *wb,
1560                                const GL_INDEX_ID &gl_index_id) const {
1561     start_ongoing_index_operation(wb, gl_index_id,
1562                                   Rdb_key_def::DDL_DROP_INDEX_ONGOING);
1563   }
start_create_index(rocksdb::WriteBatch * wb,const GL_INDEX_ID & gl_index_id)1564   inline void start_create_index(rocksdb::WriteBatch *wb,
1565                                  const GL_INDEX_ID &gl_index_id) const {
1566     start_ongoing_index_operation(wb, gl_index_id,
1567                                   Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
1568   }
finish_drop_indexes(const std::unordered_set<GL_INDEX_ID> & gl_index_ids)1569   inline void finish_drop_indexes(
1570       const std::unordered_set<GL_INDEX_ID> &gl_index_ids) const {
1571     finish_indexes_operation(gl_index_ids, Rdb_key_def::DDL_DROP_INDEX_ONGOING);
1572   }
finish_create_indexes(const std::unordered_set<GL_INDEX_ID> & gl_index_ids)1573   inline void finish_create_indexes(
1574       const std::unordered_set<GL_INDEX_ID> &gl_index_ids) const {
1575     finish_indexes_operation(gl_index_ids,
1576                              Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
1577   }
is_drop_index_ongoing(const GL_INDEX_ID & gl_index_id)1578   inline bool is_drop_index_ongoing(const GL_INDEX_ID &gl_index_id) const {
1579     return is_index_operation_ongoing(gl_index_id,
1580                                       Rdb_key_def::DDL_DROP_INDEX_ONGOING);
1581   }
is_create_index_ongoing(const GL_INDEX_ID & gl_index_id)1582   inline bool is_create_index_ongoing(const GL_INDEX_ID &gl_index_id) const {
1583     return is_index_operation_ongoing(gl_index_id,
1584                                       Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
1585   }
1586 
1587   bool get_max_index_id(uint32_t *const index_id) const;
1588   bool update_max_index_id(rocksdb::WriteBatch *const batch,
1589                            const uint32_t index_id) const;
1590   void add_stats(rocksdb::WriteBatch *const batch,
1591                  const std::vector<Rdb_index_stats> &stats) const;
1592   Rdb_index_stats get_stats(GL_INDEX_ID gl_index_id) const;
1593 
1594   rocksdb::Status put_auto_incr_val(rocksdb::WriteBatchBase *batch,
1595                                     GL_INDEX_ID gl_index_id, ulonglong val,
1596                                     bool overwrite = false) const;
1597   bool get_auto_incr_val(const GL_INDEX_ID &gl_index_id,
1598                          ulonglong *new_val) const;
1599 
1600  private:
1601   /* dropped cf flags */
1602   void delete_cf_flags(rocksdb::WriteBatch *const batch,
1603                        const uint &cf_id) const;
1604 };
1605 
1606 struct Rdb_index_info {
1607   GL_INDEX_ID m_gl_index_id;
1608   uint16_t m_index_dict_version = 0;
1609   uchar m_index_type = 0;
1610   uint16_t m_kv_version = 0;
1611   uint32 m_index_flags = 0;
1612   uint64 m_ttl_duration = 0;
1613 };
1614 
1615 bool rdb_is_collation_supported(const my_core::CHARSET_INFO *const cs);
1616 
1617 /*
1618   @brief
1619   Merge Operator for the auto_increment value in the system_cf
1620 
1621   @detail
1622   This class implements the rocksdb Merge Operator for auto_increment values
1623   that are stored to the data dictionary every transaction.
1624 
1625   The actual Merge function is triggered on compaction, memtable flushes, or
1626   when get() is called on the same key.
1627 
1628  */
1629 class Rdb_system_merge_op : public rocksdb::AssociativeMergeOperator {
1630  public:
1631   /*
1632     Updates the new value associated with a key to be the maximum of the
1633     passed in value and the existing value.
1634 
1635     @param[IN]  key
1636     @param[IN]  existing_value  existing value for a key; nullptr if nonexistent
1637     key
1638     @param[IN]  value
1639     @param[OUT] new_value       new value after Merge
1640     @param[IN]  logger
1641   */
Merge(const rocksdb::Slice & key,const rocksdb::Slice * existing_value,const rocksdb::Slice & value,std::string * new_value,rocksdb::Logger * logger)1642   bool Merge(const rocksdb::Slice &key, const rocksdb::Slice *existing_value,
1643              const rocksdb::Slice &value, std::string *new_value,
1644              rocksdb::Logger *logger) const override {
1645     assert(new_value != nullptr);
1646 
1647     if (key.size() != Rdb_key_def::INDEX_NUMBER_SIZE * 3 ||
1648         GetKeyType(key) != Rdb_key_def::AUTO_INC ||
1649         value.size() !=
1650             RDB_SIZEOF_AUTO_INCREMENT_VERSION + ROCKSDB_SIZEOF_AUTOINC_VALUE ||
1651         GetVersion(value) > Rdb_key_def::AUTO_INCREMENT_VERSION) {
1652       abort();
1653     }
1654 
1655     uint64_t merged_value = Deserialize(value);
1656 
1657     if (existing_value != nullptr) {
1658       if (existing_value->size() != RDB_SIZEOF_AUTO_INCREMENT_VERSION +
1659                                         ROCKSDB_SIZEOF_AUTOINC_VALUE ||
1660           GetVersion(*existing_value) > Rdb_key_def::AUTO_INCREMENT_VERSION) {
1661         abort();
1662       }
1663 
1664       merged_value = std::max(merged_value, Deserialize(*existing_value));
1665     }
1666     Serialize(merged_value, new_value);
1667     return true;
1668   }
1669 
Name()1670   virtual const char *Name() const override { return "Rdb_system_merge_op"; }
1671 
1672  private:
1673   /*
1674     Serializes the integer data to the new_value buffer or the target buffer
1675     the merge operator will update to
1676    */
Serialize(const uint64_t data,std::string * new_value)1677   void Serialize(const uint64_t data, std::string *new_value) const {
1678     uchar value_buf[RDB_SIZEOF_AUTO_INCREMENT_VERSION +
1679                     ROCKSDB_SIZEOF_AUTOINC_VALUE] = {0};
1680     uchar *ptr = value_buf;
1681     /* fill in the auto increment version */
1682     rdb_netbuf_store_uint16(ptr, Rdb_key_def::AUTO_INCREMENT_VERSION);
1683     ptr += RDB_SIZEOF_AUTO_INCREMENT_VERSION;
1684     /* fill in the auto increment value */
1685     rdb_netbuf_store_uint64(ptr, data);
1686     ptr += ROCKSDB_SIZEOF_AUTOINC_VALUE;
1687     new_value->assign(reinterpret_cast<char *>(value_buf), ptr - value_buf);
1688   }
1689 
1690   /*
1691     Gets the value of auto_increment type in the data dictionary from the
1692     value slice
1693 
1694     @Note Only to be used on data dictionary keys for the auto_increment type
1695    */
Deserialize(const rocksdb::Slice & s)1696   uint64_t Deserialize(const rocksdb::Slice &s) const {
1697     return rdb_netbuf_to_uint64(reinterpret_cast<const uchar *>(s.data()) +
1698                                 RDB_SIZEOF_AUTO_INCREMENT_VERSION);
1699   }
1700 
1701   /*
1702     Gets the type of the key of the key in the data dictionary.
1703 
1704     @Note Only to be used on data dictionary keys for the auto_increment type
1705    */
GetKeyType(const rocksdb::Slice & s)1706   uint16_t GetKeyType(const rocksdb::Slice &s) const {
1707     return rdb_netbuf_to_uint32(reinterpret_cast<const uchar *>(s.data()));
1708   }
1709 
1710   /*
1711     Gets the version of the auto_increment value in the data dictionary.
1712 
1713     @Note Only to be used on data dictionary value for the auto_increment type
1714    */
GetVersion(const rocksdb::Slice & s)1715   uint16_t GetVersion(const rocksdb::Slice &s) const {
1716     return rdb_netbuf_to_uint16(reinterpret_cast<const uchar *>(s.data()));
1717   }
1718 };
1719 
1720 }  // namespace myrocks
1721