1 /*
2 Copyright (c) 2012,2013 Monty Program Ab
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; version 2 of the License.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
16 #pragma once
17
18 #define ROCKSDB_INCLUDE_VALIDATE_TABLES 1
19
20 /* C++ standard header files */
21 #include <algorithm>
22 #include <array>
23 #include <atomic>
24 #include <map>
25 #include <mutex>
26 #include <string>
27 #include <unordered_map>
28 #include <unordered_set>
29 #include <utility>
30 #include <vector>
31
32 /* C standard header files */
33 #include <arpa/inet.h>
34
35 /* MyRocks header files */
36 #include "./ha_rocksdb.h"
37 #include "./properties_collector.h"
38 #include "./rdb_buff.h"
39 #include "./rdb_utils.h"
40
41 namespace myrocks {
42
43 class Rdb_dict_manager;
44 class Rdb_key_def;
45 class Rdb_field_packing;
46 class Rdb_cf_manager;
47 class Rdb_ddl_manager;
48
49 class Rdb_convert_to_record_key_decoder {
50 public:
51 Rdb_convert_to_record_key_decoder() = default;
52 Rdb_convert_to_record_key_decoder(
53 const Rdb_convert_to_record_key_decoder &decoder) = delete;
54 Rdb_convert_to_record_key_decoder &operator=(
55 const Rdb_convert_to_record_key_decoder &decoder) = delete;
56 static int decode(uchar *const buf, Rdb_field_packing *fpi, TABLE *table,
57 bool has_unpack_info, Rdb_string_reader *reader,
58 Rdb_string_reader *unpack_reader);
59 static int skip(const Rdb_field_packing *fpi, const Field *field,
60 Rdb_string_reader *reader, Rdb_string_reader *unpack_reader);
61
62 private:
63 static int decode_field(Rdb_field_packing *fpi, TABLE *table, uchar *buf,
64 Rdb_string_reader *reader,
65 Rdb_string_reader *unpack_reader);
66 };
67
68 /*
69 @brief
70 Field packing context.
71 The idea is to ensure that a call to rdb_index_field_pack_t function
72 is followed by a call to rdb_make_unpack_info_t.
73
74 @detail
75 For some datatypes, unpack_info is produced as a side effect of
76 rdb_index_field_pack_t function call.
77 For other datatypes, packing is just calling make_sort_key(), while
78 rdb_make_unpack_info_t is a custom function.
79 In order to accommodate both cases, we require both calls to be made and
80 unpack_info is passed as context data between the two.
81 */
82 class Rdb_pack_field_context {
83 public:
84 Rdb_pack_field_context(const Rdb_pack_field_context &) = delete;
85 Rdb_pack_field_context &operator=(const Rdb_pack_field_context &) = delete;
86
Rdb_pack_field_context(Rdb_string_writer * const writer_arg)87 explicit Rdb_pack_field_context(Rdb_string_writer *const writer_arg)
88 : writer(writer_arg) {}
89
90 // NULL means we're not producing unpack_info.
91 Rdb_string_writer *writer;
92 };
93
94 class Rdb_key_field_iterator {
95 private:
96 TABLE *m_table;
97 Rdb_string_reader *m_reader;
98 Rdb_string_reader *m_unp_reader;
99 uint m_curr_bitmap_pos;
100 const MY_BITMAP *m_covered_bitmap;
101 uchar *m_buf;
102 bool m_has_unpack_info;
103 const Rdb_key_def *m_key_def;
104 bool m_secondary_key;
105 bool m_hidden_pk_exists;
106 bool m_is_hidden_pk;
107 bool m_is_null;
108 Rdb_field_packing *m_fpi;
109 Rdb_field_packing *m_fpi_end;
110
111 public:
112 Rdb_key_field_iterator(const Rdb_key_field_iterator &) = delete;
113 Rdb_key_field_iterator &operator=(const Rdb_key_field_iterator &) = delete;
114 Rdb_key_field_iterator(const Rdb_key_def *key_def,
115 Rdb_field_packing *pack_info,
116 Rdb_string_reader *reader,
117 Rdb_string_reader *unp_reader, TABLE *table,
118 bool has_unpack_info, const MY_BITMAP *covered_bitmap,
119 uchar *buf);
120
121 int next();
122 bool has_next();
123 };
124
125 struct Rdb_collation_codec;
126 struct Rdb_index_info;
127
128 /*
129 C-style "virtual table" allowing different handling of packing logic based
130 on the field type. See Rdb_field_packing::setup() implementation.
131 */
132 using rdb_make_unpack_info_t = void (*)(const Rdb_collation_codec *codec,
133 const Field *field,
134 Rdb_pack_field_context *pack_ctx);
135 using rdb_index_field_unpack_t = int (*)(Rdb_field_packing *fpi,
136 uchar *field_ptr,
137 Rdb_string_reader *reader,
138 Rdb_string_reader *unpack_reader);
139 using rdb_index_field_skip_t = int (*)(const Rdb_field_packing *fpi,
140 Rdb_string_reader *reader);
141 using rdb_index_field_pack_t = void (*)(Rdb_field_packing *fpi, Field *field,
142 uchar *buf, uchar **dst,
143 Rdb_pack_field_context *pack_ctx);
144
145 const constexpr uint RDB_INVALID_KEY_LEN = uint(-1);
146
147 /* How much one checksum occupies when stored in the record */
148 const constexpr size_t RDB_CHECKSUM_SIZE = sizeof(uint32_t);
149
150 /*
151 How much the checksum data occupies in record, in total.
152 It is storing two checksums plus 1 tag-byte.
153 */
154 const constexpr size_t RDB_CHECKSUM_CHUNK_SIZE = 2 * RDB_CHECKSUM_SIZE + 1;
155
156 /*
157 Checksum data starts from CHECKSUM_DATA_TAG which is followed by two CRC32
158 checksums.
159 */
160 const constexpr char RDB_CHECKSUM_DATA_TAG = 0x01;
161
162 /*
163 Unpack data is variable length. The header is 1 tag-byte plus a two byte
164 length field. The length field includes the header as well.
165 */
166 const constexpr char RDB_UNPACK_DATA_TAG = 0x02;
167 const constexpr size_t RDB_UNPACK_DATA_LEN_SIZE = sizeof(uint16_t);
168 const constexpr size_t RDB_UNPACK_HEADER_SIZE =
169 sizeof(RDB_UNPACK_DATA_TAG) + RDB_UNPACK_DATA_LEN_SIZE;
170
171 /*
172 This header format is 1 tag-byte plus a two byte length field plus a two byte
173 covered bitmap. The length field includes the header size.
174 */
175 const constexpr char RDB_UNPACK_COVERED_DATA_TAG = 0x03;
176 const constexpr size_t RDB_UNPACK_COVERED_DATA_LEN_SIZE = sizeof(uint16_t);
177 const constexpr size_t RDB_COVERED_BITMAP_SIZE = sizeof(uint16_t);
178 const constexpr size_t RDB_UNPACK_COVERED_HEADER_SIZE =
179 sizeof(RDB_UNPACK_COVERED_DATA_TAG) + RDB_UNPACK_COVERED_DATA_LEN_SIZE +
180 RDB_COVERED_BITMAP_SIZE;
181
182 /*
183 Data dictionary index info field sizes.
184 */
185 const constexpr size_t RDB_SIZEOF_INDEX_INFO_VERSION = sizeof(uint16);
186 const constexpr size_t RDB_SIZEOF_INDEX_TYPE = sizeof(uchar);
187 const constexpr size_t RDB_SIZEOF_KV_VERSION = sizeof(uint16);
188 const constexpr size_t RDB_SIZEOF_INDEX_FLAGS = sizeof(uint32);
189 const constexpr size_t RDB_SIZEOF_AUTO_INCREMENT_VERSION = sizeof(uint16);
190
191 // Possible return values for rdb_index_field_unpack_t functions.
192 enum {
193 UNPACK_SUCCESS = 0,
194 UNPACK_FAILURE = 1,
195 };
196
197 /*
198 An object of this class represents information about an index in an SQL
199 table. It provides services to encode and decode index tuples.
200
201 Note: a table (as in, on-disk table) has a single Rdb_key_def object which
202 is shared across multiple TABLE* objects and may be used simultaneously from
203 different threads.
204
205 There are several data encodings:
206
207 === SQL LAYER ===
208 SQL layer uses two encodings:
209
210 - "Table->record format". This is the format that is used for the data in
211 the record buffers, table->record[i]
212
213 - KeyTupleFormat (see opt_range.cc) - this is used in parameters to index
214 lookup functions, like handler::index_read_map().
215
216 === Inside RocksDB ===
217 Primary Key is stored as a mapping:
218
219 index_tuple -> StoredRecord
220
221 StoredRecord is in Table->record format, except for blobs, which are stored
222 in-place. See ha_rocksdb::convert_record_to_storage_format for details.
223
224 Secondary indexes are stored as one of two variants:
225
226 index_tuple -> unpack_info
227 index_tuple -> empty_string
228
229 index_tuple here is the form of key that can be compared with memcmp(), aka
230 "mem-comparable form".
231
232 unpack_info is extra data that allows to restore the original value from its
233 mem-comparable form. It is present only if the index supports index-only
234 reads.
235 */
236
237 class Rdb_key_def {
238 public:
239 /* Convert a key from KeyTupleFormat to mem-comparable form */
240 uint pack_index_tuple(TABLE *const tbl, uchar *const pack_buffer,
241 uchar *const packed_tuple, const uchar *const key_tuple,
242 const key_part_map &keypart_map) const;
243
244 uchar *pack_field(Field *const field, Rdb_field_packing *pack_info,
245 uchar *tuple, uchar *const packed_tuple,
246 uchar *const pack_buffer,
247 Rdb_string_writer *const unpack_info,
248 uint *const n_null_fields) const;
249 /* Convert a key from Table->record format to mem-comparable form */
250 uint pack_record(const TABLE *const tbl, uchar *const pack_buffer,
251 const uchar *const record, uchar *const packed_tuple,
252 Rdb_string_writer *const unpack_info,
253 const bool should_store_row_debug_checksums,
254 const longlong hidden_pk_id = 0, uint n_key_parts = 0,
255 uint *const n_null_fields = nullptr,
256 const char *const ttl_bytes = nullptr) const;
257 /* Pack the hidden primary key into mem-comparable form. */
258 uint pack_hidden_pk(const longlong hidden_pk_id,
259 uchar *const packed_tuple) const;
260 int unpack_record(TABLE *const table, uchar *const buf,
261 const rocksdb::Slice *const packed_key,
262 const rocksdb::Slice *const unpack_info,
263 const bool verify_row_debug_checksums) const;
264
265 static bool unpack_info_has_checksum(const rocksdb::Slice &unpack_info);
266 int compare_keys(const rocksdb::Slice *key1, const rocksdb::Slice *key2,
267 std::size_t *const column_index) const;
268
269 size_t key_length(const TABLE *const table, const rocksdb::Slice &key) const;
270
271 /* Get the key that is the "infimum" for this index */
get_infimum_key(uchar * const key,uint * const size)272 inline void get_infimum_key(uchar *const key, uint *const size) const {
273 rdb_netbuf_store_index(key, m_index_number);
274 *size = INDEX_NUMBER_SIZE;
275 }
276
277 /* Get the key that is a "supremum" for this index */
get_supremum_key(uchar * const key,uint * const size)278 inline void get_supremum_key(uchar *const key, uint *const size) const {
279 rdb_netbuf_store_index(key, m_index_number + 1);
280 *size = INDEX_NUMBER_SIZE;
281 }
282
283 /*
284 Get the first key that you need to position at to start iterating.
285 Stores into *key a "supremum" or "infimum" key value for the index.
286 @parameters key OUT Big Endian, value is m_index_number or
287 m_index_number + 1
288 @parameters size OUT key size, value is INDEX_NUMBER_SIZE
289 @return Number of bytes in the key that are usable for bloom filter use.
290 */
get_first_key(uchar * const key,uint * const size)291 inline int get_first_key(uchar *const key, uint *const size) const {
292 if (m_is_reverse_cf) {
293 get_supremum_key(key, size);
294 /* Find out how many bytes of infimum are the same as m_index_number */
295 uchar unmodified_key[INDEX_NUMBER_SIZE];
296 rdb_netbuf_store_index(unmodified_key, m_index_number);
297 int i;
298 for (i = 0; i < INDEX_NUMBER_SIZE; i++) {
299 if (key[i] != unmodified_key[i]) {
300 break;
301 }
302 }
303 return i;
304 } else {
305 get_infimum_key(key, size);
306 // For infimum key, its value will be m_index_number
307 // Thus return its own size instead.
308 return INDEX_NUMBER_SIZE;
309 }
310 }
311
312 /*
313 The same as get_first_key, but get the key for the last entry in the index
314 @parameters key OUT Big Endian, value is m_index_number or
315 m_index_number + 1
316 @parameters size OUT key size, value is INDEX_NUMBER_SIZE
317
318 @return Number of bytes in the key that are usable for bloom filter use.
319 */
get_last_key(uchar * const key,uint * const size)320 inline int get_last_key(uchar *const key, uint *const size) const {
321 if (m_is_reverse_cf) {
322 get_infimum_key(key, size);
323 // For infimum key, its value will be m_index_number
324 // Thus return its own size instead.
325 return INDEX_NUMBER_SIZE;
326 } else {
327 get_supremum_key(key, size);
328 /* Find out how many bytes are the same as m_index_number */
329 uchar unmodified_key[INDEX_NUMBER_SIZE];
330 rdb_netbuf_store_index(unmodified_key, m_index_number);
331 int i;
332 for (i = 0; i < INDEX_NUMBER_SIZE; i++) {
333 if (key[i] != unmodified_key[i]) {
334 break;
335 }
336 }
337 return i;
338 }
339 }
340
341 /* Make a key that is right after the given key. */
342 static int successor(uchar *const packed_tuple, const uint len);
343
344 /* Make a key that is right before the given key. */
345 static int predecessor(uchar *const packed_tuple, const uint len);
346
347 /*
348 This can be used to compare prefixes.
349 if X is a prefix of Y, then we consider that X = Y.
350 */
351 // b describes the lookup key, which can be a prefix of a.
352 // b might be outside of the index_number range, if successor() is called.
cmp_full_keys(const rocksdb::Slice & a,const rocksdb::Slice & b)353 int cmp_full_keys(const rocksdb::Slice &a, const rocksdb::Slice &b) const {
354 assert(covers_key(a));
355
356 return memcmp(a.data(), b.data(), std::min(a.size(), b.size()));
357 }
358
359 /* Check if given mem-comparable key belongs to this index */
covers_key(const rocksdb::Slice & slice)360 bool covers_key(const rocksdb::Slice &slice) const {
361 if (slice.size() < INDEX_NUMBER_SIZE) return false;
362
363 if (memcmp(slice.data(), m_index_number_storage_form, INDEX_NUMBER_SIZE)) {
364 return false;
365 }
366
367 return true;
368 }
369
370 void get_lookup_bitmap(const TABLE *table, MY_BITMAP *map) const;
371
372 bool covers_lookup(const rocksdb::Slice *const unpack_info,
373 const MY_BITMAP *const map) const;
374
use_covered_bitmap_format()375 inline bool use_covered_bitmap_format() const {
376 return m_index_type == INDEX_TYPE_SECONDARY &&
377 m_kv_format_version >= SECONDARY_FORMAT_VERSION_UPDATE3;
378 }
379
is_primary_key()380 inline bool is_primary_key() const {
381 return m_index_type == INDEX_TYPE_PRIMARY ||
382 m_index_type == INDEX_TYPE_HIDDEN_PRIMARY;
383 }
384
385 /* Indicates that all key parts can be unpacked to cover a secondary lookup */
386 bool can_cover_lookup() const;
387
388 /*
389 Return true if the passed mem-comparable key
390 - is from this index, and
391 - it matches the passed key prefix (the prefix is also in mem-comparable
392 form)
393 */
value_matches_prefix(const rocksdb::Slice & value,const rocksdb::Slice & prefix)394 bool value_matches_prefix(const rocksdb::Slice &value,
395 const rocksdb::Slice &prefix) const {
396 return covers_key(value) && !cmp_full_keys(value, prefix);
397 }
398
get_keyno()399 uint32 get_keyno() const { return m_keyno; }
400
get_index_number()401 uint32 get_index_number() const { return m_index_number; }
402
get_gl_index_id()403 GL_INDEX_ID get_gl_index_id() const {
404 const GL_INDEX_ID gl_index_id = {m_cf_handle->GetID(), m_index_number};
405 return gl_index_id;
406 }
407
408 int read_memcmp_key_part(const TABLE *table_arg, Rdb_string_reader *reader,
409 const uint part_num) const;
410
411 /* Must only be called for secondary keys: */
412 uint get_primary_key_tuple(const TABLE *const tbl,
413 const Rdb_key_def &pk_descr,
414 const rocksdb::Slice *const key,
415 uchar *const pk_buffer) const;
416
417 uint get_memcmp_sk_parts(const TABLE *table, const rocksdb::Slice &key,
418 uchar *sk_buffer, uint *n_null_fields) const;
419
420 /* Return max length of mem-comparable form */
max_storage_fmt_length()421 uint max_storage_fmt_length() const { return m_maxlength; }
422
get_key_parts()423 uint get_key_parts() const { return m_key_parts; }
424
get_ttl_field_index()425 uint get_ttl_field_index() const { return m_ttl_field_index; }
426
427 /*
428 Get a field object for key part #part_no
429
430 @detail
431 SQL layer thinks unique secondary indexes and indexes in partitioned
432 tables are not "Extended" with Primary Key columns.
433
434 Internally, we always extend all indexes with PK columns. This function
435 uses our definition of how the index is Extended.
436 */
437 inline Field *get_table_field_for_part_no(TABLE *table, uint part_no) const;
438
get_name()439 const std::string &get_name() const { return m_name; }
440
get_extractor()441 const rocksdb::SliceTransform *get_extractor() const {
442 return m_prefix_extractor.get();
443 }
444
445 static size_t get_unpack_header_size(char tag);
446
447 Rdb_key_def &operator=(const Rdb_key_def &) = delete;
448 Rdb_key_def(const Rdb_key_def &k);
449 Rdb_key_def(uint indexnr_arg, uint keyno_arg,
450 std::shared_ptr<rocksdb::ColumnFamilyHandle> cf_handle_arg,
451 uint16_t index_dict_version_arg, uchar index_type_arg,
452 uint16_t kv_format_version_arg, bool is_reverse_cf_arg,
453 bool is_per_partition_cf, const char *name,
454 Rdb_index_stats stats = Rdb_index_stats(), uint32 index_flags = 0,
455 uint32 ttl_rec_offset = UINT_MAX, uint64 ttl_duration = 0);
456 ~Rdb_key_def();
457
458 enum {
459 INDEX_NUMBER_SIZE = 4,
460 VERSION_SIZE = 2,
461 CF_NUMBER_SIZE = 4,
462 CF_FLAG_SIZE = 4,
463 PACKED_SIZE = 4, // one int
464 };
465
466 // bit flags for combining bools when writing to disk
467 enum {
468 REVERSE_CF_FLAG = 1,
469 AUTO_CF_FLAG = 2, // Deprecated
470 PER_PARTITION_CF_FLAG = 4,
471 };
472
473 // bit flags which denote myrocks specific fields stored in the record
474 // currently only used for TTL.
475 enum INDEX_FLAG {
476 TTL_FLAG = 1 << 0,
477
478 // MAX_FLAG marks where the actual record starts
479 // This flag always needs to be set to the last index flag enum.
480 MAX_FLAG = TTL_FLAG << 1,
481 };
482
483 // Set of flags to ignore when comparing two CF-s and determining if
484 // they're same.
485 static const uint CF_FLAGS_TO_IGNORE = PER_PARTITION_CF_FLAG;
486
487 // Data dictionary types
488 enum DATA_DICT_TYPE {
489 DDL_ENTRY_INDEX_START_NUMBER = 1,
490 INDEX_INFO = 2,
491 CF_DEFINITION = 3,
492 BINLOG_INFO_INDEX_NUMBER = 4,
493 DDL_DROP_INDEX_ONGOING = 5,
494 INDEX_STATISTICS = 6,
495 MAX_INDEX_ID = 7,
496 DDL_CREATE_INDEX_ONGOING = 8,
497 AUTO_INC = 9,
498 DROPPED_CF = 10,
499 END_DICT_INDEX_ID = 255
500 };
501
502 // Data dictionary schema version. Introduce newer versions
503 // if changing schema layout
504 enum {
505 DDL_ENTRY_INDEX_VERSION = 1,
506 CF_DEFINITION_VERSION = 1,
507 BINLOG_INFO_INDEX_NUMBER_VERSION = 1,
508 DDL_DROP_INDEX_ONGOING_VERSION = 1,
509 MAX_INDEX_ID_VERSION = 1,
510 DDL_CREATE_INDEX_ONGOING_VERSION = 1,
511 AUTO_INCREMENT_VERSION = 1,
512 DROPPED_CF_VERSION = 1,
513 // Version for index stats is stored in IndexStats struct
514 };
515
516 // Index info version. Introduce newer versions when changing the
517 // INDEX_INFO layout. Update INDEX_INFO_VERSION_LATEST to point to the
518 // latest version number.
519 enum {
520 INDEX_INFO_VERSION_INITIAL = 1, // Obsolete
521 INDEX_INFO_VERSION_KV_FORMAT,
522 INDEX_INFO_VERSION_GLOBAL_ID,
523 // There is no change to data format in this version, but this version
524 // verifies KV format version, whereas previous versions do not. A version
525 // bump is needed to prevent older binaries from skipping the KV version
526 // check inadvertently.
527 INDEX_INFO_VERSION_VERIFY_KV_FORMAT,
528 // This changes the data format to include a 8 byte TTL duration for tables
529 INDEX_INFO_VERSION_TTL,
530 // This changes the data format to include a bitmap before the TTL duration
531 // which will indicate in the future whether TTL or other special fields
532 // are turned on or off.
533 INDEX_INFO_VERSION_FIELD_FLAGS,
534 // This normally point to the latest (currently it does).
535 INDEX_INFO_VERSION_LATEST = INDEX_INFO_VERSION_FIELD_FLAGS,
536 };
537
538 // MyRocks index types
539 enum {
540 INDEX_TYPE_PRIMARY = 1,
541 INDEX_TYPE_SECONDARY = 2,
542 INDEX_TYPE_HIDDEN_PRIMARY = 3,
543 };
544
545 // Key/Value format version for each index type
546 enum {
547 PRIMARY_FORMAT_VERSION_INITIAL = 10,
548 // This change includes:
549 // - For columns that can be unpacked with unpack_info, PK
550 // stores the unpack_info.
551 // - DECIMAL datatype is no longer stored in the row (because
552 // it can be decoded from its mem-comparable form)
553 // - VARCHAR-columns use endspace-padding.
554 PRIMARY_FORMAT_VERSION_UPDATE1 = 11,
555 // This change includes:
556 // - Binary encoded variable length fields have a new format that avoids
557 // an inefficient where data that was a multiple of 8 bytes in length
558 // had an extra 9 bytes of encoded data.
559 PRIMARY_FORMAT_VERSION_UPDATE2 = 12,
560 // This change includes support for TTL
561 // - This means that when TTL is specified for the table an 8-byte TTL
562 // field is prepended in front of each value.
563 PRIMARY_FORMAT_VERSION_TTL = 13,
564 PRIMARY_FORMAT_VERSION_LATEST = PRIMARY_FORMAT_VERSION_TTL,
565
566 SECONDARY_FORMAT_VERSION_INITIAL = 10,
567 // This change the SK format to include unpack_info.
568 SECONDARY_FORMAT_VERSION_UPDATE1 = 11,
569 // This change includes:
570 // - Binary encoded variable length fields have a new format that avoids
571 // an inefficient where data that was a multiple of 8 bytes in length
572 // had an extra 9 bytes of encoded data.
573 SECONDARY_FORMAT_VERSION_UPDATE2 = 12,
574 // This change includes support for TTL
575 // - This means that when TTL is specified for the table an 8-byte TTL
576 // field is prepended in front of each value.
577 SECONDARY_FORMAT_VERSION_TTL = 13,
578 SECONDARY_FORMAT_VERSION_LATEST = SECONDARY_FORMAT_VERSION_TTL,
579 // This change includes support for covering SK lookups for varchars. A
580 // 2-byte bitmap is added after the tag-byte to unpack_info only for
581 // records which have covered varchar columns. Currently waiting before
582 // enabling in prod.
583 SECONDARY_FORMAT_VERSION_UPDATE3 = 65535,
584 };
585
586 void setup(const TABLE *const table, const Rdb_tbl_def *const tbl_def);
587
588 static uint extract_ttl_duration(const TABLE *const table_arg,
589 const Rdb_tbl_def *const tbl_def_arg,
590 uint64 *ttl_duration);
591 static uint extract_ttl_col(const TABLE *const table_arg,
592 const Rdb_tbl_def *const tbl_def_arg,
593 std::string *ttl_column, uint *ttl_field_index,
594 bool skip_checks = false);
has_ttl()595 inline bool has_ttl() const { return m_ttl_duration > 0; }
596
597 static bool has_index_flag(uint32 index_flags, enum INDEX_FLAG flag);
598 static uint32 calculate_index_flag_offset(uint32 index_flags,
599 enum INDEX_FLAG flag,
600 uint *const field_length = nullptr);
601 void write_index_flag_field(Rdb_string_writer *const buf,
602 const uchar *const val,
603 enum INDEX_FLAG flag) const;
604
605 static const std::string gen_qualifier_for_table(
606 const char *const qualifier, const std::string &partition_name = "");
607 static const std::string gen_cf_name_qualifier_for_partition(
608 const std::string &s);
609 static const std::string gen_ttl_duration_qualifier_for_partition(
610 const std::string &s);
611 static const std::string gen_ttl_col_qualifier_for_partition(
612 const std::string &s);
613
614 static const std::string parse_comment_for_qualifier(
615 const std::string &comment, const TABLE *const table_arg,
616 const Rdb_tbl_def *const tbl_def_arg, bool *per_part_match_found,
617 const char *const qualifier);
618
get_cf()619 rocksdb::ColumnFamilyHandle *get_cf() const { return m_cf_handle.get(); }
get_shared_cf()620 std::shared_ptr<rocksdb::ColumnFamilyHandle> get_shared_cf() const {
621 return m_cf_handle;
622 }
623
624 /* Check if keypart #kp can be unpacked from index tuple */
625 inline bool can_unpack(const uint kp) const;
626 /* Check if keypart #kp needs unpack info */
627 inline bool has_unpack_info(const uint kp) const;
628
629 /* Check if given table has a primary key */
630 static bool table_has_hidden_pk(const TABLE *const table);
631
632 void report_checksum_mismatch(const bool is_key, const char *const data,
633 const size_t data_size) const;
634
635 /* Check if index is at least pk_min if it is a PK,
636 or at least sk_min if SK.*/
637 bool index_format_min_check(const int pk_min, const int sk_min) const;
638
639 static void pack_tiny(Rdb_field_packing *const fpi, Field *const field,
640 uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
641 Rdb_pack_field_context *const pack_ctx
642 MY_ATTRIBUTE((__unused__)));
643
644 static void pack_short(Rdb_field_packing *const fpi, Field *const field,
645 uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
646 Rdb_pack_field_context *const pack_ctx
647 MY_ATTRIBUTE((__unused__)));
648
649 static void pack_medium(Rdb_field_packing *const fpi, Field *const field,
650 uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
651 Rdb_pack_field_context *const pack_ctx
652 MY_ATTRIBUTE((__unused__)));
653
654 static void pack_long(Rdb_field_packing *const fpi, Field *const field,
655 uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
656 Rdb_pack_field_context *const pack_ctx
657 MY_ATTRIBUTE((__unused__)));
658
659 static void pack_longlong(Rdb_field_packing *const fpi, Field *const field,
660 uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
661 Rdb_pack_field_context *const pack_ctx
662 MY_ATTRIBUTE((__unused__)));
663
664 static void pack_double(Rdb_field_packing *const fpi, Field *const field,
665 uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
666 Rdb_pack_field_context *const pack_ctx
667 MY_ATTRIBUTE((__unused__)));
668
669 static void pack_float(Rdb_field_packing *const fpi, Field *const field,
670 uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
671 Rdb_pack_field_context *const pack_ctx
672 MY_ATTRIBUTE((__unused__)));
673
674 static void pack_new_decimal(
675 Rdb_field_packing *const fpi, Field *const field,
676 uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
677 Rdb_pack_field_context *const pack_ctx MY_ATTRIBUTE((__unused__)));
678
679 static void pack_datetime2(Rdb_field_packing *const fpi, Field *const field,
680 uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
681 Rdb_pack_field_context *const pack_ctx
682 MY_ATTRIBUTE((__unused__)));
683
684 static void pack_timestamp2(
685 Rdb_field_packing *const fpi, Field *const field,
686 uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
687 Rdb_pack_field_context *const pack_ctx MY_ATTRIBUTE((__unused__)));
688
689 static void pack_time2(Rdb_field_packing *const fpi, Field *const field,
690 uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
691 Rdb_pack_field_context *const pack_ctx
692 MY_ATTRIBUTE((__unused__)));
693
694 static void pack_year(Rdb_field_packing *const fpi, Field *const field,
695 uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
696 Rdb_pack_field_context *const pack_ctx
697 MY_ATTRIBUTE((__unused__)));
698
699 static void pack_newdate(Rdb_field_packing *const fpi, Field *const field,
700 uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
701 Rdb_pack_field_context *const pack_ctx
702 MY_ATTRIBUTE((__unused__)));
703
704 static void pack_blob(Rdb_field_packing *const fpi, Field *const field,
705 uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
706 Rdb_pack_field_context *const pack_ctx
707 MY_ATTRIBUTE((__unused__)));
708
709 static void pack_with_make_sort_key(
710 Rdb_field_packing *const fpi, Field *const field,
711 uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
712 Rdb_pack_field_context *const pack_ctx MY_ATTRIBUTE((__unused__)));
713
714 static void pack_with_varchar_encoding(
715 Rdb_field_packing *const fpi, Field *const field, uchar *buf, uchar **dst,
716 Rdb_pack_field_context *const pack_ctx MY_ATTRIBUTE((__unused__)));
717
718 static void pack_with_varchar_space_pad(
719 Rdb_field_packing *const fpi, Field *const field, uchar *buf, uchar **dst,
720 Rdb_pack_field_context *const pack_ctx);
721
722 template <int length>
723 static int unpack_integer(Rdb_field_packing *const fpi, uchar *const to,
724 Rdb_string_reader *const reader,
725 Rdb_string_reader *const unp_reader
726 MY_ATTRIBUTE((__unused__)));
727
728 static int unpack_double(
729 Rdb_field_packing *const fpi MY_ATTRIBUTE((__unused__)),
730 uchar *const field_ptr, Rdb_string_reader *const reader,
731 Rdb_string_reader *const unp_reader MY_ATTRIBUTE((__unused__)));
732
733 static int unpack_float(Rdb_field_packing *const fpi, uchar *const field_ptr,
734 Rdb_string_reader *const reader,
735 Rdb_string_reader *const unp_reader
736 MY_ATTRIBUTE((__unused__)));
737
738 static int unpack_binary_str(Rdb_field_packing *const fpi, uchar *const to,
739 Rdb_string_reader *const reader,
740 Rdb_string_reader *const unp_reader
741 MY_ATTRIBUTE((__unused__)));
742
743 static int unpack_binary_or_utf8_varchar(
744 Rdb_field_packing *const fpi, uchar *dst, Rdb_string_reader *const reader,
745 Rdb_string_reader *const unp_reader MY_ATTRIBUTE((__unused__)));
746
747 static int unpack_binary_or_utf8_varchar_space_pad(
748 Rdb_field_packing *const fpi, uchar *dst, Rdb_string_reader *const reader,
749 Rdb_string_reader *const unp_reader);
750
751 static int unpack_newdate(
752 Rdb_field_packing *const fpi, uchar *const field_ptr,
753 Rdb_string_reader *const reader,
754 Rdb_string_reader *const unp_reader MY_ATTRIBUTE((__unused__)));
755
756 static int unpack_utf8_str(
757 Rdb_field_packing *const fpi, uchar *dst, Rdb_string_reader *const reader,
758 Rdb_string_reader *const unp_reader MY_ATTRIBUTE((__unused__)));
759
760 static int unpack_unknown_varchar(Rdb_field_packing *const fpi, uchar *dst,
761 Rdb_string_reader *const reader,
762 Rdb_string_reader *const unp_reader);
763
764 static int unpack_simple_varchar_space_pad(
765 Rdb_field_packing *const fpi, uchar *dst, Rdb_string_reader *const reader,
766 Rdb_string_reader *const unp_reader);
767
768 static int unpack_simple(Rdb_field_packing *const fpi, uchar *const dst,
769 Rdb_string_reader *const reader,
770 Rdb_string_reader *const unp_reader);
771
772 static int unpack_unknown(Rdb_field_packing *const fpi, uchar *const dst,
773 Rdb_string_reader *const reader,
774 Rdb_string_reader *const unp_reader);
775
776 static int unpack_floating_point(uchar *const dst,
777 Rdb_string_reader *const reader,
778 const size_t size, const int exp_digit,
779 const uchar *const zero_pattern,
780 const uchar *const zero_val,
781 void (*swap_func)(uchar *, const uchar *));
782
783 static void make_unpack_simple_varchar(
784 const Rdb_collation_codec *const codec, const Field *const field,
785 Rdb_pack_field_context *const pack_ctx);
786
787 static void make_unpack_simple(const Rdb_collation_codec *const codec,
788 const Field *const field,
789 Rdb_pack_field_context *const pack_ctx);
790
791 static void make_unpack_unknown(
792 const Rdb_collation_codec *codec MY_ATTRIBUTE((__unused__)),
793 const Field *const field, Rdb_pack_field_context *const pack_ctx);
794
795 static void make_unpack_unknown_varchar(
796 const Rdb_collation_codec *const codec MY_ATTRIBUTE((__unused__)),
797 const Field *const field, Rdb_pack_field_context *const pack_ctx);
798
799 static void dummy_make_unpack_info(
800 const Rdb_collation_codec *codec MY_ATTRIBUTE((__unused__)),
801 const Field *field MY_ATTRIBUTE((__unused__)),
802 Rdb_pack_field_context *pack_ctx MY_ATTRIBUTE((__unused__)));
803
804 static int skip_max_length(const Rdb_field_packing *const fpi,
805 Rdb_string_reader *const reader);
806
807 static int skip_variable_length(const Rdb_field_packing *const fpi,
808 Rdb_string_reader *const reader);
809
810 static int skip_variable_space_pad(const Rdb_field_packing *const fpi,
811 Rdb_string_reader *const reader);
812
use_legacy_varbinary_format()813 inline bool use_legacy_varbinary_format() const {
814 return !index_format_min_check(PRIMARY_FORMAT_VERSION_UPDATE2,
815 SECONDARY_FORMAT_VERSION_UPDATE2);
816 }
817
is_unpack_data_tag(char c)818 static inline bool is_unpack_data_tag(char c) {
819 return c == RDB_UNPACK_DATA_TAG || c == RDB_UNPACK_COVERED_DATA_TAG;
820 }
821
822 private:
823 #ifndef NDEBUG
is_storage_available(const int offset,const int needed)824 inline bool is_storage_available(const int offset, const int needed) const {
825 const int storage_length = static_cast<int>(max_storage_fmt_length());
826 return (storage_length - offset) >= needed;
827 }
828 #endif // NDEBUG
829
830 /* Global number of this index (used as prefix in StorageFormat) */
831 const uint32 m_index_number;
832
833 uchar m_index_number_storage_form[INDEX_NUMBER_SIZE];
834
835 std::shared_ptr<rocksdb::ColumnFamilyHandle> m_cf_handle;
836
837 static void pack_legacy_variable_format(const uchar *src, size_t src_len,
838 uchar **dst);
839
840 static void pack_variable_format(const uchar *src, size_t src_len,
841 uchar **dst);
842
843 static uint calc_unpack_legacy_variable_format(uchar flag, bool *done);
844
845 static uint calc_unpack_variable_format(uchar flag, bool *done);
846
847 public:
848 uint16_t m_index_dict_version;
849 uchar m_index_type;
850 /* KV format version for the index id */
851 uint16_t m_kv_format_version;
852 /* If true, the column family stores data in the reverse order */
853 bool m_is_reverse_cf;
854
855 /* If true, then column family is created per partition. */
856 bool m_is_per_partition_cf;
857
858 std::string m_name;
859 mutable Rdb_index_stats m_stats;
860
861 /*
862 Bitmap containing information about whether TTL or other special fields
863 are enabled for the given index.
864 */
865 uint32 m_index_flags_bitmap;
866
867 /*
868 How much space in bytes the index flag fields occupy.
869 */
870 uint32 m_total_index_flags_length;
871
872 /*
873 Offset in the records where the 8-byte TTL is stored (UINT_MAX if no TTL)
874 */
875 uint32 m_ttl_rec_offset;
876
877 /* Default TTL duration */
878 uint64 m_ttl_duration;
879
880 /* TTL column (if defined by user, otherwise implicit TTL is used) */
881 std::string m_ttl_column;
882
883 private:
884 /* Number of key parts in the primary key*/
885 uint m_pk_key_parts;
886
887 /*
888 pk_part_no[X]=Y means that keypart #X of this key is key part #Y of the
889 primary key. Y==-1 means this column is not present in the primary key.
890 */
891 uint *m_pk_part_no;
892
893 /* Array of index-part descriptors. */
894 Rdb_field_packing *m_pack_info;
895
896 uint m_keyno; /* number of this index in the table */
897
898 /*
899 Number of key parts in the index (including "index extension"). This is how
900 many elements are in the m_pack_info array.
901 */
902 uint m_key_parts;
903
904 /*
905 If TTL column is part of the PK, offset of the column within pk.
906 Default is UINT_MAX to denote that TTL col is not part of PK.
907 */
908 uint m_ttl_pk_key_part_offset;
909
910 /*
911 Index of the TTL column in table->s->fields, if it exists.
912 Default is UINT_MAX to denote that it does not exist.
913 */
914 uint m_ttl_field_index;
915
916 /* Prefix extractor for the column family of the key definiton */
917 std::shared_ptr<const rocksdb::SliceTransform> m_prefix_extractor;
918
919 /* Maximum length of the mem-comparable form. */
920 uint m_maxlength;
921
922 /* mutex to protect setup */
923 mysql_mutex_t m_mutex;
924 };
925
926 // "Simple" collations (those specified in strings/ctype-simple.c) are simple
927 // because their strnxfrm function maps one byte to one byte. However, the
928 // mapping is not injective, so the inverse function will take in an extra
929 // index parameter containing information to disambiguate what the original
930 // character was.
931 //
932 // The m_enc* members are for encoding. Generally, we want encoding to be:
933 // src -> (dst, idx)
934 //
935 // Since strnxfrm already gives us dst, we just need m_enc_idx[src] to give us
936 // idx.
937 //
938 // For the inverse, we have:
939 // (dst, idx) -> src
940 //
941 // We have m_dec_idx[idx][dst] = src to get our original character back.
942 //
943 struct Rdb_collation_codec {
944 const my_core::CHARSET_INFO *m_cs;
945 // The first element unpacks VARCHAR(n), the second one - CHAR(n).
946 std::array<rdb_make_unpack_info_t, 2> m_make_unpack_info_func;
947 std::array<rdb_index_field_unpack_t, 2> m_unpack_func;
948
949 std::array<uchar, 256> m_enc_idx;
950 std::array<uchar, 256> m_enc_size;
951
952 std::array<uchar, 256> m_dec_size;
953 std::vector<std::array<uchar, 256>> m_dec_idx;
954 };
955
956 extern mysql_mutex_t rdb_collation_data_mutex;
957 extern mysql_mutex_t rdb_mem_cmp_space_mutex;
958 extern std::array<const Rdb_collation_codec *, MY_ALL_CHARSETS_SIZE>
959 rdb_collation_data;
960
961 class Rdb_field_packing {
962 public:
963 Rdb_field_packing(const Rdb_field_packing &);
964 Rdb_field_packing &operator=(const Rdb_field_packing &) = delete;
965 Rdb_field_packing();
966
967 /* Length of mem-comparable image of the field, in bytes */
968 int m_max_image_len;
969
970 /* Length of image in the unpack data */
971 int m_unpack_data_len;
972 int m_unpack_data_offset;
973
974 /*
975 Cached field information for faster access
976 */
977 bool m_field_maybe_null; /* TRUE <=> NULL-byte is stored */
978 bool m_field_unsigned_flag;
979 enum_field_types m_field_real_type;
980 uchar m_field_null_bit_mask;
981 uint m_field_pack_length;
982 uint m_field_null_offset;
983 my_ptrdiff_t m_field_offset;
984 const CHARSET_INFO *m_field_charset;
985
986 /*
987 Valid only for VARCHAR fields.
988 */
989 uint m_varchar_length_bytes;
990 uint m_varchar_char_length;
991 bool m_use_legacy_varbinary_format;
992
993 // (Valid when Variable Length Space Padded Encoding is used):
994 uint m_segment_size; // size of segment used
995
996 // number of bytes used to store number of trimmed (or added)
997 // spaces in the upack_info
998 bool m_unpack_info_uses_two_bytes;
999
1000 /*
1001 True implies that an index-only read is always possible for this field.
1002 False means an index-only read may be possible depending on the record and
1003 field type.
1004 */
1005 bool m_covered;
1006
1007 const std::vector<uchar> *space_xfrm;
1008 size_t space_xfrm_len;
1009 size_t space_mb_len;
1010
1011 const Rdb_collation_codec *m_charset_codec;
1012
1013 /*
1014 @return TRUE: this field makes use of unpack_info.
1015 */
uses_unpack_info()1016 bool uses_unpack_info() const { return (m_make_unpack_info_func != nullptr); }
1017
1018 /* TRUE means unpack_info stores the original field value */
1019 bool m_unpack_info_stores_value;
1020
1021 rdb_index_field_pack_t m_pack_func;
1022 rdb_make_unpack_info_t m_make_unpack_info_func;
1023
1024 /*
1025 This function takes
1026 - mem-comparable form
1027 - unpack_info data
1028 and restores the original value.
1029 */
1030 rdb_index_field_unpack_t m_unpack_func;
1031
1032 /*
1033 This function skips over mem-comparable form.
1034 */
1035 rdb_index_field_skip_t m_skip_func;
1036
1037 private:
1038 /*
1039 Location of the field in the table (key number and key part number).
1040
1041 Note that this describes not the field, but rather a position of field in
1042 the index. Consider an example:
1043
1044 col1 VARCHAR (100),
1045 INDEX idx1 (col1)),
1046 INDEX idx2 (col1(10)),
1047
1048 Here, idx2 has a special Field object that is set to describe a 10-char
1049 prefix of col1.
1050
1051 We must also store the keynr. It is needed for implicit "extended keys".
1052 Every key in MyRocks needs to include PK columns. Generally, SQL layer
1053 includes PK columns as part of its "Extended Keys" feature, but sometimes
1054 it does not (known examples are unique secondary indexes and partitioned
1055 tables).
1056 In that case, MyRocks's index descriptor has invisible suffix of PK
1057 columns (and the point is that these columns are parts of PK, not parts
1058 of the current index).
1059 */
1060 uint m_keynr;
1061 uint m_key_part;
1062
1063 public:
1064 bool setup(const Rdb_key_def *const key_descr, const Field *const field,
1065 const uint keynr_arg, const uint key_part_arg,
1066 const uint16 key_length);
1067 Field *get_field_in_table(const TABLE *const tbl) const;
1068 void fill_hidden_pk_val(uchar **dst, const longlong hidden_pk_id) const;
1069 };
1070
1071 /*
1072 Descriptor telling how to decode/encode a field to on-disk record storage
1073 format. Not all information is in the structure yet, but eventually we
1074 want to have as much as possible there to avoid virtual calls.
1075
1076 For encoding/decoding of index tuples, see Rdb_key_def.
1077 */
1078 class Rdb_field_encoder {
1079 public:
1080 Rdb_field_encoder(const Rdb_field_encoder &) = delete;
1081 Rdb_field_encoder &operator=(const Rdb_field_encoder &) = delete;
1082 /*
1083 STORE_NONE is set when a column can be decoded solely from their
1084 mem-comparable form.
1085 STORE_SOME is set when a column can be decoded from their mem-comparable
1086 form plus unpack_info.
1087 STORE_ALL is set when a column cannot be decoded, so its original value
1088 must be stored in the PK records.
1089 */
1090 enum STORAGE_TYPE {
1091 STORE_NONE,
1092 STORE_SOME,
1093 STORE_ALL,
1094 };
1095 STORAGE_TYPE m_storage_type;
1096
1097 uint m_null_offset;
1098 uchar m_null_mask; // 0 means the field cannot be null
1099
1100 /*
1101 Cached field information
1102 */
1103 my_core::enum_field_types m_field_type;
1104 uchar m_field_null_mask;
1105 uint16 m_field_index;
1106 uint m_field_pack_length;
1107 uint m_field_length_bytes;
1108 uint m_field_length;
1109 my_ptrdiff_t m_field_null_offset;
1110 my_ptrdiff_t m_field_offset;
1111
maybe_null()1112 bool maybe_null() const { return m_null_mask != 0; }
1113
uses_variable_len_encoding()1114 bool uses_variable_len_encoding() const {
1115 return (m_field_type == MYSQL_TYPE_BLOB ||
1116 m_field_type == MYSQL_TYPE_VARCHAR ||
1117 m_field_type == MYSQL_TYPE_JSON);
1118 }
1119 };
1120
get_table_field_for_part_no(TABLE * table,uint part_no)1121 inline Field *Rdb_key_def::get_table_field_for_part_no(TABLE *table,
1122 uint part_no) const {
1123 assert(part_no < get_key_parts());
1124 return m_pack_info[part_no].get_field_in_table(table);
1125 }
1126
can_unpack(const uint kp)1127 inline bool Rdb_key_def::can_unpack(const uint kp) const {
1128 assert(kp < m_key_parts);
1129 return (m_pack_info[kp].m_unpack_func != nullptr);
1130 }
1131
has_unpack_info(const uint kp)1132 inline bool Rdb_key_def::has_unpack_info(const uint kp) const {
1133 assert(kp < m_key_parts);
1134 return m_pack_info[kp].uses_unpack_info();
1135 }
1136
1137 /*
1138 A table definition. This is an entry in the mapping
1139
1140 dbname.tablename -> {index_nr, index_nr, ... }
1141
1142 There is only one Rdb_tbl_def object for a given table.
1143 That's why we keep auto_increment value here, too.
1144 */
1145
1146 class Rdb_tbl_def {
1147 private:
1148 void check_if_is_mysql_system_table();
1149
1150 /* Stores 'dbname.tablename' */
1151 std::string m_dbname_tablename;
1152
1153 /* Store the db name, table name, and partition name */
1154 std::string m_dbname;
1155 std::string m_tablename;
1156 std::string m_partition;
1157
1158 void set_name(const std::string &name);
1159
1160 public:
1161 Rdb_tbl_def(const Rdb_tbl_def &) = delete;
1162 Rdb_tbl_def &operator=(const Rdb_tbl_def &) = delete;
1163
Rdb_tbl_def(const std::string & name)1164 explicit Rdb_tbl_def(const std::string &name)
1165 : m_key_descr_arr(nullptr),
1166 m_hidden_pk_val(0),
1167 m_auto_incr_val(0),
1168 m_tbl_stats(),
1169 m_update_time(0),
1170 m_create_time(CREATE_TIME_UNKNOWN) {
1171 set_name(name);
1172 }
1173
Rdb_tbl_def(const char * const name,const size_t len)1174 Rdb_tbl_def(const char *const name, const size_t len)
1175 : m_key_descr_arr(nullptr),
1176 m_hidden_pk_val(0),
1177 m_auto_incr_val(0),
1178 m_tbl_stats(),
1179 m_update_time(0),
1180 m_create_time(CREATE_TIME_UNKNOWN) {
1181 set_name(std::string(name, len));
1182 }
1183
1184 explicit Rdb_tbl_def(const rocksdb::Slice &slice, const size_t pos = 0)
m_key_descr_arr(nullptr)1185 : m_key_descr_arr(nullptr),
1186 m_hidden_pk_val(0),
1187 m_auto_incr_val(0),
1188 m_tbl_stats(),
1189 m_update_time(0),
1190 m_create_time(CREATE_TIME_UNKNOWN) {
1191 set_name(std::string(slice.data() + pos, slice.size() - pos));
1192 }
1193
1194 ~Rdb_tbl_def();
1195
1196 void check_and_set_read_free_rpl_table();
1197
1198 /* Number of indexes */
1199 uint m_key_count;
1200
1201 /* Array of index descriptors */
1202 std::shared_ptr<Rdb_key_def> *m_key_descr_arr;
1203
1204 std::atomic<longlong> m_hidden_pk_val;
1205 std::atomic<ulonglong> m_auto_incr_val;
1206
1207 /* Is this a system table */
1208 bool m_is_mysql_system_table;
1209
1210 /* Is this table read free repl enabled */
1211 std::atomic_bool m_is_read_free_rpl_table{false};
1212
1213 Rdb_table_stats m_tbl_stats;
1214
1215 bool put_dict(Rdb_dict_manager *const dict, Rdb_cf_manager *const cf_manager,
1216 rocksdb::WriteBatch *const batch, const rocksdb::Slice &key);
1217
full_tablename()1218 const std::string &full_tablename() const { return m_dbname_tablename; }
base_dbname()1219 const std::string &base_dbname() const { return m_dbname; }
base_tablename()1220 const std::string &base_tablename() const { return m_tablename; }
base_partition()1221 const std::string &base_partition() const { return m_partition; }
1222 GL_INDEX_ID get_autoincr_gl_index_id();
1223
1224 time_t get_create_time();
1225 std::atomic<time_t> m_update_time; // in-memory only value
1226 private:
1227 const time_t CREATE_TIME_UNKNOWN = 1;
1228 // CREATE_TIME_UNKNOWN means "didn't try to read, yet"
1229 // 0 means "no data available"
1230 std::atomic<time_t> m_create_time;
1231 };
1232
1233 /*
1234 A thread-safe sequential number generator. Its performance is not a concern
1235 hence it is ok to protect it by a mutex.
1236 */
1237
1238 class Rdb_seq_generator {
1239 uint m_next_number = 0;
1240
1241 mysql_mutex_t m_mutex;
1242
1243 public:
1244 Rdb_seq_generator(const Rdb_seq_generator &) = delete;
1245 Rdb_seq_generator &operator=(const Rdb_seq_generator &) = delete;
1246 Rdb_seq_generator() = default;
1247
init(const uint initial_number)1248 void init(const uint initial_number) {
1249 mysql_mutex_init(0, &m_mutex, MY_MUTEX_INIT_FAST);
1250 m_next_number = initial_number;
1251 }
1252
1253 uint get_and_update_next_number(Rdb_dict_manager *const dict);
1254
cleanup()1255 void cleanup() { mysql_mutex_destroy(&m_mutex); }
1256 };
1257
1258 interface Rdb_tables_scanner {
1259 virtual int add_table(Rdb_tbl_def * tdef) = 0;
1260 };
1261
1262 /*
1263 This contains a mapping of
1264
1265 dbname.table_name -> array{Rdb_key_def}.
1266
1267 objects are shared among all threads.
1268 */
1269
1270 class Rdb_ddl_manager {
1271 Rdb_dict_manager *m_dict = nullptr;
1272 Rdb_cf_manager *m_cf_manager = nullptr;
1273
1274 // Contains Rdb_tbl_def elements
1275 std::unordered_map<std::string, Rdb_tbl_def *> m_ddl_map;
1276
1277 // Maps index id to <table_name, index number>
1278 std::map<GL_INDEX_ID, std::pair<std::string, uint>> m_index_num_to_keydef;
1279
1280 // Maps index id to key definitons not yet committed to data dictionary.
1281 // This is mainly used to store key definitions during ALTER TABLE.
1282 std::map<GL_INDEX_ID, std::shared_ptr<Rdb_key_def>>
1283 m_index_num_to_uncommitted_keydef;
1284 mysql_rwlock_t m_rwlock;
1285
1286 Rdb_seq_generator m_sequence;
1287 // A queue of table stats to write into data dictionary
1288 // It is produced by event listener (ie compaction and flush threads)
1289 // and consumed by the rocksdb background thread
1290 std::map<GL_INDEX_ID, Rdb_index_stats> m_stats2store;
1291
1292 const std::shared_ptr<Rdb_key_def> &find(GL_INDEX_ID gl_index_id);
1293
1294 public:
1295 Rdb_ddl_manager(const Rdb_ddl_manager &) = delete;
1296 Rdb_ddl_manager &operator=(const Rdb_ddl_manager &) = delete;
Rdb_ddl_manager()1297 Rdb_ddl_manager() {}
1298
1299 /* Load the data dictionary from on-disk storage */
1300 #if defined(ROCKSDB_INCLUDE_VALIDATE_TABLES) && ROCKSDB_INCLUDE_VALIDATE_TABLES
1301 bool init(Rdb_dict_manager *const dict_arg, Rdb_cf_manager *const cf_manager,
1302 const uint32_t validate_tables);
1303 #else
1304 bool init(Rdb_dict_manager *const dict_arg, Rdb_cf_manager *const cf_manager);
1305 #endif // defined(ROCKSDB_INCLUDE_VALIDATE_TABLES) &&
1306 // ROCKSDB_INCLUDE_VALIDATE_TABLES
1307
1308 void cleanup();
1309
1310 Rdb_tbl_def *find(const std::string &table_name, const bool lock = true);
1311 int find_indexes(const std::string &table_name,
1312 std::vector<GL_INDEX_ID> *indexes);
1313 int find_table_stats(const std::string &table_name,
1314 Rdb_table_stats *tbl_stats);
1315 std::shared_ptr<const Rdb_key_def> safe_find(GL_INDEX_ID gl_index_id);
1316 void set_stats(const std::unordered_map<GL_INDEX_ID, Rdb_index_stats> &stats);
1317 void adjust_stats(const std::vector<Rdb_index_stats> &new_data,
1318 const std::vector<Rdb_index_stats> &deleted_data =
1319 std::vector<Rdb_index_stats>());
1320 void persist_stats(const bool sync = false);
1321
1322 void set_table_stats(const std::string &tbl_name);
1323
1324 /* Modify the mapping and write it to on-disk storage */
1325 int put_and_write(Rdb_tbl_def *const key_descr,
1326 rocksdb::WriteBatch *const batch);
1327 void remove(Rdb_tbl_def *const rec, rocksdb::WriteBatch *const batch,
1328 const bool lock = true);
1329 bool rename(const std::string &from, const std::string &to,
1330 rocksdb::WriteBatch *const batch);
1331
get_and_update_next_number(Rdb_dict_manager * const dict)1332 uint get_and_update_next_number(Rdb_dict_manager *const dict) {
1333 return m_sequence.get_and_update_next_number(dict);
1334 }
1335
1336 const std::string safe_get_table_name(const GL_INDEX_ID &gl_index_id);
1337
1338 /* Walk the data dictionary */
1339 int scan_for_tables(Rdb_tables_scanner *tables_scanner);
1340
1341 void erase_index_num(const GL_INDEX_ID &gl_index_id);
1342 void add_uncommitted_keydefs(
1343 const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes);
1344 void remove_uncommitted_keydefs(
1345 const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes);
1346 int find_in_uncommitted_keydef(const uint32_t &cf_id);
1347
1348 private:
1349 /* Put the data into in-memory table (only) */
1350 int put(Rdb_tbl_def *const key_descr, const bool lock = true);
1351
1352 /* Helper functions to be passed to my_core::HASH object */
1353 static const uchar *get_hash_key(Rdb_tbl_def *const rec, size_t *const length,
1354 my_bool not_used MY_ATTRIBUTE((unused)));
1355 static void free_hash_elem(void *const data);
1356
1357 #if defined(ROCKSDB_INCLUDE_VALIDATE_TABLES) && ROCKSDB_INCLUDE_VALIDATE_TABLES
1358 bool validate_schemas();
1359
1360 bool validate_auto_incr();
1361 #endif // defined(ROCKSDB_INCLUDE_VALIDATE_TABLES) &&
1362 // ROCKSDB_INCLUDE_VALIDATE_TABLES
1363 };
1364
1365 /*
1366 Rdb_dict_manager manages how MySQL on RocksDB (MyRocks) stores its
1367 internal data dictionary.
1368 MyRocks stores data dictionary on dedicated system column family
1369 named __system__. The system column family is used by MyRocks
1370 internally only, and not used by applications.
1371
1372 Currently MyRocks has the following data dictionary data models.
1373
1374 1. Table Name => internal index id mappings
1375 key: Rdb_key_def::DDL_ENTRY_INDEX_START_NUMBER(0x1) + dbname.tablename
1376 value: version, {cf_id, index_id}*n_indexes_of_the_table
1377 version is 2 bytes. cf_id and index_id are 4 bytes.
1378
1379 2. internal cf_id, index id => index information
1380 key: Rdb_key_def::INDEX_INFO(0x2) + cf_id + index_id
1381 value: version, index_type, kv_format_version, index_flags, ttl_duration
1382 index_type is 1 byte, version and kv_format_version are 2 bytes.
1383 index_flags is 4 bytes.
1384 ttl_duration is 8 bytes.
1385
1386 3. CF id => CF flags
1387 key: Rdb_key_def::CF_DEFINITION(0x3) + cf_id
1388 value: version, {is_reverse_cf, is_auto_cf (deprecated), is_per_partition_cf}
1389 cf_flags is 4 bytes in total.
1390
1391 4. Binlog entry (updated at commit)
1392 key: Rdb_key_def::BINLOG_INFO_INDEX_NUMBER (0x4)
1393 value: version, {binlog_name,binlog_pos,binlog_gtid}
1394
1395 5. Ongoing drop index entry
1396 key: Rdb_key_def::DDL_DROP_INDEX_ONGOING(0x5) + cf_id + index_id
1397 value: version
1398
1399 6. index stats
1400 key: Rdb_key_def::INDEX_STATISTICS(0x6) + cf_id + index_id
1401 value: version, {materialized PropertiesCollector::IndexStats}
1402
1403 7. maximum index id
1404 key: Rdb_key_def::MAX_INDEX_ID(0x7)
1405 value: index_id
1406 index_id is 4 bytes
1407
1408 8. Ongoing create index entry
1409 key: Rdb_key_def::DDL_CREATE_INDEX_ONGOING(0x8) + cf_id + index_id
1410 value: version
1411
1412 9. auto_increment values
1413 key: Rdb_key_def::AUTO_INC(0x9) + cf_id + index_id
1414 value: version, {max auto_increment so far}
1415 max auto_increment is 8 bytes
1416
1417 10. dropped cfs
1418 key: Rdb_key_def::DROPPED_CF(0xa) + cf_id
1419 value: version
1420
1421 Data dictionary operations are atomic inside RocksDB. For example,
1422 when creating a table with two indexes, it is necessary to call Put
1423 three times. They have to be atomic. Rdb_dict_manager has a wrapper function
1424 begin() and commit() to make it easier to do atomic operations.
1425
1426 */
1427 class Rdb_dict_manager {
1428 private:
1429 mysql_mutex_t m_mutex;
1430 rocksdb::TransactionDB *m_db = nullptr;
1431 rocksdb::ColumnFamilyHandle *m_system_cfh = nullptr;
1432 /* Utility to put INDEX_INFO and CF_DEFINITION */
1433
1434 uchar m_key_buf_max_index_id[Rdb_key_def::INDEX_NUMBER_SIZE] = {0};
1435 rocksdb::Slice m_key_slice_max_index_id;
1436
1437 static void dump_index_id(uchar *const netbuf,
1438 Rdb_key_def::DATA_DICT_TYPE dict_type,
1439 const GL_INDEX_ID &gl_index_id);
1440 template <size_t T>
dump_index_id(Rdb_buf_writer<T> * buf_writer,Rdb_key_def::DATA_DICT_TYPE dict_type,const GL_INDEX_ID & gl_index_id)1441 static void dump_index_id(Rdb_buf_writer<T> *buf_writer,
1442 Rdb_key_def::DATA_DICT_TYPE dict_type,
1443 const GL_INDEX_ID &gl_index_id) {
1444 buf_writer->write_uint32(dict_type);
1445 buf_writer->write_uint32(gl_index_id.cf_id);
1446 buf_writer->write_uint32(gl_index_id.index_id);
1447 }
1448
1449 void delete_with_prefix(rocksdb::WriteBatch *const batch,
1450 Rdb_key_def::DATA_DICT_TYPE dict_type,
1451 const GL_INDEX_ID &gl_index_id) const;
1452 /* Functions for fast DROP TABLE/INDEX */
1453 void resume_drop_indexes() const;
1454 void log_start_drop_table(const std::shared_ptr<Rdb_key_def> *const key_descr,
1455 const uint32 n_keys,
1456 const char *const log_action) const;
1457 void log_start_drop_index(GL_INDEX_ID gl_index_id,
1458 const char *log_action) const;
1459
1460 public:
1461 Rdb_dict_manager(const Rdb_dict_manager &) = delete;
1462 Rdb_dict_manager &operator=(const Rdb_dict_manager &) = delete;
1463 Rdb_dict_manager() = default;
1464
1465 bool init(rocksdb::TransactionDB *const rdb_dict,
1466 Rdb_cf_manager *const cf_manager,
1467 const my_bool enable_remove_orphaned_cf_flags);
1468
cleanup()1469 inline void cleanup() { mysql_mutex_destroy(&m_mutex); }
1470
lock()1471 inline void lock() { RDB_MUTEX_LOCK_CHECK(m_mutex); }
1472
unlock()1473 inline void unlock() { RDB_MUTEX_UNLOCK_CHECK(m_mutex); }
1474
assert_lock_held()1475 inline void assert_lock_held() { mysql_mutex_assert_owner(&m_mutex); }
1476
get_system_cf()1477 inline rocksdb::ColumnFamilyHandle *get_system_cf() const {
1478 return m_system_cfh;
1479 }
1480
1481 /* Raw RocksDB operations */
1482 std::unique_ptr<rocksdb::WriteBatch> begin() const;
1483 int commit(rocksdb::WriteBatch *const batch, const bool sync = true) const;
1484 rocksdb::Status get_value(const rocksdb::Slice &key,
1485 std::string *const value) const;
1486 void put_key(rocksdb::WriteBatchBase *const batch, const rocksdb::Slice &key,
1487 const rocksdb::Slice &value) const;
1488 void delete_key(rocksdb::WriteBatchBase *batch,
1489 const rocksdb::Slice &key) const;
1490 rocksdb::Iterator *new_iterator() const;
1491
1492 /* Internal Index id => CF */
1493 void add_or_update_index_cf_mapping(
1494 rocksdb::WriteBatch *batch,
1495 struct Rdb_index_info *const index_info) const;
1496 void delete_index_info(rocksdb::WriteBatch *batch,
1497 const GL_INDEX_ID &index_id) const;
1498 bool get_index_info(const GL_INDEX_ID &gl_index_id,
1499 struct Rdb_index_info *const index_info) const;
1500
1501 /* CF id => CF flags */
1502 void add_cf_flags(rocksdb::WriteBatch *const batch, const uint cf_id,
1503 const uint cf_flags) const;
1504 bool get_cf_flags(const uint cf_id, uint *const cf_flags) const;
1505
1506 void add_dropped_cf(rocksdb::WriteBatch *const batch,
1507 const uint &cf_id) const;
1508 void delete_dropped_cf(rocksdb::WriteBatch *const batch,
1509 const uint &cf_id) const;
1510 bool get_dropped_cf(const uint &cf_id) const;
1511 void get_all_dropped_cfs(std::unordered_set<uint32> *dropped_cf_ids) const;
1512
1513 int add_missing_cf_flags(Rdb_cf_manager *const cf_manager) const;
1514
1515 int remove_orphaned_dropped_cfs(
1516 Rdb_cf_manager *const cf_manager,
1517 const my_bool &enable_remove_orphaned_dropped_cfs) const;
1518
1519 void delete_dropped_cf_and_flags(rocksdb::WriteBatch *const batch,
1520 const uint &cf_id) const;
1521
1522 /* Functions for fast CREATE/DROP TABLE/INDEX */
1523 void get_ongoing_index_operation(
1524 std::unordered_set<GL_INDEX_ID> *gl_index_ids,
1525 Rdb_key_def::DATA_DICT_TYPE dd_type) const;
1526 bool is_index_operation_ongoing(const GL_INDEX_ID &gl_index_id,
1527 Rdb_key_def::DATA_DICT_TYPE dd_type) const;
1528 void start_ongoing_index_operation(rocksdb::WriteBatch *batch,
1529 const GL_INDEX_ID &gl_index_id,
1530 Rdb_key_def::DATA_DICT_TYPE dd_type) const;
1531 void end_ongoing_index_operation(rocksdb::WriteBatch *const batch,
1532 const GL_INDEX_ID &gl_index_id,
1533 Rdb_key_def::DATA_DICT_TYPE dd_type) const;
1534 bool is_drop_index_empty() const;
1535 void add_drop_table(std::shared_ptr<Rdb_key_def> *const key_descr,
1536 const uint32 n_keys,
1537 rocksdb::WriteBatch *const batch) const;
1538 void add_drop_index(const std::unordered_set<GL_INDEX_ID> &gl_index_ids,
1539 rocksdb::WriteBatch *const batch) const;
1540 void add_create_index(const std::unordered_set<GL_INDEX_ID> &gl_index_ids,
1541 rocksdb::WriteBatch *const batch) const;
1542 void finish_indexes_operation(
1543 const std::unordered_set<GL_INDEX_ID> &gl_index_ids,
1544 Rdb_key_def::DATA_DICT_TYPE dd_type) const;
1545 void rollback_ongoing_index_creation() const;
1546 void rollback_ongoing_index_creation(
1547 const std::unordered_set<GL_INDEX_ID> &gl_index_ids) const;
1548
get_ongoing_drop_indexes(std::unordered_set<GL_INDEX_ID> * gl_index_ids)1549 inline void get_ongoing_drop_indexes(
1550 std::unordered_set<GL_INDEX_ID> *gl_index_ids) const {
1551 get_ongoing_index_operation(gl_index_ids,
1552 Rdb_key_def::DDL_DROP_INDEX_ONGOING);
1553 }
get_ongoing_create_indexes(std::unordered_set<GL_INDEX_ID> * gl_index_ids)1554 inline void get_ongoing_create_indexes(
1555 std::unordered_set<GL_INDEX_ID> *gl_index_ids) const {
1556 get_ongoing_index_operation(gl_index_ids,
1557 Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
1558 }
start_drop_index(rocksdb::WriteBatch * wb,const GL_INDEX_ID & gl_index_id)1559 inline void start_drop_index(rocksdb::WriteBatch *wb,
1560 const GL_INDEX_ID &gl_index_id) const {
1561 start_ongoing_index_operation(wb, gl_index_id,
1562 Rdb_key_def::DDL_DROP_INDEX_ONGOING);
1563 }
start_create_index(rocksdb::WriteBatch * wb,const GL_INDEX_ID & gl_index_id)1564 inline void start_create_index(rocksdb::WriteBatch *wb,
1565 const GL_INDEX_ID &gl_index_id) const {
1566 start_ongoing_index_operation(wb, gl_index_id,
1567 Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
1568 }
finish_drop_indexes(const std::unordered_set<GL_INDEX_ID> & gl_index_ids)1569 inline void finish_drop_indexes(
1570 const std::unordered_set<GL_INDEX_ID> &gl_index_ids) const {
1571 finish_indexes_operation(gl_index_ids, Rdb_key_def::DDL_DROP_INDEX_ONGOING);
1572 }
finish_create_indexes(const std::unordered_set<GL_INDEX_ID> & gl_index_ids)1573 inline void finish_create_indexes(
1574 const std::unordered_set<GL_INDEX_ID> &gl_index_ids) const {
1575 finish_indexes_operation(gl_index_ids,
1576 Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
1577 }
is_drop_index_ongoing(const GL_INDEX_ID & gl_index_id)1578 inline bool is_drop_index_ongoing(const GL_INDEX_ID &gl_index_id) const {
1579 return is_index_operation_ongoing(gl_index_id,
1580 Rdb_key_def::DDL_DROP_INDEX_ONGOING);
1581 }
is_create_index_ongoing(const GL_INDEX_ID & gl_index_id)1582 inline bool is_create_index_ongoing(const GL_INDEX_ID &gl_index_id) const {
1583 return is_index_operation_ongoing(gl_index_id,
1584 Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
1585 }
1586
1587 bool get_max_index_id(uint32_t *const index_id) const;
1588 bool update_max_index_id(rocksdb::WriteBatch *const batch,
1589 const uint32_t index_id) const;
1590 void add_stats(rocksdb::WriteBatch *const batch,
1591 const std::vector<Rdb_index_stats> &stats) const;
1592 Rdb_index_stats get_stats(GL_INDEX_ID gl_index_id) const;
1593
1594 rocksdb::Status put_auto_incr_val(rocksdb::WriteBatchBase *batch,
1595 GL_INDEX_ID gl_index_id, ulonglong val,
1596 bool overwrite = false) const;
1597 bool get_auto_incr_val(const GL_INDEX_ID &gl_index_id,
1598 ulonglong *new_val) const;
1599
1600 private:
1601 /* dropped cf flags */
1602 void delete_cf_flags(rocksdb::WriteBatch *const batch,
1603 const uint &cf_id) const;
1604 };
1605
1606 struct Rdb_index_info {
1607 GL_INDEX_ID m_gl_index_id;
1608 uint16_t m_index_dict_version = 0;
1609 uchar m_index_type = 0;
1610 uint16_t m_kv_version = 0;
1611 uint32 m_index_flags = 0;
1612 uint64 m_ttl_duration = 0;
1613 };
1614
1615 bool rdb_is_collation_supported(const my_core::CHARSET_INFO *const cs);
1616
1617 /*
1618 @brief
1619 Merge Operator for the auto_increment value in the system_cf
1620
1621 @detail
1622 This class implements the rocksdb Merge Operator for auto_increment values
1623 that are stored to the data dictionary every transaction.
1624
1625 The actual Merge function is triggered on compaction, memtable flushes, or
1626 when get() is called on the same key.
1627
1628 */
1629 class Rdb_system_merge_op : public rocksdb::AssociativeMergeOperator {
1630 public:
1631 /*
1632 Updates the new value associated with a key to be the maximum of the
1633 passed in value and the existing value.
1634
1635 @param[IN] key
1636 @param[IN] existing_value existing value for a key; nullptr if nonexistent
1637 key
1638 @param[IN] value
1639 @param[OUT] new_value new value after Merge
1640 @param[IN] logger
1641 */
Merge(const rocksdb::Slice & key,const rocksdb::Slice * existing_value,const rocksdb::Slice & value,std::string * new_value,rocksdb::Logger * logger)1642 bool Merge(const rocksdb::Slice &key, const rocksdb::Slice *existing_value,
1643 const rocksdb::Slice &value, std::string *new_value,
1644 rocksdb::Logger *logger) const override {
1645 assert(new_value != nullptr);
1646
1647 if (key.size() != Rdb_key_def::INDEX_NUMBER_SIZE * 3 ||
1648 GetKeyType(key) != Rdb_key_def::AUTO_INC ||
1649 value.size() !=
1650 RDB_SIZEOF_AUTO_INCREMENT_VERSION + ROCKSDB_SIZEOF_AUTOINC_VALUE ||
1651 GetVersion(value) > Rdb_key_def::AUTO_INCREMENT_VERSION) {
1652 abort();
1653 }
1654
1655 uint64_t merged_value = Deserialize(value);
1656
1657 if (existing_value != nullptr) {
1658 if (existing_value->size() != RDB_SIZEOF_AUTO_INCREMENT_VERSION +
1659 ROCKSDB_SIZEOF_AUTOINC_VALUE ||
1660 GetVersion(*existing_value) > Rdb_key_def::AUTO_INCREMENT_VERSION) {
1661 abort();
1662 }
1663
1664 merged_value = std::max(merged_value, Deserialize(*existing_value));
1665 }
1666 Serialize(merged_value, new_value);
1667 return true;
1668 }
1669
Name()1670 virtual const char *Name() const override { return "Rdb_system_merge_op"; }
1671
1672 private:
1673 /*
1674 Serializes the integer data to the new_value buffer or the target buffer
1675 the merge operator will update to
1676 */
Serialize(const uint64_t data,std::string * new_value)1677 void Serialize(const uint64_t data, std::string *new_value) const {
1678 uchar value_buf[RDB_SIZEOF_AUTO_INCREMENT_VERSION +
1679 ROCKSDB_SIZEOF_AUTOINC_VALUE] = {0};
1680 uchar *ptr = value_buf;
1681 /* fill in the auto increment version */
1682 rdb_netbuf_store_uint16(ptr, Rdb_key_def::AUTO_INCREMENT_VERSION);
1683 ptr += RDB_SIZEOF_AUTO_INCREMENT_VERSION;
1684 /* fill in the auto increment value */
1685 rdb_netbuf_store_uint64(ptr, data);
1686 ptr += ROCKSDB_SIZEOF_AUTOINC_VALUE;
1687 new_value->assign(reinterpret_cast<char *>(value_buf), ptr - value_buf);
1688 }
1689
1690 /*
1691 Gets the value of auto_increment type in the data dictionary from the
1692 value slice
1693
1694 @Note Only to be used on data dictionary keys for the auto_increment type
1695 */
Deserialize(const rocksdb::Slice & s)1696 uint64_t Deserialize(const rocksdb::Slice &s) const {
1697 return rdb_netbuf_to_uint64(reinterpret_cast<const uchar *>(s.data()) +
1698 RDB_SIZEOF_AUTO_INCREMENT_VERSION);
1699 }
1700
1701 /*
1702 Gets the type of the key of the key in the data dictionary.
1703
1704 @Note Only to be used on data dictionary keys for the auto_increment type
1705 */
GetKeyType(const rocksdb::Slice & s)1706 uint16_t GetKeyType(const rocksdb::Slice &s) const {
1707 return rdb_netbuf_to_uint32(reinterpret_cast<const uchar *>(s.data()));
1708 }
1709
1710 /*
1711 Gets the version of the auto_increment value in the data dictionary.
1712
1713 @Note Only to be used on data dictionary value for the auto_increment type
1714 */
GetVersion(const rocksdb::Slice & s)1715 uint16_t GetVersion(const rocksdb::Slice &s) const {
1716 return rdb_netbuf_to_uint16(reinterpret_cast<const uchar *>(s.data()));
1717 }
1718 };
1719
1720 } // namespace myrocks
1721