1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10 #include "table/block_based/block_based_table_factory.h"
11
12 #include <stdint.h>
13
14 #include <cinttypes>
15 #include <memory>
16 #include <string>
17
18 #include "logging/logging.h"
19 #include "port/port.h"
20 #include "rocksdb/cache.h"
21 #include "rocksdb/convenience.h"
22 #include "rocksdb/filter_policy.h"
23 #include "rocksdb/flush_block_policy.h"
24 #include "rocksdb/utilities/options_type.h"
25 #include "table/block_based/block_based_table_builder.h"
26 #include "table/block_based/block_based_table_reader.h"
27 #include "table/format.h"
28 #include "util/mutexlock.h"
29 #include "util/string_util.h"
30
31 namespace ROCKSDB_NAMESPACE {
32
RecordEffectiveSize(size_t len)33 void TailPrefetchStats::RecordEffectiveSize(size_t len) {
34 MutexLock l(&mutex_);
35 if (num_records_ < kNumTracked) {
36 num_records_++;
37 }
38 records_[next_++] = len;
39 if (next_ == kNumTracked) {
40 next_ = 0;
41 }
42 }
43
GetSuggestedPrefetchSize()44 size_t TailPrefetchStats::GetSuggestedPrefetchSize() {
45 std::vector<size_t> sorted;
46 {
47 MutexLock l(&mutex_);
48
49 if (num_records_ == 0) {
50 return 0;
51 }
52 sorted.assign(records_, records_ + num_records_);
53 }
54
55 // Of the historic size, we find the maximum one that satisifis the condtiion
56 // that if prefetching all, less than 1/8 will be wasted.
57 std::sort(sorted.begin(), sorted.end());
58
59 // Assuming we have 5 data points, and after sorting it looks like this:
60 //
61 // +---+
62 // +---+ | |
63 // | | | |
64 // | | | |
65 // | | | |
66 // | | | |
67 // +---+ | | | |
68 // | | | | | |
69 // +---+ | | | | | |
70 // | | | | | | | |
71 // +---+ | | | | | | | |
72 // | | | | | | | | | |
73 // | | | | | | | | | |
74 // | | | | | | | | | |
75 // | | | | | | | | | |
76 // | | | | | | | | | |
77 // +---+ +---+ +---+ +---+ +---+
78 //
79 // and we use every of the value as a candidate, and estimate how much we
80 // wasted, compared to read. For example, when we use the 3rd record
81 // as candiate. This area is what we read:
82 // +---+
83 // +---+ | |
84 // | | | |
85 // | | | |
86 // | | | |
87 // | | | |
88 // *** *** *** ***+ *** *** *** *** **
89 // * | | | | | |
90 // +---+ | | | | | *
91 // * | | | | | | | |
92 // +---+ | | | | | | | *
93 // * | | | | X | | | | |
94 // | | | | | | | | | *
95 // * | | | | | | | | |
96 // | | | | | | | | | *
97 // * | | | | | | | | |
98 // *** *** ***-*** ***--*** ***--*** +****
99 // which is (size of the record) X (number of records).
100 //
101 // While wasted is this area:
102 // +---+
103 // +---+ | |
104 // | | | |
105 // | | | |
106 // | | | |
107 // | | | |
108 // *** *** *** ****---+ | | | |
109 // * * | | | | |
110 // * *-*** *** | | | | |
111 // * * | | | | | | |
112 // *--** *** | | | | | | |
113 // | | | | | X | | | | |
114 // | | | | | | | | | |
115 // | | | | | | | | | |
116 // | | | | | | | | | |
117 // | | | | | | | | | |
118 // +---+ +---+ +---+ +---+ +---+
119 //
120 // Which can be calculated iteratively.
121 // The difference between wasted using 4st and 3rd record, will
122 // be following area:
123 // +---+
124 // +--+ +-+ ++ +-+ +-+ +---+ | |
125 // + xxxxxxxxxxxxxxxxxxxxxxxx | | | |
126 // xxxxxxxxxxxxxxxxxxxxxxxx | | | |
127 // + xxxxxxxxxxxxxxxxxxxxxxxx | | | |
128 // | xxxxxxxxxxxxxxxxxxxxxxxx | | | |
129 // +-+ +-+ +-+ ++ +---+ +--+ | | |
130 // | | | | | | |
131 // +---+ ++ | | | | | |
132 // | | | | | | X | | |
133 // +---+ ++ | | | | | | | |
134 // | | | | | | | | | |
135 // | | | | | | | | | |
136 // | | | | | | | | | |
137 // | | | | | | | | | |
138 // | | | | | | | | | |
139 // +---+ +---+ +---+ +---+ +---+
140 //
141 // which will be the size difference between 4st and 3rd record,
142 // times 3, which is number of records before the 4st.
143 // Here we assume that all data within the prefetch range will be useful. In
144 // reality, it may not be the case when a partial block is inside the range,
145 // or there are data in the middle that is not read. We ignore those cases
146 // for simplicity.
147 assert(!sorted.empty());
148 size_t prev_size = sorted[0];
149 size_t max_qualified_size = sorted[0];
150 size_t wasted = 0;
151 for (size_t i = 1; i < sorted.size(); i++) {
152 size_t read = sorted[i] * sorted.size();
153 wasted += (sorted[i] - prev_size) * i;
154 if (wasted <= read / 8) {
155 max_qualified_size = sorted[i];
156 }
157 prev_size = sorted[i];
158 }
159 const size_t kMaxPrefetchSize = 512 * 1024; // Never exceed 512KB
160 return std::min(kMaxPrefetchSize, max_qualified_size);
161 }
162
163 #ifndef ROCKSDB_LITE
164
165 const std::string kOptNameMetadataCacheOpts = "metadata_cache_options";
166
167 static std::unordered_map<std::string, PinningTier>
168 pinning_tier_type_string_map = {
169 {"kFallback", PinningTier::kFallback},
170 {"kNone", PinningTier::kNone},
171 {"kFlushedAndSimilar", PinningTier::kFlushedAndSimilar},
172 {"kAll", PinningTier::kAll}};
173
174 static std::unordered_map<std::string, BlockBasedTableOptions::IndexType>
175 block_base_table_index_type_string_map = {
176 {"kBinarySearch", BlockBasedTableOptions::IndexType::kBinarySearch},
177 {"kHashSearch", BlockBasedTableOptions::IndexType::kHashSearch},
178 {"kTwoLevelIndexSearch",
179 BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch},
180 {"kBinarySearchWithFirstKey",
181 BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey}};
182
183 static std::unordered_map<std::string,
184 BlockBasedTableOptions::DataBlockIndexType>
185 block_base_table_data_block_index_type_string_map = {
186 {"kDataBlockBinarySearch",
187 BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch},
188 {"kDataBlockBinaryAndHash",
189 BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash}};
190
191 static std::unordered_map<std::string,
192 BlockBasedTableOptions::IndexShorteningMode>
193 block_base_table_index_shortening_mode_string_map = {
194 {"kNoShortening",
195 BlockBasedTableOptions::IndexShorteningMode::kNoShortening},
196 {"kShortenSeparators",
197 BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators},
198 {"kShortenSeparatorsAndSuccessor",
199 BlockBasedTableOptions::IndexShorteningMode::
200 kShortenSeparatorsAndSuccessor}};
201
202 static std::unordered_map<std::string, OptionTypeInfo>
203 metadata_cache_options_type_info = {
204 {"top_level_index_pinning",
205 OptionTypeInfo::Enum<PinningTier>(
206 offsetof(struct MetadataCacheOptions, top_level_index_pinning),
207 &pinning_tier_type_string_map)},
208 {"partition_pinning",
209 OptionTypeInfo::Enum<PinningTier>(
210 offsetof(struct MetadataCacheOptions, partition_pinning),
211 &pinning_tier_type_string_map)},
212 {"unpartitioned_pinning",
213 OptionTypeInfo::Enum<PinningTier>(
214 offsetof(struct MetadataCacheOptions, unpartitioned_pinning),
215 &pinning_tier_type_string_map)}};
216
217 static std::unordered_map<std::string,
218 BlockBasedTableOptions::PrepopulateBlockCache>
219 block_base_table_prepopulate_block_cache_string_map = {
220 {"kDisable", BlockBasedTableOptions::PrepopulateBlockCache::kDisable},
221 {"kFlushOnly",
222 BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly}};
223
224 #endif // ROCKSDB_LITE
225
226 static std::unordered_map<std::string, OptionTypeInfo>
227 block_based_table_type_info = {
228 #ifndef ROCKSDB_LITE
229 /* currently not supported
230 std::shared_ptr<Cache> block_cache = nullptr;
231 std::shared_ptr<Cache> block_cache_compressed = nullptr;
232 */
233 {"flush_block_policy_factory",
234 OptionTypeInfo::AsCustomSharedPtr<FlushBlockPolicyFactory>(
235 offsetof(struct BlockBasedTableOptions,
236 flush_block_policy_factory),
237 OptionVerificationType::kByName, OptionTypeFlags::kCompareNever)},
238 {"cache_index_and_filter_blocks",
239 {offsetof(struct BlockBasedTableOptions,
240 cache_index_and_filter_blocks),
241 OptionType::kBoolean, OptionVerificationType::kNormal,
242 OptionTypeFlags::kNone}},
243 {"cache_index_and_filter_blocks_with_high_priority",
244 {offsetof(struct BlockBasedTableOptions,
245 cache_index_and_filter_blocks_with_high_priority),
246 OptionType::kBoolean, OptionVerificationType::kNormal,
247 OptionTypeFlags::kNone}},
248 {"pin_l0_filter_and_index_blocks_in_cache",
249 {offsetof(struct BlockBasedTableOptions,
250 pin_l0_filter_and_index_blocks_in_cache),
251 OptionType::kBoolean, OptionVerificationType::kNormal,
252 OptionTypeFlags::kNone}},
253 {"index_type", OptionTypeInfo::Enum<BlockBasedTableOptions::IndexType>(
254 offsetof(struct BlockBasedTableOptions, index_type),
255 &block_base_table_index_type_string_map)},
256 {"hash_index_allow_collision",
257 {offsetof(struct BlockBasedTableOptions, hash_index_allow_collision),
258 OptionType::kBoolean, OptionVerificationType::kNormal,
259 OptionTypeFlags::kNone}},
260 {"data_block_index_type",
261 OptionTypeInfo::Enum<BlockBasedTableOptions::DataBlockIndexType>(
262 offsetof(struct BlockBasedTableOptions, data_block_index_type),
263 &block_base_table_data_block_index_type_string_map)},
264 {"index_shortening",
265 OptionTypeInfo::Enum<BlockBasedTableOptions::IndexShorteningMode>(
266 offsetof(struct BlockBasedTableOptions, index_shortening),
267 &block_base_table_index_shortening_mode_string_map)},
268 {"data_block_hash_table_util_ratio",
269 {offsetof(struct BlockBasedTableOptions,
270 data_block_hash_table_util_ratio),
271 OptionType::kDouble, OptionVerificationType::kNormal,
272 OptionTypeFlags::kNone}},
273 {"checksum",
274 {offsetof(struct BlockBasedTableOptions, checksum),
275 OptionType::kChecksumType, OptionVerificationType::kNormal,
276 OptionTypeFlags::kNone}},
277 {"no_block_cache",
278 {offsetof(struct BlockBasedTableOptions, no_block_cache),
279 OptionType::kBoolean, OptionVerificationType::kNormal,
280 OptionTypeFlags::kNone}},
281 {"block_size",
282 {offsetof(struct BlockBasedTableOptions, block_size),
283 OptionType::kSizeT, OptionVerificationType::kNormal,
284 OptionTypeFlags::kMutable}},
285 {"block_size_deviation",
286 {offsetof(struct BlockBasedTableOptions, block_size_deviation),
287 OptionType::kInt, OptionVerificationType::kNormal,
288 OptionTypeFlags::kNone}},
289 {"block_restart_interval",
290 {offsetof(struct BlockBasedTableOptions, block_restart_interval),
291 OptionType::kInt, OptionVerificationType::kNormal,
292 OptionTypeFlags::kMutable}},
293 {"index_block_restart_interval",
294 {offsetof(struct BlockBasedTableOptions, index_block_restart_interval),
295 OptionType::kInt, OptionVerificationType::kNormal,
296 OptionTypeFlags::kNone}},
297 {"index_per_partition",
298 {0, OptionType::kUInt64T, OptionVerificationType::kDeprecated,
299 OptionTypeFlags::kNone}},
300 {"metadata_block_size",
301 {offsetof(struct BlockBasedTableOptions, metadata_block_size),
302 OptionType::kUInt64T, OptionVerificationType::kNormal,
303 OptionTypeFlags::kNone}},
304 {"partition_filters",
305 {offsetof(struct BlockBasedTableOptions, partition_filters),
306 OptionType::kBoolean, OptionVerificationType::kNormal,
307 OptionTypeFlags::kNone}},
308 {"optimize_filters_for_memory",
309 {offsetof(struct BlockBasedTableOptions, optimize_filters_for_memory),
310 OptionType::kBoolean, OptionVerificationType::kNormal,
311 OptionTypeFlags::kNone}},
312 {"filter_policy",
313 {offsetof(struct BlockBasedTableOptions, filter_policy),
314 OptionType::kUnknown, OptionVerificationType::kByNameAllowFromNull,
315 OptionTypeFlags::kNone,
316 // Parses the Filter policy
317 [](const ConfigOptions& opts, const std::string&,
__anon0ea3814c0102() 318 const std::string& value, void* addr) {
319 auto* policy =
320 static_cast<std::shared_ptr<const FilterPolicy>*>(addr);
321 return FilterPolicy::CreateFromString(opts, value, policy);
322 },
323 // Converts the FilterPolicy to its string representation
324 [](const ConfigOptions&, const std::string&, const void* addr,
__anon0ea3814c0202() 325 std::string* value) {
326 const auto* policy =
327 static_cast<const std::shared_ptr<const FilterPolicy>*>(addr);
328 if (policy->get()) {
329 *value = (*policy)->Name();
330 } else {
331 *value = kNullptrString;
332 }
333 return Status::OK();
334 },
335 // Compares two FilterPolicy objects for equality
336 [](const ConfigOptions&, const std::string&, const void* addr1,
__anon0ea3814c0302() 337 const void* addr2, std::string*) {
338 const auto* policy1 =
339 static_cast<const std::shared_ptr<const FilterPolicy>*>(addr1)
340 ->get();
341 const auto* policy2 =
342 static_cast<const std::shared_ptr<FilterPolicy>*>(addr2)->get();
343 if (policy1 == policy2) {
344 return true;
345 } else if (policy1 != nullptr && policy2 != nullptr) {
346 return (strcmp(policy1->Name(), policy2->Name()) == 0);
347 } else {
348 return false;
349 }
350 }}},
351 {"whole_key_filtering",
352 {offsetof(struct BlockBasedTableOptions, whole_key_filtering),
353 OptionType::kBoolean, OptionVerificationType::kNormal,
354 OptionTypeFlags::kNone}},
355 {"skip_table_builder_flush",
356 {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
357 OptionTypeFlags::kNone}},
358 {"format_version",
359 {offsetof(struct BlockBasedTableOptions, format_version),
360 OptionType::kUInt32T, OptionVerificationType::kNormal,
361 OptionTypeFlags::kNone}},
362 {"verify_compression",
363 {offsetof(struct BlockBasedTableOptions, verify_compression),
364 OptionType::kBoolean, OptionVerificationType::kNormal,
365 OptionTypeFlags::kNone}},
366 {"read_amp_bytes_per_bit",
367 {offsetof(struct BlockBasedTableOptions, read_amp_bytes_per_bit),
368 OptionType::kUInt32T, OptionVerificationType::kNormal,
369 OptionTypeFlags::kNone,
370 [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
__anon0ea3814c0402() 371 const std::string& value, void* addr) {
372 // A workaround to fix a bug in 6.10, 6.11, 6.12, 6.13
373 // and 6.14. The bug will write out 8 bytes to OPTIONS file from the
374 // starting address of BlockBasedTableOptions.read_amp_bytes_per_bit
375 // which is actually a uint32. Consequently, the value of
376 // read_amp_bytes_per_bit written in the OPTIONS file is wrong.
377 // From 6.15, RocksDB will try to parse the read_amp_bytes_per_bit
378 // from OPTIONS file as a uint32. To be able to load OPTIONS file
379 // generated by affected releases before the fix, we need to
380 // manually parse read_amp_bytes_per_bit with this special hack.
381 uint64_t read_amp_bytes_per_bit = ParseUint64(value);
382 *(static_cast<uint32_t*>(addr)) =
383 static_cast<uint32_t>(read_amp_bytes_per_bit);
384 return Status::OK();
385 }}},
386 {"enable_index_compression",
387 {offsetof(struct BlockBasedTableOptions, enable_index_compression),
388 OptionType::kBoolean, OptionVerificationType::kNormal,
389 OptionTypeFlags::kNone}},
390 {"block_align",
391 {offsetof(struct BlockBasedTableOptions, block_align),
392 OptionType::kBoolean, OptionVerificationType::kNormal,
393 OptionTypeFlags::kNone}},
394 {"pin_top_level_index_and_filter",
395 {offsetof(struct BlockBasedTableOptions,
396 pin_top_level_index_and_filter),
397 OptionType::kBoolean, OptionVerificationType::kNormal,
398 OptionTypeFlags::kNone}},
399 {kOptNameMetadataCacheOpts,
400 OptionTypeInfo::Struct(
401 kOptNameMetadataCacheOpts, &metadata_cache_options_type_info,
402 offsetof(struct BlockBasedTableOptions, metadata_cache_options),
403 OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
404 {"block_cache",
405 {offsetof(struct BlockBasedTableOptions, block_cache),
406 OptionType::kUnknown, OptionVerificationType::kNormal,
407 (OptionTypeFlags::kCompareNever | OptionTypeFlags::kDontSerialize),
408 // Parses the input vsalue as a Cache
409 [](const ConfigOptions& opts, const std::string&,
__anon0ea3814c0502() 410 const std::string& value, void* addr) {
411 auto* cache = static_cast<std::shared_ptr<Cache>*>(addr);
412 return Cache::CreateFromString(opts, value, cache);
413 }}},
414 {"block_cache_compressed",
415 {offsetof(struct BlockBasedTableOptions, block_cache_compressed),
416 OptionType::kUnknown, OptionVerificationType::kNormal,
417 (OptionTypeFlags::kCompareNever | OptionTypeFlags::kDontSerialize),
418 // Parses the input vsalue as a Cache
419 [](const ConfigOptions& opts, const std::string&,
__anon0ea3814c0602() 420 const std::string& value, void* addr) {
421 auto* cache = static_cast<std::shared_ptr<Cache>*>(addr);
422 return Cache::CreateFromString(opts, value, cache);
423 }}},
424 {"max_auto_readahead_size",
425 {offsetof(struct BlockBasedTableOptions, max_auto_readahead_size),
426 OptionType::kSizeT, OptionVerificationType::kNormal,
427 OptionTypeFlags::kMutable}},
428 {"prepopulate_block_cache",
429 OptionTypeInfo::Enum<BlockBasedTableOptions::PrepopulateBlockCache>(
430 offsetof(struct BlockBasedTableOptions, prepopulate_block_cache),
431 &block_base_table_prepopulate_block_cache_string_map,
432 OptionTypeFlags::kMutable)},
433
434 #endif // ROCKSDB_LITE
435 };
436
437 // TODO(myabandeh): We should return an error instead of silently changing the
438 // options
BlockBasedTableFactory(const BlockBasedTableOptions & _table_options)439 BlockBasedTableFactory::BlockBasedTableFactory(
440 const BlockBasedTableOptions& _table_options)
441 : table_options_(_table_options) {
442 InitializeOptions();
443 RegisterOptions(&table_options_, &block_based_table_type_info);
444 }
445
InitializeOptions()446 void BlockBasedTableFactory::InitializeOptions() {
447 if (table_options_.flush_block_policy_factory == nullptr) {
448 table_options_.flush_block_policy_factory.reset(
449 new FlushBlockBySizePolicyFactory());
450 }
451 if (table_options_.no_block_cache) {
452 table_options_.block_cache.reset();
453 } else if (table_options_.block_cache == nullptr) {
454 LRUCacheOptions co;
455 co.capacity = 8 << 20;
456 // It makes little sense to pay overhead for mid-point insertion while the
457 // block size is only 8MB.
458 co.high_pri_pool_ratio = 0.0;
459 table_options_.block_cache = NewLRUCache(co);
460 }
461 if (table_options_.block_size_deviation < 0 ||
462 table_options_.block_size_deviation > 100) {
463 table_options_.block_size_deviation = 0;
464 }
465 if (table_options_.block_restart_interval < 1) {
466 table_options_.block_restart_interval = 1;
467 }
468 if (table_options_.index_block_restart_interval < 1) {
469 table_options_.index_block_restart_interval = 1;
470 }
471 if (table_options_.index_type == BlockBasedTableOptions::kHashSearch &&
472 table_options_.index_block_restart_interval != 1) {
473 // Currently kHashSearch is incompatible with index_block_restart_interval > 1
474 table_options_.index_block_restart_interval = 1;
475 }
476 if (table_options_.partition_filters &&
477 table_options_.index_type !=
478 BlockBasedTableOptions::kTwoLevelIndexSearch) {
479 // We do not support partitioned filters without partitioning indexes
480 table_options_.partition_filters = false;
481 }
482 }
483
PrepareOptions(const ConfigOptions & opts)484 Status BlockBasedTableFactory::PrepareOptions(const ConfigOptions& opts) {
485 InitializeOptions();
486 return TableFactory::PrepareOptions(opts);
487 }
488
NewTableReader(const ReadOptions & ro,const TableReaderOptions & table_reader_options,std::unique_ptr<RandomAccessFileReader> && file,uint64_t file_size,std::unique_ptr<TableReader> * table_reader,bool prefetch_index_and_filter_in_cache) const489 Status BlockBasedTableFactory::NewTableReader(
490 const ReadOptions& ro, const TableReaderOptions& table_reader_options,
491 std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
492 std::unique_ptr<TableReader>* table_reader,
493 bool prefetch_index_and_filter_in_cache) const {
494 return BlockBasedTable::Open(
495 ro, table_reader_options.ioptions, table_reader_options.env_options,
496 table_options_, table_reader_options.internal_comparator, std::move(file),
497 file_size, table_reader, table_reader_options.prefix_extractor,
498 prefetch_index_and_filter_in_cache, table_reader_options.skip_filters,
499 table_reader_options.level, table_reader_options.immortal,
500 table_reader_options.largest_seqno,
501 table_reader_options.force_direct_prefetch, &tail_prefetch_stats_,
502 table_reader_options.block_cache_tracer,
503 table_reader_options.max_file_size_for_l0_meta_pin,
504 table_reader_options.cur_db_session_id,
505 table_reader_options.cur_file_num);
506 }
507
NewTableBuilder(const TableBuilderOptions & table_builder_options,WritableFileWriter * file) const508 TableBuilder* BlockBasedTableFactory::NewTableBuilder(
509 const TableBuilderOptions& table_builder_options,
510 WritableFileWriter* file) const {
511 return new BlockBasedTableBuilder(table_options_, table_builder_options,
512 file);
513 }
514
ValidateOptions(const DBOptions & db_opts,const ColumnFamilyOptions & cf_opts) const515 Status BlockBasedTableFactory::ValidateOptions(
516 const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const {
517 if (table_options_.index_type == BlockBasedTableOptions::kHashSearch &&
518 cf_opts.prefix_extractor == nullptr) {
519 return Status::InvalidArgument(
520 "Hash index is specified for block-based "
521 "table, but prefix_extractor is not given");
522 }
523 if (table_options_.cache_index_and_filter_blocks &&
524 table_options_.no_block_cache) {
525 return Status::InvalidArgument(
526 "Enable cache_index_and_filter_blocks, "
527 ", but block cache is disabled");
528 }
529 if (table_options_.pin_l0_filter_and_index_blocks_in_cache &&
530 table_options_.no_block_cache) {
531 return Status::InvalidArgument(
532 "Enable pin_l0_filter_and_index_blocks_in_cache, "
533 ", but block cache is disabled");
534 }
535 if (!BlockBasedTableSupportedVersion(table_options_.format_version)) {
536 return Status::InvalidArgument(
537 "Unsupported BlockBasedTable format_version. Please check "
538 "include/rocksdb/table.h for more info");
539 }
540 if (table_options_.block_align && (cf_opts.compression != kNoCompression)) {
541 return Status::InvalidArgument(
542 "Enable block_align, but compression "
543 "enabled");
544 }
545 if (table_options_.block_align &&
546 (table_options_.block_size & (table_options_.block_size - 1))) {
547 return Status::InvalidArgument(
548 "Block alignment requested but block size is not a power of 2");
549 }
550 if (table_options_.block_size > port::kMaxUint32) {
551 return Status::InvalidArgument(
552 "block size exceeds maximum number (4GiB) allowed");
553 }
554 if (table_options_.data_block_index_type ==
555 BlockBasedTableOptions::kDataBlockBinaryAndHash &&
556 table_options_.data_block_hash_table_util_ratio <= 0) {
557 return Status::InvalidArgument(
558 "data_block_hash_table_util_ratio should be greater than 0 when "
559 "data_block_index_type is set to kDataBlockBinaryAndHash");
560 }
561 if (db_opts.unordered_write && cf_opts.max_successive_merges > 0) {
562 // TODO(myabandeh): support it
563 return Status::InvalidArgument(
564 "max_successive_merges larger than 0 is currently inconsistent with "
565 "unordered_write");
566 }
567 return TableFactory::ValidateOptions(db_opts, cf_opts);
568 }
569
GetPrintableOptions() const570 std::string BlockBasedTableFactory::GetPrintableOptions() const {
571 std::string ret;
572 ret.reserve(20000);
573 const int kBufferSize = 200;
574 char buffer[kBufferSize];
575
576 snprintf(buffer, kBufferSize, " flush_block_policy_factory: %s (%p)\n",
577 table_options_.flush_block_policy_factory->Name(),
578 static_cast<void*>(table_options_.flush_block_policy_factory.get()));
579 ret.append(buffer);
580 snprintf(buffer, kBufferSize, " cache_index_and_filter_blocks: %d\n",
581 table_options_.cache_index_and_filter_blocks);
582 ret.append(buffer);
583 snprintf(buffer, kBufferSize,
584 " cache_index_and_filter_blocks_with_high_priority: %d\n",
585 table_options_.cache_index_and_filter_blocks_with_high_priority);
586 ret.append(buffer);
587 snprintf(buffer, kBufferSize,
588 " pin_l0_filter_and_index_blocks_in_cache: %d\n",
589 table_options_.pin_l0_filter_and_index_blocks_in_cache);
590 ret.append(buffer);
591 snprintf(buffer, kBufferSize, " pin_top_level_index_and_filter: %d\n",
592 table_options_.pin_top_level_index_and_filter);
593 ret.append(buffer);
594 snprintf(buffer, kBufferSize, " index_type: %d\n",
595 table_options_.index_type);
596 ret.append(buffer);
597 snprintf(buffer, kBufferSize, " data_block_index_type: %d\n",
598 table_options_.data_block_index_type);
599 ret.append(buffer);
600 snprintf(buffer, kBufferSize, " index_shortening: %d\n",
601 static_cast<int>(table_options_.index_shortening));
602 ret.append(buffer);
603 snprintf(buffer, kBufferSize, " data_block_hash_table_util_ratio: %lf\n",
604 table_options_.data_block_hash_table_util_ratio);
605 ret.append(buffer);
606 snprintf(buffer, kBufferSize, " hash_index_allow_collision: %d\n",
607 table_options_.hash_index_allow_collision);
608 ret.append(buffer);
609 snprintf(buffer, kBufferSize, " checksum: %d\n", table_options_.checksum);
610 ret.append(buffer);
611 snprintf(buffer, kBufferSize, " no_block_cache: %d\n",
612 table_options_.no_block_cache);
613 ret.append(buffer);
614 snprintf(buffer, kBufferSize, " block_cache: %p\n",
615 static_cast<void*>(table_options_.block_cache.get()));
616 ret.append(buffer);
617 if (table_options_.block_cache) {
618 const char* block_cache_name = table_options_.block_cache->Name();
619 if (block_cache_name != nullptr) {
620 snprintf(buffer, kBufferSize, " block_cache_name: %s\n",
621 block_cache_name);
622 ret.append(buffer);
623 }
624 ret.append(" block_cache_options:\n");
625 ret.append(table_options_.block_cache->GetPrintableOptions());
626 }
627 snprintf(buffer, kBufferSize, " block_cache_compressed: %p\n",
628 static_cast<void*>(table_options_.block_cache_compressed.get()));
629 ret.append(buffer);
630 if (table_options_.block_cache_compressed) {
631 const char* block_cache_compressed_name =
632 table_options_.block_cache_compressed->Name();
633 if (block_cache_compressed_name != nullptr) {
634 snprintf(buffer, kBufferSize, " block_cache_name: %s\n",
635 block_cache_compressed_name);
636 ret.append(buffer);
637 }
638 ret.append(" block_cache_compressed_options:\n");
639 ret.append(table_options_.block_cache_compressed->GetPrintableOptions());
640 }
641 snprintf(buffer, kBufferSize, " persistent_cache: %p\n",
642 static_cast<void*>(table_options_.persistent_cache.get()));
643 ret.append(buffer);
644 if (table_options_.persistent_cache) {
645 snprintf(buffer, kBufferSize, " persistent_cache_options:\n");
646 ret.append(buffer);
647 ret.append(table_options_.persistent_cache->GetPrintableOptions());
648 }
649 snprintf(buffer, kBufferSize, " block_size: %" ROCKSDB_PRIszt "\n",
650 table_options_.block_size);
651 ret.append(buffer);
652 snprintf(buffer, kBufferSize, " block_size_deviation: %d\n",
653 table_options_.block_size_deviation);
654 ret.append(buffer);
655 snprintf(buffer, kBufferSize, " block_restart_interval: %d\n",
656 table_options_.block_restart_interval);
657 ret.append(buffer);
658 snprintf(buffer, kBufferSize, " index_block_restart_interval: %d\n",
659 table_options_.index_block_restart_interval);
660 ret.append(buffer);
661 snprintf(buffer, kBufferSize, " metadata_block_size: %" PRIu64 "\n",
662 table_options_.metadata_block_size);
663 ret.append(buffer);
664 snprintf(buffer, kBufferSize, " partition_filters: %d\n",
665 table_options_.partition_filters);
666 ret.append(buffer);
667 snprintf(buffer, kBufferSize, " use_delta_encoding: %d\n",
668 table_options_.use_delta_encoding);
669 ret.append(buffer);
670 snprintf(buffer, kBufferSize, " filter_policy: %s\n",
671 table_options_.filter_policy == nullptr
672 ? "nullptr"
673 : table_options_.filter_policy->Name());
674 ret.append(buffer);
675 snprintf(buffer, kBufferSize, " whole_key_filtering: %d\n",
676 table_options_.whole_key_filtering);
677 ret.append(buffer);
678 snprintf(buffer, kBufferSize, " verify_compression: %d\n",
679 table_options_.verify_compression);
680 ret.append(buffer);
681 snprintf(buffer, kBufferSize, " read_amp_bytes_per_bit: %d\n",
682 table_options_.read_amp_bytes_per_bit);
683 ret.append(buffer);
684 snprintf(buffer, kBufferSize, " format_version: %d\n",
685 table_options_.format_version);
686 ret.append(buffer);
687 snprintf(buffer, kBufferSize, " enable_index_compression: %d\n",
688 table_options_.enable_index_compression);
689 ret.append(buffer);
690 snprintf(buffer, kBufferSize, " block_align: %d\n",
691 table_options_.block_align);
692 ret.append(buffer);
693 snprintf(buffer, kBufferSize,
694 " max_auto_readahead_size: %" ROCKSDB_PRIszt "\n",
695 table_options_.max_auto_readahead_size);
696 ret.append(buffer);
697 snprintf(buffer, kBufferSize, " prepopulate_block_cache: %d\n",
698 static_cast<int>(table_options_.prepopulate_block_cache));
699 ret.append(buffer);
700 return ret;
701 }
702
GetOptionsPtr(const std::string & name) const703 const void* BlockBasedTableFactory::GetOptionsPtr(
704 const std::string& name) const {
705 if (name == kBlockCacheOpts()) {
706 if (table_options_.no_block_cache) {
707 return nullptr;
708 } else {
709 return table_options_.block_cache.get();
710 }
711 } else {
712 return TableFactory::GetOptionsPtr(name);
713 }
714 }
715
716 #ifndef ROCKSDB_LITE
717 // Take a default BlockBasedTableOptions "table_options" in addition to a
718 // map "opts_map" of option name to option value to construct the new
719 // BlockBasedTableOptions "new_table_options".
720 //
721 // Below are the instructions of how to config some non-primitive-typed
722 // options in BlockBasedTableOptions:
723 //
724 // * filter_policy:
725 // We currently only support the following FilterPolicy in the convenience
726 // functions:
727 // - BloomFilter: use "bloomfilter:[bits_per_key]:[use_block_based_builder]"
728 // to specify BloomFilter. The above string is equivalent to calling
729 // NewBloomFilterPolicy(bits_per_key, use_block_based_builder).
730 // [Example]:
731 // - Pass {"filter_policy", "bloomfilter:4:true"} in
732 // GetBlockBasedTableOptionsFromMap to use a BloomFilter with 4-bits
733 // per key and use_block_based_builder enabled.
734 //
735 // * block_cache / block_cache_compressed:
736 // We currently only support LRU cache in the GetOptions API. The LRU
737 // cache can be set by directly specifying its size.
738 // [Example]:
739 // - Passing {"block_cache", "1M"} in GetBlockBasedTableOptionsFromMap is
740 // equivalent to setting block_cache using NewLRUCache(1024 * 1024).
741 //
742 // @param table_options the default options of the output "new_table_options".
743 // @param opts_map an option name to value map for specifying how
744 // "new_table_options" should be set.
745 // @param new_table_options the resulting options based on "table_options"
746 // with the change specified in "opts_map".
747 // @param input_strings_escaped when set to true, each escaped characters
748 // prefixed by '\' in the values of the opts_map will be further converted
749 // back to the raw string before assigning to the associated options.
750 // @param ignore_unknown_options when set to true, unknown options are ignored
751 // instead of resulting in an unknown-option error.
752 // @return Status::OK() on success. Otherwise, a non-ok status indicating
753 // error will be returned, and "new_table_options" will be set to
754 // "table_options".
ParseOption(const ConfigOptions & config_options,const OptionTypeInfo & opt_info,const std::string & opt_name,const std::string & opt_value,void * opt_ptr)755 Status BlockBasedTableFactory::ParseOption(const ConfigOptions& config_options,
756 const OptionTypeInfo& opt_info,
757 const std::string& opt_name,
758 const std::string& opt_value,
759 void* opt_ptr) {
760 Status status = TableFactory::ParseOption(config_options, opt_info, opt_name,
761 opt_value, opt_ptr);
762 if (config_options.input_strings_escaped && !status.ok()) { // Got an error
763 // !input_strings_escaped indicates the old API, where everything is
764 // parsable.
765 if (opt_info.IsByName()) {
766 status = Status::OK();
767 }
768 }
769 return status;
770 }
771
GetBlockBasedTableOptionsFromString(const BlockBasedTableOptions & table_options,const std::string & opts_str,BlockBasedTableOptions * new_table_options)772 Status GetBlockBasedTableOptionsFromString(
773 const BlockBasedTableOptions& table_options, const std::string& opts_str,
774 BlockBasedTableOptions* new_table_options) {
775 ConfigOptions config_options;
776 config_options.input_strings_escaped = false;
777 config_options.ignore_unknown_options = false;
778 config_options.invoke_prepare_options = false;
779 return GetBlockBasedTableOptionsFromString(config_options, table_options,
780 opts_str, new_table_options);
781 }
GetBlockBasedTableOptionsFromString(const ConfigOptions & config_options,const BlockBasedTableOptions & table_options,const std::string & opts_str,BlockBasedTableOptions * new_table_options)782 Status GetBlockBasedTableOptionsFromString(
783 const ConfigOptions& config_options,
784 const BlockBasedTableOptions& table_options, const std::string& opts_str,
785 BlockBasedTableOptions* new_table_options) {
786 std::unordered_map<std::string, std::string> opts_map;
787 Status s = StringToMap(opts_str, &opts_map);
788 if (!s.ok()) {
789 return s;
790 }
791 s = GetBlockBasedTableOptionsFromMap(config_options, table_options, opts_map,
792 new_table_options);
793 // Translate any errors (NotFound, NotSupported, to InvalidArgument
794 if (s.ok() || s.IsInvalidArgument()) {
795 return s;
796 } else {
797 return Status::InvalidArgument(s.getState());
798 }
799 }
800
GetBlockBasedTableOptionsFromMap(const BlockBasedTableOptions & table_options,const std::unordered_map<std::string,std::string> & opts_map,BlockBasedTableOptions * new_table_options,bool input_strings_escaped,bool ignore_unknown_options)801 Status GetBlockBasedTableOptionsFromMap(
802 const BlockBasedTableOptions& table_options,
803 const std::unordered_map<std::string, std::string>& opts_map,
804 BlockBasedTableOptions* new_table_options, bool input_strings_escaped,
805 bool ignore_unknown_options) {
806 ConfigOptions config_options;
807 config_options.input_strings_escaped = input_strings_escaped;
808 config_options.ignore_unknown_options = ignore_unknown_options;
809 config_options.invoke_prepare_options = false;
810
811 return GetBlockBasedTableOptionsFromMap(config_options, table_options,
812 opts_map, new_table_options);
813 }
814
GetBlockBasedTableOptionsFromMap(const ConfigOptions & config_options,const BlockBasedTableOptions & table_options,const std::unordered_map<std::string,std::string> & opts_map,BlockBasedTableOptions * new_table_options)815 Status GetBlockBasedTableOptionsFromMap(
816 const ConfigOptions& config_options,
817 const BlockBasedTableOptions& table_options,
818 const std::unordered_map<std::string, std::string>& opts_map,
819 BlockBasedTableOptions* new_table_options) {
820 assert(new_table_options);
821 BlockBasedTableFactory bbtf(table_options);
822 Status s = bbtf.ConfigureFromMap(config_options, opts_map);
823 if (s.ok()) {
824 *new_table_options = *(bbtf.GetOptions<BlockBasedTableOptions>());
825 } else {
826 *new_table_options = table_options;
827 }
828 return s;
829 }
830 #endif // !ROCKSDB_LITE
831
NewBlockBasedTableFactory(const BlockBasedTableOptions & _table_options)832 TableFactory* NewBlockBasedTableFactory(
833 const BlockBasedTableOptions& _table_options) {
834 return new BlockBasedTableFactory(_table_options);
835 }
836
837 const std::string BlockBasedTablePropertyNames::kIndexType =
838 "rocksdb.block.based.table.index.type";
839 const std::string BlockBasedTablePropertyNames::kWholeKeyFiltering =
840 "rocksdb.block.based.table.whole.key.filtering";
841 const std::string BlockBasedTablePropertyNames::kPrefixFiltering =
842 "rocksdb.block.based.table.prefix.filtering";
843 const std::string kHashIndexPrefixesBlock = "rocksdb.hashindex.prefixes";
844 const std::string kHashIndexPrefixesMetadataBlock =
845 "rocksdb.hashindex.metadata";
846 const std::string kPropTrue = "1";
847 const std::string kPropFalse = "0";
848
849 } // namespace ROCKSDB_NAMESPACE
850