1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 
10 #include "table/block_based/block_based_table_factory.h"
11 
12 #include <stdint.h>
13 
14 #include <cinttypes>
15 #include <memory>
16 #include <string>
17 
18 #include "logging/logging.h"
19 #include "port/port.h"
20 #include "rocksdb/cache.h"
21 #include "rocksdb/convenience.h"
22 #include "rocksdb/filter_policy.h"
23 #include "rocksdb/flush_block_policy.h"
24 #include "rocksdb/utilities/options_type.h"
25 #include "table/block_based/block_based_table_builder.h"
26 #include "table/block_based/block_based_table_reader.h"
27 #include "table/format.h"
28 #include "util/mutexlock.h"
29 #include "util/string_util.h"
30 
31 namespace ROCKSDB_NAMESPACE {
32 
RecordEffectiveSize(size_t len)33 void TailPrefetchStats::RecordEffectiveSize(size_t len) {
34   MutexLock l(&mutex_);
35   if (num_records_ < kNumTracked) {
36     num_records_++;
37   }
38   records_[next_++] = len;
39   if (next_ == kNumTracked) {
40     next_ = 0;
41   }
42 }
43 
GetSuggestedPrefetchSize()44 size_t TailPrefetchStats::GetSuggestedPrefetchSize() {
45   std::vector<size_t> sorted;
46   {
47     MutexLock l(&mutex_);
48 
49     if (num_records_ == 0) {
50       return 0;
51     }
52     sorted.assign(records_, records_ + num_records_);
53   }
54 
55   // Of the historic size, we find the maximum one that satisifis the condtiion
56   // that if prefetching all, less than 1/8 will be wasted.
57   std::sort(sorted.begin(), sorted.end());
58 
59   // Assuming we have 5 data points, and after sorting it looks like this:
60   //
61   //                                     +---+
62   //                             +---+   |   |
63   //                             |   |   |   |
64   //                             |   |   |   |
65   //                             |   |   |   |
66   //                             |   |   |   |
67   //                    +---+    |   |   |   |
68   //                    |   |    |   |   |   |
69   //           +---+    |   |    |   |   |   |
70   //           |   |    |   |    |   |   |   |
71   //  +---+    |   |    |   |    |   |   |   |
72   //  |   |    |   |    |   |    |   |   |   |
73   //  |   |    |   |    |   |    |   |   |   |
74   //  |   |    |   |    |   |    |   |   |   |
75   //  |   |    |   |    |   |    |   |   |   |
76   //  |   |    |   |    |   |    |   |   |   |
77   //  +---+    +---+    +---+    +---+   +---+
78   //
79   // and we use every of the value as a candidate, and estimate how much we
80   // wasted, compared to read. For example, when we use the 3rd record
81   // as candiate. This area is what we read:
82   //                                     +---+
83   //                             +---+   |   |
84   //                             |   |   |   |
85   //                             |   |   |   |
86   //                             |   |   |   |
87   //                             |   |   |   |
88   //  ***  ***  ***  ***+ ***  ***  *** *** **
89   //  *                 |   |    |   |   |   |
90   //           +---+    |   |    |   |   |   *
91   //  *        |   |    |   |    |   |   |   |
92   //  +---+    |   |    |   |    |   |   |   *
93   //  *   |    |   |    | X |    |   |   |   |
94   //  |   |    |   |    |   |    |   |   |   *
95   //  *   |    |   |    |   |    |   |   |   |
96   //  |   |    |   |    |   |    |   |   |   *
97   //  *   |    |   |    |   |    |   |   |   |
98   //  *** *** ***-***  ***--*** ***--*** +****
99   // which is (size of the record) X (number of records).
100   //
101   // While wasted is this area:
102   //                                     +---+
103   //                             +---+   |   |
104   //                             |   |   |   |
105   //                             |   |   |   |
106   //                             |   |   |   |
107   //                             |   |   |   |
108   //  ***  ***  ***  ****---+    |   |   |   |
109   //  *                 *   |    |   |   |   |
110   //  *        *-***  ***   |    |   |   |   |
111   //  *        *   |    |   |    |   |   |   |
112   //  *--**  ***   |    |   |    |   |   |   |
113   //  |   |    |   |    | X |    |   |   |   |
114   //  |   |    |   |    |   |    |   |   |   |
115   //  |   |    |   |    |   |    |   |   |   |
116   //  |   |    |   |    |   |    |   |   |   |
117   //  |   |    |   |    |   |    |   |   |   |
118   //  +---+    +---+    +---+    +---+   +---+
119   //
120   // Which can be calculated iteratively.
121   // The difference between wasted using 4st and 3rd record, will
122   // be following area:
123   //                                     +---+
124   //  +--+  +-+   ++  +-+  +-+   +---+   |   |
125   //  + xxxxxxxxxxxxxxxxxxxxxxxx |   |   |   |
126   //    xxxxxxxxxxxxxxxxxxxxxxxx |   |   |   |
127   //  + xxxxxxxxxxxxxxxxxxxxxxxx |   |   |   |
128   //  | xxxxxxxxxxxxxxxxxxxxxxxx |   |   |   |
129   //  +-+ +-+  +-+  ++  +---+ +--+   |   |   |
130   //  |                 |   |    |   |   |   |
131   //           +---+ ++ |   |    |   |   |   |
132   //  |        |   |    |   |    | X |   |   |
133   //  +---+ ++ |   |    |   |    |   |   |   |
134   //  |   |    |   |    |   |    |   |   |   |
135   //  |   |    |   |    |   |    |   |   |   |
136   //  |   |    |   |    |   |    |   |   |   |
137   //  |   |    |   |    |   |    |   |   |   |
138   //  |   |    |   |    |   |    |   |   |   |
139   //  +---+    +---+    +---+    +---+   +---+
140   //
141   // which will be the size difference between 4st and 3rd record,
142   // times 3, which is number of records before the 4st.
143   // Here we assume that all data within the prefetch range will be useful. In
144   // reality, it may not be the case when a partial block is inside the range,
145   // or there are data in the middle that is not read. We ignore those cases
146   // for simplicity.
147   assert(!sorted.empty());
148   size_t prev_size = sorted[0];
149   size_t max_qualified_size = sorted[0];
150   size_t wasted = 0;
151   for (size_t i = 1; i < sorted.size(); i++) {
152     size_t read = sorted[i] * sorted.size();
153     wasted += (sorted[i] - prev_size) * i;
154     if (wasted <= read / 8) {
155       max_qualified_size = sorted[i];
156     }
157     prev_size = sorted[i];
158   }
159   const size_t kMaxPrefetchSize = 512 * 1024;  // Never exceed 512KB
160   return std::min(kMaxPrefetchSize, max_qualified_size);
161 }
162 
163 #ifndef ROCKSDB_LITE
164 
165 const std::string kOptNameMetadataCacheOpts = "metadata_cache_options";
166 
167 static std::unordered_map<std::string, PinningTier>
168     pinning_tier_type_string_map = {
169         {"kFallback", PinningTier::kFallback},
170         {"kNone", PinningTier::kNone},
171         {"kFlushedAndSimilar", PinningTier::kFlushedAndSimilar},
172         {"kAll", PinningTier::kAll}};
173 
174 static std::unordered_map<std::string, BlockBasedTableOptions::IndexType>
175     block_base_table_index_type_string_map = {
176         {"kBinarySearch", BlockBasedTableOptions::IndexType::kBinarySearch},
177         {"kHashSearch", BlockBasedTableOptions::IndexType::kHashSearch},
178         {"kTwoLevelIndexSearch",
179          BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch},
180         {"kBinarySearchWithFirstKey",
181          BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey}};
182 
183 static std::unordered_map<std::string,
184                           BlockBasedTableOptions::DataBlockIndexType>
185     block_base_table_data_block_index_type_string_map = {
186         {"kDataBlockBinarySearch",
187          BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch},
188         {"kDataBlockBinaryAndHash",
189          BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash}};
190 
191 static std::unordered_map<std::string,
192                           BlockBasedTableOptions::IndexShorteningMode>
193     block_base_table_index_shortening_mode_string_map = {
194         {"kNoShortening",
195          BlockBasedTableOptions::IndexShorteningMode::kNoShortening},
196         {"kShortenSeparators",
197          BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators},
198         {"kShortenSeparatorsAndSuccessor",
199          BlockBasedTableOptions::IndexShorteningMode::
200              kShortenSeparatorsAndSuccessor}};
201 
202 static std::unordered_map<std::string, OptionTypeInfo>
203     metadata_cache_options_type_info = {
204         {"top_level_index_pinning",
205          OptionTypeInfo::Enum<PinningTier>(
206              offsetof(struct MetadataCacheOptions, top_level_index_pinning),
207              &pinning_tier_type_string_map)},
208         {"partition_pinning",
209          OptionTypeInfo::Enum<PinningTier>(
210              offsetof(struct MetadataCacheOptions, partition_pinning),
211              &pinning_tier_type_string_map)},
212         {"unpartitioned_pinning",
213          OptionTypeInfo::Enum<PinningTier>(
214              offsetof(struct MetadataCacheOptions, unpartitioned_pinning),
215              &pinning_tier_type_string_map)}};
216 
217 static std::unordered_map<std::string,
218                           BlockBasedTableOptions::PrepopulateBlockCache>
219     block_base_table_prepopulate_block_cache_string_map = {
220         {"kDisable", BlockBasedTableOptions::PrepopulateBlockCache::kDisable},
221         {"kFlushOnly",
222          BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly}};
223 
224 #endif  // ROCKSDB_LITE
225 
226 static std::unordered_map<std::string, OptionTypeInfo>
227     block_based_table_type_info = {
228 #ifndef ROCKSDB_LITE
229         /* currently not supported
230           std::shared_ptr<Cache> block_cache = nullptr;
231           std::shared_ptr<Cache> block_cache_compressed = nullptr;
232          */
233         {"flush_block_policy_factory",
234          OptionTypeInfo::AsCustomSharedPtr<FlushBlockPolicyFactory>(
235              offsetof(struct BlockBasedTableOptions,
236                       flush_block_policy_factory),
237              OptionVerificationType::kByName, OptionTypeFlags::kCompareNever)},
238         {"cache_index_and_filter_blocks",
239          {offsetof(struct BlockBasedTableOptions,
240                    cache_index_and_filter_blocks),
241           OptionType::kBoolean, OptionVerificationType::kNormal,
242           OptionTypeFlags::kNone}},
243         {"cache_index_and_filter_blocks_with_high_priority",
244          {offsetof(struct BlockBasedTableOptions,
245                    cache_index_and_filter_blocks_with_high_priority),
246           OptionType::kBoolean, OptionVerificationType::kNormal,
247           OptionTypeFlags::kNone}},
248         {"pin_l0_filter_and_index_blocks_in_cache",
249          {offsetof(struct BlockBasedTableOptions,
250                    pin_l0_filter_and_index_blocks_in_cache),
251           OptionType::kBoolean, OptionVerificationType::kNormal,
252           OptionTypeFlags::kNone}},
253         {"index_type", OptionTypeInfo::Enum<BlockBasedTableOptions::IndexType>(
254                            offsetof(struct BlockBasedTableOptions, index_type),
255                            &block_base_table_index_type_string_map)},
256         {"hash_index_allow_collision",
257          {offsetof(struct BlockBasedTableOptions, hash_index_allow_collision),
258           OptionType::kBoolean, OptionVerificationType::kNormal,
259           OptionTypeFlags::kNone}},
260         {"data_block_index_type",
261          OptionTypeInfo::Enum<BlockBasedTableOptions::DataBlockIndexType>(
262              offsetof(struct BlockBasedTableOptions, data_block_index_type),
263              &block_base_table_data_block_index_type_string_map)},
264         {"index_shortening",
265          OptionTypeInfo::Enum<BlockBasedTableOptions::IndexShorteningMode>(
266              offsetof(struct BlockBasedTableOptions, index_shortening),
267              &block_base_table_index_shortening_mode_string_map)},
268         {"data_block_hash_table_util_ratio",
269          {offsetof(struct BlockBasedTableOptions,
270                    data_block_hash_table_util_ratio),
271           OptionType::kDouble, OptionVerificationType::kNormal,
272           OptionTypeFlags::kNone}},
273         {"checksum",
274          {offsetof(struct BlockBasedTableOptions, checksum),
275           OptionType::kChecksumType, OptionVerificationType::kNormal,
276           OptionTypeFlags::kNone}},
277         {"no_block_cache",
278          {offsetof(struct BlockBasedTableOptions, no_block_cache),
279           OptionType::kBoolean, OptionVerificationType::kNormal,
280           OptionTypeFlags::kNone}},
281         {"block_size",
282          {offsetof(struct BlockBasedTableOptions, block_size),
283           OptionType::kSizeT, OptionVerificationType::kNormal,
284           OptionTypeFlags::kMutable}},
285         {"block_size_deviation",
286          {offsetof(struct BlockBasedTableOptions, block_size_deviation),
287           OptionType::kInt, OptionVerificationType::kNormal,
288           OptionTypeFlags::kNone}},
289         {"block_restart_interval",
290          {offsetof(struct BlockBasedTableOptions, block_restart_interval),
291           OptionType::kInt, OptionVerificationType::kNormal,
292           OptionTypeFlags::kMutable}},
293         {"index_block_restart_interval",
294          {offsetof(struct BlockBasedTableOptions, index_block_restart_interval),
295           OptionType::kInt, OptionVerificationType::kNormal,
296           OptionTypeFlags::kNone}},
297         {"index_per_partition",
298          {0, OptionType::kUInt64T, OptionVerificationType::kDeprecated,
299           OptionTypeFlags::kNone}},
300         {"metadata_block_size",
301          {offsetof(struct BlockBasedTableOptions, metadata_block_size),
302           OptionType::kUInt64T, OptionVerificationType::kNormal,
303           OptionTypeFlags::kNone}},
304         {"partition_filters",
305          {offsetof(struct BlockBasedTableOptions, partition_filters),
306           OptionType::kBoolean, OptionVerificationType::kNormal,
307           OptionTypeFlags::kNone}},
308         {"optimize_filters_for_memory",
309          {offsetof(struct BlockBasedTableOptions, optimize_filters_for_memory),
310           OptionType::kBoolean, OptionVerificationType::kNormal,
311           OptionTypeFlags::kNone}},
312         {"filter_policy",
313          {offsetof(struct BlockBasedTableOptions, filter_policy),
314           OptionType::kUnknown, OptionVerificationType::kByNameAllowFromNull,
315           OptionTypeFlags::kNone,
316           // Parses the Filter policy
317           [](const ConfigOptions& opts, const std::string&,
__anon0ea3814c0102() 318              const std::string& value, void* addr) {
319             auto* policy =
320                 static_cast<std::shared_ptr<const FilterPolicy>*>(addr);
321             return FilterPolicy::CreateFromString(opts, value, policy);
322           },
323           // Converts the FilterPolicy to its string representation
324           [](const ConfigOptions&, const std::string&, const void* addr,
__anon0ea3814c0202() 325              std::string* value) {
326             const auto* policy =
327                 static_cast<const std::shared_ptr<const FilterPolicy>*>(addr);
328             if (policy->get()) {
329               *value = (*policy)->Name();
330             } else {
331               *value = kNullptrString;
332             }
333             return Status::OK();
334           },
335           // Compares two FilterPolicy objects for equality
336           [](const ConfigOptions&, const std::string&, const void* addr1,
__anon0ea3814c0302() 337              const void* addr2, std::string*) {
338             const auto* policy1 =
339                 static_cast<const std::shared_ptr<const FilterPolicy>*>(addr1)
340                     ->get();
341             const auto* policy2 =
342                 static_cast<const std::shared_ptr<FilterPolicy>*>(addr2)->get();
343             if (policy1 == policy2) {
344               return true;
345             } else if (policy1 != nullptr && policy2 != nullptr) {
346               return (strcmp(policy1->Name(), policy2->Name()) == 0);
347             } else {
348               return false;
349             }
350           }}},
351         {"whole_key_filtering",
352          {offsetof(struct BlockBasedTableOptions, whole_key_filtering),
353           OptionType::kBoolean, OptionVerificationType::kNormal,
354           OptionTypeFlags::kNone}},
355         {"skip_table_builder_flush",
356          {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
357           OptionTypeFlags::kNone}},
358         {"format_version",
359          {offsetof(struct BlockBasedTableOptions, format_version),
360           OptionType::kUInt32T, OptionVerificationType::kNormal,
361           OptionTypeFlags::kNone}},
362         {"verify_compression",
363          {offsetof(struct BlockBasedTableOptions, verify_compression),
364           OptionType::kBoolean, OptionVerificationType::kNormal,
365           OptionTypeFlags::kNone}},
366         {"read_amp_bytes_per_bit",
367          {offsetof(struct BlockBasedTableOptions, read_amp_bytes_per_bit),
368           OptionType::kUInt32T, OptionVerificationType::kNormal,
369           OptionTypeFlags::kNone,
370           [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
__anon0ea3814c0402() 371              const std::string& value, void* addr) {
372             // A workaround to fix a bug in 6.10, 6.11, 6.12, 6.13
373             // and 6.14. The bug will write out 8 bytes to OPTIONS file from the
374             // starting address of BlockBasedTableOptions.read_amp_bytes_per_bit
375             // which is actually a uint32. Consequently, the value of
376             // read_amp_bytes_per_bit written in the OPTIONS file is wrong.
377             // From 6.15, RocksDB will try to parse the read_amp_bytes_per_bit
378             // from OPTIONS file as a uint32. To be able to load OPTIONS file
379             // generated by affected releases before the fix, we need to
380             // manually parse read_amp_bytes_per_bit with this special hack.
381             uint64_t read_amp_bytes_per_bit = ParseUint64(value);
382             *(static_cast<uint32_t*>(addr)) =
383                 static_cast<uint32_t>(read_amp_bytes_per_bit);
384             return Status::OK();
385           }}},
386         {"enable_index_compression",
387          {offsetof(struct BlockBasedTableOptions, enable_index_compression),
388           OptionType::kBoolean, OptionVerificationType::kNormal,
389           OptionTypeFlags::kNone}},
390         {"block_align",
391          {offsetof(struct BlockBasedTableOptions, block_align),
392           OptionType::kBoolean, OptionVerificationType::kNormal,
393           OptionTypeFlags::kNone}},
394         {"pin_top_level_index_and_filter",
395          {offsetof(struct BlockBasedTableOptions,
396                    pin_top_level_index_and_filter),
397           OptionType::kBoolean, OptionVerificationType::kNormal,
398           OptionTypeFlags::kNone}},
399         {kOptNameMetadataCacheOpts,
400          OptionTypeInfo::Struct(
401              kOptNameMetadataCacheOpts, &metadata_cache_options_type_info,
402              offsetof(struct BlockBasedTableOptions, metadata_cache_options),
403              OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
404         {"block_cache",
405          {offsetof(struct BlockBasedTableOptions, block_cache),
406           OptionType::kUnknown, OptionVerificationType::kNormal,
407           (OptionTypeFlags::kCompareNever | OptionTypeFlags::kDontSerialize),
408           // Parses the input vsalue as a Cache
409           [](const ConfigOptions& opts, const std::string&,
__anon0ea3814c0502() 410              const std::string& value, void* addr) {
411             auto* cache = static_cast<std::shared_ptr<Cache>*>(addr);
412             return Cache::CreateFromString(opts, value, cache);
413           }}},
414         {"block_cache_compressed",
415          {offsetof(struct BlockBasedTableOptions, block_cache_compressed),
416           OptionType::kUnknown, OptionVerificationType::kNormal,
417           (OptionTypeFlags::kCompareNever | OptionTypeFlags::kDontSerialize),
418           // Parses the input vsalue as a Cache
419           [](const ConfigOptions& opts, const std::string&,
__anon0ea3814c0602() 420              const std::string& value, void* addr) {
421             auto* cache = static_cast<std::shared_ptr<Cache>*>(addr);
422             return Cache::CreateFromString(opts, value, cache);
423           }}},
424         {"max_auto_readahead_size",
425          {offsetof(struct BlockBasedTableOptions, max_auto_readahead_size),
426           OptionType::kSizeT, OptionVerificationType::kNormal,
427           OptionTypeFlags::kMutable}},
428         {"prepopulate_block_cache",
429          OptionTypeInfo::Enum<BlockBasedTableOptions::PrepopulateBlockCache>(
430              offsetof(struct BlockBasedTableOptions, prepopulate_block_cache),
431              &block_base_table_prepopulate_block_cache_string_map,
432              OptionTypeFlags::kMutable)},
433 
434 #endif  // ROCKSDB_LITE
435 };
436 
437 // TODO(myabandeh): We should return an error instead of silently changing the
438 // options
BlockBasedTableFactory(const BlockBasedTableOptions & _table_options)439 BlockBasedTableFactory::BlockBasedTableFactory(
440     const BlockBasedTableOptions& _table_options)
441     : table_options_(_table_options) {
442   InitializeOptions();
443   RegisterOptions(&table_options_, &block_based_table_type_info);
444 }
445 
InitializeOptions()446 void BlockBasedTableFactory::InitializeOptions() {
447   if (table_options_.flush_block_policy_factory == nullptr) {
448     table_options_.flush_block_policy_factory.reset(
449         new FlushBlockBySizePolicyFactory());
450   }
451   if (table_options_.no_block_cache) {
452     table_options_.block_cache.reset();
453   } else if (table_options_.block_cache == nullptr) {
454     LRUCacheOptions co;
455     co.capacity = 8 << 20;
456     // It makes little sense to pay overhead for mid-point insertion while the
457     // block size is only 8MB.
458     co.high_pri_pool_ratio = 0.0;
459     table_options_.block_cache = NewLRUCache(co);
460   }
461   if (table_options_.block_size_deviation < 0 ||
462       table_options_.block_size_deviation > 100) {
463     table_options_.block_size_deviation = 0;
464   }
465   if (table_options_.block_restart_interval < 1) {
466     table_options_.block_restart_interval = 1;
467   }
468   if (table_options_.index_block_restart_interval < 1) {
469     table_options_.index_block_restart_interval = 1;
470   }
471   if (table_options_.index_type == BlockBasedTableOptions::kHashSearch &&
472       table_options_.index_block_restart_interval != 1) {
473     // Currently kHashSearch is incompatible with index_block_restart_interval > 1
474     table_options_.index_block_restart_interval = 1;
475   }
476   if (table_options_.partition_filters &&
477       table_options_.index_type !=
478           BlockBasedTableOptions::kTwoLevelIndexSearch) {
479     // We do not support partitioned filters without partitioning indexes
480     table_options_.partition_filters = false;
481   }
482 }
483 
PrepareOptions(const ConfigOptions & opts)484 Status BlockBasedTableFactory::PrepareOptions(const ConfigOptions& opts) {
485   InitializeOptions();
486   return TableFactory::PrepareOptions(opts);
487 }
488 
NewTableReader(const ReadOptions & ro,const TableReaderOptions & table_reader_options,std::unique_ptr<RandomAccessFileReader> && file,uint64_t file_size,std::unique_ptr<TableReader> * table_reader,bool prefetch_index_and_filter_in_cache) const489 Status BlockBasedTableFactory::NewTableReader(
490     const ReadOptions& ro, const TableReaderOptions& table_reader_options,
491     std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
492     std::unique_ptr<TableReader>* table_reader,
493     bool prefetch_index_and_filter_in_cache) const {
494   return BlockBasedTable::Open(
495       ro, table_reader_options.ioptions, table_reader_options.env_options,
496       table_options_, table_reader_options.internal_comparator, std::move(file),
497       file_size, table_reader, table_reader_options.prefix_extractor,
498       prefetch_index_and_filter_in_cache, table_reader_options.skip_filters,
499       table_reader_options.level, table_reader_options.immortal,
500       table_reader_options.largest_seqno,
501       table_reader_options.force_direct_prefetch, &tail_prefetch_stats_,
502       table_reader_options.block_cache_tracer,
503       table_reader_options.max_file_size_for_l0_meta_pin,
504       table_reader_options.cur_db_session_id,
505       table_reader_options.cur_file_num);
506 }
507 
NewTableBuilder(const TableBuilderOptions & table_builder_options,WritableFileWriter * file) const508 TableBuilder* BlockBasedTableFactory::NewTableBuilder(
509     const TableBuilderOptions& table_builder_options,
510     WritableFileWriter* file) const {
511   return new BlockBasedTableBuilder(table_options_, table_builder_options,
512                                     file);
513 }
514 
ValidateOptions(const DBOptions & db_opts,const ColumnFamilyOptions & cf_opts) const515 Status BlockBasedTableFactory::ValidateOptions(
516     const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const {
517   if (table_options_.index_type == BlockBasedTableOptions::kHashSearch &&
518       cf_opts.prefix_extractor == nullptr) {
519     return Status::InvalidArgument(
520         "Hash index is specified for block-based "
521         "table, but prefix_extractor is not given");
522   }
523   if (table_options_.cache_index_and_filter_blocks &&
524       table_options_.no_block_cache) {
525     return Status::InvalidArgument(
526         "Enable cache_index_and_filter_blocks, "
527         ", but block cache is disabled");
528   }
529   if (table_options_.pin_l0_filter_and_index_blocks_in_cache &&
530       table_options_.no_block_cache) {
531     return Status::InvalidArgument(
532         "Enable pin_l0_filter_and_index_blocks_in_cache, "
533         ", but block cache is disabled");
534   }
535   if (!BlockBasedTableSupportedVersion(table_options_.format_version)) {
536     return Status::InvalidArgument(
537         "Unsupported BlockBasedTable format_version. Please check "
538         "include/rocksdb/table.h for more info");
539   }
540   if (table_options_.block_align && (cf_opts.compression != kNoCompression)) {
541     return Status::InvalidArgument(
542         "Enable block_align, but compression "
543         "enabled");
544   }
545   if (table_options_.block_align &&
546       (table_options_.block_size & (table_options_.block_size - 1))) {
547     return Status::InvalidArgument(
548         "Block alignment requested but block size is not a power of 2");
549   }
550   if (table_options_.block_size > port::kMaxUint32) {
551     return Status::InvalidArgument(
552         "block size exceeds maximum number (4GiB) allowed");
553   }
554   if (table_options_.data_block_index_type ==
555           BlockBasedTableOptions::kDataBlockBinaryAndHash &&
556       table_options_.data_block_hash_table_util_ratio <= 0) {
557     return Status::InvalidArgument(
558         "data_block_hash_table_util_ratio should be greater than 0 when "
559         "data_block_index_type is set to kDataBlockBinaryAndHash");
560   }
561   if (db_opts.unordered_write && cf_opts.max_successive_merges > 0) {
562     // TODO(myabandeh): support it
563     return Status::InvalidArgument(
564         "max_successive_merges larger than 0 is currently inconsistent with "
565         "unordered_write");
566   }
567   return TableFactory::ValidateOptions(db_opts, cf_opts);
568 }
569 
GetPrintableOptions() const570 std::string BlockBasedTableFactory::GetPrintableOptions() const {
571   std::string ret;
572   ret.reserve(20000);
573   const int kBufferSize = 200;
574   char buffer[kBufferSize];
575 
576   snprintf(buffer, kBufferSize, "  flush_block_policy_factory: %s (%p)\n",
577            table_options_.flush_block_policy_factory->Name(),
578            static_cast<void*>(table_options_.flush_block_policy_factory.get()));
579   ret.append(buffer);
580   snprintf(buffer, kBufferSize, "  cache_index_and_filter_blocks: %d\n",
581            table_options_.cache_index_and_filter_blocks);
582   ret.append(buffer);
583   snprintf(buffer, kBufferSize,
584            "  cache_index_and_filter_blocks_with_high_priority: %d\n",
585            table_options_.cache_index_and_filter_blocks_with_high_priority);
586   ret.append(buffer);
587   snprintf(buffer, kBufferSize,
588            "  pin_l0_filter_and_index_blocks_in_cache: %d\n",
589            table_options_.pin_l0_filter_and_index_blocks_in_cache);
590   ret.append(buffer);
591   snprintf(buffer, kBufferSize, "  pin_top_level_index_and_filter: %d\n",
592            table_options_.pin_top_level_index_and_filter);
593   ret.append(buffer);
594   snprintf(buffer, kBufferSize, "  index_type: %d\n",
595            table_options_.index_type);
596   ret.append(buffer);
597   snprintf(buffer, kBufferSize, "  data_block_index_type: %d\n",
598            table_options_.data_block_index_type);
599   ret.append(buffer);
600   snprintf(buffer, kBufferSize, "  index_shortening: %d\n",
601            static_cast<int>(table_options_.index_shortening));
602   ret.append(buffer);
603   snprintf(buffer, kBufferSize, "  data_block_hash_table_util_ratio: %lf\n",
604            table_options_.data_block_hash_table_util_ratio);
605   ret.append(buffer);
606   snprintf(buffer, kBufferSize, "  hash_index_allow_collision: %d\n",
607            table_options_.hash_index_allow_collision);
608   ret.append(buffer);
609   snprintf(buffer, kBufferSize, "  checksum: %d\n", table_options_.checksum);
610   ret.append(buffer);
611   snprintf(buffer, kBufferSize, "  no_block_cache: %d\n",
612            table_options_.no_block_cache);
613   ret.append(buffer);
614   snprintf(buffer, kBufferSize, "  block_cache: %p\n",
615            static_cast<void*>(table_options_.block_cache.get()));
616   ret.append(buffer);
617   if (table_options_.block_cache) {
618     const char* block_cache_name = table_options_.block_cache->Name();
619     if (block_cache_name != nullptr) {
620       snprintf(buffer, kBufferSize, "  block_cache_name: %s\n",
621                block_cache_name);
622       ret.append(buffer);
623     }
624     ret.append("  block_cache_options:\n");
625     ret.append(table_options_.block_cache->GetPrintableOptions());
626   }
627   snprintf(buffer, kBufferSize, "  block_cache_compressed: %p\n",
628            static_cast<void*>(table_options_.block_cache_compressed.get()));
629   ret.append(buffer);
630   if (table_options_.block_cache_compressed) {
631     const char* block_cache_compressed_name =
632         table_options_.block_cache_compressed->Name();
633     if (block_cache_compressed_name != nullptr) {
634       snprintf(buffer, kBufferSize, "  block_cache_name: %s\n",
635                block_cache_compressed_name);
636       ret.append(buffer);
637     }
638     ret.append("  block_cache_compressed_options:\n");
639     ret.append(table_options_.block_cache_compressed->GetPrintableOptions());
640   }
641   snprintf(buffer, kBufferSize, "  persistent_cache: %p\n",
642            static_cast<void*>(table_options_.persistent_cache.get()));
643   ret.append(buffer);
644   if (table_options_.persistent_cache) {
645     snprintf(buffer, kBufferSize, "  persistent_cache_options:\n");
646     ret.append(buffer);
647     ret.append(table_options_.persistent_cache->GetPrintableOptions());
648   }
649   snprintf(buffer, kBufferSize, "  block_size: %" ROCKSDB_PRIszt "\n",
650            table_options_.block_size);
651   ret.append(buffer);
652   snprintf(buffer, kBufferSize, "  block_size_deviation: %d\n",
653            table_options_.block_size_deviation);
654   ret.append(buffer);
655   snprintf(buffer, kBufferSize, "  block_restart_interval: %d\n",
656            table_options_.block_restart_interval);
657   ret.append(buffer);
658   snprintf(buffer, kBufferSize, "  index_block_restart_interval: %d\n",
659            table_options_.index_block_restart_interval);
660   ret.append(buffer);
661   snprintf(buffer, kBufferSize, "  metadata_block_size: %" PRIu64 "\n",
662            table_options_.metadata_block_size);
663   ret.append(buffer);
664   snprintf(buffer, kBufferSize, "  partition_filters: %d\n",
665            table_options_.partition_filters);
666   ret.append(buffer);
667   snprintf(buffer, kBufferSize, "  use_delta_encoding: %d\n",
668            table_options_.use_delta_encoding);
669   ret.append(buffer);
670   snprintf(buffer, kBufferSize, "  filter_policy: %s\n",
671            table_options_.filter_policy == nullptr
672                ? "nullptr"
673                : table_options_.filter_policy->Name());
674   ret.append(buffer);
675   snprintf(buffer, kBufferSize, "  whole_key_filtering: %d\n",
676            table_options_.whole_key_filtering);
677   ret.append(buffer);
678   snprintf(buffer, kBufferSize, "  verify_compression: %d\n",
679            table_options_.verify_compression);
680   ret.append(buffer);
681   snprintf(buffer, kBufferSize, "  read_amp_bytes_per_bit: %d\n",
682            table_options_.read_amp_bytes_per_bit);
683   ret.append(buffer);
684   snprintf(buffer, kBufferSize, "  format_version: %d\n",
685            table_options_.format_version);
686   ret.append(buffer);
687   snprintf(buffer, kBufferSize, "  enable_index_compression: %d\n",
688            table_options_.enable_index_compression);
689   ret.append(buffer);
690   snprintf(buffer, kBufferSize, "  block_align: %d\n",
691            table_options_.block_align);
692   ret.append(buffer);
693   snprintf(buffer, kBufferSize,
694            "  max_auto_readahead_size: %" ROCKSDB_PRIszt "\n",
695            table_options_.max_auto_readahead_size);
696   ret.append(buffer);
697   snprintf(buffer, kBufferSize, "  prepopulate_block_cache: %d\n",
698            static_cast<int>(table_options_.prepopulate_block_cache));
699   ret.append(buffer);
700   return ret;
701 }
702 
GetOptionsPtr(const std::string & name) const703 const void* BlockBasedTableFactory::GetOptionsPtr(
704     const std::string& name) const {
705   if (name == kBlockCacheOpts()) {
706     if (table_options_.no_block_cache) {
707       return nullptr;
708     } else {
709       return table_options_.block_cache.get();
710     }
711   } else {
712     return TableFactory::GetOptionsPtr(name);
713   }
714 }
715 
716 #ifndef ROCKSDB_LITE
717 // Take a default BlockBasedTableOptions "table_options" in addition to a
718 // map "opts_map" of option name to option value to construct the new
719 // BlockBasedTableOptions "new_table_options".
720 //
721 // Below are the instructions of how to config some non-primitive-typed
722 // options in BlockBasedTableOptions:
723 //
724 // * filter_policy:
725 //   We currently only support the following FilterPolicy in the convenience
726 //   functions:
727 //   - BloomFilter: use "bloomfilter:[bits_per_key]:[use_block_based_builder]"
728 //     to specify BloomFilter.  The above string is equivalent to calling
729 //     NewBloomFilterPolicy(bits_per_key, use_block_based_builder).
730 //     [Example]:
731 //     - Pass {"filter_policy", "bloomfilter:4:true"} in
732 //       GetBlockBasedTableOptionsFromMap to use a BloomFilter with 4-bits
733 //       per key and use_block_based_builder enabled.
734 //
735 // * block_cache / block_cache_compressed:
736 //   We currently only support LRU cache in the GetOptions API.  The LRU
737 //   cache can be set by directly specifying its size.
738 //   [Example]:
739 //   - Passing {"block_cache", "1M"} in GetBlockBasedTableOptionsFromMap is
740 //     equivalent to setting block_cache using NewLRUCache(1024 * 1024).
741 //
742 // @param table_options the default options of the output "new_table_options".
743 // @param opts_map an option name to value map for specifying how
744 //     "new_table_options" should be set.
745 // @param new_table_options the resulting options based on "table_options"
746 //     with the change specified in "opts_map".
747 // @param input_strings_escaped when set to true, each escaped characters
748 //     prefixed by '\' in the values of the opts_map will be further converted
749 //     back to the raw string before assigning to the associated options.
750 // @param ignore_unknown_options when set to true, unknown options are ignored
751 //     instead of resulting in an unknown-option error.
752 // @return Status::OK() on success.  Otherwise, a non-ok status indicating
753 //     error will be returned, and "new_table_options" will be set to
754 //     "table_options".
ParseOption(const ConfigOptions & config_options,const OptionTypeInfo & opt_info,const std::string & opt_name,const std::string & opt_value,void * opt_ptr)755 Status BlockBasedTableFactory::ParseOption(const ConfigOptions& config_options,
756                                            const OptionTypeInfo& opt_info,
757                                            const std::string& opt_name,
758                                            const std::string& opt_value,
759                                            void* opt_ptr) {
760   Status status = TableFactory::ParseOption(config_options, opt_info, opt_name,
761                                             opt_value, opt_ptr);
762   if (config_options.input_strings_escaped && !status.ok()) {  // Got an error
763     // !input_strings_escaped indicates the old API, where everything is
764     // parsable.
765     if (opt_info.IsByName()) {
766       status = Status::OK();
767     }
768   }
769   return status;
770 }
771 
GetBlockBasedTableOptionsFromString(const BlockBasedTableOptions & table_options,const std::string & opts_str,BlockBasedTableOptions * new_table_options)772 Status GetBlockBasedTableOptionsFromString(
773     const BlockBasedTableOptions& table_options, const std::string& opts_str,
774     BlockBasedTableOptions* new_table_options) {
775   ConfigOptions config_options;
776   config_options.input_strings_escaped = false;
777   config_options.ignore_unknown_options = false;
778   config_options.invoke_prepare_options = false;
779   return GetBlockBasedTableOptionsFromString(config_options, table_options,
780                                              opts_str, new_table_options);
781 }
GetBlockBasedTableOptionsFromString(const ConfigOptions & config_options,const BlockBasedTableOptions & table_options,const std::string & opts_str,BlockBasedTableOptions * new_table_options)782 Status GetBlockBasedTableOptionsFromString(
783     const ConfigOptions& config_options,
784     const BlockBasedTableOptions& table_options, const std::string& opts_str,
785     BlockBasedTableOptions* new_table_options) {
786   std::unordered_map<std::string, std::string> opts_map;
787   Status s = StringToMap(opts_str, &opts_map);
788   if (!s.ok()) {
789     return s;
790   }
791   s = GetBlockBasedTableOptionsFromMap(config_options, table_options, opts_map,
792                                        new_table_options);
793   // Translate any errors (NotFound, NotSupported, to InvalidArgument
794   if (s.ok() || s.IsInvalidArgument()) {
795     return s;
796   } else {
797     return Status::InvalidArgument(s.getState());
798   }
799 }
800 
GetBlockBasedTableOptionsFromMap(const BlockBasedTableOptions & table_options,const std::unordered_map<std::string,std::string> & opts_map,BlockBasedTableOptions * new_table_options,bool input_strings_escaped,bool ignore_unknown_options)801 Status GetBlockBasedTableOptionsFromMap(
802     const BlockBasedTableOptions& table_options,
803     const std::unordered_map<std::string, std::string>& opts_map,
804     BlockBasedTableOptions* new_table_options, bool input_strings_escaped,
805     bool ignore_unknown_options) {
806   ConfigOptions config_options;
807   config_options.input_strings_escaped = input_strings_escaped;
808   config_options.ignore_unknown_options = ignore_unknown_options;
809   config_options.invoke_prepare_options = false;
810 
811   return GetBlockBasedTableOptionsFromMap(config_options, table_options,
812                                           opts_map, new_table_options);
813 }
814 
GetBlockBasedTableOptionsFromMap(const ConfigOptions & config_options,const BlockBasedTableOptions & table_options,const std::unordered_map<std::string,std::string> & opts_map,BlockBasedTableOptions * new_table_options)815 Status GetBlockBasedTableOptionsFromMap(
816     const ConfigOptions& config_options,
817     const BlockBasedTableOptions& table_options,
818     const std::unordered_map<std::string, std::string>& opts_map,
819     BlockBasedTableOptions* new_table_options) {
820   assert(new_table_options);
821   BlockBasedTableFactory bbtf(table_options);
822   Status s = bbtf.ConfigureFromMap(config_options, opts_map);
823   if (s.ok()) {
824     *new_table_options = *(bbtf.GetOptions<BlockBasedTableOptions>());
825   } else {
826     *new_table_options = table_options;
827   }
828   return s;
829 }
830 #endif  // !ROCKSDB_LITE
831 
NewBlockBasedTableFactory(const BlockBasedTableOptions & _table_options)832 TableFactory* NewBlockBasedTableFactory(
833     const BlockBasedTableOptions& _table_options) {
834   return new BlockBasedTableFactory(_table_options);
835 }
836 
837 const std::string BlockBasedTablePropertyNames::kIndexType =
838     "rocksdb.block.based.table.index.type";
839 const std::string BlockBasedTablePropertyNames::kWholeKeyFiltering =
840     "rocksdb.block.based.table.whole.key.filtering";
841 const std::string BlockBasedTablePropertyNames::kPrefixFiltering =
842     "rocksdb.block.based.table.prefix.filtering";
843 const std::string kHashIndexPrefixesBlock = "rocksdb.hashindex.prefixes";
844 const std::string kHashIndexPrefixesMetadataBlock =
845     "rocksdb.hashindex.metadata";
846 const std::string kPropTrue = "1";
847 const std::string kPropFalse = "0";
848 
849 }  // namespace ROCKSDB_NAMESPACE
850