1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 //
10 // BlockBuilder generates blocks where keys are prefix-compressed:
11 //
12 // When we store a key, we drop the prefix shared with the previous
13 // string.  This helps reduce the space requirement significantly.
14 // Furthermore, once every K keys, we do not apply the prefix
15 // compression and store the entire key.  We call this a "restart
16 // point".  The tail end of the block stores the offsets of all of the
17 // restart points, and can be used to do a binary search when looking
18 // for a particular key.  Values are stored as-is (without compression)
19 // immediately following the corresponding key.
20 //
21 // An entry for a particular key-value pair has the form:
22 //     shared_bytes: varint32
23 //     unshared_bytes: varint32
24 //     value_length: varint32
25 //     key_delta: char[unshared_bytes]
26 //     value: char[value_length]
27 // shared_bytes == 0 for restart points.
28 //
29 // The trailer of the block has the form:
30 //     restarts: uint32[num_restarts]
31 //     num_restarts: uint32
32 // restarts[i] contains the offset within the block of the ith restart point.
33 
34 #include "table/block_based/block_builder.h"
35 
36 #include <assert.h>
37 #include <algorithm>
38 #include "db/dbformat.h"
39 #include "rocksdb/comparator.h"
40 #include "table/block_based/data_block_footer.h"
41 #include "util/coding.h"
42 
43 namespace ROCKSDB_NAMESPACE {
44 
BlockBuilder(int block_restart_interval,bool use_delta_encoding,bool use_value_delta_encoding,BlockBasedTableOptions::DataBlockIndexType index_type,double data_block_hash_table_util_ratio)45 BlockBuilder::BlockBuilder(
46     int block_restart_interval, bool use_delta_encoding,
47     bool use_value_delta_encoding,
48     BlockBasedTableOptions::DataBlockIndexType index_type,
49     double data_block_hash_table_util_ratio)
50     : block_restart_interval_(block_restart_interval),
51       use_delta_encoding_(use_delta_encoding),
52       use_value_delta_encoding_(use_value_delta_encoding),
53       restarts_(),
54       counter_(0),
55       finished_(false) {
56   switch (index_type) {
57     case BlockBasedTableOptions::kDataBlockBinarySearch:
58       break;
59     case BlockBasedTableOptions::kDataBlockBinaryAndHash:
60       data_block_hash_index_builder_.Initialize(
61           data_block_hash_table_util_ratio);
62       break;
63     default:
64       assert(0);
65   }
66   assert(block_restart_interval_ >= 1);
67   restarts_.push_back(0);  // First restart point is at offset 0
68   estimate_ = sizeof(uint32_t) + sizeof(uint32_t);
69 }
70 
Reset()71 void BlockBuilder::Reset() {
72   buffer_.clear();
73   restarts_.clear();
74   restarts_.push_back(0);  // First restart point is at offset 0
75   estimate_ = sizeof(uint32_t) + sizeof(uint32_t);
76   counter_ = 0;
77   finished_ = false;
78   last_key_.clear();
79   if (data_block_hash_index_builder_.Valid()) {
80     data_block_hash_index_builder_.Reset();
81   }
82 }
83 
EstimateSizeAfterKV(const Slice & key,const Slice & value) const84 size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key,
85                                          const Slice& value) const {
86   size_t estimate = CurrentSizeEstimate();
87   // Note: this is an imprecise estimate as it accounts for the whole key size
88   // instead of non-shared key size.
89   estimate += key.size();
90   // In value delta encoding we estimate the value delta size as half the full
91   // value size since only the size field of block handle is encoded.
92   estimate +=
93       !use_value_delta_encoding_ || (counter_ >= block_restart_interval_)
94           ? value.size()
95           : value.size() / 2;
96 
97   if (counter_ >= block_restart_interval_) {
98     estimate += sizeof(uint32_t);  // a new restart entry.
99   }
100 
101   estimate += sizeof(int32_t);  // varint for shared prefix length.
102   // Note: this is an imprecise estimate as we will have to encoded size, one
103   // for shared key and one for non-shared key.
104   estimate += VarintLength(key.size());  // varint for key length.
105   if (!use_value_delta_encoding_ || (counter_ >= block_restart_interval_)) {
106     estimate += VarintLength(value.size());  // varint for value length.
107   }
108 
109   return estimate;
110 }
111 
Finish()112 Slice BlockBuilder::Finish() {
113   // Append restart array
114   for (size_t i = 0; i < restarts_.size(); i++) {
115     PutFixed32(&buffer_, restarts_[i]);
116   }
117 
118   uint32_t num_restarts = static_cast<uint32_t>(restarts_.size());
119   BlockBasedTableOptions::DataBlockIndexType index_type =
120       BlockBasedTableOptions::kDataBlockBinarySearch;
121   if (data_block_hash_index_builder_.Valid() &&
122       CurrentSizeEstimate() <= kMaxBlockSizeSupportedByHashIndex) {
123     data_block_hash_index_builder_.Finish(buffer_);
124     index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash;
125   }
126 
127   // footer is a packed format of data_block_index_type and num_restarts
128   uint32_t block_footer = PackIndexTypeAndNumRestarts(index_type, num_restarts);
129 
130   PutFixed32(&buffer_, block_footer);
131   finished_ = true;
132   return Slice(buffer_);
133 }
134 
Add(const Slice & key,const Slice & value,const Slice * const delta_value)135 void BlockBuilder::Add(const Slice& key, const Slice& value,
136                        const Slice* const delta_value) {
137   assert(!finished_);
138   assert(counter_ <= block_restart_interval_);
139   assert(!use_value_delta_encoding_ || delta_value);
140   size_t shared = 0;  // number of bytes shared with prev key
141   if (counter_ >= block_restart_interval_) {
142     // Restart compression
143     restarts_.push_back(static_cast<uint32_t>(buffer_.size()));
144     estimate_ += sizeof(uint32_t);
145     counter_ = 0;
146 
147     if (use_delta_encoding_) {
148       // Update state
149       last_key_.assign(key.data(), key.size());
150     }
151   } else if (use_delta_encoding_) {
152     Slice last_key_piece(last_key_);
153     // See how much sharing to do with previous string
154     shared = key.difference_offset(last_key_piece);
155 
156     // Update state
157     // We used to just copy the changed data here, but it appears to be
158     // faster to just copy the whole thing.
159     last_key_.assign(key.data(), key.size());
160   }
161 
162   const size_t non_shared = key.size() - shared;
163   const size_t curr_size = buffer_.size();
164 
165   if (use_value_delta_encoding_) {
166     // Add "<shared><non_shared>" to buffer_
167     PutVarint32Varint32(&buffer_, static_cast<uint32_t>(shared),
168                         static_cast<uint32_t>(non_shared));
169   } else {
170     // Add "<shared><non_shared><value_size>" to buffer_
171     PutVarint32Varint32Varint32(&buffer_, static_cast<uint32_t>(shared),
172                                 static_cast<uint32_t>(non_shared),
173                                 static_cast<uint32_t>(value.size()));
174   }
175 
176   // Add string delta to buffer_ followed by value
177   buffer_.append(key.data() + shared, non_shared);
178   // Use value delta encoding only when the key has shared bytes. This would
179   // simplify the decoding, where it can figure which decoding to use simply by
180   // looking at the shared bytes size.
181   if (shared != 0 && use_value_delta_encoding_) {
182     buffer_.append(delta_value->data(), delta_value->size());
183   } else {
184     buffer_.append(value.data(), value.size());
185   }
186 
187   if (data_block_hash_index_builder_.Valid()) {
188     data_block_hash_index_builder_.Add(ExtractUserKey(key),
189                                        restarts_.size() - 1);
190   }
191 
192   counter_++;
193   estimate_ += buffer_.size() - curr_size;
194 }
195 
196 }  // namespace ROCKSDB_NAMESPACE
197