1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 
10 #pragma once
11 #include <stdint.h>
12 #include <string>
13 #include "file/file_prefetch_buffer.h"
14 #include "file/random_access_file_reader.h"
15 
16 #include "rocksdb/options.h"
17 #include "rocksdb/slice.h"
18 #include "rocksdb/status.h"
19 #include "rocksdb/table.h"
20 
21 #include "memory/memory_allocator.h"
22 #include "options/cf_options.h"
23 #include "port/malloc.h"
24 #include "port/port.h"  // noexcept
25 #include "table/persistent_cache_options.h"
26 
27 namespace ROCKSDB_NAMESPACE {
28 
29 class RandomAccessFile;
30 struct ReadOptions;
31 
32 extern bool ShouldReportDetailedTime(Env* env, Statistics* stats);
33 
34 // the length of the magic number in bytes.
35 const int kMagicNumberLengthByte = 8;
36 
37 // BlockHandle is a pointer to the extent of a file that stores a data
38 // block or a meta block.
39 class BlockHandle {
40  public:
41   BlockHandle();
42   BlockHandle(uint64_t offset, uint64_t size);
43 
44   // The offset of the block in the file.
offset()45   uint64_t offset() const { return offset_; }
set_offset(uint64_t _offset)46   void set_offset(uint64_t _offset) { offset_ = _offset; }
47 
48   // The size of the stored block
size()49   uint64_t size() const { return size_; }
set_size(uint64_t _size)50   void set_size(uint64_t _size) { size_ = _size; }
51 
52   void EncodeTo(std::string* dst) const;
53   Status DecodeFrom(Slice* input);
54   Status DecodeSizeFrom(uint64_t offset, Slice* input);
55 
56   // Return a string that contains the copy of handle.
57   std::string ToString(bool hex = true) const;
58 
59   // if the block handle's offset and size are both "0", we will view it
60   // as a null block handle that points to no where.
IsNull()61   bool IsNull() const { return offset_ == 0 && size_ == 0; }
62 
NullBlockHandle()63   static const BlockHandle& NullBlockHandle() { return kNullBlockHandle; }
64 
65   // Maximum encoding length of a BlockHandle
66   enum { kMaxEncodedLength = 10 + 10 };
67 
68  private:
69   uint64_t offset_;
70   uint64_t size_;
71 
72   static const BlockHandle kNullBlockHandle;
73 };
74 
75 // Value in block-based table file index.
76 //
77 // The index entry for block n is: y -> h, [x],
78 // where: y is some key between the last key of block n (inclusive) and the
79 // first key of block n+1 (exclusive); h is BlockHandle pointing to block n;
80 // x, if present, is the first key of block n (unshortened).
81 // This struct represents the "h, [x]" part.
82 struct IndexValue {
83   BlockHandle handle;
84   // Empty means unknown.
85   Slice first_internal_key;
86 
87   IndexValue() = default;
IndexValueIndexValue88   IndexValue(BlockHandle _handle, Slice _first_internal_key)
89       : handle(_handle), first_internal_key(_first_internal_key) {}
90 
91   // have_first_key indicates whether the `first_internal_key` is used.
92   // If previous_handle is not null, delta encoding is used;
93   // in this case, the two handles must point to consecutive blocks:
94   // handle.offset() ==
95   //     previous_handle->offset() + previous_handle->size() + kBlockTrailerSize
96   void EncodeTo(std::string* dst, bool have_first_key,
97                 const BlockHandle* previous_handle) const;
98   Status DecodeFrom(Slice* input, bool have_first_key,
99                     const BlockHandle* previous_handle);
100 
101   std::string ToString(bool hex, bool have_first_key) const;
102 };
103 
GetCompressFormatForVersion(CompressionType compression_type,uint32_t version)104 inline uint32_t GetCompressFormatForVersion(CompressionType compression_type,
105                                             uint32_t version) {
106 #ifdef NDEBUG
107   (void)compression_type;
108 #endif
109   // snappy is not versioned
110   assert(compression_type != kSnappyCompression &&
111          compression_type != kXpressCompression &&
112          compression_type != kNoCompression);
113   // As of version 2, we encode compressed block with
114   // compress_format_version == 2. Before that, the version is 1.
115   // DO NOT CHANGE THIS FUNCTION, it affects disk format
116   return version >= 2 ? 2 : 1;
117 }
118 
BlockBasedTableSupportedVersion(uint32_t version)119 inline bool BlockBasedTableSupportedVersion(uint32_t version) {
120   return version <= 5;
121 }
122 
123 // Footer encapsulates the fixed information stored at the tail
124 // end of every table file.
125 class Footer {
126  public:
127   // Constructs a footer without specifying its table magic number.
128   // In such case, the table magic number of such footer should be
129   // initialized via @ReadFooterFromFile().
130   // Use this when you plan to load Footer with DecodeFrom(). Never use this
131   // when you plan to EncodeTo.
Footer()132   Footer() : Footer(kInvalidTableMagicNumber, 0) {}
133 
134   // Use this constructor when you plan to write out the footer using
135   // EncodeTo(). Never use this constructor with DecodeFrom().
136   Footer(uint64_t table_magic_number, uint32_t version);
137 
138   // The version of the footer in this file
version()139   uint32_t version() const { return version_; }
140 
141   // The checksum type used in this file
checksum()142   ChecksumType checksum() const { return checksum_; }
set_checksum(const ChecksumType c)143   void set_checksum(const ChecksumType c) { checksum_ = c; }
144 
145   // The block handle for the metaindex block of the table
metaindex_handle()146   const BlockHandle& metaindex_handle() const { return metaindex_handle_; }
set_metaindex_handle(const BlockHandle & h)147   void set_metaindex_handle(const BlockHandle& h) { metaindex_handle_ = h; }
148 
149   // The block handle for the index block of the table
index_handle()150   const BlockHandle& index_handle() const { return index_handle_; }
151 
set_index_handle(const BlockHandle & h)152   void set_index_handle(const BlockHandle& h) { index_handle_ = h; }
153 
table_magic_number()154   uint64_t table_magic_number() const { return table_magic_number_; }
155 
156   void EncodeTo(std::string* dst) const;
157 
158   // Set the current footer based on the input slice.
159   //
160   // REQUIRES: table_magic_number_ is not set (i.e.,
161   // HasInitializedTableMagicNumber() is true). The function will initialize the
162   // magic number
163   Status DecodeFrom(Slice* input);
164 
165   // Encoded length of a Footer.  Note that the serialization of a Footer will
166   // always occupy at least kMinEncodedLength bytes.  If fields are changed
167   // the version number should be incremented and kMaxEncodedLength should be
168   // increased accordingly.
169   enum {
170     // Footer version 0 (legacy) will always occupy exactly this many bytes.
171     // It consists of two block handles, padding, and a magic number.
172     kVersion0EncodedLength = 2 * BlockHandle::kMaxEncodedLength + 8,
173     // Footer of versions 1 and higher will always occupy exactly this many
174     // bytes. It consists of the checksum type, two block handles, padding,
175     // a version number (bigger than 1), and a magic number
176     kNewVersionsEncodedLength = 1 + 2 * BlockHandle::kMaxEncodedLength + 4 + 8,
177     kMinEncodedLength = kVersion0EncodedLength,
178     kMaxEncodedLength = kNewVersionsEncodedLength,
179   };
180 
181   static const uint64_t kInvalidTableMagicNumber = 0;
182 
183   // convert this object to a human readable form
184   std::string ToString() const;
185 
186  private:
187   // REQUIRES: magic number wasn't initialized.
set_table_magic_number(uint64_t magic_number)188   void set_table_magic_number(uint64_t magic_number) {
189     assert(!HasInitializedTableMagicNumber());
190     table_magic_number_ = magic_number;
191   }
192 
193   // return true if @table_magic_number_ is set to a value different
194   // from @kInvalidTableMagicNumber.
HasInitializedTableMagicNumber()195   bool HasInitializedTableMagicNumber() const {
196     return (table_magic_number_ != kInvalidTableMagicNumber);
197   }
198 
199   uint32_t version_;
200   ChecksumType checksum_;
201   BlockHandle metaindex_handle_;
202   BlockHandle index_handle_;
203   uint64_t table_magic_number_ = 0;
204 };
205 
206 // Read the footer from file
207 // If enforce_table_magic_number != 0, ReadFooterFromFile() will return
208 // corruption if table_magic number is not equal to enforce_table_magic_number
209 Status ReadFooterFromFile(RandomAccessFileReader* file,
210                           FilePrefetchBuffer* prefetch_buffer,
211                           uint64_t file_size, Footer* footer,
212                           uint64_t enforce_table_magic_number = 0);
213 
214 // 1-byte type + 32-bit crc
215 static const size_t kBlockTrailerSize = 5;
216 
217 // Make block size calculation for IO less error prone
block_size(const BlockHandle & handle)218 inline uint64_t block_size(const BlockHandle& handle) {
219   return handle.size() + kBlockTrailerSize;
220 }
221 
get_block_compression_type(const char * block_data,size_t block_size)222 inline CompressionType get_block_compression_type(const char* block_data,
223                                                   size_t block_size) {
224   return static_cast<CompressionType>(block_data[block_size]);
225 }
226 
227 // Represents the contents of a block read from an SST file. Depending on how
228 // it's created, it may or may not own the actual block bytes. As an example,
229 // BlockContents objects representing data read from mmapped files only point
230 // into the mmapped region.
231 struct BlockContents {
232   Slice data;  // Actual contents of data
233   CacheAllocationPtr allocation;
234 
235 #ifndef NDEBUG
236   // Whether the block is a raw block, which contains compression type
237   // byte. It is only used for assertion.
238   bool is_raw_block = false;
239 #endif  // NDEBUG
240 
BlockContentsBlockContents241   BlockContents() {}
242 
243   // Does not take ownership of the underlying data bytes.
BlockContentsBlockContents244   BlockContents(const Slice& _data) : data(_data) {}
245 
246   // Takes ownership of the underlying data bytes.
BlockContentsBlockContents247   BlockContents(CacheAllocationPtr&& _data, size_t _size)
248       : data(_data.get(), _size), allocation(std::move(_data)) {}
249 
250   // Takes ownership of the underlying data bytes.
BlockContentsBlockContents251   BlockContents(std::unique_ptr<char[]>&& _data, size_t _size)
252       : data(_data.get(), _size) {
253     allocation.reset(_data.release());
254   }
255 
256   // Returns whether the object has ownership of the underlying data bytes.
own_bytesBlockContents257   bool own_bytes() const { return allocation.get() != nullptr; }
258 
259   // It's the caller's responsibility to make sure that this is
260   // for raw block contents, which contains the compression
261   // byte in the end.
get_compression_typeBlockContents262   CompressionType get_compression_type() const {
263     assert(is_raw_block);
264     return get_block_compression_type(data.data(), data.size());
265   }
266 
267   // The additional memory space taken by the block data.
usable_sizeBlockContents268   size_t usable_size() const {
269     if (allocation.get() != nullptr) {
270       auto allocator = allocation.get_deleter().allocator;
271       if (allocator) {
272         return allocator->UsableSize(allocation.get(), data.size());
273       }
274 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
275       return malloc_usable_size(allocation.get());
276 #else
277       return data.size();
278 #endif  // ROCKSDB_MALLOC_USABLE_SIZE
279     } else {
280       return 0;  // no extra memory is occupied by the data
281     }
282   }
283 
ApproximateMemoryUsageBlockContents284   size_t ApproximateMemoryUsage() const {
285     return usable_size() + sizeof(*this);
286   }
287 
BlockContentsBlockContents288   BlockContents(BlockContents&& other) ROCKSDB_NOEXCEPT {
289     *this = std::move(other);
290   }
291 
292   BlockContents& operator=(BlockContents&& other) {
293     data = std::move(other.data);
294     allocation = std::move(other.allocation);
295 #ifndef NDEBUG
296     is_raw_block = other.is_raw_block;
297 #endif  // NDEBUG
298     return *this;
299   }
300 };
301 
302 // Read the block identified by "handle" from "file".  On failure
303 // return non-OK.  On success fill *result and return OK.
304 extern Status ReadBlockContents(
305     RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
306     const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
307     BlockContents* contents, const ImmutableCFOptions& ioptions,
308     bool do_uncompress = true, const Slice& compression_dict = Slice(),
309     const PersistentCacheOptions& cache_options = PersistentCacheOptions());
310 
311 // The 'data' points to the raw block contents read in from file.
312 // This method allocates a new heap buffer and the raw block
313 // contents are uncompresed into this buffer. This buffer is
314 // returned via 'result' and it is upto the caller to
315 // free this buffer.
316 // For description of compress_format_version and possible values, see
317 // util/compression.h
318 extern Status UncompressBlockContents(const UncompressionInfo& info,
319                                       const char* data, size_t n,
320                                       BlockContents* contents,
321                                       uint32_t compress_format_version,
322                                       const ImmutableCFOptions& ioptions,
323                                       MemoryAllocator* allocator = nullptr);
324 
325 // This is an extension to UncompressBlockContents that accepts
326 // a specific compression type. This is used by un-wrapped blocks
327 // with no compression header.
328 extern Status UncompressBlockContentsForCompressionType(
329     const UncompressionInfo& info, const char* data, size_t n,
330     BlockContents* contents, uint32_t compress_format_version,
331     const ImmutableCFOptions& ioptions, MemoryAllocator* allocator = nullptr);
332 
333 // Implementation details follow.  Clients should ignore,
334 
335 // TODO(andrewkr): we should prefer one way of representing a null/uninitialized
336 // BlockHandle. Currently we use zeros for null and use negation-of-zeros for
337 // uninitialized.
BlockHandle()338 inline BlockHandle::BlockHandle()
339     : BlockHandle(~static_cast<uint64_t>(0), ~static_cast<uint64_t>(0)) {}
340 
BlockHandle(uint64_t _offset,uint64_t _size)341 inline BlockHandle::BlockHandle(uint64_t _offset, uint64_t _size)
342     : offset_(_offset), size_(_size) {}
343 
344 }  // namespace ROCKSDB_NAMESPACE
345