1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 
10 #pragma once
11 #include <algorithm>
12 #include <set>
13 #include <string>
14 #include <utility>
15 #include <vector>
16 #include "db/dbformat.h"
17 #include "memory/arena.h"
18 #include "rocksdb/cache.h"
19 #include "table/table_reader.h"
20 #include "util/autovector.h"
21 
22 namespace rocksdb {
23 
24 class VersionSet;
25 
26 constexpr uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF;
27 constexpr uint64_t kInvalidBlobFileNumber = 0;
28 constexpr uint64_t kUnknownOldestAncesterTime = 0;
29 constexpr uint64_t kUnknownFileCreationTime = 0;
30 
31 extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id);
32 
33 // A copyable structure contains information needed to read data from an SST
34 // file. It can contain a pointer to a table reader opened for the file, or
35 // file number and size, which can be used to create a new table reader for it.
36 // The behavior is undefined when a copied of the structure is used when the
37 // file is not in any live version any more.
38 struct FileDescriptor {
39   // Table reader in table_reader_handle
40   TableReader* table_reader;
41   uint64_t packed_number_and_path_id;
42   uint64_t file_size;  // File size in bytes
43   SequenceNumber smallest_seqno;  // The smallest seqno in this file
44   SequenceNumber largest_seqno;   // The largest seqno in this file
45 
FileDescriptorFileDescriptor46   FileDescriptor() : FileDescriptor(0, 0, 0) {}
47 
FileDescriptorFileDescriptor48   FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size)
49       : FileDescriptor(number, path_id, _file_size, kMaxSequenceNumber, 0) {}
50 
FileDescriptorFileDescriptor51   FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size,
52                  SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno)
53       : table_reader(nullptr),
54         packed_number_and_path_id(PackFileNumberAndPathId(number, path_id)),
55         file_size(_file_size),
56         smallest_seqno(_smallest_seqno),
57         largest_seqno(_largest_seqno) {}
58 
FileDescriptorFileDescriptor59   FileDescriptor(const FileDescriptor& fd) { *this = fd; }
60 
61   FileDescriptor& operator=(const FileDescriptor& fd) {
62     table_reader = fd.table_reader;
63     packed_number_and_path_id = fd.packed_number_and_path_id;
64     file_size = fd.file_size;
65     smallest_seqno = fd.smallest_seqno;
66     largest_seqno = fd.largest_seqno;
67     return *this;
68   }
69 
GetNumberFileDescriptor70   uint64_t GetNumber() const {
71     return packed_number_and_path_id & kFileNumberMask;
72   }
GetPathIdFileDescriptor73   uint32_t GetPathId() const {
74     return static_cast<uint32_t>(
75         packed_number_and_path_id / (kFileNumberMask + 1));
76   }
GetFileSizeFileDescriptor77   uint64_t GetFileSize() const { return file_size; }
78 };
79 
80 struct FileSampledStats {
FileSampledStatsFileSampledStats81   FileSampledStats() : num_reads_sampled(0) {}
FileSampledStatsFileSampledStats82   FileSampledStats(const FileSampledStats& other) { *this = other; }
83   FileSampledStats& operator=(const FileSampledStats& other) {
84     num_reads_sampled = other.num_reads_sampled.load();
85     return *this;
86   }
87 
88   // number of user reads to this file.
89   mutable std::atomic<uint64_t> num_reads_sampled;
90 };
91 
92 struct FileMetaData {
93   FileDescriptor fd;
94   InternalKey smallest;            // Smallest internal key served by table
95   InternalKey largest;             // Largest internal key served by table
96 
97   // Needs to be disposed when refs becomes 0.
98   Cache::Handle* table_reader_handle = nullptr;
99 
100   FileSampledStats stats;
101 
102   // Stats for compensating deletion entries during compaction
103 
104   // File size compensated by deletion entry.
105   // This is updated in Version::UpdateAccumulatedStats() first time when the
106   // file is created or loaded.  After it is updated (!= 0), it is immutable.
107   uint64_t compensated_file_size = 0;
108   // These values can mutate, but they can only be read or written from
109   // single-threaded LogAndApply thread
110   uint64_t num_entries = 0;     // the number of entries.
111   uint64_t num_deletions = 0;   // the number of deletion entries.
112   uint64_t raw_key_size = 0;    // total uncompressed key size.
113   uint64_t raw_value_size = 0;  // total uncompressed value size.
114 
115   int refs = 0;  // Reference count
116 
117   bool being_compacted = false;       // Is this file undergoing compaction?
118   bool init_stats_from_file = false;  // true if the data-entry stats of this
119                                       // file has initialized from file.
120 
121   bool marked_for_compaction = false;  // True if client asked us nicely to
122                                        // compact this file.
123 
124   // Used only in BlobDB. The file number of the oldest blob file this SST file
125   // refers to. 0 is an invalid value; BlobDB numbers the files starting from 1.
126   uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
127 
128   // The file could be the compaction output from other SST files, which could
129   // in turn be outputs for compact older SST files. We track the memtable
130   // flush timestamp for the oldest SST file that eventaully contribute data
131   // to this file. 0 means the information is not available.
132   uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
133 
134   // Unix time when the SST file is created.
135   uint64_t file_creation_time = kUnknownFileCreationTime;
136 
137   FileMetaData() = default;
138 
FileMetaDataFileMetaData139   FileMetaData(uint64_t file, uint32_t file_path_id, uint64_t file_size,
140                const InternalKey& smallest_key, const InternalKey& largest_key,
141                const SequenceNumber& smallest_seq,
142                const SequenceNumber& largest_seq, bool marked_for_compact,
143                uint64_t oldest_blob_file, uint64_t _oldest_ancester_time,
144                uint64_t _file_creation_time)
145       : fd(file, file_path_id, file_size, smallest_seq, largest_seq),
146         smallest(smallest_key),
147         largest(largest_key),
148         marked_for_compaction(marked_for_compact),
149         oldest_blob_file_number(oldest_blob_file),
150         oldest_ancester_time(_oldest_ancester_time),
151         file_creation_time(_file_creation_time) {
152     TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this);
153   }
154 
155   // REQUIRED: Keys must be given to the function in sorted order (it expects
156   // the last key to be the largest).
157   void UpdateBoundaries(const Slice& key, const Slice& value,
158                         SequenceNumber seqno, ValueType value_type);
159 
160   // Unlike UpdateBoundaries, ranges do not need to be presented in any
161   // particular order.
UpdateBoundariesForRangeFileMetaData162   void UpdateBoundariesForRange(const InternalKey& start,
163                                 const InternalKey& end, SequenceNumber seqno,
164                                 const InternalKeyComparator& icmp) {
165     if (smallest.size() == 0 || icmp.Compare(start, smallest) < 0) {
166       smallest = start;
167     }
168     if (largest.size() == 0 || icmp.Compare(largest, end) < 0) {
169       largest = end;
170     }
171     fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
172     fd.largest_seqno = std::max(fd.largest_seqno, seqno);
173   }
174 
175   // Try to get oldest ancester time from the class itself or table properties
176   // if table reader is already pinned.
177   // 0 means the information is not available.
TryGetOldestAncesterTimeFileMetaData178   uint64_t TryGetOldestAncesterTime() {
179     if (oldest_ancester_time != kUnknownOldestAncesterTime) {
180       return oldest_ancester_time;
181     } else if (fd.table_reader != nullptr &&
182                fd.table_reader->GetTableProperties() != nullptr) {
183       return fd.table_reader->GetTableProperties()->creation_time;
184     }
185     return kUnknownOldestAncesterTime;
186   }
187 
TryGetFileCreationTimeFileMetaData188   uint64_t TryGetFileCreationTime() {
189     if (file_creation_time != kUnknownFileCreationTime) {
190       return file_creation_time;
191     } else if (fd.table_reader != nullptr &&
192                fd.table_reader->GetTableProperties() != nullptr) {
193       return fd.table_reader->GetTableProperties()->file_creation_time;
194     }
195     return kUnknownFileCreationTime;
196   }
197 };
198 
199 // A compressed copy of file meta data that just contain minimum data needed
200 // to server read operations, while still keeping the pointer to full metadata
201 // of the file in case it is needed.
202 struct FdWithKeyRange {
203   FileDescriptor fd;
204   FileMetaData* file_metadata;  // Point to all metadata
205   Slice smallest_key;    // slice that contain smallest key
206   Slice largest_key;     // slice that contain largest key
207 
FdWithKeyRangeFdWithKeyRange208   FdWithKeyRange()
209       : fd(),
210         file_metadata(nullptr),
211         smallest_key(),
212         largest_key() {
213   }
214 
FdWithKeyRangeFdWithKeyRange215   FdWithKeyRange(FileDescriptor _fd, Slice _smallest_key, Slice _largest_key,
216                  FileMetaData* _file_metadata)
217       : fd(_fd),
218         file_metadata(_file_metadata),
219         smallest_key(_smallest_key),
220         largest_key(_largest_key) {}
221 };
222 
223 // Data structure to store an array of FdWithKeyRange in one level
224 // Actual data is guaranteed to be stored closely
225 struct LevelFilesBrief {
226   size_t num_files;
227   FdWithKeyRange* files;
LevelFilesBriefLevelFilesBrief228   LevelFilesBrief() {
229     num_files = 0;
230     files = nullptr;
231   }
232 };
233 
234 // The state of a DB at any given time is referred to as a Version.
235 // Any modification to the Version is considered a Version Edit. A Version is
236 // constructed by joining a sequence of Version Edits. Version Edits are written
237 // to the MANIFEST file.
238 class VersionEdit {
239  public:
VersionEdit()240   VersionEdit() { Clear(); }
~VersionEdit()241   ~VersionEdit() { }
242 
243   void Clear();
244 
SetDBId(const std::string & db_id)245   void SetDBId(const std::string& db_id) {
246     has_db_id_ = true;
247     db_id_ = db_id;
248   }
249 
SetComparatorName(const Slice & name)250   void SetComparatorName(const Slice& name) {
251     has_comparator_ = true;
252     comparator_ = name.ToString();
253   }
SetLogNumber(uint64_t num)254   void SetLogNumber(uint64_t num) {
255     has_log_number_ = true;
256     log_number_ = num;
257   }
SetPrevLogNumber(uint64_t num)258   void SetPrevLogNumber(uint64_t num) {
259     has_prev_log_number_ = true;
260     prev_log_number_ = num;
261   }
SetNextFile(uint64_t num)262   void SetNextFile(uint64_t num) {
263     has_next_file_number_ = true;
264     next_file_number_ = num;
265   }
SetLastSequence(SequenceNumber seq)266   void SetLastSequence(SequenceNumber seq) {
267     has_last_sequence_ = true;
268     last_sequence_ = seq;
269   }
SetMaxColumnFamily(uint32_t max_column_family)270   void SetMaxColumnFamily(uint32_t max_column_family) {
271     has_max_column_family_ = true;
272     max_column_family_ = max_column_family;
273   }
SetMinLogNumberToKeep(uint64_t num)274   void SetMinLogNumberToKeep(uint64_t num) {
275     has_min_log_number_to_keep_ = true;
276     min_log_number_to_keep_ = num;
277   }
278 
has_db_id()279   bool has_db_id() { return has_db_id_; }
280 
has_log_number()281   bool has_log_number() { return has_log_number_; }
282 
log_number()283   uint64_t log_number() { return log_number_; }
284 
has_next_file_number()285   bool has_next_file_number() const { return has_next_file_number_; }
286 
next_file_number()287   uint64_t next_file_number() const { return next_file_number_; }
288 
289   // Add the specified file at the specified number.
290   // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
291   // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
292   // REQUIRES: "oldest_blob_file_number" is the number of the oldest blob file
293   // referred to by this file if any, kInvalidBlobFileNumber otherwise.
AddFile(int level,uint64_t file,uint32_t file_path_id,uint64_t file_size,const InternalKey & smallest,const InternalKey & largest,const SequenceNumber & smallest_seqno,const SequenceNumber & largest_seqno,bool marked_for_compaction,uint64_t oldest_blob_file_number,uint64_t oldest_ancester_time,uint64_t file_creation_time)294   void AddFile(int level, uint64_t file, uint32_t file_path_id,
295                uint64_t file_size, const InternalKey& smallest,
296                const InternalKey& largest, const SequenceNumber& smallest_seqno,
297                const SequenceNumber& largest_seqno, bool marked_for_compaction,
298                uint64_t oldest_blob_file_number, uint64_t oldest_ancester_time,
299                uint64_t file_creation_time) {
300     assert(smallest_seqno <= largest_seqno);
301     new_files_.emplace_back(
302         level, FileMetaData(file, file_path_id, file_size, smallest, largest,
303                             smallest_seqno, largest_seqno,
304                             marked_for_compaction, oldest_blob_file_number,
305                             oldest_ancester_time, file_creation_time));
306   }
307 
AddFile(int level,const FileMetaData & f)308   void AddFile(int level, const FileMetaData& f) {
309     assert(f.fd.smallest_seqno <= f.fd.largest_seqno);
310     new_files_.emplace_back(level, f);
311   }
312 
313   // Delete the specified "file" from the specified "level".
DeleteFile(int level,uint64_t file)314   void DeleteFile(int level, uint64_t file) {
315     deleted_files_.insert({level, file});
316   }
317 
318   // Number of edits
NumEntries()319   size_t NumEntries() { return new_files_.size() + deleted_files_.size(); }
320 
IsColumnFamilyManipulation()321   bool IsColumnFamilyManipulation() {
322     return is_column_family_add_ || is_column_family_drop_;
323   }
324 
SetColumnFamily(uint32_t column_family_id)325   void SetColumnFamily(uint32_t column_family_id) {
326     column_family_ = column_family_id;
327   }
328 
329   // set column family ID by calling SetColumnFamily()
AddColumnFamily(const std::string & name)330   void AddColumnFamily(const std::string& name) {
331     assert(!is_column_family_drop_);
332     assert(!is_column_family_add_);
333     assert(NumEntries() == 0);
334     is_column_family_add_ = true;
335     column_family_name_ = name;
336   }
337 
338   // set column family ID by calling SetColumnFamily()
DropColumnFamily()339   void DropColumnFamily() {
340     assert(!is_column_family_drop_);
341     assert(!is_column_family_add_);
342     assert(NumEntries() == 0);
343     is_column_family_drop_ = true;
344   }
345 
346   // return true on success.
347   bool EncodeTo(std::string* dst) const;
348   Status DecodeFrom(const Slice& src);
349 
350   const char* DecodeNewFile4From(Slice* input);
351 
352   typedef std::set<std::pair<int, uint64_t>> DeletedFileSet;
353 
GetDeletedFiles()354   const DeletedFileSet& GetDeletedFiles() { return deleted_files_; }
GetNewFiles()355   const std::vector<std::pair<int, FileMetaData>>& GetNewFiles() {
356     return new_files_;
357   }
358 
MarkAtomicGroup(uint32_t remaining_entries)359   void MarkAtomicGroup(uint32_t remaining_entries) {
360     is_in_atomic_group_ = true;
361     remaining_entries_ = remaining_entries;
362   }
363 
364   std::string DebugString(bool hex_key = false) const;
365   std::string DebugJSON(int edit_num, bool hex_key = false) const;
366 
GetDbId()367   const std::string GetDbId() { return db_id_; }
368 
369  private:
370   friend class ReactiveVersionSet;
371   friend class VersionSet;
372   friend class Version;
373   friend class AtomicGroupReadBuffer;
374 
375   bool GetLevel(Slice* input, int* level, const char** msg);
376 
377   int max_level_;
378   std::string db_id_;
379   std::string comparator_;
380   uint64_t log_number_;
381   uint64_t prev_log_number_;
382   uint64_t next_file_number_;
383   uint32_t max_column_family_;
384   // The most recent WAL log number that is deleted
385   uint64_t min_log_number_to_keep_;
386   SequenceNumber last_sequence_;
387   bool has_db_id_;
388   bool has_comparator_;
389   bool has_log_number_;
390   bool has_prev_log_number_;
391   bool has_next_file_number_;
392   bool has_last_sequence_;
393   bool has_max_column_family_;
394   bool has_min_log_number_to_keep_;
395 
396   DeletedFileSet deleted_files_;
397   std::vector<std::pair<int, FileMetaData>> new_files_;
398 
399   // Each version edit record should have column_family_ set
400   // If it's not set, it is default (0)
401   uint32_t column_family_;
402   // a version edit can be either column_family add or
403   // column_family drop. If it's column family add,
404   // it also includes column family name.
405   bool is_column_family_drop_;
406   bool is_column_family_add_;
407   std::string column_family_name_;
408 
409   bool is_in_atomic_group_;
410   uint32_t remaining_entries_;
411 };
412 
413 }  // namespace rocksdb
414