1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. 2 // This source code is licensed under both the GPLv2 (found in the 3 // COPYING file in the root directory) and Apache 2.0 License 4 // (found in the LICENSE.Apache file in the root directory). 5 // 6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 7 // Use of this source code is governed by a BSD-style license that can be 8 // found in the LICENSE file. See the AUTHORS file for names of contributors. 9 10 #pragma once 11 #include <algorithm> 12 #include <set> 13 #include <string> 14 #include <utility> 15 #include <vector> 16 #include "db/dbformat.h" 17 #include "memory/arena.h" 18 #include "rocksdb/cache.h" 19 #include "table/table_reader.h" 20 #include "util/autovector.h" 21 22 namespace rocksdb { 23 24 class VersionSet; 25 26 constexpr uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF; 27 constexpr uint64_t kInvalidBlobFileNumber = 0; 28 constexpr uint64_t kUnknownOldestAncesterTime = 0; 29 constexpr uint64_t kUnknownFileCreationTime = 0; 30 31 extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id); 32 33 // A copyable structure contains information needed to read data from an SST 34 // file. It can contain a pointer to a table reader opened for the file, or 35 // file number and size, which can be used to create a new table reader for it. 36 // The behavior is undefined when a copied of the structure is used when the 37 // file is not in any live version any more. 38 struct FileDescriptor { 39 // Table reader in table_reader_handle 40 TableReader* table_reader; 41 uint64_t packed_number_and_path_id; 42 uint64_t file_size; // File size in bytes 43 SequenceNumber smallest_seqno; // The smallest seqno in this file 44 SequenceNumber largest_seqno; // The largest seqno in this file 45 FileDescriptorFileDescriptor46 FileDescriptor() : FileDescriptor(0, 0, 0) {} 47 FileDescriptorFileDescriptor48 FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size) 49 : FileDescriptor(number, path_id, _file_size, kMaxSequenceNumber, 0) {} 50 FileDescriptorFileDescriptor51 FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size, 52 SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno) 53 : table_reader(nullptr), 54 packed_number_and_path_id(PackFileNumberAndPathId(number, path_id)), 55 file_size(_file_size), 56 smallest_seqno(_smallest_seqno), 57 largest_seqno(_largest_seqno) {} 58 FileDescriptorFileDescriptor59 FileDescriptor(const FileDescriptor& fd) { *this = fd; } 60 61 FileDescriptor& operator=(const FileDescriptor& fd) { 62 table_reader = fd.table_reader; 63 packed_number_and_path_id = fd.packed_number_and_path_id; 64 file_size = fd.file_size; 65 smallest_seqno = fd.smallest_seqno; 66 largest_seqno = fd.largest_seqno; 67 return *this; 68 } 69 GetNumberFileDescriptor70 uint64_t GetNumber() const { 71 return packed_number_and_path_id & kFileNumberMask; 72 } GetPathIdFileDescriptor73 uint32_t GetPathId() const { 74 return static_cast<uint32_t>( 75 packed_number_and_path_id / (kFileNumberMask + 1)); 76 } GetFileSizeFileDescriptor77 uint64_t GetFileSize() const { return file_size; } 78 }; 79 80 struct FileSampledStats { FileSampledStatsFileSampledStats81 FileSampledStats() : num_reads_sampled(0) {} FileSampledStatsFileSampledStats82 FileSampledStats(const FileSampledStats& other) { *this = other; } 83 FileSampledStats& operator=(const FileSampledStats& other) { 84 num_reads_sampled = other.num_reads_sampled.load(); 85 return *this; 86 } 87 88 // number of user reads to this file. 89 mutable std::atomic<uint64_t> num_reads_sampled; 90 }; 91 92 struct FileMetaData { 93 FileDescriptor fd; 94 InternalKey smallest; // Smallest internal key served by table 95 InternalKey largest; // Largest internal key served by table 96 97 // Needs to be disposed when refs becomes 0. 98 Cache::Handle* table_reader_handle = nullptr; 99 100 FileSampledStats stats; 101 102 // Stats for compensating deletion entries during compaction 103 104 // File size compensated by deletion entry. 105 // This is updated in Version::UpdateAccumulatedStats() first time when the 106 // file is created or loaded. After it is updated (!= 0), it is immutable. 107 uint64_t compensated_file_size = 0; 108 // These values can mutate, but they can only be read or written from 109 // single-threaded LogAndApply thread 110 uint64_t num_entries = 0; // the number of entries. 111 uint64_t num_deletions = 0; // the number of deletion entries. 112 uint64_t raw_key_size = 0; // total uncompressed key size. 113 uint64_t raw_value_size = 0; // total uncompressed value size. 114 115 int refs = 0; // Reference count 116 117 bool being_compacted = false; // Is this file undergoing compaction? 118 bool init_stats_from_file = false; // true if the data-entry stats of this 119 // file has initialized from file. 120 121 bool marked_for_compaction = false; // True if client asked us nicely to 122 // compact this file. 123 124 // Used only in BlobDB. The file number of the oldest blob file this SST file 125 // refers to. 0 is an invalid value; BlobDB numbers the files starting from 1. 126 uint64_t oldest_blob_file_number = kInvalidBlobFileNumber; 127 128 // The file could be the compaction output from other SST files, which could 129 // in turn be outputs for compact older SST files. We track the memtable 130 // flush timestamp for the oldest SST file that eventaully contribute data 131 // to this file. 0 means the information is not available. 132 uint64_t oldest_ancester_time = kUnknownOldestAncesterTime; 133 134 // Unix time when the SST file is created. 135 uint64_t file_creation_time = kUnknownFileCreationTime; 136 137 FileMetaData() = default; 138 FileMetaDataFileMetaData139 FileMetaData(uint64_t file, uint32_t file_path_id, uint64_t file_size, 140 const InternalKey& smallest_key, const InternalKey& largest_key, 141 const SequenceNumber& smallest_seq, 142 const SequenceNumber& largest_seq, bool marked_for_compact, 143 uint64_t oldest_blob_file, uint64_t _oldest_ancester_time, 144 uint64_t _file_creation_time) 145 : fd(file, file_path_id, file_size, smallest_seq, largest_seq), 146 smallest(smallest_key), 147 largest(largest_key), 148 marked_for_compaction(marked_for_compact), 149 oldest_blob_file_number(oldest_blob_file), 150 oldest_ancester_time(_oldest_ancester_time), 151 file_creation_time(_file_creation_time) { 152 TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this); 153 } 154 155 // REQUIRED: Keys must be given to the function in sorted order (it expects 156 // the last key to be the largest). 157 void UpdateBoundaries(const Slice& key, const Slice& value, 158 SequenceNumber seqno, ValueType value_type); 159 160 // Unlike UpdateBoundaries, ranges do not need to be presented in any 161 // particular order. UpdateBoundariesForRangeFileMetaData162 void UpdateBoundariesForRange(const InternalKey& start, 163 const InternalKey& end, SequenceNumber seqno, 164 const InternalKeyComparator& icmp) { 165 if (smallest.size() == 0 || icmp.Compare(start, smallest) < 0) { 166 smallest = start; 167 } 168 if (largest.size() == 0 || icmp.Compare(largest, end) < 0) { 169 largest = end; 170 } 171 fd.smallest_seqno = std::min(fd.smallest_seqno, seqno); 172 fd.largest_seqno = std::max(fd.largest_seqno, seqno); 173 } 174 175 // Try to get oldest ancester time from the class itself or table properties 176 // if table reader is already pinned. 177 // 0 means the information is not available. TryGetOldestAncesterTimeFileMetaData178 uint64_t TryGetOldestAncesterTime() { 179 if (oldest_ancester_time != kUnknownOldestAncesterTime) { 180 return oldest_ancester_time; 181 } else if (fd.table_reader != nullptr && 182 fd.table_reader->GetTableProperties() != nullptr) { 183 return fd.table_reader->GetTableProperties()->creation_time; 184 } 185 return kUnknownOldestAncesterTime; 186 } 187 TryGetFileCreationTimeFileMetaData188 uint64_t TryGetFileCreationTime() { 189 if (file_creation_time != kUnknownFileCreationTime) { 190 return file_creation_time; 191 } else if (fd.table_reader != nullptr && 192 fd.table_reader->GetTableProperties() != nullptr) { 193 return fd.table_reader->GetTableProperties()->file_creation_time; 194 } 195 return kUnknownFileCreationTime; 196 } 197 }; 198 199 // A compressed copy of file meta data that just contain minimum data needed 200 // to server read operations, while still keeping the pointer to full metadata 201 // of the file in case it is needed. 202 struct FdWithKeyRange { 203 FileDescriptor fd; 204 FileMetaData* file_metadata; // Point to all metadata 205 Slice smallest_key; // slice that contain smallest key 206 Slice largest_key; // slice that contain largest key 207 FdWithKeyRangeFdWithKeyRange208 FdWithKeyRange() 209 : fd(), 210 file_metadata(nullptr), 211 smallest_key(), 212 largest_key() { 213 } 214 FdWithKeyRangeFdWithKeyRange215 FdWithKeyRange(FileDescriptor _fd, Slice _smallest_key, Slice _largest_key, 216 FileMetaData* _file_metadata) 217 : fd(_fd), 218 file_metadata(_file_metadata), 219 smallest_key(_smallest_key), 220 largest_key(_largest_key) {} 221 }; 222 223 // Data structure to store an array of FdWithKeyRange in one level 224 // Actual data is guaranteed to be stored closely 225 struct LevelFilesBrief { 226 size_t num_files; 227 FdWithKeyRange* files; LevelFilesBriefLevelFilesBrief228 LevelFilesBrief() { 229 num_files = 0; 230 files = nullptr; 231 } 232 }; 233 234 // The state of a DB at any given time is referred to as a Version. 235 // Any modification to the Version is considered a Version Edit. A Version is 236 // constructed by joining a sequence of Version Edits. Version Edits are written 237 // to the MANIFEST file. 238 class VersionEdit { 239 public: VersionEdit()240 VersionEdit() { Clear(); } ~VersionEdit()241 ~VersionEdit() { } 242 243 void Clear(); 244 SetDBId(const std::string & db_id)245 void SetDBId(const std::string& db_id) { 246 has_db_id_ = true; 247 db_id_ = db_id; 248 } 249 SetComparatorName(const Slice & name)250 void SetComparatorName(const Slice& name) { 251 has_comparator_ = true; 252 comparator_ = name.ToString(); 253 } SetLogNumber(uint64_t num)254 void SetLogNumber(uint64_t num) { 255 has_log_number_ = true; 256 log_number_ = num; 257 } SetPrevLogNumber(uint64_t num)258 void SetPrevLogNumber(uint64_t num) { 259 has_prev_log_number_ = true; 260 prev_log_number_ = num; 261 } SetNextFile(uint64_t num)262 void SetNextFile(uint64_t num) { 263 has_next_file_number_ = true; 264 next_file_number_ = num; 265 } SetLastSequence(SequenceNumber seq)266 void SetLastSequence(SequenceNumber seq) { 267 has_last_sequence_ = true; 268 last_sequence_ = seq; 269 } SetMaxColumnFamily(uint32_t max_column_family)270 void SetMaxColumnFamily(uint32_t max_column_family) { 271 has_max_column_family_ = true; 272 max_column_family_ = max_column_family; 273 } SetMinLogNumberToKeep(uint64_t num)274 void SetMinLogNumberToKeep(uint64_t num) { 275 has_min_log_number_to_keep_ = true; 276 min_log_number_to_keep_ = num; 277 } 278 has_db_id()279 bool has_db_id() { return has_db_id_; } 280 has_log_number()281 bool has_log_number() { return has_log_number_; } 282 log_number()283 uint64_t log_number() { return log_number_; } 284 has_next_file_number()285 bool has_next_file_number() const { return has_next_file_number_; } 286 next_file_number()287 uint64_t next_file_number() const { return next_file_number_; } 288 289 // Add the specified file at the specified number. 290 // REQUIRES: This version has not been saved (see VersionSet::SaveTo) 291 // REQUIRES: "smallest" and "largest" are smallest and largest keys in file 292 // REQUIRES: "oldest_blob_file_number" is the number of the oldest blob file 293 // referred to by this file if any, kInvalidBlobFileNumber otherwise. AddFile(int level,uint64_t file,uint32_t file_path_id,uint64_t file_size,const InternalKey & smallest,const InternalKey & largest,const SequenceNumber & smallest_seqno,const SequenceNumber & largest_seqno,bool marked_for_compaction,uint64_t oldest_blob_file_number,uint64_t oldest_ancester_time,uint64_t file_creation_time)294 void AddFile(int level, uint64_t file, uint32_t file_path_id, 295 uint64_t file_size, const InternalKey& smallest, 296 const InternalKey& largest, const SequenceNumber& smallest_seqno, 297 const SequenceNumber& largest_seqno, bool marked_for_compaction, 298 uint64_t oldest_blob_file_number, uint64_t oldest_ancester_time, 299 uint64_t file_creation_time) { 300 assert(smallest_seqno <= largest_seqno); 301 new_files_.emplace_back( 302 level, FileMetaData(file, file_path_id, file_size, smallest, largest, 303 smallest_seqno, largest_seqno, 304 marked_for_compaction, oldest_blob_file_number, 305 oldest_ancester_time, file_creation_time)); 306 } 307 AddFile(int level,const FileMetaData & f)308 void AddFile(int level, const FileMetaData& f) { 309 assert(f.fd.smallest_seqno <= f.fd.largest_seqno); 310 new_files_.emplace_back(level, f); 311 } 312 313 // Delete the specified "file" from the specified "level". DeleteFile(int level,uint64_t file)314 void DeleteFile(int level, uint64_t file) { 315 deleted_files_.insert({level, file}); 316 } 317 318 // Number of edits NumEntries()319 size_t NumEntries() { return new_files_.size() + deleted_files_.size(); } 320 IsColumnFamilyManipulation()321 bool IsColumnFamilyManipulation() { 322 return is_column_family_add_ || is_column_family_drop_; 323 } 324 SetColumnFamily(uint32_t column_family_id)325 void SetColumnFamily(uint32_t column_family_id) { 326 column_family_ = column_family_id; 327 } 328 329 // set column family ID by calling SetColumnFamily() AddColumnFamily(const std::string & name)330 void AddColumnFamily(const std::string& name) { 331 assert(!is_column_family_drop_); 332 assert(!is_column_family_add_); 333 assert(NumEntries() == 0); 334 is_column_family_add_ = true; 335 column_family_name_ = name; 336 } 337 338 // set column family ID by calling SetColumnFamily() DropColumnFamily()339 void DropColumnFamily() { 340 assert(!is_column_family_drop_); 341 assert(!is_column_family_add_); 342 assert(NumEntries() == 0); 343 is_column_family_drop_ = true; 344 } 345 346 // return true on success. 347 bool EncodeTo(std::string* dst) const; 348 Status DecodeFrom(const Slice& src); 349 350 const char* DecodeNewFile4From(Slice* input); 351 352 typedef std::set<std::pair<int, uint64_t>> DeletedFileSet; 353 GetDeletedFiles()354 const DeletedFileSet& GetDeletedFiles() { return deleted_files_; } GetNewFiles()355 const std::vector<std::pair<int, FileMetaData>>& GetNewFiles() { 356 return new_files_; 357 } 358 MarkAtomicGroup(uint32_t remaining_entries)359 void MarkAtomicGroup(uint32_t remaining_entries) { 360 is_in_atomic_group_ = true; 361 remaining_entries_ = remaining_entries; 362 } 363 364 std::string DebugString(bool hex_key = false) const; 365 std::string DebugJSON(int edit_num, bool hex_key = false) const; 366 GetDbId()367 const std::string GetDbId() { return db_id_; } 368 369 private: 370 friend class ReactiveVersionSet; 371 friend class VersionSet; 372 friend class Version; 373 friend class AtomicGroupReadBuffer; 374 375 bool GetLevel(Slice* input, int* level, const char** msg); 376 377 int max_level_; 378 std::string db_id_; 379 std::string comparator_; 380 uint64_t log_number_; 381 uint64_t prev_log_number_; 382 uint64_t next_file_number_; 383 uint32_t max_column_family_; 384 // The most recent WAL log number that is deleted 385 uint64_t min_log_number_to_keep_; 386 SequenceNumber last_sequence_; 387 bool has_db_id_; 388 bool has_comparator_; 389 bool has_log_number_; 390 bool has_prev_log_number_; 391 bool has_next_file_number_; 392 bool has_last_sequence_; 393 bool has_max_column_family_; 394 bool has_min_log_number_to_keep_; 395 396 DeletedFileSet deleted_files_; 397 std::vector<std::pair<int, FileMetaData>> new_files_; 398 399 // Each version edit record should have column_family_ set 400 // If it's not set, it is default (0) 401 uint32_t column_family_; 402 // a version edit can be either column_family add or 403 // column_family drop. If it's column family add, 404 // it also includes column family name. 405 bool is_column_family_drop_; 406 bool is_column_family_add_; 407 std::string column_family_name_; 408 409 bool is_in_atomic_group_; 410 uint32_t remaining_entries_; 411 }; 412 413 } // namespace rocksdb 414