1 // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. See the AUTHORS file for names of contributors. 4 // 5 // The representation of a DBImpl consists of a set of Versions. The 6 // newest version is called "current". Older versions may be kept 7 // around to provide a consistent view to live iterators. 8 // 9 // Each Version keeps track of a set of Table files per level. The 10 // entire set of versions is maintained in a VersionSet. 11 // 12 // Version,VersionSet are thread-compatible, but require external 13 // synchronization on all accesses. 14 15 #ifndef STORAGE_LEVELDB_DB_VERSION_SET_H_ 16 #define STORAGE_LEVELDB_DB_VERSION_SET_H_ 17 18 #include <map> 19 #include <set> 20 #include <vector> 21 #include "db/dbformat.h" 22 #include "db/version_edit.h" 23 #include "port/port.h" 24 #include "port/thread_annotations.h" 25 26 namespace leveldb { 27 28 namespace log { class Writer; } 29 30 class Compaction; 31 class Iterator; 32 class MemTable; 33 class TableBuilder; 34 class TableCache; 35 class Version; 36 class VersionSet; 37 class WritableFile; 38 39 // Return the smallest index i such that files[i]->largest >= key. 40 // Return files.size() if there is no such file. 41 // REQUIRES: "files" contains a sorted list of non-overlapping files. 42 extern int FindFile(const InternalKeyComparator& icmp, 43 const std::vector<FileMetaData*>& files, 44 const Slice& key); 45 46 // Returns true iff some file in "files" overlaps the user key range 47 // [*smallest,*largest]. 48 // smallest==NULL represents a key smaller than all keys in the DB. 49 // largest==NULL represents a key largest than all keys in the DB. 50 // REQUIRES: If disjoint_sorted_files, files[] contains disjoint ranges 51 // in sorted order. 52 extern bool SomeFileOverlapsRange( 53 const InternalKeyComparator& icmp, 54 bool disjoint_sorted_files, 55 const std::vector<FileMetaData*>& files, 56 const Slice* smallest_user_key, 57 const Slice* largest_user_key); 58 59 class Version { 60 public: 61 // Append to *iters a sequence of iterators that will 62 // yield the contents of this Version when merged together. 63 // REQUIRES: This version has been saved (see VersionSet::SaveTo) 64 void AddIterators(const ReadOptions&, std::vector<Iterator*>* iters); 65 66 // Lookup the value for key. If found, store it in *val and 67 // return OK. Else return a non-OK status. Fills *stats. 68 // REQUIRES: lock is not held 69 struct GetStats { 70 FileMetaData* seek_file; 71 int seek_file_level; 72 }; 73 Status Get(const ReadOptions&, const LookupKey& key, std::string* val, 74 GetStats* stats); 75 76 // Adds "stats" into the current state. Returns true if a new 77 // compaction may need to be triggered, false otherwise. 78 // REQUIRES: lock is held 79 bool UpdateStats(const GetStats& stats); 80 81 // Record a sample of bytes read at the specified internal key. 82 // Samples are taken approximately once every config::kReadBytesPeriod 83 // bytes. Returns true if a new compaction may need to be triggered. 84 // REQUIRES: lock is held 85 bool RecordReadSample(Slice key); 86 87 // Reference count management (so Versions do not disappear out from 88 // under live iterators) 89 void Ref(); 90 void Unref(); 91 92 void GetOverlappingInputs( 93 int level, 94 const InternalKey* begin, // NULL means before all keys 95 const InternalKey* end, // NULL means after all keys 96 std::vector<FileMetaData*>* inputs); 97 98 // Returns true iff some file in the specified level overlaps 99 // some part of [*smallest_user_key,*largest_user_key]. 100 // smallest_user_key==NULL represents a key smaller than all keys in the DB. 101 // largest_user_key==NULL represents a key largest than all keys in the DB. 102 bool OverlapInLevel(int level, 103 const Slice* smallest_user_key, 104 const Slice* largest_user_key); 105 106 // Return the level at which we should place a new memtable compaction 107 // result that covers the range [smallest_user_key,largest_user_key]. 108 int PickLevelForMemTableOutput(const Slice& smallest_user_key, 109 const Slice& largest_user_key); 110 NumFiles(int level)111 int NumFiles(int level) const { return files_[level].size(); } 112 113 // Return a human readable string that describes this version's contents. 114 std::string DebugString() const; 115 116 private: 117 friend class Compaction; 118 friend class VersionSet; 119 120 class LevelFileNumIterator; 121 Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const; 122 123 // Call func(arg, level, f) for every file that overlaps user_key in 124 // order from newest to oldest. If an invocation of func returns 125 // false, makes no more calls. 126 // 127 // REQUIRES: user portion of internal_key == user_key. 128 void ForEachOverlapping(Slice user_key, Slice internal_key, 129 void* arg, 130 bool (*func)(void*, int, FileMetaData*)); 131 132 VersionSet* vset_; // VersionSet to which this Version belongs 133 Version* next_; // Next version in linked list 134 Version* prev_; // Previous version in linked list 135 int refs_; // Number of live refs to this version 136 137 // List of files per level 138 std::vector<FileMetaData*> files_[config::kNumLevels]; 139 140 // Next file to compact based on seek stats. 141 FileMetaData* file_to_compact_; 142 int file_to_compact_level_; 143 144 // Level that should be compacted next and its compaction score. 145 // Score < 1 means compaction is not strictly needed. These fields 146 // are initialized by Finalize(). 147 double compaction_score_; 148 int compaction_level_; 149 Version(VersionSet * vset)150 explicit Version(VersionSet* vset) 151 : vset_(vset), next_(this), prev_(this), refs_(0), 152 file_to_compact_(NULL), 153 file_to_compact_level_(-1), 154 compaction_score_(-1), 155 compaction_level_(-1) { 156 } 157 158 ~Version(); 159 160 // No copying allowed 161 Version(const Version&); 162 void operator=(const Version&); 163 }; 164 165 class VersionSet { 166 public: 167 VersionSet(const std::string& dbname, 168 const Options* options, 169 TableCache* table_cache, 170 const InternalKeyComparator*); 171 ~VersionSet(); 172 173 // Apply *edit to the current version to form a new descriptor that 174 // is both saved to persistent state and installed as the new 175 // current version. Will release *mu while actually writing to the file. 176 // REQUIRES: *mu is held on entry. 177 // REQUIRES: no other thread concurrently calls LogAndApply() 178 Status LogAndApply(VersionEdit* edit, port::Mutex* mu) 179 EXCLUSIVE_LOCKS_REQUIRED(mu); 180 181 // Recover the last saved descriptor from persistent storage. 182 Status Recover(bool *save_manifest); 183 184 // Return the current version. current()185 Version* current() const { return current_; } 186 187 // Return the current manifest file number ManifestFileNumber()188 uint64_t ManifestFileNumber() const { return manifest_file_number_; } 189 190 // Allocate and return a new file number NewFileNumber()191 uint64_t NewFileNumber() { return next_file_number_++; } 192 193 // Arrange to reuse "file_number" unless a newer file number has 194 // already been allocated. 195 // REQUIRES: "file_number" was returned by a call to NewFileNumber(). ReuseFileNumber(uint64_t file_number)196 void ReuseFileNumber(uint64_t file_number) { 197 if (next_file_number_ == file_number + 1) { 198 next_file_number_ = file_number; 199 } 200 } 201 202 // Return the number of Table files at the specified level. 203 int NumLevelFiles(int level) const; 204 205 // Return the combined file size of all files at the specified level. 206 int64_t NumLevelBytes(int level) const; 207 208 // Return the last sequence number. LastSequence()209 uint64_t LastSequence() const { return last_sequence_; } 210 211 // Set the last sequence number to s. SetLastSequence(uint64_t s)212 void SetLastSequence(uint64_t s) { 213 assert(s >= last_sequence_); 214 last_sequence_ = s; 215 } 216 217 // Mark the specified file number as used. 218 void MarkFileNumberUsed(uint64_t number); 219 220 // Return the current log file number. LogNumber()221 uint64_t LogNumber() const { return log_number_; } 222 223 // Return the log file number for the log file that is currently 224 // being compacted, or zero if there is no such log file. PrevLogNumber()225 uint64_t PrevLogNumber() const { return prev_log_number_; } 226 227 // Pick level and inputs for a new compaction. 228 // Returns NULL if there is no compaction to be done. 229 // Otherwise returns a pointer to a heap-allocated object that 230 // describes the compaction. Caller should delete the result. 231 Compaction* PickCompaction(); 232 233 // Return a compaction object for compacting the range [begin,end] in 234 // the specified level. Returns NULL if there is nothing in that 235 // level that overlaps the specified range. Caller should delete 236 // the result. 237 Compaction* CompactRange( 238 int level, 239 const InternalKey* begin, 240 const InternalKey* end); 241 242 // Return the maximum overlapping data (in bytes) at next level for any 243 // file at a level >= 1. 244 int64_t MaxNextLevelOverlappingBytes(); 245 246 // Create an iterator that reads over the compaction inputs for "*c". 247 // The caller should delete the iterator when no longer needed. 248 Iterator* MakeInputIterator(Compaction* c); 249 250 // Returns true iff some level needs a compaction. NeedsCompaction()251 bool NeedsCompaction() const { 252 Version* v = current_; 253 return (v->compaction_score_ >= 1) || (v->file_to_compact_ != NULL); 254 } 255 256 // Add all files listed in any live version to *live. 257 // May also mutate some internal state. 258 void AddLiveFiles(std::set<uint64_t>* live); 259 260 // Return the approximate offset in the database of the data for 261 // "key" as of version "v". 262 uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key); 263 264 // Return a human-readable short (single-line) summary of the number 265 // of files per level. Uses *scratch as backing store. 266 struct LevelSummaryStorage { 267 char buffer[100]; 268 }; 269 const char* LevelSummary(LevelSummaryStorage* scratch) const; 270 271 private: 272 class Builder; 273 274 friend class Compaction; 275 friend class Version; 276 277 bool ReuseManifest(const std::string& dscname, const std::string& dscbase); 278 279 void Finalize(Version* v); 280 281 void GetRange(const std::vector<FileMetaData*>& inputs, 282 InternalKey* smallest, 283 InternalKey* largest); 284 285 void GetRange2(const std::vector<FileMetaData*>& inputs1, 286 const std::vector<FileMetaData*>& inputs2, 287 InternalKey* smallest, 288 InternalKey* largest); 289 290 void SetupOtherInputs(Compaction* c); 291 292 // Save current contents to *log 293 Status WriteSnapshot(log::Writer* log); 294 295 void AppendVersion(Version* v); 296 297 Env* const env_; 298 const std::string dbname_; 299 const Options* const options_; 300 TableCache* const table_cache_; 301 const InternalKeyComparator icmp_; 302 uint64_t next_file_number_; 303 uint64_t manifest_file_number_; 304 uint64_t last_sequence_; 305 uint64_t log_number_; 306 uint64_t prev_log_number_; // 0 or backing store for memtable being compacted 307 308 // Opened lazily 309 WritableFile* descriptor_file_; 310 log::Writer* descriptor_log_; 311 Version dummy_versions_; // Head of circular doubly-linked list of versions. 312 Version* current_; // == dummy_versions_.prev_ 313 314 // Per-level key at which the next compaction at that level should start. 315 // Either an empty string, or a valid InternalKey. 316 std::string compact_pointer_[config::kNumLevels]; 317 318 // No copying allowed 319 VersionSet(const VersionSet&); 320 void operator=(const VersionSet&); 321 }; 322 323 // A Compaction encapsulates information about a compaction. 324 class Compaction { 325 public: 326 ~Compaction(); 327 328 // Return the level that is being compacted. Inputs from "level" 329 // and "level+1" will be merged to produce a set of "level+1" files. level()330 int level() const { return level_; } 331 332 // Return the object that holds the edits to the descriptor done 333 // by this compaction. edit()334 VersionEdit* edit() { return &edit_; } 335 336 // "which" must be either 0 or 1 num_input_files(int which)337 int num_input_files(int which) const { return inputs_[which].size(); } 338 339 // Return the ith input file at "level()+which" ("which" must be 0 or 1). input(int which,int i)340 FileMetaData* input(int which, int i) const { return inputs_[which][i]; } 341 342 // Maximum size of files to build during this compaction. MaxOutputFileSize()343 uint64_t MaxOutputFileSize() const { return max_output_file_size_; } 344 345 // Is this a trivial compaction that can be implemented by just 346 // moving a single input file to the next level (no merging or splitting) 347 bool IsTrivialMove() const; 348 349 // Add all inputs to this compaction as delete operations to *edit. 350 void AddInputDeletions(VersionEdit* edit); 351 352 // Returns true if the information we have available guarantees that 353 // the compaction is producing data in "level+1" for which no data exists 354 // in levels greater than "level+1". 355 bool IsBaseLevelForKey(const Slice& user_key); 356 357 // Returns true iff we should stop building the current output 358 // before processing "internal_key". 359 bool ShouldStopBefore(const Slice& internal_key); 360 361 // Release the input version for the compaction, once the compaction 362 // is successful. 363 void ReleaseInputs(); 364 365 private: 366 friend class Version; 367 friend class VersionSet; 368 369 Compaction(const Options* options, int level); 370 371 int level_; 372 uint64_t max_output_file_size_; 373 Version* input_version_; 374 VersionEdit edit_; 375 376 // Each compaction reads inputs from "level_" and "level_+1" 377 std::vector<FileMetaData*> inputs_[2]; // The two sets of inputs 378 379 // State used to check for number of overlapping grandparent files 380 // (parent == level_ + 1, grandparent == level_ + 2) 381 std::vector<FileMetaData*> grandparents_; 382 size_t grandparent_index_; // Index in grandparent_starts_ 383 bool seen_key_; // Some output key has been seen 384 int64_t overlapped_bytes_; // Bytes of overlap between current output 385 // and grandparent files 386 387 // State for implementing IsBaseLevelForKey 388 389 // level_ptrs_ holds indices into input_version_->levels_: our state 390 // is that we are positioned at one of the file ranges for each 391 // higher level than the ones involved in this compaction (i.e. for 392 // all L >= level_ + 2). 393 size_t level_ptrs_[config::kNumLevels]; 394 }; 395 396 } // namespace leveldb 397 398 #endif // STORAGE_LEVELDB_DB_VERSION_SET_H_ 399