1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. 2 // This source code is licensed under both the GPLv2 (found in the 3 // COPYING file in the root directory) and Apache 2.0 License 4 // (found in the LICENSE.Apache file in the root directory). 5 // 6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 7 // Use of this source code is governed by a BSD-style license that can be 8 // found in the LICENSE file. See the AUTHORS file for names of contributors. 9 10 #pragma once 11 12 #include <unordered_map> 13 #include <string> 14 #include <vector> 15 #include <atomic> 16 17 #include "db/memtable_list.h" 18 #include "db/table_cache.h" 19 #include "db/table_properties_collector.h" 20 #include "db/write_batch_internal.h" 21 #include "db/write_controller.h" 22 #include "options/cf_options.h" 23 #include "rocksdb/compaction_job_stats.h" 24 #include "rocksdb/db.h" 25 #include "rocksdb/env.h" 26 #include "rocksdb/options.h" 27 #include "trace_replay/block_cache_tracer.h" 28 #include "util/thread_local.h" 29 30 namespace ROCKSDB_NAMESPACE { 31 32 class Version; 33 class VersionSet; 34 class VersionStorageInfo; 35 class MemTable; 36 class MemTableListVersion; 37 class CompactionPicker; 38 class Compaction; 39 class InternalKey; 40 class InternalStats; 41 class ColumnFamilyData; 42 class DBImpl; 43 class LogBuffer; 44 class InstrumentedMutex; 45 class InstrumentedMutexLock; 46 struct SuperVersionContext; 47 48 extern const double kIncSlowdownRatio; 49 // This file contains a list of data structures for managing column family 50 // level metadata. 51 // 52 // The basic relationships among classes declared here are illustrated as 53 // following: 54 // 55 // +----------------------+ +----------------------+ +--------+ 56 // +---+ ColumnFamilyHandle 1 | +--+ ColumnFamilyHandle 2 | | DBImpl | 57 // | +----------------------+ | +----------------------+ +----+---+ 58 // | +--------------------------+ | 59 // | | +-----------------------------+ 60 // | | | 61 // | | +-----------------------------v-------------------------------+ 62 // | | | | 63 // | | | ColumnFamilySet | 64 // | | | | 65 // | | +-------------+--------------------------+----------------+---+ 66 // | | | | | 67 // | +-------------------------------------+ | | 68 // | | | | v 69 // | +-------------v-------------+ +-----v----v---------+ 70 // | | | | | 71 // | | ColumnFamilyData 1 | | ColumnFamilyData 2 | ...... 72 // | | | | | 73 // +---> | | | 74 // | +---------+ | | 75 // | | MemTable| | | 76 // | | List | | | 77 // +--------+---+--+-+----+----+ +--------------------++ 78 // | | | | 79 // | | | | 80 // | | | +-----------------------+ 81 // | | +-----------+ | 82 // v +--------+ | | 83 // +--------+--------+ | | | 84 // | | | | +----------v----------+ 85 // +---> |SuperVersion 1.a +-----------------> | 86 // | +------+ | | MemTableListVersion | 87 // +---+-------------+ | | | | | 88 // | | | | +----+------------+---+ 89 // | current | | | | | 90 // | +-------------+ | |mem | | 91 // | | | | | | 92 // +-v---v-------+ +---v--v---+ +-----v----+ +----v-----+ 93 // | | | | | | | | 94 // | Version 1.a | | memtable | | memtable | | memtable | 95 // | | | 1.a | | 1.b | | 1.c | 96 // +-------------+ | | | | | | 97 // +----------+ +----------+ +----------+ 98 // 99 // DBImpl keeps a ColumnFamilySet, which references to all column families by 100 // pointing to respective ColumnFamilyData object of each column family. 101 // This is how DBImpl can list and operate on all the column families. 102 // ColumnFamilyHandle also points to ColumnFamilyData directly, so that 103 // when a user executes a query, it can directly find memtables and Version 104 // as well as SuperVersion to the column family, without going through 105 // ColumnFamilySet. 106 // 107 // ColumnFamilySet points to the latest view of the LSM-tree (list of memtables 108 // and SST files) indirectly, while ongoing operations may hold references 109 // to a current or an out-of-date SuperVersion, which in turn points to a 110 // point-in-time view of the LSM-tree. This guarantees the memtables and SST 111 // files being operated on will not go away, until the SuperVersion is 112 // unreferenced to 0 and destoryed. 113 // 114 // The following graph illustrates a possible referencing relationships: 115 // 116 // Column +--------------+ current +-----------+ 117 // Family +---->+ +------------------->+ | 118 // Data | SuperVersion +----------+ | Version A | 119 // | 3 | imm | | | 120 // Iter2 +----->+ | +-------v------+ +-----------+ 121 // +-----+--------+ | MemtableList +----------------> Empty 122 // | | Version r | +-----------+ 123 // | +--------------+ | | 124 // +------------------+ current| Version B | 125 // +--------------+ | +----->+ | 126 // | | | | +-----+-----+ 127 // Compaction +>+ SuperVersion +-------------+ ^ 128 // Job | 2 +------+ | |current 129 // | +----+ | | mem | +------------+ 130 // +--------------+ | | +---------------------> | 131 // | +------------------------> MemTable a | 132 // | mem | | | 133 // +--------------+ | | +------------+ 134 // | +--------------------------+ 135 // Iter1 +-----> SuperVersion | | +------------+ 136 // | 1 +------------------------------>+ | 137 // | +-+ | mem | MemTable b | 138 // +--------------+ | | | | 139 // | | +--------------+ +-----^------+ 140 // | |imm | MemtableList | | 141 // | +--->+ Version s +------------+ 142 // | +--------------+ 143 // | +--------------+ 144 // | | MemtableList | 145 // +------>+ Version t +--------> Empty 146 // imm +--------------+ 147 // 148 // In this example, even if the current LSM-tree consists of Version A and 149 // memtable a, which is also referenced by SuperVersion, two older SuperVersion 150 // SuperVersion2 and Superversion1 still exist, and are referenced by a 151 // compaction job and an old iterator Iter1, respectively. SuperVersion2 152 // contains Version B, memtable a and memtable b; SuperVersion1 contains 153 // Version B and memtable b (mutable). As a result, Version B and memtable b 154 // are prevented from being destroyed or deleted. 155 156 // ColumnFamilyHandleImpl is the class that clients use to access different 157 // column families. It has non-trivial destructor, which gets called when client 158 // is done using the column family 159 class ColumnFamilyHandleImpl : public ColumnFamilyHandle { 160 public: 161 // create while holding the mutex 162 ColumnFamilyHandleImpl( 163 ColumnFamilyData* cfd, DBImpl* db, InstrumentedMutex* mutex); 164 // destroy without mutex 165 virtual ~ColumnFamilyHandleImpl(); cfd()166 virtual ColumnFamilyData* cfd() const { return cfd_; } 167 168 virtual uint32_t GetID() const override; 169 virtual const std::string& GetName() const override; 170 virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) override; 171 virtual const Comparator* GetComparator() const override; 172 173 private: 174 ColumnFamilyData* cfd_; 175 DBImpl* db_; 176 InstrumentedMutex* mutex_; 177 }; 178 179 // Does not ref-count ColumnFamilyData 180 // We use this dummy ColumnFamilyHandleImpl because sometimes MemTableInserter 181 // calls DBImpl methods. When this happens, MemTableInserter need access to 182 // ColumnFamilyHandle (same as the client would need). In that case, we feed 183 // MemTableInserter dummy ColumnFamilyHandle and enable it to call DBImpl 184 // methods 185 class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl { 186 public: ColumnFamilyHandleInternal()187 ColumnFamilyHandleInternal() 188 : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), internal_cfd_(nullptr) {} 189 SetCFD(ColumnFamilyData * _cfd)190 void SetCFD(ColumnFamilyData* _cfd) { internal_cfd_ = _cfd; } cfd()191 virtual ColumnFamilyData* cfd() const override { return internal_cfd_; } 192 193 private: 194 ColumnFamilyData* internal_cfd_; 195 }; 196 197 // holds references to memtable, all immutable memtables and version 198 struct SuperVersion { 199 // Accessing members of this class is not thread-safe and requires external 200 // synchronization (ie db mutex held or on write thread). 201 ColumnFamilyData* cfd; 202 MemTable* mem; 203 MemTableListVersion* imm; 204 Version* current; 205 MutableCFOptions mutable_cf_options; 206 // Version number of the current SuperVersion 207 uint64_t version_number; 208 WriteStallCondition write_stall_condition; 209 210 InstrumentedMutex* db_mutex; 211 212 // should be called outside the mutex 213 SuperVersion() = default; 214 ~SuperVersion(); 215 SuperVersion* Ref(); 216 // If Unref() returns true, Cleanup() should be called with mutex held 217 // before deleting this SuperVersion. 218 bool Unref(); 219 220 // call these two methods with db mutex held 221 // Cleanup unrefs mem, imm and current. Also, it stores all memtables 222 // that needs to be deleted in to_delete vector. Unrefing those 223 // objects needs to be done in the mutex 224 void Cleanup(); 225 void Init(ColumnFamilyData* new_cfd, MemTable* new_mem, 226 MemTableListVersion* new_imm, Version* new_current); 227 228 // The value of dummy is not actually used. kSVInUse takes its address as a 229 // mark in the thread local storage to indicate the SuperVersion is in use 230 // by thread. This way, the value of kSVInUse is guaranteed to have no 231 // conflict with SuperVersion object address and portable on different 232 // platform. 233 static int dummy; 234 static void* const kSVInUse; 235 static void* const kSVObsolete; 236 237 private: 238 std::atomic<uint32_t> refs; 239 // We need to_delete because during Cleanup(), imm->Unref() returns 240 // all memtables that we need to free through this vector. We then 241 // delete all those memtables outside of mutex, during destruction 242 autovector<MemTable*> to_delete; 243 }; 244 245 extern Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options); 246 247 extern Status CheckConcurrentWritesSupported( 248 const ColumnFamilyOptions& cf_options); 249 250 extern Status CheckCFPathsSupported(const DBOptions& db_options, 251 const ColumnFamilyOptions& cf_options); 252 253 extern ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, 254 const ColumnFamilyOptions& src); 255 // Wrap user defined table proproties collector factories `from cf_options` 256 // into internal ones in int_tbl_prop_collector_factories. Add a system internal 257 // one too. 258 extern void GetIntTblPropCollectorFactory( 259 const ImmutableCFOptions& ioptions, 260 std::vector<std::unique_ptr<IntTblPropCollectorFactory>>* 261 int_tbl_prop_collector_factories); 262 263 class ColumnFamilySet; 264 265 // This class keeps all the data that a column family needs. 266 // Most methods require DB mutex held, unless otherwise noted 267 class ColumnFamilyData { 268 public: 269 ~ColumnFamilyData(); 270 271 // thread-safe GetID()272 uint32_t GetID() const { return id_; } 273 // thread-safe GetName()274 const std::string& GetName() const { return name_; } 275 276 // Ref() can only be called from a context where the caller can guarantee 277 // that ColumnFamilyData is alive (while holding a non-zero ref already, 278 // holding a DB mutex, or as the leader in a write batch group). Ref()279 void Ref() { refs_.fetch_add(1); } 280 281 // Unref decreases the reference count, but does not handle deletion 282 // when the count goes to 0. If this method returns true then the 283 // caller should delete the instance immediately, or later, by calling 284 // FreeDeadColumnFamilies(). Unref() can only be called while holding 285 // a DB mutex, or during single-threaded recovery. Unref()286 bool Unref() { 287 int old_refs = refs_.fetch_sub(1); 288 assert(old_refs > 0); 289 return old_refs == 1; 290 } 291 292 // UnrefAndTryDelete() decreases the reference count and do free if needed, 293 // return true if this is freed else false, UnrefAndTryDelete() can only 294 // be called while holding a DB mutex, or during single-threaded recovery. 295 bool UnrefAndTryDelete(); 296 297 // SetDropped() can only be called under following conditions: 298 // 1) Holding a DB mutex, 299 // 2) from single-threaded write thread, AND 300 // 3) from single-threaded VersionSet::LogAndApply() 301 // After dropping column family no other operation on that column family 302 // will be executed. All the files and memory will be, however, kept around 303 // until client drops the column family handle. That way, client can still 304 // access data from dropped column family. 305 // Column family can be dropped and still alive. In that state: 306 // *) Compaction and flush is not executed on the dropped column family. 307 // *) Client can continue reading from column family. Writes will fail unless 308 // WriteOptions::ignore_missing_column_families is true 309 // When the dropped column family is unreferenced, then we: 310 // *) Remove column family from the linked list maintained by ColumnFamilySet 311 // *) delete all memory associated with that column family 312 // *) delete all the files associated with that column family 313 void SetDropped(); IsDropped()314 bool IsDropped() const { return dropped_.load(std::memory_order_relaxed); } 315 316 // thread-safe NumberLevels()317 int NumberLevels() const { return ioptions_.num_levels; } 318 SetLogNumber(uint64_t log_number)319 void SetLogNumber(uint64_t log_number) { log_number_ = log_number; } GetLogNumber()320 uint64_t GetLogNumber() const { return log_number_; } 321 SetFlushReason(FlushReason flush_reason)322 void SetFlushReason(FlushReason flush_reason) { 323 flush_reason_ = flush_reason; 324 } GetFlushReason()325 FlushReason GetFlushReason() const { return flush_reason_; } 326 // thread-safe 327 const FileOptions* soptions() const; ioptions()328 const ImmutableCFOptions* ioptions() const { return &ioptions_; } 329 // REQUIRES: DB mutex held 330 // This returns the MutableCFOptions used by current SuperVersion 331 // You should use this API to reference MutableCFOptions most of the time. GetCurrentMutableCFOptions()332 const MutableCFOptions* GetCurrentMutableCFOptions() const { 333 return &(super_version_->mutable_cf_options); 334 } 335 // REQUIRES: DB mutex held 336 // This returns the latest MutableCFOptions, which may be not in effect yet. GetLatestMutableCFOptions()337 const MutableCFOptions* GetLatestMutableCFOptions() const { 338 return &mutable_cf_options_; 339 } 340 341 // REQUIRES: DB mutex held 342 // Build ColumnFamiliesOptions with immutable options and latest mutable 343 // options. 344 ColumnFamilyOptions GetLatestCFOptions() const; 345 is_delete_range_supported()346 bool is_delete_range_supported() { return is_delete_range_supported_; } 347 348 // Validate CF options against DB options 349 static Status ValidateOptions(const DBOptions& db_options, 350 const ColumnFamilyOptions& cf_options); 351 #ifndef ROCKSDB_LITE 352 // REQUIRES: DB mutex held 353 Status SetOptions( 354 const DBOptions& db_options, 355 const std::unordered_map<std::string, std::string>& options_map); 356 #endif // ROCKSDB_LITE 357 internal_stats()358 InternalStats* internal_stats() { return internal_stats_.get(); } 359 imm()360 MemTableList* imm() { return &imm_; } mem()361 MemTable* mem() { return mem_; } current()362 Version* current() { return current_; } dummy_versions()363 Version* dummy_versions() { return dummy_versions_; } 364 void SetCurrent(Version* _current); 365 uint64_t GetNumLiveVersions() const; // REQUIRE: DB mutex held 366 uint64_t GetTotalSstFilesSize() const; // REQUIRE: DB mutex held 367 uint64_t GetLiveSstFilesSize() const; // REQUIRE: DB mutex held SetMemtable(MemTable * new_mem)368 void SetMemtable(MemTable* new_mem) { 369 uint64_t memtable_id = last_memtable_id_.fetch_add(1) + 1; 370 new_mem->SetID(memtable_id); 371 mem_ = new_mem; 372 } 373 374 // calculate the oldest log needed for the durability of this column family 375 uint64_t OldestLogToKeep(); 376 377 // See Memtable constructor for explanation of earliest_seq param. 378 MemTable* ConstructNewMemtable(const MutableCFOptions& mutable_cf_options, 379 SequenceNumber earliest_seq); 380 void CreateNewMemtable(const MutableCFOptions& mutable_cf_options, 381 SequenceNumber earliest_seq); 382 table_cache()383 TableCache* table_cache() const { return table_cache_.get(); } 384 385 // See documentation in compaction_picker.h 386 // REQUIRES: DB mutex held 387 bool NeedsCompaction() const; 388 // REQUIRES: DB mutex held 389 Compaction* PickCompaction(const MutableCFOptions& mutable_options, 390 LogBuffer* log_buffer); 391 392 // Check if the passed range overlap with any running compactions. 393 // REQUIRES: DB mutex held 394 bool RangeOverlapWithCompaction(const Slice& smallest_user_key, 395 const Slice& largest_user_key, 396 int level) const; 397 398 // Check if the passed ranges overlap with any unflushed memtables 399 // (immutable or mutable). 400 // 401 // @param super_version A referenced SuperVersion that will be held for the 402 // duration of this function. 403 // 404 // Thread-safe 405 Status RangesOverlapWithMemtables(const autovector<Range>& ranges, 406 SuperVersion* super_version, bool* overlap); 407 408 // A flag to tell a manual compaction is to compact all levels together 409 // instead of a specific level. 410 static const int kCompactAllLevels; 411 // A flag to tell a manual compaction's output is base level. 412 static const int kCompactToBaseLevel; 413 // REQUIRES: DB mutex held 414 Compaction* CompactRange(const MutableCFOptions& mutable_cf_options, 415 int input_level, int output_level, 416 const CompactRangeOptions& compact_range_options, 417 const InternalKey* begin, const InternalKey* end, 418 InternalKey** compaction_end, bool* manual_conflict, 419 uint64_t max_file_num_to_ignore); 420 compaction_picker()421 CompactionPicker* compaction_picker() { return compaction_picker_.get(); } 422 // thread-safe user_comparator()423 const Comparator* user_comparator() const { 424 return internal_comparator_.user_comparator(); 425 } 426 // thread-safe internal_comparator()427 const InternalKeyComparator& internal_comparator() const { 428 return internal_comparator_; 429 } 430 431 const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>* int_tbl_prop_collector_factories()432 int_tbl_prop_collector_factories() const { 433 return &int_tbl_prop_collector_factories_; 434 } 435 GetSuperVersion()436 SuperVersion* GetSuperVersion() { return super_version_; } 437 // thread-safe 438 // Return a already referenced SuperVersion to be used safely. 439 SuperVersion* GetReferencedSuperVersion(DBImpl* db); 440 // thread-safe 441 // Get SuperVersion stored in thread local storage. If it does not exist, 442 // get a reference from a current SuperVersion. 443 SuperVersion* GetThreadLocalSuperVersion(DBImpl* db); 444 // Try to return SuperVersion back to thread local storage. Retrun true on 445 // success and false on failure. It fails when the thread local storage 446 // contains anything other than SuperVersion::kSVInUse flag. 447 bool ReturnThreadLocalSuperVersion(SuperVersion* sv); 448 // thread-safe GetSuperVersionNumber()449 uint64_t GetSuperVersionNumber() const { 450 return super_version_number_.load(); 451 } 452 // will return a pointer to SuperVersion* if previous SuperVersion 453 // if its reference count is zero and needs deletion or nullptr if not 454 // As argument takes a pointer to allocated SuperVersion to enable 455 // the clients to allocate SuperVersion outside of mutex. 456 // IMPORTANT: Only call this from DBImpl::InstallSuperVersion() 457 void InstallSuperVersion(SuperVersionContext* sv_context, 458 InstrumentedMutex* db_mutex, 459 const MutableCFOptions& mutable_cf_options); 460 void InstallSuperVersion(SuperVersionContext* sv_context, 461 InstrumentedMutex* db_mutex); 462 463 void ResetThreadLocalSuperVersions(); 464 465 // Protected by DB mutex set_queued_for_flush(bool value)466 void set_queued_for_flush(bool value) { queued_for_flush_ = value; } set_queued_for_compaction(bool value)467 void set_queued_for_compaction(bool value) { queued_for_compaction_ = value; } queued_for_flush()468 bool queued_for_flush() { return queued_for_flush_; } queued_for_compaction()469 bool queued_for_compaction() { return queued_for_compaction_; } 470 471 enum class WriteStallCause { 472 kNone, 473 kMemtableLimit, 474 kL0FileCountLimit, 475 kPendingCompactionBytes, 476 }; 477 static std::pair<WriteStallCondition, WriteStallCause> 478 GetWriteStallConditionAndCause(int num_unflushed_memtables, int num_l0_files, 479 uint64_t num_compaction_needed_bytes, 480 const MutableCFOptions& mutable_cf_options); 481 482 // Recalculate some small conditions, which are changed only during 483 // compaction, adding new memtable and/or 484 // recalculation of compaction score. These values are used in 485 // DBImpl::MakeRoomForWrite function to decide, if it need to make 486 // a write stall 487 WriteStallCondition RecalculateWriteStallConditions( 488 const MutableCFOptions& mutable_cf_options); 489 set_initialized()490 void set_initialized() { initialized_.store(true); } 491 initialized()492 bool initialized() const { return initialized_.load(); } 493 initial_cf_options()494 const ColumnFamilyOptions& initial_cf_options() { 495 return initial_cf_options_; 496 } 497 498 Env::WriteLifeTimeHint CalculateSSTWriteHint(int level); 499 500 // created_dirs remembers directory created, so that we don't need to call 501 // the same data creation operation again. 502 Status AddDirectories( 503 std::map<std::string, std::shared_ptr<Directory>>* created_dirs); 504 505 Directory* GetDataDir(size_t path_id) const; 506 TEST_GetLocalSV()507 ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); } 508 509 private: 510 friend class ColumnFamilySet; 511 ColumnFamilyData(uint32_t id, const std::string& name, 512 Version* dummy_versions, Cache* table_cache, 513 WriteBufferManager* write_buffer_manager, 514 const ColumnFamilyOptions& options, 515 const ImmutableDBOptions& db_options, 516 const FileOptions& file_options, 517 ColumnFamilySet* column_family_set, 518 BlockCacheTracer* const block_cache_tracer); 519 520 uint32_t id_; 521 const std::string name_; 522 Version* dummy_versions_; // Head of circular doubly-linked list of versions. 523 Version* current_; // == dummy_versions->prev_ 524 525 std::atomic<int> refs_; // outstanding references to ColumnFamilyData 526 std::atomic<bool> initialized_; 527 std::atomic<bool> dropped_; // true if client dropped it 528 529 const InternalKeyComparator internal_comparator_; 530 std::vector<std::unique_ptr<IntTblPropCollectorFactory>> 531 int_tbl_prop_collector_factories_; 532 533 const ColumnFamilyOptions initial_cf_options_; 534 const ImmutableCFOptions ioptions_; 535 MutableCFOptions mutable_cf_options_; 536 537 const bool is_delete_range_supported_; 538 539 std::unique_ptr<TableCache> table_cache_; 540 541 std::unique_ptr<InternalStats> internal_stats_; 542 543 WriteBufferManager* write_buffer_manager_; 544 545 MemTable* mem_; 546 MemTableList imm_; 547 SuperVersion* super_version_; 548 549 // An ordinal representing the current SuperVersion. Updated by 550 // InstallSuperVersion(), i.e. incremented every time super_version_ 551 // changes. 552 std::atomic<uint64_t> super_version_number_; 553 554 // Thread's local copy of SuperVersion pointer 555 // This needs to be destructed before mutex_ 556 std::unique_ptr<ThreadLocalPtr> local_sv_; 557 558 // pointers for a circular linked list. we use it to support iterations over 559 // all column families that are alive (note: dropped column families can also 560 // be alive as long as client holds a reference) 561 ColumnFamilyData* next_; 562 ColumnFamilyData* prev_; 563 564 // This is the earliest log file number that contains data from this 565 // Column Family. All earlier log files must be ignored and not 566 // recovered from 567 uint64_t log_number_; 568 569 std::atomic<FlushReason> flush_reason_; 570 571 // An object that keeps all the compaction stats 572 // and picks the next compaction 573 std::unique_ptr<CompactionPicker> compaction_picker_; 574 575 ColumnFamilySet* column_family_set_; 576 577 std::unique_ptr<WriteControllerToken> write_controller_token_; 578 579 // If true --> this ColumnFamily is currently present in DBImpl::flush_queue_ 580 bool queued_for_flush_; 581 582 // If true --> this ColumnFamily is currently present in 583 // DBImpl::compaction_queue_ 584 bool queued_for_compaction_; 585 586 uint64_t prev_compaction_needed_bytes_; 587 588 // if the database was opened with 2pc enabled 589 bool allow_2pc_; 590 591 // Memtable id to track flush. 592 std::atomic<uint64_t> last_memtable_id_; 593 594 // Directories corresponding to cf_paths. 595 std::vector<std::shared_ptr<Directory>> data_dirs_; 596 }; 597 598 // ColumnFamilySet has interesting thread-safety requirements 599 // * CreateColumnFamily() or RemoveColumnFamily() -- need to be protected by DB 600 // mutex AND executed in the write thread. 601 // CreateColumnFamily() should ONLY be called from VersionSet::LogAndApply() AND 602 // single-threaded write thread. It is also called during Recovery and in 603 // DumpManifest(). 604 // RemoveColumnFamily() is only called from SetDropped(). DB mutex needs to be 605 // held and it needs to be executed from the write thread. SetDropped() also 606 // guarantees that it will be called only from single-threaded LogAndApply(), 607 // but this condition is not that important. 608 // * Iteration -- hold DB mutex, but you can release it in the body of 609 // iteration. If you release DB mutex in body, reference the column 610 // family before the mutex and unreference after you unlock, since the column 611 // family might get dropped when the DB mutex is released 612 // * GetDefault() -- thread safe 613 // * GetColumnFamily() -- either inside of DB mutex or from a write thread 614 // * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(), 615 // NumberOfColumnFamilies -- inside of DB mutex 616 class ColumnFamilySet { 617 public: 618 // ColumnFamilySet supports iteration 619 class iterator { 620 public: iterator(ColumnFamilyData * cfd)621 explicit iterator(ColumnFamilyData* cfd) 622 : current_(cfd) {} 623 iterator& operator++() { 624 // dropped column families might still be included in this iteration 625 // (we're only removing them when client drops the last reference to the 626 // column family). 627 // dummy is never dead, so this will never be infinite 628 do { 629 current_ = current_->next_; 630 } while (current_->refs_.load(std::memory_order_relaxed) == 0); 631 return *this; 632 } 633 bool operator!=(const iterator& other) { 634 return this->current_ != other.current_; 635 } 636 ColumnFamilyData* operator*() { return current_; } 637 638 private: 639 ColumnFamilyData* current_; 640 }; 641 642 ColumnFamilySet(const std::string& dbname, 643 const ImmutableDBOptions* db_options, 644 const FileOptions& file_options, Cache* table_cache, 645 WriteBufferManager* write_buffer_manager, 646 WriteController* write_controller, 647 BlockCacheTracer* const block_cache_tracer); 648 ~ColumnFamilySet(); 649 650 ColumnFamilyData* GetDefault() const; 651 // GetColumnFamily() calls return nullptr if column family is not found 652 ColumnFamilyData* GetColumnFamily(uint32_t id) const; 653 ColumnFamilyData* GetColumnFamily(const std::string& name) const; 654 // this call will return the next available column family ID. it guarantees 655 // that there is no column family with id greater than or equal to the 656 // returned value in the current running instance or anytime in RocksDB 657 // instance history. 658 uint32_t GetNextColumnFamilyID(); 659 uint32_t GetMaxColumnFamily(); 660 void UpdateMaxColumnFamily(uint32_t new_max_column_family); 661 size_t NumberOfColumnFamilies() const; 662 663 ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id, 664 Version* dummy_version, 665 const ColumnFamilyOptions& options); 666 begin()667 iterator begin() { return iterator(dummy_cfd_->next_); } end()668 iterator end() { return iterator(dummy_cfd_); } 669 670 // REQUIRES: DB mutex held 671 // Don't call while iterating over ColumnFamilySet 672 void FreeDeadColumnFamilies(); 673 get_table_cache()674 Cache* get_table_cache() { return table_cache_; } 675 676 private: 677 friend class ColumnFamilyData; 678 // helper function that gets called from cfd destructor 679 // REQUIRES: DB mutex held 680 void RemoveColumnFamily(ColumnFamilyData* cfd); 681 682 // column_families_ and column_family_data_ need to be protected: 683 // * when mutating both conditions have to be satisfied: 684 // 1. DB mutex locked 685 // 2. thread currently in single-threaded write thread 686 // * when reading, at least one condition needs to be satisfied: 687 // 1. DB mutex locked 688 // 2. accessed from a single-threaded write thread 689 std::unordered_map<std::string, uint32_t> column_families_; 690 std::unordered_map<uint32_t, ColumnFamilyData*> column_family_data_; 691 692 uint32_t max_column_family_; 693 ColumnFamilyData* dummy_cfd_; 694 // We don't hold the refcount here, since default column family always exists 695 // We are also not responsible for cleaning up default_cfd_cache_. This is 696 // just a cache that makes common case (accessing default column family) 697 // faster 698 ColumnFamilyData* default_cfd_cache_; 699 700 const std::string db_name_; 701 const ImmutableDBOptions* const db_options_; 702 const FileOptions file_options_; 703 Cache* table_cache_; 704 WriteBufferManager* write_buffer_manager_; 705 WriteController* write_controller_; 706 BlockCacheTracer* const block_cache_tracer_; 707 }; 708 709 // We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access 710 // memtables of different column families (specified by ID in the write batch) 711 class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables { 712 public: ColumnFamilyMemTablesImpl(ColumnFamilySet * column_family_set)713 explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set) 714 : column_family_set_(column_family_set), current_(nullptr) {} 715 716 // Constructs a ColumnFamilyMemTablesImpl equivalent to one constructed 717 // with the arguments used to construct *orig. ColumnFamilyMemTablesImpl(ColumnFamilyMemTablesImpl * orig)718 explicit ColumnFamilyMemTablesImpl(ColumnFamilyMemTablesImpl* orig) 719 : column_family_set_(orig->column_family_set_), current_(nullptr) {} 720 721 // sets current_ to ColumnFamilyData with column_family_id 722 // returns false if column family doesn't exist 723 // REQUIRES: use this function of DBImpl::column_family_memtables_ should be 724 // under a DB mutex OR from a write thread 725 bool Seek(uint32_t column_family_id) override; 726 727 // Returns log number of the selected column family 728 // REQUIRES: under a DB mutex OR from a write thread 729 uint64_t GetLogNumber() const override; 730 731 // REQUIRES: Seek() called first 732 // REQUIRES: use this function of DBImpl::column_family_memtables_ should be 733 // under a DB mutex OR from a write thread 734 virtual MemTable* GetMemTable() const override; 735 736 // Returns column family handle for the selected column family 737 // REQUIRES: use this function of DBImpl::column_family_memtables_ should be 738 // under a DB mutex OR from a write thread 739 virtual ColumnFamilyHandle* GetColumnFamilyHandle() override; 740 741 // Cannot be called while another thread is calling Seek(). 742 // REQUIRES: use this function of DBImpl::column_family_memtables_ should be 743 // under a DB mutex OR from a write thread current()744 virtual ColumnFamilyData* current() override { return current_; } 745 746 private: 747 ColumnFamilySet* column_family_set_; 748 ColumnFamilyData* current_; 749 ColumnFamilyHandleInternal handle_; 750 }; 751 752 extern uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family); 753 754 extern const Comparator* GetColumnFamilyUserComparator( 755 ColumnFamilyHandle* column_family); 756 757 } // namespace ROCKSDB_NAMESPACE 758